Source code for cluster.neuralnet.neuralnet_node_w2v

from cluster.neuralnet.neuralnet_node import NeuralNetNode
from gensim.models import word2vec
from master.workflow.netconf.workflow_netconf_w2v import WorkFlowNetConfW2V
import os, json, logging
import numpy as np
from konlpy.tag import Mecab

[docs]class NeuralNetNodeWord2Vec(NeuralNetNode):
[docs] def run(self, conf_data): try : # init parms for word2vec node self._init_node_parm(conf_data['node_id']) self.cls_pool = conf_data['cls_pool'] # get prev node for load data data_node_name = self._get_backward_node_with_type(conf_data['node_id'], 'preprocess') train_data_set = self.cls_pool[data_node_name[0]] # load model for train update_flag = False model = word2vec.Word2Vec(size=self.vector_size , window=self.window_size, min_count=self.min_count, workers=4) if (os.path.exists(self._get_model_path()) == True): model = word2vec.Word2Vec.load(self._get_model_path()) update_flag = True # build vocab first by batch size while(train_data_set.has_next()) : # Iteration is to improve for Model Accuracy for x in range(0, self.iter_size) : # Per Line in file for i in range(0, train_data_set.data_size(), self.batch_size): data_set = train_data_set[i:i + self.batch_size] if (update_flag == False): model.build_vocab(data_set, update=False) update_flag = True else: model.build_vocab(data_set, update=True) train_data_set.next() # after all new vocab stacked on voacb start train train_data_set.reset_pointer() while (train_data_set.has_next()): # Iteration is to improve for Model Accuracy for x in range(0, self.iter_size): # Per Line in file for i in range(0, train_data_set.data_size(), self.batch_size): data_set = train_data_set[i:i + self.batch_size] model.train(data_set) train_data_set.next() os.makedirs(self.md_store_path, exist_ok=True) model.save(self._get_model_path()) return True except Exception as e: logging.info("[Word2vec Train Process] : {0}".format(e)) raise Exception(e)
def _init_node_parm(self, node_id): wf_conf = WorkFlowNetConfW2V(node_id) self.md_store_path = wf_conf.get_model_store_path() self.window_size = wf_conf.get_window_size() self.vector_size = wf_conf.get_vector_size() self.batch_size = wf_conf.get_batch_size() self.iter_size = wf_conf.get_iter_size() self.min_count = wf_conf.get_min_count() self.preprocess = wf_conf.preprocess_type() def _get_model_path(self): return ''.join([self.md_store_path, '/model.bin']) def _set_progress_state(self): return None
[docs] def predict(self, node_id, parm = {"type" : "vector", "val_1" : [], "val_2" : []}): """ predict service _get_model_path 1. type (vector) : return vector 2. type (sim) : positive list & negative list :param node_id: :param parm: :return: """ try : self._init_node_parm(node_id) return_val = [] if (os.path.exists(self._get_model_path()) == False): raise Exception ("No pretrained model exist") model = word2vec.Word2Vec.load(self._get_model_path()) if(parm['type'] in ['vector', 'sim', 'similarity']): if ('val_1' in parm) : parm['val_1'] = np.array(self._preprocess(parm['val_1'], type=self.preprocess)).flatten().tolist() if ('val_2' in parm) : parm['val_2'] = np.array(self._preprocess(parm['val_2'], type=self.preprocess)).flatten().tolist() if(parm['type'] in ['vector','train']) : return_val = self._predict_word2vec(parm, return_val, model) elif(parm['type'] in ['sim']) : return_val = self._predict_sim(parm, return_val, model) elif(parm['type'] in ['similarity']) : return_val.append(model.similarity(parm['val_1'][0], parm['val_2'][0])) elif(parm['type'] in ['dict'] or parm['type'] in ['vocab2index']) : return_val = self._predict_vocab2index(parm, return_val, model) elif(parm['type'] in ['index2vocab']): return_val = self._predict_index2vocab(parm, return_val, model) elif(parm['type'] in ['povb2vocab']) : return_val = self._predict_prob2vocab(parm, return_val, model) elif(parm['type'] in ['vec2word']): return_val = self._predict_vector2word(parm, return_val, model) elif (parm['type'] in ['vocablen']): return len(model.wv.vocab) - 1 elif (parm['type'] in ['model']): return model else : raise Exception ("Not available type : {0}".format(parm['type'])) return return_val except Exception as e : raise Exception (e)
def _predict_word2vec(self, parm, return_val, model): """ get word and return with embeded vector :param parm: :param return_val: :param model: :return: """ for key in parm['val_1']: if key in ['#']: return_val.append([0.0005] * self.vector_size) elif key in ['@']: return_val.append([0.0009] * self.vector_size) elif key in model: return_val.append(model[key].tolist()) else: return_val.append([0.0002] * self.vector_size) return return_val def _predict_sim(self, parm, return_val, model): """ return most similar vocabs (close to each other) :param parm: :param return_val: :param model: :return: """ try : return_val.append(model.most_similar(positive=parm['val_1'], negative=parm['val_2'], topn=5)) return return_val except Exception as e : return return_val.append(e) def _predict_vocab2index(self, parm, return_val, model): """ find vocab index num :param parm: :param return_val: :param model: :return: """ for key in parm['val_1']: if key in ['#']: # padding return_val.append(len(model.wv.index2word)) elif key in ['@']: # starting return_val.append(len(model.wv.index2word) + 1) elif key in model: # word on vocab return_val.append(model.wv.vocab[key].index) else: # unknown return_val.append(len(model.wv.index2word) + 2) return return_val def _predict_index2vocab(self, parm, return_val, model): """ convert index number to word :param parm: :param return_val: :param model: :return: """ for key in parm['val_1']: if len(model.wv.index2word) > key: return_val.append(model.wv.index2word[key]) return return_val def _predict_prob2vocab(self, parm, return_val, model): """ prob matrix to max arg matched vocab :param parm: :param return_val: :param model: :return: """ for key in parm['val_1']: # set ignore char set filter_list = [] for filter_set in filter_list: if filter_set in model: key[model.wv.vocab[filter_set].index] = 0.0 # set ignore char set with index filter_index = [] for idx in filter_index: key[idx] = 0.0 if 'prob_idx' in parm : sorted_list = sorted(key, reverse=True) index = key.index(sorted_list[parm.get['prob_idx']]) else : index = key.argmax(axis=0) if len(model.wv.index2word) > index: return_val.append(model.wv.index2word[index]) elif len(model.wv.index2word) == index: return_val.append("PAD") elif len(model.wv.index2word) + 1 == index: return_val.append("START") elif len(model.wv.index2word) + 2 == index: return_val.append("UNKNOWN") return return_val def _predict_vector2word(self, parm, return_val, model): """ embeded vector to most sim word :param parm: :param return_val: :param model: :return: """ for key in parm['val_1']: for guess in model.similar_by_vector(key): if guess[0] not in ['\n', '#', './SF'] and guess[1] > 0: return_val = return_val + [guess[0]] break return return_val def _pos_raw_data(self, lt): """ :param lt: list type value :return: """ mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') return_arr= [] for raw in lt : pos = mecab.pos(raw) for word, tag in pos: return_arr.append("{0}/{1}".format(word, tag)) return return_arr
[docs] def eval(self, node_id, conf, data=None, result=None): """ :param node_id: :param parm: :return: """ result.set_result_data_format({}) return result