Source code for cluster.neuralnet.neuralnet_node_d2v
from cluster.neuralnet.neuralnet_node import NeuralNetNode
from gensim.models import doc2vec
from master.workflow.netconf.workflow_netconf_d2v import WorkFlowNetConfD2V
import os, json, logging
from konlpy.tag import Mecab
[docs]class NeuralNetNodeDoc2Vec(NeuralNetNode):
[docs] def run(self, conf_data):
try :
# init parms for doc2vec node
self._init_node_parm(conf_data['node_id'])
self.cls_pool = conf_data['cls_pool']
# get prev node for load data
data_node_name = self._get_backward_node_with_type(conf_data['node_id'], 'preprocess')
train_data_set = self.cls_pool[data_node_name[0]]
# load model for train
update_flag = False
model = doc2vec.Doc2Vec(size=self.vector_size , window=self.window_size)
if (os.path.exists(''.join([self.md_store_path, '/model.bin'])) == True):
model = doc2vec.Doc2Vec.load(''.join([self.md_store_path, '/model.bin']))
update_flag = True
# train vocab and model
while(train_data_set.has_next()) :
train_data = doc2vec.TaggedLineDocument(train_data_set.train_file_path())
for x in range(0, self.iter_size) :
if (update_flag == False):
model.build_vocab(train_data, update=False)
update_flag = True
else:
model.build_vocab(train_data, update=True)
model.train(train_data)
train_data_set.next()
os.makedirs(self.md_store_path, exist_ok=True)
model.save(''.join([self.md_store_path, '/model.bin']))
return len(model.raw_vocab)
except Exception as e:
logging.info("[Doc2Vector Train Process] : {0}".format(e))
raise Exception(e)
def _init_node_parm(self, node_id):
wf_conf = WorkFlowNetConfD2V(node_id)
self.md_store_path = wf_conf.get_model_store_path()
self.window_size = wf_conf.get_window_size()
self.vector_size = wf_conf.get_vector_size()
self.batch_size = wf_conf.get_batch_size()
self.iter_size = wf_conf.get_iter_size()
def _get_model_path(self):
return ''.join([self.md_store_path, '/model.bin'])
def _set_progress_state(self):
return None
[docs] def predict(self, node_id, parm = {"type" : "vector", "val_1" : [], "val_2" : []}):
"""
predict service _get_model_path
1. type (vector) : return vector
2. type (sim) : positive list & negative list
:param node_id:
:param parm:
:return:
"""
try :
self._init_node_parm(node_id)
return_val = []
if (os.path.exists(self._get_model_path()) == False):
raise Exception ("No pretrained model exist")
model = doc2vec.Doc2Vec.load(self._get_model_path())
if(parm['type'] in ['vector-tag', 'sim-tag', 'similarity-tag']):
if ('val_1' in parm): parm['val_1'] = self._pos_raw_data(parm['val_1'])
if ('val_2' in parm): parm['val_2'] = self._pos_raw_data(parm['val_2'])
if(parm['type'] in ['vector','train']) :
for key in parm['val_1'] :
if key in model : return_val.append(model[key])
else : return_val.append([0.] * self.vector_size)
elif(parm['type'] == 'sim') :
return_val.append(model.most_similar(positive=parm['val_1'], negative=parm['val_2'] , topn=5))
elif(parm['type'] == 'similarity') :
return_val.append(model.similarity(parm['val_1'][0], parm['val_2'][0]))
elif(parm['type'] == 'dict' or parm['type'] == 'vocab2index') :
for key in parm['val_1'] :
if key in model : return_val.append(model.wv.vocab[key].index)
else : return_val.append(0)
elif(parm['type'] == 'index2vocab'):
for key in parm['val_1']:
if len(model.wv.index2word) >= key:
return_val.append(model.wv.index2word[key])
elif(parm['type'] == 'povb2vocab') :
for key in parm['val_1']:
filter_list = ['#', 'SF']
for filter_set in filter_list :
key[model.wv.vocab[filter_set].index] = 0.0
index = key.argmax(axis=0)
if len(model.wv.index2word) >= index:
return_val.append(model.wv.index2word[index])
elif(parm['type'] == 'vec2word'):
for key in parm['val_1']:
for guess in model.similar_by_vector(key) :
if guess[0] not in ['\n', '#', './SF'] and guess[1] > 0:
return_val = return_val + [guess[0]]
break
elif (parm['type'] == 'vocablen'):
return len(model.wv.vocab) - 1
else :
raise Exception ("Not available type : {0}".format(parm['type']))
return return_val
except Exception as e :
raise Exception (e)
def _pos_raw_data(self, lt):
"""
:param lt: list type value
:return:
"""
mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
return_arr= []
for raw in lt :
pos = mecab.pos(raw)
for word, tag in pos:
return_arr.append("{0}/{1}".format(word, tag))
return return_arr
[docs] def eval(self, node_id, parm={}):
"""
:param node_id:
:param parm:
:return:
"""
pass