Source code for cluster.generator.ner_augmentation

import codecs, os
import pandas as pd
from konlpy.tag import Mecab
from numba import *
import numpy as np
import threading, logging

[docs]class DataAugmentation : """ Data Augmentation Class for nlp mainly for create iob data with pattern and dict test = DataAugmentation() test.load_dict() test.convert_data() """
[docs] class ThreadCls(threading.Thread) : def __init__(self, obj, idx): threading.Thread.__init__(self) self.obj = obj self.idx = idx
[docs] def run(self): for _ in range(self.obj.dict_sample_iter): self.obj.load_dict() self.obj.convert_data(self.idx)
[docs] def join(self): threading.Thread.join(self) return True
def __init__(self, conf): """ init parms need to mange teses parms on db """ self.aug_file_cnt = 0 self.use_mecab = conf.get("use_mecab") self.max_file_size = conf.get("max_file_size") #10M self.pattern_data_path = conf.get("pattern_data_path") self.augmented_out_path = conf.get("augmented_out_path") self.dict_path = conf.get("dict_path") self.out_format_type = conf.get("out_format_type") self.ner_dicts = {} self.gpu_use = True self.dict_sample_size = int(conf.get("dict_sample_size")) self.dict_sample_iter = int(conf.get("dict_sample_iter")) self.thread_num = int(conf.get("thread_num"))
[docs] def run(self): """ run :return: """ job_list = [] for idx, _ in enumerate(range(self.thread_num)) : job_list.append(self.ThreadCls(self, idx)) for job in job_list: job.start() for job in job_list: job.join()
[docs] def load_dict(self): """ load dict list from csv file :return: """ self.ner_dicts = {} df_csv_read = pd.read_csv(self.dict_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') df_csv_read = df_csv_read.sample(n=self.dict_sample_size) for col in df_csv_read.keys() : self.ner_dicts[col] = [] for val in list(set(df_csv_read[col])) : if (val == val and val != None) : self.ner_dicts[col].append(val)
def _check_all_match(self, words) : """ check all matcing dict keys in ohter word entity keys :param words: sentence str :return: list contain keys """ match_keys = [] for word in words : word = word.replace('\n', '') if(word in list(self.ner_dicts.keys())) : match_keys.append(word) return match_keys #@autojit def _aug_sent(self, keys, pattern, return_aug_sent=[]) : """ function which actually augment sentences with given pattern and keys :param keys: entity keys :param pattern: sentence pattern :return: list of augmented sentence """ try : if (len(keys) > 0): key = keys[0] del keys[0] else : return return_aug_sent if (len(return_aug_sent) == 0): for word in self.ner_dicts[key] : line = [] for slot in pattern: for rep in ['\n', 'NaN'] : slot = slot.replace(rep, '') if(key in slot) : for wd in self.mecab.morphs(word): wd = wd.replace(' ', '') line.append((wd, key)) else : line.append((slot, 'O')) return_aug_sent.append(line) else : del_idx = [] for i, line in enumerate(return_aug_sent): for j, slot in enumerate(line): if (slot[0] == key): for word in self.ner_dicts[key]: line = return_aug_sent[i].copy() for z, slot in enumerate(line): if(slot[0] == key) : buffer = "" for wd in self.mecab.morphs(word) : wd = wd.replace(' ', '') if(len(buffer) > 0 ) : buffer = ''.join([buffer,' ', wd]) else : buffer = wd if (len(buffer) > 1 ): line[z] = (buffer, key) return_aug_sent.append(line) del_idx.append(i) for _ in del_idx: del return_aug_sent[0] return self._aug_sent(keys, pattern, return_aug_sent) except Exception as e : print("error on nlp data augmentation :{0}".format(e)) def _iob_formatter(self, aug_data, idx) : """ save aug list as iob file format :param aug_data: augmented list of sentence :return: None """ if aug_data == None : pass path = ''.join([self.augmented_out_path, '/'+str(idx),'Test' , str(self.aug_file_cnt) , '.iob']) if(os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size) : with open(path, "a") as f : for line in aug_data : for word in line : related_words = word[0].split(' ') for tocken in related_words : f.write(''.join([tocken, ' ', word[1]])) f.write('\n') f.write('\n') else : self.aug_file_cnt = self.aug_file_cnt + 1 path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.iob']) with open(path, "w") as f : for line in aug_data : for word in line : related_words = word[0].split(' ') for tocken in related_words : f.write(''.join([tocken, ' ', word[1]])) f.write('\n') f.write('\n') def _plain_formatter(self, aug_data, idx) : """ save aug list as iob file format :param aug_data: augmented list of sentence :return: None """ if aug_data == None : pass path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out']) if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size): with open(path, "a") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write('\n') else : self.aug_file_cnt = self.aug_file_cnt + 1 path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out']) with open(path, "w") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write('\n') def _intent_formatter(self, aug_data, key, idx) : """ save aug list as iob file format :param aug_data: augmented list of sentence :return: None """ if aug_data == None : pass path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv']) if (os.path.exists(path) == False) : with open(path, "w") as f : f.write('encode,decode\n') if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size): with open(path, "a") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write(',') f.write(str(key)) f.write('\n') else : self.aug_file_cnt = self.aug_file_cnt + 1 path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv']) with open(path, "a") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write(',') f.write(str(key)) f.write('\n')
[docs] def convert_data(self, idx) : """ augment data with entity list and pattern :return: Nones """ try : if (self.out_format_type == 'intent'): self._conv_type_b(idx) else : self._conv_type_a(idx) except Exception as e : print("error log : {0}".format(e))
def _conv_type_b(self, idx): """ :return: """ df_csv_read = pd.read_csv(self.pattern_data_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') i = 0 for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) : words = [] if (self.use_mecab): self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') pos = self.mecab.pos(line) for word, tag in pos: words.append(word) else: words = str(line).split(' ') match_keys = self._check_all_match(words) aug_data = self._aug_sent(match_keys, words, []) self._intent_formatter(aug_data, key, idx) if(i%100 == 0) : print("====Therad{0} : {1} line job done".format(idx, i)) i = i + 1 def _conv_type_a(self, idx): """ :return: """ df_csv_read = pd.read_csv(self.pattern_data_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') i = 0 for line in df_csv_read['encode'].values: words = [] if(self.use_mecab) : self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') pos = self.mecab.pos(line) for word, tag in pos: words.append(word) else : words = str(line).split(' ') match_keys = self._check_all_match(words) if(self.out_format_type == 'plain') : aug_data = self._aug_sent(match_keys, words, []) self._plain_formatter(aug_data,idx) elif(self.out_format_type == 'iob') : aug_data = self._aug_sent(match_keys, words, []) self._iob_formatter(aug_data,idx) else : raise Exception (' '.join(['not', 'plain', 'or iob'])) if (i % 100 == 0): print("====Therad{0} : {1} line job done".format(idx, i)) i = i + 1
# da = DataAugmentation({ # "use_mecab": True, # "max_file_size": 100000000, # "pattern_data_path": "/hoya_model_root/aug/pattern.csv", # "augmented_out_path": "/hoya_model_root/aug/aug_0810/", # "dict_path": "/hoya_model_root/aug/dict.csv", # "out_format_type": "iob", # "dict_sample_size" : 3, # "dict_sample_iter" : 500, # "thread_num" : 8 # }) #