Source code for chatbot.nlp.entity_analyzer

from chatbot.common.chat_share_data import ShareData
# from konlpy.tag import Kkma
# from konlpy.tag import Twitter
from chatbot.common.chat_knowledge_mem_dict import ChatKnowledgeMemDict
from konlpy.tag import Mecab
import logging

[docs]class EntityAnalyzer(ShareData):
    """
    parse raw text to tageed, entity filterd sentence
    ※ Example
    input : I bought a car yesterday
    output : I bought a car [time]
    """

    def __init__(self, cb_id):
        """
        init global variables
        """
        #self.proper_key_list = sorted(proper_noun.keys(), key=lambda x : proper_noun[x][0], reverse=False) #Sorted Key Priority
        self.cb_id = cb_id
        self.proper_key_list = ChatKnowledgeMemDict.data_order.get(self.cb_id)
        self.proper_noun = ChatKnowledgeMemDict.data.get(self.cb_id).get('proper_noun')     # key : [values]

[docs]    def parse(self, share_data):
        """
        parse input with entity list
        :param share_data:
        :return:
        """
        try :
            input_data = share_data.get_request_data()
            pos_tags = self._pos_tagger(input_data)
            logging.info("■■■■■■■■■■ 형태소 분석 결과 : " + str(pos_tags))
            result = list(map(lambda x : self._preprocess_data(share_data,x), pos_tags))
            # Remove preposition
            result = list(filter(lambda x : x[0] != "", result))
            convert_dict_data = list(map(lambda x : x[1] ,result))
            morphed_data = list(map(lambda x : x[0] ,result))
            share_data.set_convert_dict_data(convert_dict_data)
            share_data.set_morphed_data(morphed_data)
            logging.info("■■■■■■■■■■ Entity 분석 결과 : " + str(convert_dict_data))
            return share_data
        except Exception as e :
            raise Exception ("error on entity anal : {0}".format(e))

    #Custom Case : ex)"hi and hello" and len < 3
    def _preprocess_data(self, share_data, pos_tags):
        #except meaningless
        convert_dict_data = pos_tags[0]
        pos_tags_0 = pos_tags[0]
        if (pos_tags[1] in ['NNG', 'NNP','SL'] and len(pos_tags[0]) > 1): #Check only Noun
            key_slot = pos_tags[0]
            key_check = list(filter(lambda x : self._extract_proper_entity(pos_tags[0], x), self.proper_key_list))
            if(key_check == []):
                pass
            else: #proper noun priority
                # except duplicated
                if(self.proper_noun[key_check[0]][2]):
                    key_slot = share_data.get_story_slot_entity(key_check[0])[0] + " " + pos_tags[0] if share_data.get_story_slot_entity(key_check[0]) is not None else "" + pos_tags[0]
                share_data.set_story_slot_entity(key_check[0], [key_slot])
                convert_dict_data = key_check[0]
        elif (pos_tags[1] in ['SY', 'EC', 'EP', 'VA', 'VX', 'XSV+EC', 'VX+EC', 'VX+EF', 'SF', 'VCP+EF', 'ETN', 'ETM', 'JKO', 'EF','VCP+EC','SSO','SSC','EP+EF']):
            return "",""
        return pos_tags_0, convert_dict_data

    def _pos_tagger(self, input, type ='mecab'):
        """

        :param input:
        :return:
        """
        if(type == 'mecab') :
            mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
            return mecab.pos(str(input))
        # elif(type == 'kkma') :
        #     kkma = Kkma()
        #     return kkma.pos(str(input))
        #
        # elif(type == 'twitter') :
        #     twitter = Twitter(jvmpath=None)
        #     return twitter.pos(str(input))

    def _extract_proper_entity(self, value, key):
        exist = False
        value = value.lower()
        input_file = ChatKnowledgeMemDict.data.get(self.cb_id).get(key)
        if(input_file is not None):
            for line in input_file:
                if(self.proper_noun.get(key)[2] and line.lower().strip().find(value) > -1):
                    exist = True
                    break
                elif(line.lower().strip() == value):
                    exist = True
                    break
        return exist