Source code for cluster.neuralnet.neuralnet_node_bilstmcrf

from cluster.neuralnet.neuralnet_node import NeuralNetNode
from master.workflow.netconf.workflow_netconf_bilstmcrf import WorkFlowNetConfBiLstmCrf
import os, json, logging
import numpy as np
import tensorflow as tf
from cluster.common.neural_common_bilismcrf import BiLstmCommon
from common.graph.nn_graph_manager import NeuralNetModel

[docs]class NeuralNetNodeBiLstmCrf(NeuralNetNode, BiLstmCommon):

[docs]    def run(self, conf_data):
        try :
            self._init_node_parm(conf_data['node_id'])

            # build graph
            self.build_graph()

            # get prev node for load data
            train_data_set = self.get_linked_prev_node_with_grp('preprocess')[0]
            best_score = 0
            with tf.Session() as sess:
                while (train_data_set.has_next()):
                    logging.info("------------------------------")
                    logging.info(train_data_set.get_file_name())
                    logging.info("------------------------------")
                    # create dataset
                    dev = self.CoNLLDataset(train_data_set.get_file_name(),
                                            self.processing_word,
                                            self.processing_tag,
                                            self.max_iter)
                    train = self.CoNLLDataset(train_data_set.get_file_name(),
                                              self.processing_word,
                                              self.processing_tag,
                                              self.max_iter)
                    # train
                    best_score = self.train(train, dev, self.vocab_tags, sess)

                    # save model
                    self.model_output = ''.join([self.md_store_path, '/', self.make_batch(self.node_id)[1], '/'])
                    if not os.path.exists(self.model_output):
                        os.makedirs(self.model_output)
                    self.saver.save(sess, self.model_output)

                    train_data_set.next()
            return [best_score]
        except Exception as e :
            logging.info("[BiLstmCrf Train Process] : {0}".format(e))
            raise Exception ("error on fast text tain process : {0}".format(e))

    def _init_node_parm(self, node_id):
        """
        initialze parms for autoencoder
        :param node_id:
        :return:
        """
        try:
            wf_conf = WorkFlowNetConfBiLstmCrf(node_id)
            self.node_id = node_id
            self.md_store_path = wf_conf.get_model_store_path

            # create dict folder for ner if not exists
            dict_path = ''.join([self.md_store_path, '/dict/'])
            if not os.path.exists(dict_path):
                os.makedirs(dict_path)

            self.trimmed_filename = ''.join([dict_path, 'words.vec'])
            self.charembed_filename = ''.join([dict_path, 'char.vec'])
            self.words_filename = ''.join([dict_path, 'words.txt'])
            self.tags_filename = ''.join([dict_path, 'tags.txt'])
            self.chars_filename = ''.join([dict_path, 'chars.txt'])

            self.embeddings = self.get_trimmed_glove_vectors(self.trimmed_filename)
            self.char_embed = self.get_trimmed_glove_vectors(self.charembed_filename)
            self.vocab_words = self.load_vocab(self.words_filename)
            self.vocab_tags = self.load_vocab(self.tags_filename)
            self.vocab_chars = self.load_vocab(self.chars_filename)

            self.nchars = len(self.vocab_chars)
            self.ntags = len(self.vocab_tags)
            self.lowercase = wf_conf.lowercase
            self.max_iter = wf_conf.max_iter
            self.crf = wf_conf.crf
            self.chars = wf_conf.chars

            self.processing_word = self.get_processing_word(self.vocab_words,
                                                            self.vocab_chars,
                                                            lowercase=self.lowercase,
                                                            chars=self.chars)
            self.processing_tag = self.get_processing_word(self.vocab_tags,
                                                           lowercase=False)

            self.dim = wf_conf.dim
            self.dim_char = wf_conf.dim_char
            self.train_embeddings = wf_conf.train_embeddings
            self.nepochs = wf_conf.nepochs
            self.p_dropout = wf_conf.p_dropout
            self.batch_size = wf_conf.batch_size
            self.p_lr = wf_conf.p_lr
            self.lr_decay = wf_conf.lr_decay
            self.nepoch_no_imprv = wf_conf.nepoch_no_imprv

            self.hidden_size = wf_conf.hidden_size
            self.char_hidden_size = wf_conf.char_hidden_size

            if(self.check_batch_exist(node_id)) :
                self.output_path = ''.join([self.md_store_path, '/', self.get_eval_batch(node_id), '/'])
            else :
                self.output_path = ''.join([self.md_store_path, '/', self.make_batch(node_id)[1], '/'])
            self.model_output = self.output_path
            self.log_path = self.output_path + "/log/log.txt"

        except Exception as e :
            raise Exception (e)

[docs]    def add_placeholders(self):
        """
        Adds placeholders to self
        """
        # shape = (batch size, max length of sentence in batch)
        self.word_ids = tf.placeholder(tf.int32, shape=[None, None], name="word_ids")
        # shape = (batch size)
        self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths")
        # shape = (batch size, max length of sentence, max length of word)
        self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None], name="char_ids")
        # shape = (batch_size, max_length of sentence)
        self.word_lengths = tf.placeholder(tf.int32, shape=[None, None], name="word_lengths")
        # shape = (batch size, max length of sentence in batch)
        self.labels = tf.placeholder(tf.int32, shape=[None, None], name="labels")
        # hyper parameters
        self.dropout = tf.placeholder(dtype=tf.float32, shape=[], name="dropout")
        self.lr = tf.placeholder(dtype=tf.float32, shape=[], name="lr")


[docs]    def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
        """
        Given some data, pad it and build a feed dictionary
        Args:
            words: list of sentences. A sentence is a list of ids of a list of words.
                A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob
        Returns:
            dict {placeholder: value}
        """
        # perform padding of the given data
        if self.chars:
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = self.pad_sequences(word_ids, 0)
            char_ids, word_lengths = self.pad_sequences(char_ids, pad_tok=0, nlevels=2)
        else:
            word_ids, sequence_lengths = self.pad_sequences(words, 0)

        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if self.chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths

        if labels is not None:
            labels, _ = self.pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths


[docs]    def add_word_embeddings_op(self):
        """
        Adds word embeddings to self
        """
        with tf.variable_scope("words"):
            _word_embeddings = tf.Variable(self.embeddings, name="_word_embeddings", dtype=tf.float32,
                                trainable=self.train_embeddings)
            word_embeddings = tf.nn.embedding_lookup(_word_embeddings, self.word_ids,
                name="word_embeddings")

        with tf.variable_scope("chars"):
            if self.chars:

                if (self.char_embed is not None):
                    _char_embeddings = tf.Variable(self.char_embed, name="_char_embeddings", dtype=tf.float32,
                                                   trainable=self.train_embeddings)
                    char_embeddings = tf.nn.embedding_lookup(_char_embeddings, self.char_ids,
                                                             name="char_embeddings")
                else :
                    # get embeddings matrix
                    _char_embeddings = tf.get_variable(name="_char_embeddings", dtype=tf.float32,
                        shape=[self.nchars, self.dim_char])
                    char_embeddings = tf.nn.embedding_lookup(_char_embeddings, self.char_ids,
                        name="char_embeddings")
                # put the time dimension on axis=1
                s = tf.shape(char_embeddings)
                char_embeddings = tf.reshape(char_embeddings, shape=[-1, s[-2], self.dim_char])
                word_lengths = tf.reshape(self.word_lengths, shape=[-1])
                # bi lstm on chars
                lstm_frod_cell = tf.contrib.rnn.LSTMCell(self.char_hidden_size,
                                                    state_is_tuple=True)
                lstm_back_cell = tf.contrib.rnn.LSTMCell(self.char_hidden_size,
                                                    state_is_tuple=True)
                _, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn(lstm_frod_cell,
                                                                                      lstm_back_cell,
                                                                                      char_embeddings,
                                                                                      sequence_length=word_lengths,
                                                                                      dtype=tf.float32)
                output = tf.concat([output_fw, output_bw], axis=-1)
                # shape = (batch size, max sentence length, char hidden size)
                output = tf.reshape(output, shape=[-1, s[1], 2*self.char_hidden_size])

                word_embeddings = tf.concat([word_embeddings, output], axis=-1)

        self.word_embeddings =  tf.nn.dropout(word_embeddings, self.dropout)


[docs]    def add_logits_op(self):
        """
        Adds logits to self
        """
        with tf.variable_scope("bi-lstm"):
            lstm_fwrd_cell = tf.contrib.rnn.LSTMCell(self.hidden_size)
            lstm_back_cell = tf.contrib.rnn.LSTMCell(self.hidden_size)
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(lstm_fwrd_cell,
                                                                        lstm_back_cell,
                                                                        self.word_embeddings,
                                                                        sequence_length=self.sequence_lengths,
                                                                        dtype=tf.float32)
            output = tf.concat([output_fw, output_bw], axis=-1)
            output = tf.nn.dropout(output, self.dropout)

        with tf.variable_scope("proj"):
            W = tf.get_variable("W", shape=[2*self.hidden_size, self.ntags],
                dtype=tf.float32)

            b = tf.get_variable("b", shape=[self.ntags], dtype=tf.float32,
                initializer=tf.zeros_initializer())

            ntime_steps = tf.shape(output)[1]
            output = tf.reshape(output, [-1, 2*self.hidden_size])
            pred = tf.matmul(output, W) + b
            self.logits = tf.reshape(pred, [-1, ntime_steps, self.ntags])

[docs]    def add_pred_op(self):
        """
        Adds labels_pred to self
        """
        if not self.crf:
            self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1), tf.int32)

[docs]    def add_loss_op(self):
        """
        Adds loss to self
        """
        if self.crf:
            log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
            self.logits, self.labels, self.sequence_lengths)
            self.loss = tf.reduce_mean(-log_likelihood)
        else:
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.labels)
            mask = tf.sequence_mask(self.sequence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)

        # for tensorboard
        tf.summary.scalar("loss", self.loss)


[docs]    def add_train_op(self):
        """
        Add train_op to self
        """
        with tf.variable_scope("train_step"):
            optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_op = optimizer.minimize(self.loss)


[docs]    def add_init_op(self):
        self.init = tf.global_variables_initializer()
        self.saver = tf.train.Saver(tf.all_variables())


[docs]    def add_summary(self, sess):
        # tensorboard stuff
        self.merged = tf.summary.merge_all()
        self.file_writer = tf.summary.FileWriter(self.output_path, sess.graph)


[docs]    def build_graph(self):
        self.add_placeholders()
        self.add_word_embeddings_op()
        self.add_logits_op()
        self.add_pred_op()
        self.add_loss_op()
        self.add_train_op()
        self.add_init_op()



[docs]    def predict_batch(self, sess, words):
        """
        Args:
            sess: a tensorflow session
            words: list of sentences
        Returns:
            labels_pred: list of labels for each sentence
            sequence_length
        """
        fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0)

        if self.crf:
            viterbi_sequences = []
            logits, transition_params = sess.run([self.logits, self.transition_params],
                                                 feed_dict=fd)
            # iterate over the sentences
            for logit, sequence_length in zip(logits, sequence_lengths):
                # keep only the valid time steps
                logit = logit[:sequence_length]
                viterbi_sequence, viterbi_score = tf.contrib.crf.viterbi_decode(
                    logit, transition_params)
                viterbi_sequences += [viterbi_sequence]

            return viterbi_sequences, sequence_lengths

        else:
            labels_pred = sess.run(self.labels_pred, feed_dict=fd)

            return labels_pred, sequence_lengths

[docs]    def run_epoch(self, sess, train, dev, tags, epoch):
        """
        Performs one complete pass over the train set and evaluate on dev
        Args:
            sess: tensorflow session
            train: dataset that yields tuple of sentences, tags
            dev: dataset
            tags: {tag: index} dictionary
            epoch: (int) number of the epoch
        """
        try :
            nbatches = (len(train) + self.batch_size - 1) / self.batch_size
            for i, (words, labels) in enumerate(self.minibatches(train, self.batch_size)):
                fd, _ = self.get_feed_dict(words, labels, self.p_lr, self.p_dropout)
                _, train_loss, summary = sess.run([self.train_op, self.loss, self.merged], feed_dict=fd)

                # tensorboard
                if i % 10 == 0:
                    self.file_writer.add_summary(summary, epoch * nbatches + i)

            acc, f1, _ = self.run_evaluate(sess, dev, tags)
            logging.info("- dev acc {:04.2f} - f1 {:04.2f}".format(100 * acc, 100 * f1))
            return acc, f1
        except Exception as e :
            print ("Exception on run_epoch {0}".format(e))

[docs]    def run_evaluate(self, sess, test, tags, result=None):
        """
        Evaluates performance on test set
        Args:
            sess: tensorflow session
            test: dataset that yields tuple of sentences, tags
            tags: {tag: index} dictionary
        Returns:
            accuracy
            f1 score
        """
        try:
            accs = []
            correct_preds, total_correct, total_preds = 0., 0., 0.
            for words, labels in self.minibatches(test, self.batch_size):
                labels_pred, sequence_lengths = self.predict_batch(sess, words)

                for lab, lab_pred, length in zip(labels, labels_pred, sequence_lengths):
                    lab = lab[:length]
                    lab_pred = lab_pred[:length]
                    accs += map(lambda x: x[0] == x[1], zip(lab, lab_pred))

                    lab_chunks = set(self.get_chunks(lab, tags))
                    lab_pred_chunks = set(self.get_chunks(lab_pred, tags))
                    if(result) :
                        if (len(self.get_chunks(lab_pred, tags)) > 0 and
                            len(self.get_chunks(lab, tags)) > 0 ) :
                            for label, pred in zip (self.get_chunks(lab, tags), self.get_chunks(lab_pred, tags)) :
                                result.set_result_info(label[0], pred[0])
                    correct_preds += len(lab_chunks & lab_pred_chunks)
                    total_preds += len(lab_pred_chunks)
                    total_correct += len(lab_chunks)

            p = correct_preds / total_preds if correct_preds > 0 else 0
            r = correct_preds / total_correct if correct_preds > 0 else 0
            f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
            acc = np.mean(accs)
            return acc, f1, result
        except Exception as e:
            raise Exception(e)

[docs]    def train(self, train, dev, tags, sess):
        """
        Performs training with early stopping and lr exponential decay
        Args:
            train: dataset that yields tuple of sentences, tags
            dev: dataset
            tags: {tag: index} dictionary
        """
        best_score = 0
        saver = tf.train.Saver()

        # for early stopping
        nepoch_no_imprv = 0
        sess.run(self.init)

        # restore model
        if (self.check_batch_exist(self.node_id) and os.path.exists(self.model_output)):
            saver.restore(sess, self.model_output)

        # tensorboard
        self.add_summary(sess)
        for epoch in range(self.nepochs):
            logging.info("Epoch {:} out of {:}".format(epoch + 1, self.nepochs))

            acc, f1 = self.run_epoch(sess, train, dev, tags, epoch)

            # decay learning rate
            self.p_lr *= self.lr_decay

            # early stopping and saving best parameters
            if f1 >= best_score:
                nepoch_no_imprv = 0
                best_score = f1
                logging.info("- new best score!")
            else:
                nepoch_no_imprv += 1
                if nepoch_no_imprv >= self.nepoch_no_imprv:
                    logging.info("- early stopping {} epochs without improvement".format(
                        nepoch_no_imprv))
                    break
        return best_score


[docs]    def eval(self, node_id, conf_data, data=None, result=None, stand=0.1):
        """
        eval process check if model works well (accuracy with cross table)
        :param node_id:
        :param conf_data:
        :param data:
        :param result:
        :return:
        """
        try :
            node_id = self.get_node_name()
            lables = list(map(lambda x : x.split('-')[-1], list(self.vocab_tags.keys())))
            result.set_result_data_format({"labels": lables})
            result.set_nn_batch_ver_id(self.get_eval_batch(node_id))

            with tf.Session() as sess:
                # load trained model
                if (self.check_batch_exist(self.node_id) and os.path.exists(self.model_output)):
                    self.saver.restore(sess, self.model_output)
                else:
                    raise Exception("bilstm crf error : no pretrained model exist")

                if(data.has_next() == False) :
                    result.set_result_info([''], [''])
                    logging.info("no test data exists")
                    return result

                while (data.has_next()):
                    test = self.CoNLLDataset(data.get_file_name(),
                                            self.processing_word,
                                            self.processing_tag,
                                            self.max_iter)
                    acc, f1, result = self.run_evaluate(sess, test, self.vocab_tags, result=result)
                    logging.info("- test acc {:04.2f} - f1 {:04.2f}".format(100 * acc, 100 * f1))
                    data.next()
            return result
        except Exception as e :
            raise Exception ("error on eval wcnn : {0}".format(e))


[docs]    def predict(self, node_id, parm={"input_data": {}}):
        """
        predict logic for ner
        tockenize input text and find matching tags for each value
        :param node_id:
        :param parm:
        :return:
        """
        try:
            # get unique key
            unique_key = '__'.join([node_id, self.get_eval_batch(node_id)])

            ## create tensorflow graph
            if (NeuralNetModel.dict.get(unique_key)):
                self = NeuralNetModel.dict.get(unique_key)
                graph = NeuralNetModel.graph.get(unique_key)
            else:
                # set init params
                self._init_node_parm(node_id)
                self.idx_to_tag = {idx: tag for tag, idx in iter(self.vocab_tags.items())}
                self.build_graph()
                graph = tf.get_default_graph()

            if (NeuralNetModel.sess.get(unique_key) == None) :
                sess = tf.Session(graph=graph)
                # load trained model
                if (self.check_batch_exist(self.node_id) and os.path.exists(self.model_output)):
                    self.saver.restore(sess, self.model_output)
                    NeuralNetModel.set_dict(unique_key, self)
                    NeuralNetModel.set_graph(unique_key, graph)
                    NeuralNetModel.set_sess(unique_key, sess)
                else:
                    raise Exception("bilstm crf error : no pretrained model exist")
            else :
                sess = NeuralNetModel.sess.get(unique_key)

            input_arr = parm["input_data"].split(' ')
            words = list(map(lambda x: self.processing_word(x), input_arr))
            if type(words[0]) == tuple:
                words = zip(*words)
            pred_ids, _ = self.predict_batch(sess, [words])
            preds = list(map(lambda idx: self.idx_to_tag[idx], list(pred_ids[0])))

            return preds
        except Exception as e:
            raise Exception(e)