Repository: createmomo/CRF-Layer-on-the-Top-of-BiLSTM
Branch: master
Commit: 562b2ea744d3
Files: 3
Total size: 11.0 KB

Directory structure:
gitextract_7n5x16cj/

├── MyCRFLayer.py
├── README.md
└── demo.py

================================================
FILE CONTENTS
================================================

================================================
FILE: MyCRFLayer.py
================================================
import chainer.links as L
import chainer.functions as F
from chainer import variable

import numpy as np
from chainer.functions.math import sum as _sum
from chainer.functions.math import exponential as _exponential

# More details: https://github.com/createmomo/CRF-Layer-on-the-Top-of-BiLSTM
# This code is modified based on https://github.com/glample/tagger

class My_CRF(L.CRF1d):
    def __init__(self, n_label):
        super(My_CRF, self).__init__(n_label)
        with self.init_scope():
            '''
            [Initialization]
            '''
            # Generate random values for transition matrix.
            # The shape of transition matrix is (n_label+2, n_label+2).
            # "2" means the extra added labels, START and END. (see 3.2)
            drange = np.sqrt(6. / (np.sum((n_label + 2, n_label + 2))))
            value = drange * np.random.uniform(low=-1.0, high=1.0, size=(n_label + 2, n_label + 2))
            transitions = np.array(value, dtype=np.float32)
            self.cost = variable.Parameter(transitions)

            # The number of unique labels in training data set (e.g B-Person, I-Person, O)
            self.n_label = n_label

            # The small value will fill the expanded emission score matrix as described in 3.2
            self.small = -1000

    def __call__(self, xs, ys):
        '''
        :param xs: the outputs of BiLSTM layer (the emission score matrix)
        :param ys: the ground truth labels
        :return: CRF loss
        '''
        '''
        Loss Function
        '''

        # Assign new id for extra added labels (START and END)
        b_id = np.array([self.n_label], dtype='i')
        e_id = np.array([self.n_label + 1], dtype='i')

        total_loss = 0.0
        small = self.small

        #Compute crf loss for each sentence
        for xs_i, ys_i in zip(xs,ys):
            s_len = len(xs_i)# how many words does the sentence have

            # Expand the emission score matrix by adding two extra labels (START and END).
            # For more details, please see the example in 3.2
            b_s = np.array([[small] * self.n_label + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * self.n_label + [small, 0]]).astype(np.float32)
            observations = F.concat((xs_i, small * np.ones((s_len,2),dtype='f')),axis=1)
            observations = F.concat((b_s,observations,e_s), axis=0)

            # Compute the real path score according the ground truth labels (see 2.4)
            # Emission score of the real path
            real_path_score = _sum.sum(xs_i[list(range(s_len)), ys_i])

            # Transition score of the real path
            padded_tags_ids = F.concat((b_id, ys_i, e_id), axis=0)

            real_path_score += _sum.sum(self.cost[
                padded_tags_ids[list(range(s_len + 1))].data,
                padded_tags_ids[[current_item + 1 for current_item in range(s_len + 1)]].data
            ])

            # Compute the score of all the possible paths of current sentence (see 2.5)
            all_paths_scores = self.forward(observations, self.cost)

            # The crf cost of current sentence (see 2.5)
            current_cost = - (real_path_score - all_paths_scores)

            total_loss += current_cost

        return total_loss

    def log_sum_exp(self, x, axis=None):
        '''
        Compute the sum of scores in log space (see 2.5).
        This function is used in forward.
        '''
        xmax = F.max(x, axis=axis, keepdims=True)
        xmax_ = F.max(x, axis=axis)

        second_item = _exponential.log(_sum.sum(_exponential.exp(x - F.broadcast_to(xmax,x.shape)),axis=axis))

        return xmax_ + second_item

    def forward(self, observations, transitions,
                viterbi=False,
                return_best_sequence=False
                ):
        '''
        :param observations: (see 2.5) In 2.5, 'obs' are the observations here.
        :param transitions: Transition score matrix
        :param viterbi: When the viterbi and return_best_sequence are true, this method will return the predicted best paths.
        If false, this function will return the sum of scores in log space
        :param return_best_sequence: Please see above.
        :return: The sum of scores in log space or the predicted best sequence
        '''
        '''
        This function is described in detail in 2.5 and 2.6.
        '''
        def recurrence(obs, previous, transitions):
            previous = previous.reshape((previous.shape[0],1))
            obs = obs.reshape((1,obs.shape[0]))
            if viterbi:# Please see 2.6
                scores = F.broadcast_to(previous,(self.n_label+2, self.n_label+2)) + F.broadcast_to(obs,(self.n_label+2, self.n_label+2)) + transitions
                scores = scores.data
                out = scores.max(axis=0)
                if return_best_sequence:
                    out2 = scores.argmax(axis=0)
                    out2 = np.array(out2,dtype='i')
                    return out, out2
            else:# Please see 2.5 (Return the sum of scores in log space)
                previous = F.broadcast_to(previous,(self.n_label+2, self.n_label+2))
                obs = F.broadcast_to(obs,(self.n_label+2, self.n_label+2))
                return self.log_sum_exp(previous + obs + transitions, axis=0)

        def mini_function_for_best_sequence(beta_i, previous):
            return beta_i[previous]

        if return_best_sequence:# Return the best predicted path for one sentence (see 2.6)
            initial_0 = observations[0]
            alpha_0 = np.array(initial_0.data, dtype='f')
            alpha_0 = F.expand_dims(alpha_0,axis=0)

            alpha_1 = None

            flag = True
            for obs in observations[1:]:
                initial_0, initial_1 = recurrence(obs, initial_0, transitions)
                alpha_0 = F.vstack((alpha_0,F.expand_dims(initial_0,axis=0)))

                if flag == True:
                    alpha_1 = np.array(initial_1, dtype='i')
                    alpha_1 = F.expand_dims(alpha_1, axis=0)
                    flag = False

                alpha_1 = F.vstack((alpha_1, F.expand_dims(initial_1, axis=0)))

            alpha_0 = alpha_0.data[1:]

            initial_beta = np.argmax(alpha_0[-1])
            initial_beta = initial_beta.astype('i')
            sequence = np.array(initial_beta,dtype='i')
            sequence = F.expand_dims(sequence,axis=0)

            for item in alpha_1.data[::-1].astype('i'):
                initial_beta = mini_function_for_best_sequence(item,initial_beta)
                sequence = F.concat((sequence, F.expand_dims(np.array(initial_beta), axis=0)), axis=0)

            sequence = sequence[::-1][2:-1]
            sequence = sequence.reshape(1,sequence.shape[0])
            return sequence[0]# Return best path

        else:# Please see 2.5 (Return the sum of scores in log space)
            initial = observations[0]
            alpha = []
            alpha.append(initial)

            for obs in observations[1:]:
                initial = recurrence(obs, initial, transitions)
                alpha.append(initial)

            alpha = alpha[1:]

            return self.log_sum_exp(alpha[-1], axis=0)

    def argmax(self, xs):
        '''
        :param xs: The list of new sentences
        :return: Predicted labels for the new sentences
        '''
        best_sequence = []

        small = self.small

        # Predict the labels for new sentences (Please see 2.6)
        for xs_i in xs:
            s_len = len(xs_i)

            b_s = np.array([[small] * self.n_label + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * self.n_label + [small, 0]]).astype(np.float32)
            observations = F.concat((xs_i, small * np.ones((s_len, 2), dtype='f')), axis=1)
            observations = F.concat((b_s, observations, e_s), axis=0)

            current_best_sequence = self.forward(observations, self.cost, viterbi=True, return_best_sequence=True)
            best_sequence.append(current_best_sequence.data)

        return best_sequence


================================================
FILE: README.md
================================================
# CRF-Layer-on-the-Top-of-BiLSTM (BiLSTM-CRF)
The article series include:
- **Introduction** - the general idea of the CRF layer on the top of BiLSTM for named entity recognition tasks
- **A Detailed Example** -  a toy example to explain how CRF layer works step-by-step
- **Chainer Implementation** - a chainer implementation of the CRF Layer

Links:
  * [CRF Layer on the Top of BiLSTM - 1](https://createmomo.github.io/2017/09/12/CRF_Layer_on_the_Top_of_BiLSTM_1/) Outline and Introduction
  * [CRF Layer on the Top of BiLSTM - 2](https://createmomo.github.io/2017/09/23/CRF_Layer_on_the_Top_of_BiLSTM_2/) CRF Layer (Emission and Transition Score)
  * [CRF Layer on the Top of BiLSTM - 3](https://createmomo.github.io/2017/10/08/CRF-Layer-on-the-Top-of-BiLSTM-3/) CRF Loss Function
  * [CRF Layer on the Top of BiLSTM - 4](https://createmomo.github.io/2017/10/17/CRF-Layer-on-the-Top-of-BiLSTM-4/) Real Path Score
  * [CRF Layer on the Top of BiLSTM - 5](https://createmomo.github.io/2017/11/11/CRF-Layer-on-the-Top-of-BiLSTM-5/) The Total Score of All the Paths
  * [CRF Layer on the Top of BiLSTM - 6](https://createmomo.github.io/2017/11/24/CRF-Layer-on-the-Top-of-BiLSTM-6/) Infer the Labels for a New Sentence
  * [CRF Layer on the Top of BiLSTM - 7](https://createmomo.github.io/2017/12/06/CRF-Layer-on-the-Top-of-BiLSTM-7/) Chainer Implementation Warm Up
  * [CRF Layer on the Top of BiLSTM - 8](https://createmomo.github.io/2017/12/07/CRF-Layer-on-the-Top-of-BiLSTM-8/) Demo Code

GitHub: https://github.com/createmomo/CRF-Layer-on-the-Top-of-BiLSTM

# Wechat Public Account
Please note that: The **Wechat Public Account** is avaiable now! If you found this article is useful and would like to found more information about this series, please subscribe to the public account by your Wechat! **(2020-04-03)**
<img src="/qr_code.jpg" alt="QR Code" title="QR Code" width="393" height="127" />


================================================
FILE: demo.py
================================================
import numpy as np
import chainer

import MyCRFLayer

n_label = 2

a = np.random.uniform(-1, 1, n_label).astype('f')
b = np.random.uniform(-1, 1, n_label).astype('f')

x1 = np.stack([b, a])
x2 = np.stack([a])

xs = [x1, x2]

ys = [np.random.randint(n_label,size = x.shape[0],dtype='i') for x in xs]

my_crf = MyCRFLayer.My_CRF(n_label)

loss = my_crf(xs,ys)

print('Ground Truth:')
for i,y in enumerate(ys):
    print('\tsentence {0}: [{1}]'.format(str(i),' '.join([str(label) for label in y])))

from chainer import optimizers
optimizer = optimizers.SGD(lr=0.01)
optimizer.setup(my_crf)
optimizer.add_hook(chainer.optimizer.GradientClipping(5.0))

print('Predictions:')
for epoch_i in range(201):
    with chainer.using_config('train', True):
        loss = my_crf(xs,ys)

        # update parameters
        optimizer.target.zerograds()
        loss.backward()
        optimizer.update()

    with chainer.using_config('train', False):
        if epoch_i % 50 == 0:
            print('\tEpoch {0}: (loss={1})'.format(str(epoch_i),str(loss.data)))
            for i, prediction in enumerate(my_crf.argmax(xs)):
                print('\t\tsentence {0}: [{1}]'.format(str(i), ' '.join([str(label) for label in prediction])))