[
  {
    "path": "MyCRFLayer.py",
    "content": "import chainer.links as L\nimport chainer.functions as F\nfrom chainer import variable\n\nimport numpy as np\nfrom chainer.functions.math import sum as _sum\nfrom chainer.functions.math import exponential as _exponential\n\n# More details: https://github.com/createmomo/CRF-Layer-on-the-Top-of-BiLSTM\n# This code is modified based on https://github.com/glample/tagger\n\nclass My_CRF(L.CRF1d):\n    def __init__(self, n_label):\n        super(My_CRF, self).__init__(n_label)\n        with self.init_scope():\n            '''\n            [Initialization]\n            '''\n            # Generate random values for transition matrix.\n            # The shape of transition matrix is (n_label+2, n_label+2).\n            # \"2\" means the extra added labels, START and END. (see 3.2)\n            drange = np.sqrt(6. / (np.sum((n_label + 2, n_label + 2))))\n            value = drange * np.random.uniform(low=-1.0, high=1.0, size=(n_label + 2, n_label + 2))\n            transitions = np.array(value, dtype=np.float32)\n            self.cost = variable.Parameter(transitions)\n\n            # The number of unique labels in training data set (e.g B-Person, I-Person, O)\n            self.n_label = n_label\n\n            # The small value will fill the expanded emission score matrix as described in 3.2\n            self.small = -1000\n\n    def __call__(self, xs, ys):\n        '''\n        :param xs: the outputs of BiLSTM layer (the emission score matrix)\n        :param ys: the ground truth labels\n        :return: CRF loss\n        '''\n        '''\n        Loss Function\n        '''\n\n        # Assign new id for extra added labels (START and END)\n        b_id = np.array([self.n_label], dtype='i')\n        e_id = np.array([self.n_label + 1], dtype='i')\n\n        total_loss = 0.0\n        small = self.small\n\n        #Compute crf loss for each sentence\n        for xs_i, ys_i in zip(xs,ys):\n            s_len = len(xs_i)# how many words does the sentence have\n\n            # Expand the emission score matrix by adding two extra labels (START and END).\n            # For more details, please see the example in 3.2\n            b_s = np.array([[small] * self.n_label + [0, small]]).astype(np.float32)\n            e_s = np.array([[small] * self.n_label + [small, 0]]).astype(np.float32)\n            observations = F.concat((xs_i, small * np.ones((s_len,2),dtype='f')),axis=1)\n            observations = F.concat((b_s,observations,e_s), axis=0)\n\n            # Compute the real path score according the ground truth labels (see 2.4)\n            # Emission score of the real path\n            real_path_score = _sum.sum(xs_i[list(range(s_len)), ys_i])\n\n            # Transition score of the real path\n            padded_tags_ids = F.concat((b_id, ys_i, e_id), axis=0)\n\n            real_path_score += _sum.sum(self.cost[\n                padded_tags_ids[list(range(s_len + 1))].data,\n                padded_tags_ids[[current_item + 1 for current_item in range(s_len + 1)]].data\n            ])\n\n            # Compute the score of all the possible paths of current sentence (see 2.5)\n            all_paths_scores = self.forward(observations, self.cost)\n\n            # The crf cost of current sentence (see 2.5)\n            current_cost = - (real_path_score - all_paths_scores)\n\n            total_loss += current_cost\n\n        return total_loss\n\n    def log_sum_exp(self, x, axis=None):\n        '''\n        Compute the sum of scores in log space (see 2.5).\n        This function is used in forward.\n        '''\n        xmax = F.max(x, axis=axis, keepdims=True)\n        xmax_ = F.max(x, axis=axis)\n\n        second_item = _exponential.log(_sum.sum(_exponential.exp(x - F.broadcast_to(xmax,x.shape)),axis=axis))\n\n        return xmax_ + second_item\n\n    def forward(self, observations, transitions,\n                viterbi=False,\n                return_best_sequence=False\n                ):\n        '''\n        :param observations: (see 2.5) In 2.5, 'obs' are the observations here.\n        :param transitions: Transition score matrix\n        :param viterbi: When the viterbi and return_best_sequence are true, this method will return the predicted best paths.\n        If false, this function will return the sum of scores in log space\n        :param return_best_sequence: Please see above.\n        :return: The sum of scores in log space or the predicted best sequence\n        '''\n        '''\n        This function is described in detail in 2.5 and 2.6.\n        '''\n        def recurrence(obs, previous, transitions):\n            previous = previous.reshape((previous.shape[0],1))\n            obs = obs.reshape((1,obs.shape[0]))\n            if viterbi:# Please see 2.6\n                scores = F.broadcast_to(previous,(self.n_label+2, self.n_label+2)) + F.broadcast_to(obs,(self.n_label+2, self.n_label+2)) + transitions\n                scores = scores.data\n                out = scores.max(axis=0)\n                if return_best_sequence:\n                    out2 = scores.argmax(axis=0)\n                    out2 = np.array(out2,dtype='i')\n                    return out, out2\n            else:# Please see 2.5 (Return the sum of scores in log space)\n                previous = F.broadcast_to(previous,(self.n_label+2, self.n_label+2))\n                obs = F.broadcast_to(obs,(self.n_label+2, self.n_label+2))\n                return self.log_sum_exp(previous + obs + transitions, axis=0)\n\n        def mini_function_for_best_sequence(beta_i, previous):\n            return beta_i[previous]\n\n        if return_best_sequence:# Return the best predicted path for one sentence (see 2.6)\n            initial_0 = observations[0]\n            alpha_0 = np.array(initial_0.data, dtype='f')\n            alpha_0 = F.expand_dims(alpha_0,axis=0)\n\n            alpha_1 = None\n\n            flag = True\n            for obs in observations[1:]:\n                initial_0, initial_1 = recurrence(obs, initial_0, transitions)\n                alpha_0 = F.vstack((alpha_0,F.expand_dims(initial_0,axis=0)))\n\n                if flag == True:\n                    alpha_1 = np.array(initial_1, dtype='i')\n                    alpha_1 = F.expand_dims(alpha_1, axis=0)\n                    flag = False\n\n                alpha_1 = F.vstack((alpha_1, F.expand_dims(initial_1, axis=0)))\n\n            alpha_0 = alpha_0.data[1:]\n\n            initial_beta = np.argmax(alpha_0[-1])\n            initial_beta = initial_beta.astype('i')\n            sequence = np.array(initial_beta,dtype='i')\n            sequence = F.expand_dims(sequence,axis=0)\n\n            for item in alpha_1.data[::-1].astype('i'):\n                initial_beta = mini_function_for_best_sequence(item,initial_beta)\n                sequence = F.concat((sequence, F.expand_dims(np.array(initial_beta), axis=0)), axis=0)\n\n            sequence = sequence[::-1][2:-1]\n            sequence = sequence.reshape(1,sequence.shape[0])\n            return sequence[0]# Return best path\n\n        else:# Please see 2.5 (Return the sum of scores in log space)\n            initial = observations[0]\n            alpha = []\n            alpha.append(initial)\n\n            for obs in observations[1:]:\n                initial = recurrence(obs, initial, transitions)\n                alpha.append(initial)\n\n            alpha = alpha[1:]\n\n            return self.log_sum_exp(alpha[-1], axis=0)\n\n    def argmax(self, xs):\n        '''\n        :param xs: The list of new sentences\n        :return: Predicted labels for the new sentences\n        '''\n        best_sequence = []\n\n        small = self.small\n\n        # Predict the labels for new sentences (Please see 2.6)\n        for xs_i in xs:\n            s_len = len(xs_i)\n\n            b_s = np.array([[small] * self.n_label + [0, small]]).astype(np.float32)\n            e_s = np.array([[small] * self.n_label + [small, 0]]).astype(np.float32)\n            observations = F.concat((xs_i, small * np.ones((s_len, 2), dtype='f')), axis=1)\n            observations = F.concat((b_s, observations, e_s), axis=0)\n\n            current_best_sequence = self.forward(observations, self.cost, viterbi=True, return_best_sequence=True)\n            best_sequence.append(current_best_sequence.data)\n\n        return best_sequence\n\n"
  },
  {
    "path": "README.md",
    "content": "# CRF-Layer-on-the-Top-of-BiLSTM (BiLSTM-CRF)\nThe article series include:\n- **Introduction** - the general idea of the CRF layer on the top of BiLSTM for named entity recognition tasks\n- **A Detailed Example** -  a toy example to explain how CRF layer works step-by-step\n- **Chainer Implementation** - a chainer implementation of the CRF Layer\n\nLinks:\n  * [CRF Layer on the Top of BiLSTM - 1](https://createmomo.github.io/2017/09/12/CRF_Layer_on_the_Top_of_BiLSTM_1/) Outline and Introduction\n  * [CRF Layer on the Top of BiLSTM - 2](https://createmomo.github.io/2017/09/23/CRF_Layer_on_the_Top_of_BiLSTM_2/) CRF Layer (Emission and Transition Score)\n  * [CRF Layer on the Top of BiLSTM - 3](https://createmomo.github.io/2017/10/08/CRF-Layer-on-the-Top-of-BiLSTM-3/) CRF Loss Function\n  * [CRF Layer on the Top of BiLSTM - 4](https://createmomo.github.io/2017/10/17/CRF-Layer-on-the-Top-of-BiLSTM-4/) Real Path Score\n  * [CRF Layer on the Top of BiLSTM - 5](https://createmomo.github.io/2017/11/11/CRF-Layer-on-the-Top-of-BiLSTM-5/) The Total Score of All the Paths\n  * [CRF Layer on the Top of BiLSTM - 6](https://createmomo.github.io/2017/11/24/CRF-Layer-on-the-Top-of-BiLSTM-6/) Infer the Labels for a New Sentence\n  * [CRF Layer on the Top of BiLSTM - 7](https://createmomo.github.io/2017/12/06/CRF-Layer-on-the-Top-of-BiLSTM-7/) Chainer Implementation Warm Up\n  * [CRF Layer on the Top of BiLSTM - 8](https://createmomo.github.io/2017/12/07/CRF-Layer-on-the-Top-of-BiLSTM-8/) Demo Code\n\nGitHub: https://github.com/createmomo/CRF-Layer-on-the-Top-of-BiLSTM\n\n# Wechat Public Account\nPlease note that: The **Wechat Public Account** is avaiable now! If you found this article is useful and would like to found more information about this series, please subscribe to the public account by your Wechat! **(2020-04-03)**\n<img src=\"/qr_code.jpg\" alt=\"QR Code\" title=\"QR Code\" width=\"393\" height=\"127\" />\n"
  },
  {
    "path": "demo.py",
    "content": "import numpy as np\nimport chainer\n\nimport MyCRFLayer\n\nn_label = 2\n\na = np.random.uniform(-1, 1, n_label).astype('f')\nb = np.random.uniform(-1, 1, n_label).astype('f')\n\nx1 = np.stack([b, a])\nx2 = np.stack([a])\n\nxs = [x1, x2]\n\nys = [np.random.randint(n_label,size = x.shape[0],dtype='i') for x in xs]\n\nmy_crf = MyCRFLayer.My_CRF(n_label)\n\nloss = my_crf(xs,ys)\n\nprint('Ground Truth:')\nfor i,y in enumerate(ys):\n    print('\\tsentence {0}: [{1}]'.format(str(i),' '.join([str(label) for label in y])))\n\nfrom chainer import optimizers\noptimizer = optimizers.SGD(lr=0.01)\noptimizer.setup(my_crf)\noptimizer.add_hook(chainer.optimizer.GradientClipping(5.0))\n\nprint('Predictions:')\nfor epoch_i in range(201):\n    with chainer.using_config('train', True):\n        loss = my_crf(xs,ys)\n\n        # update parameters\n        optimizer.target.zerograds()\n        loss.backward()\n        optimizer.update()\n\n    with chainer.using_config('train', False):\n        if epoch_i % 50 == 0:\n            print('\\tEpoch {0}: (loss={1})'.format(str(epoch_i),str(loss.data)))\n            for i, prediction in enumerate(my_crf.argmax(xs)):\n                print('\\t\\tsentence {0}: [{1}]'.format(str(i), ' '.join([str(label) for label in prediction])))\n"
  }
]