Repository: graykode/nlp-tutorial
Branch: master
Commit: d05e31ec81d5
Files: 42
Total size: 248.5 KB

Directory structure:
gitextract_mxhf1sqm/

├── .github/
│   └── workflows/
│       └── python-app.yml
├── .gitignore
├── 1-1.NNLM/
│   ├── NNLM.ipynb
│   └── NNLM.py
├── 1-2.Word2Vec/
│   ├── Word2Vec-Skipgram(Softmax).ipynb
│   └── Word2Vec-Skipgram(Softmax).py
├── 1-3.FastText/
│   ├── FastText.ipynb
│   ├── test.txt
│   └── train.txt
├── 2-1.TextCNN/
│   ├── TextCNN.ipynb
│   └── TextCNN.py
├── 3-1.TextRNN/
│   ├── TextRNN.ipynb
│   └── TextRNN.py
├── 3-2.TextLSTM/
│   ├── TextLSTM.ipynb
│   └── TextLSTM.py
├── 3-3.Bi-LSTM/
│   ├── Bi-LSTM.ipynb
│   └── Bi-LSTM.py
├── 4-1.Seq2Seq/
│   ├── Seq2Seq.ipynb
│   └── Seq2Seq.py
├── 4-2.Seq2Seq(Attention)/
│   ├── Seq2Seq(Attention).ipynb
│   └── Seq2Seq(Attention).py
├── 4-3.Bi-LSTM(Attention)/
│   ├── Bi-LSTM(Attention).ipynb
│   └── Bi-LSTM(Attention).py
├── 5-1.Transformer/
│   ├── Transformer(Greedy_decoder).ipynb
│   ├── Transformer(Greedy_decoder).py
│   ├── Transformer.ipynb
│   └── Transformer.py
├── 5-2.BERT/
│   ├── BERT.ipynb
│   └── BERT.py
├── CONTRIBUTING.md
├── LICENSE
├── README.md
└── archive/
    └── tensorflow/
        └── v1/
            ├── 1-1.NNLM/
            │   └── NNLM.py
            ├── 1-2.Word2Vec/
            │   ├── Word2Vec-Skipgram(NCE_loss).py
            │   └── Word2Vec-Skipgram(Softmax).py
            ├── 2-1.TextCNN/
            │   └── TextCNN.py
            ├── 3-1.TextRNN/
            │   └── TextRNN.py
            ├── 3-2.TextLSTM/
            │   └── TextLSTM.py
            ├── 3-3.Bi-LSTM/
            │   └── Bi-LSTM.py
            ├── 4-1.Seq2Seq/
            │   └── Seq2Seq.py
            ├── 4-2.Seq2Seq(Attention)/
            │   └── Seq2Seq(Attention).py
            └── 4-3.Bi-LSTM(Attention)/
                └── Bi-LSTM(Attention).py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/python-app.yml
================================================
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: Python application

on:
  push:
    branches: [ master ]
jobs:
  build:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python 3.8
      uses: actions/setup-python@v2
      with:
        python-version: 3.8
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install py2ipynb==0.0.5
    - name: Test with py2ipynb
      run: |
        py2ipynb '*/*.py'
    - name: Commit changes
      uses: EndBug/add-and-commit@v4
      with:
        author_name: graykode
        author_email: nlkey2022@gmail.com
        message: "Automatic convert from py to ipynb"
        add: "*/*.ipynb"
      env:
        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .gitignore
================================================
.idea


================================================
FILE: 1-1.NNLM/NNLM.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung @graykode\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "\n",
        "def make_batch():\n",
        "    input_batch = []\n",
        "    target_batch = []\n",
        "\n",
        "    for sen in sentences:\n",
        "        word = sen.split() # space tokenizer\n",
        "        input = [word_dict[n] for n in word[:-1]] # create (1~n-1) as input\n",
        "        target = word_dict[word[-1]] # create (n) as target, We usually call this 'casual language model'\n",
        "\n",
        "        input_batch.append(input)\n",
        "        target_batch.append(target)\n",
        "\n",
        "    return input_batch, target_batch\n",
        "\n",
        "# Model\n",
        "class NNLM(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(NNLM, self).__init__()\n",
        "        self.C = nn.Embedding(n_class, m)\n",
        "        self.H = nn.Linear(n_step * m, n_hidden, bias=False)\n",
        "        self.d = nn.Parameter(torch.ones(n_hidden))\n",
        "        self.U = nn.Linear(n_hidden, n_class, bias=False)\n",
        "        self.W = nn.Linear(n_step * m, n_class, bias=False)\n",
        "        self.b = nn.Parameter(torch.ones(n_class))\n",
        "\n",
        "    def forward(self, X):\n",
        "        X = self.C(X) # X : [batch_size, n_step, m]\n",
        "        X = X.view(-1, n_step * m) # [batch_size, n_step * m]\n",
        "        tanh = torch.tanh(self.d + self.H(X)) # [batch_size, n_hidden]\n",
        "        output = self.b + self.W(X) + self.U(tanh) # [batch_size, n_class]\n",
        "        return output\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    n_step = 2 # number of steps, n-1 in paper\n",
        "    n_hidden = 2 # number of hidden size, h in paper\n",
        "    m = 2 # embedding size, m in paper\n",
        "\n",
        "    sentences = [\"i like dog\", \"i love coffee\", \"i hate milk\"]\n",
        "\n",
        "    word_list = \" \".join(sentences).split()\n",
        "    word_list = list(set(word_list))\n",
        "    word_dict = {w: i for i, w in enumerate(word_list)}\n",
        "    number_dict = {i: w for i, w in enumerate(word_list)}\n",
        "    n_class = len(word_dict)  # number of Vocabulary\n",
        "\n",
        "    model = NNLM()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    input_batch, target_batch = make_batch()\n",
        "    input_batch = torch.LongTensor(input_batch)\n",
        "    target_batch = torch.LongTensor(target_batch)\n",
        "\n",
        "    # Training\n",
        "    for epoch in range(5000):\n",
        "        optimizer.zero_grad()\n",
        "        output = model(input_batch)\n",
        "\n",
        "        # output : [batch_size, n_class], target_batch : [batch_size]\n",
        "        loss = criterion(output, target_batch)\n",
        "        if (epoch + 1) % 1000 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    # Predict\n",
        "    predict = model(input_batch).data.max(1, keepdim=True)[1]\n",
        "\n",
        "    # Test\n",
        "    print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 1-1.NNLM/NNLM.py
================================================
# %%
# code by Tae Hwan Jung @graykode
import torch
import torch.nn as nn
import torch.optim as optim

def make_batch():
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split() # space tokenizer
        input = [word_dict[n] for n in word[:-1]] # create (1~n-1) as input
        target = word_dict[word[-1]] # create (n) as target, We usually call this 'casual language model'

        input_batch.append(input)
        target_batch.append(target)

    return input_batch, target_batch

# Model
class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        self.C = nn.Embedding(n_class, m)
        self.H = nn.Linear(n_step * m, n_hidden, bias=False)
        self.d = nn.Parameter(torch.ones(n_hidden))
        self.U = nn.Linear(n_hidden, n_class, bias=False)
        self.W = nn.Linear(n_step * m, n_class, bias=False)
        self.b = nn.Parameter(torch.ones(n_class))

    def forward(self, X):
        X = self.C(X) # X : [batch_size, n_step, m]
        X = X.view(-1, n_step * m) # [batch_size, n_step * m]
        tanh = torch.tanh(self.d + self.H(X)) # [batch_size, n_hidden]
        output = self.b + self.W(X) + self.U(tanh) # [batch_size, n_class]
        return output

if __name__ == '__main__':
    n_step = 2 # number of steps, n-1 in paper
    n_hidden = 2 # number of hidden size, h in paper
    m = 2 # embedding size, m in paper

    sentences = ["i like dog", "i love coffee", "i hate milk"]

    word_list = " ".join(sentences).split()
    word_list = list(set(word_list))
    word_dict = {w: i for i, w in enumerate(word_list)}
    number_dict = {i: w for i, w in enumerate(word_list)}
    n_class = len(word_dict)  # number of Vocabulary

    model = NNLM()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    input_batch, target_batch = make_batch()
    input_batch = torch.LongTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)

    # Training
    for epoch in range(5000):
        optimizer.zero_grad()
        output = model(input_batch)

        # output : [batch_size, n_class], target_batch : [batch_size]
        loss = criterion(output, target_batch)
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    # Predict
    predict = model(input_batch).data.max(1, keepdim=True)[1]

    # Test
    print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])

================================================
FILE: 1-2.Word2Vec/Word2Vec-Skipgram(Softmax).ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung @graykode\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "def random_batch():\n",
        "    random_inputs = []\n",
        "    random_labels = []\n",
        "    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)\n",
        "\n",
        "    for i in random_index:\n",
        "        random_inputs.append(np.eye(voc_size)[skip_grams[i][0]])  # target\n",
        "        random_labels.append(skip_grams[i][1])  # context word\n",
        "\n",
        "    return random_inputs, random_labels\n",
        "\n",
        "# Model\n",
        "class Word2Vec(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Word2Vec, self).__init__()\n",
        "        # W and WT is not Traspose relationship\n",
        "        self.W = nn.Linear(voc_size, embedding_size, bias=False) # voc_size > embedding_size Weight\n",
        "        self.WT = nn.Linear(embedding_size, voc_size, bias=False) # embedding_size > voc_size Weight\n",
        "\n",
        "    def forward(self, X):\n",
        "        # X : [batch_size, voc_size]\n",
        "        hidden_layer = self.W(X) # hidden_layer : [batch_size, embedding_size]\n",
        "        output_layer = self.WT(hidden_layer) # output_layer : [batch_size, voc_size]\n",
        "        return output_layer\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    batch_size = 2 # mini-batch size\n",
        "    embedding_size = 2 # embedding size\n",
        "\n",
        "    sentences = [\"apple banana fruit\", \"banana orange fruit\", \"orange banana fruit\",\n",
        "                 \"dog cat animal\", \"cat monkey animal\", \"monkey dog animal\"]\n",
        "\n",
        "    word_sequence = \" \".join(sentences).split()\n",
        "    word_list = \" \".join(sentences).split()\n",
        "    word_list = list(set(word_list))\n",
        "    word_dict = {w: i for i, w in enumerate(word_list)}\n",
        "    voc_size = len(word_list)\n",
        "\n",
        "    # Make skip gram of one size window\n",
        "    skip_grams = []\n",
        "    for i in range(1, len(word_sequence) - 1):\n",
        "        target = word_dict[word_sequence[i]]\n",
        "        context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]\n",
        "        for w in context:\n",
        "            skip_grams.append([target, w])\n",
        "\n",
        "    model = Word2Vec()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    # Training\n",
        "    for epoch in range(5000):\n",
        "        input_batch, target_batch = random_batch()\n",
        "        input_batch = torch.Tensor(input_batch)\n",
        "        target_batch = torch.LongTensor(target_batch)\n",
        "\n",
        "        optimizer.zero_grad()\n",
        "        output = model(input_batch)\n",
        "\n",
        "        # output : [batch_size, voc_size], target_batch : [batch_size] (LongTensor, not one-hot)\n",
        "        loss = criterion(output, target_batch)\n",
        "        if (epoch + 1) % 1000 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    for i, label in enumerate(word_list):\n",
        "        W, WT = model.parameters()\n",
        "        x, y = W[0][i].item(), W[1][i].item()\n",
        "        plt.scatter(x, y)\n",
        "        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')\n",
        "    plt.show()\n"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 1-2.Word2Vec/Word2Vec-Skipgram(Softmax).py
================================================
# %%
# code by Tae Hwan Jung @graykode
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

def random_batch():
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)

    for i in random_index:
        random_inputs.append(np.eye(voc_size)[skip_grams[i][0]])  # target
        random_labels.append(skip_grams[i][1])  # context word

    return random_inputs, random_labels

# Model
class Word2Vec(nn.Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        # W and WT is not Traspose relationship
        self.W = nn.Linear(voc_size, embedding_size, bias=False) # voc_size > embedding_size Weight
        self.WT = nn.Linear(embedding_size, voc_size, bias=False) # embedding_size > voc_size Weight

    def forward(self, X):
        # X : [batch_size, voc_size]
        hidden_layer = self.W(X) # hidden_layer : [batch_size, embedding_size]
        output_layer = self.WT(hidden_layer) # output_layer : [batch_size, voc_size]
        return output_layer

if __name__ == '__main__':
    batch_size = 2 # mini-batch size
    embedding_size = 2 # embedding size

    sentences = ["apple banana fruit", "banana orange fruit", "orange banana fruit",
                 "dog cat animal", "cat monkey animal", "monkey dog animal"]

    word_sequence = " ".join(sentences).split()
    word_list = " ".join(sentences).split()
    word_list = list(set(word_list))
    word_dict = {w: i for i, w in enumerate(word_list)}
    voc_size = len(word_list)

    # Make skip gram of one size window
    skip_grams = []
    for i in range(1, len(word_sequence) - 1):
        target = word_dict[word_sequence[i]]
        context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]
        for w in context:
            skip_grams.append([target, w])

    model = Word2Vec()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training
    for epoch in range(5000):
        input_batch, target_batch = random_batch()
        input_batch = torch.Tensor(input_batch)
        target_batch = torch.LongTensor(target_batch)

        optimizer.zero_grad()
        output = model(input_batch)

        # output : [batch_size, voc_size], target_batch : [batch_size] (LongTensor, not one-hot)
        loss = criterion(output, target_batch)
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    for i, label in enumerate(word_list):
        W, WT = model.parameters()
        x, y = W[0][i].item(), W[1][i].item()
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    plt.show()


================================================
FILE: 1-3.FastText/FastText.ipynb
================================================
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "FastText.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "metadata": {
        "id": "kg9kgMnGqYkU",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "## Install [FastText](https://fasttext.cc/docs/en/supervised-tutorial.html)"
      ]
    },
    {
      "metadata": {
        "id": "3Iod5UKTqZnC",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 11051
        },
        "outputId": "b10c85c0-c4cf-4f0b-a30e-2207ae4512b2"
      },
      "cell_type": "code",
      "source": [
        "!wget https://github.com/facebookresearch/fastText/archive/0.2.0.zip\n",
        "!unzip 0.2.0.zip\n",
        "%cd fastText-0.2.0\n",
        "!make"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2019-02-02 14:43:56--  https://github.com/facebookresearch/fastText/archive/0.2.0.zip\n",
            "Resolving github.com (github.com)... 140.82.118.3, 140.82.118.4\n",
            "Connecting to github.com (github.com)|140.82.118.3|:443... connected.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://codeload.github.com/facebookresearch/fastText/zip/0.2.0 [following]\n",
            "--2019-02-02 14:43:56--  https://codeload.github.com/facebookresearch/fastText/zip/0.2.0\n",
            "Resolving codeload.github.com (codeload.github.com)... 192.30.253.121, 192.30.253.120\n",
            "Connecting to codeload.github.com (codeload.github.com)|192.30.253.121|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: unspecified [application/zip]\n",
            "Saving to: ‘0.2.0.zip’\n",
            "\n",
            "0.2.0.zip               [    <=>             ]   4.10M  6.17MB/s    in 0.7s    \n",
            "\n",
            "2019-02-02 14:43:57 (6.17 MB/s) - ‘0.2.0.zip’ saved [4304799]\n",
            "\n",
            "Archive:  0.2.0.zip\n",
            "7842495a4d64c7a3bb4339d45d6e64321d002ed8\n",
            "   creating: fastText-0.2.0/\n",
            "   creating: fastText-0.2.0/.circleci/\n",
            "  inflating: fastText-0.2.0/.circleci/cmake_test.sh  \n",
            "  inflating: fastText-0.2.0/.circleci/config.yml  \n",
            "  inflating: fastText-0.2.0/.circleci/gcc_test.sh  \n",
            "  inflating: fastText-0.2.0/.circleci/pip_test.sh  \n",
            "  inflating: fastText-0.2.0/.circleci/pull_data.sh  \n",
            "  inflating: fastText-0.2.0/.circleci/python_test.sh  \n",
            "  inflating: fastText-0.2.0/.circleci/run_locally.sh  \n",
            "  inflating: fastText-0.2.0/.circleci/setup_circleimg.sh  \n",
            "  inflating: fastText-0.2.0/.circleci/setup_debian.sh  \n",
            "  inflating: fastText-0.2.0/.gitignore  \n",
            "  inflating: fastText-0.2.0/CMakeLists.txt  \n",
            "  inflating: fastText-0.2.0/CONTRIBUTING.md  \n",
            "  inflating: fastText-0.2.0/LICENSE  \n",
            "  inflating: fastText-0.2.0/MANIFEST.in  \n",
            "  inflating: fastText-0.2.0/Makefile  \n",
            "  inflating: fastText-0.2.0/README.md  \n",
            "   creating: fastText-0.2.0/alignment/\n",
            "  inflating: fastText-0.2.0/alignment/README.md  \n",
            "  inflating: fastText-0.2.0/alignment/align.py  \n",
            "  inflating: fastText-0.2.0/alignment/eval.py  \n",
            "  inflating: fastText-0.2.0/alignment/example.sh  \n",
            "  inflating: fastText-0.2.0/alignment/utils.py  \n",
            "  inflating: fastText-0.2.0/classification-example.sh  \n",
            "  inflating: fastText-0.2.0/classification-results.sh  \n",
            "   creating: fastText-0.2.0/docs/\n",
            "  inflating: fastText-0.2.0/docs/aligned-vectors.md  \n",
            "  inflating: fastText-0.2.0/docs/api.md  \n",
            "  inflating: fastText-0.2.0/docs/cheatsheet.md  \n",
            "  inflating: fastText-0.2.0/docs/crawl-vectors.md  \n",
            "  inflating: fastText-0.2.0/docs/dataset.md  \n",
            "  inflating: fastText-0.2.0/docs/english-vectors.md  \n",
            "  inflating: fastText-0.2.0/docs/faqs.md  \n",
            "  inflating: fastText-0.2.0/docs/language-identification.md  \n",
            "  inflating: fastText-0.2.0/docs/options.md  \n",
            "  inflating: fastText-0.2.0/docs/pretrained-vectors.md  \n",
            "  inflating: fastText-0.2.0/docs/references.md  \n",
            "  inflating: fastText-0.2.0/docs/supervised-models.md  \n",
            "  inflating: fastText-0.2.0/docs/supervised-tutorial.md  \n",
            "  inflating: fastText-0.2.0/docs/support.md  \n",
            "  inflating: fastText-0.2.0/docs/unsupervised-tutorials.md  \n",
            "  inflating: fastText-0.2.0/eval.py  \n",
            "  inflating: fastText-0.2.0/get-wikimedia.sh  \n",
            "  inflating: fastText-0.2.0/pretrained-vectors.md  \n",
            "   creating: fastText-0.2.0/python/\n",
            "  inflating: fastText-0.2.0/python/README.md  \n",
            "  inflating: fastText-0.2.0/python/README.rst  \n",
            "   creating: fastText-0.2.0/python/benchmarks/\n",
            "  inflating: fastText-0.2.0/python/benchmarks/README.rst  \n",
            "  inflating: fastText-0.2.0/python/benchmarks/get_word_vector.py  \n",
            "   creating: fastText-0.2.0/python/doc/\n",
            "   creating: fastText-0.2.0/python/doc/examples/\n",
            "  inflating: fastText-0.2.0/python/doc/examples/FastTextEmbeddingBag.py  \n",
            "  inflating: fastText-0.2.0/python/doc/examples/bin_to_vec.py  \n",
            "  inflating: fastText-0.2.0/python/doc/examples/compute_accuracy.py  \n",
            "  inflating: fastText-0.2.0/python/doc/examples/get_vocab.py  \n",
            "  inflating: fastText-0.2.0/python/doc/examples/train_supervised.py  \n",
            "  inflating: fastText-0.2.0/python/doc/examples/train_unsupervised.py  \n",
            "   creating: fastText-0.2.0/python/fastText/\n",
            "  inflating: fastText-0.2.0/python/fastText/FastText.py  \n",
            "  inflating: fastText-0.2.0/python/fastText/__init__.py  \n",
            "   creating: fastText-0.2.0/python/fastText/pybind/\n",
            "  inflating: fastText-0.2.0/python/fastText/pybind/fasttext_pybind.cc  \n",
            "   creating: fastText-0.2.0/python/fastText/tests/\n",
            "  inflating: fastText-0.2.0/python/fastText/tests/__init__.py  \n",
            "  inflating: fastText-0.2.0/python/fastText/tests/test_configurations.py  \n",
            "  inflating: fastText-0.2.0/python/fastText/tests/test_script.py  \n",
            "   creating: fastText-0.2.0/python/fastText/util/\n",
            "  inflating: fastText-0.2.0/python/fastText/util/__init__.py  \n",
            "  inflating: fastText-0.2.0/python/fastText/util/util.py  \n",
            "  inflating: fastText-0.2.0/quantization-example.sh  \n",
            "  inflating: fastText-0.2.0/runtests.py  \n",
            "   creating: fastText-0.2.0/scripts/\n",
            "   creating: fastText-0.2.0/scripts/kbcompletion/\n",
            "  inflating: fastText-0.2.0/scripts/kbcompletion/README.md  \n",
            "  inflating: fastText-0.2.0/scripts/kbcompletion/data.sh  \n",
            "  inflating: fastText-0.2.0/scripts/kbcompletion/eval.cpp  \n",
            "  inflating: fastText-0.2.0/scripts/kbcompletion/fb15k.sh  \n",
            "  inflating: fastText-0.2.0/scripts/kbcompletion/fb15k237.sh  \n",
            "  inflating: fastText-0.2.0/scripts/kbcompletion/svo.sh  \n",
            "  inflating: fastText-0.2.0/scripts/kbcompletion/wn18.sh  \n",
            "   creating: fastText-0.2.0/scripts/quantization/\n",
            "  inflating: fastText-0.2.0/scripts/quantization/quantization-results.sh  \n",
            " extracting: fastText-0.2.0/setup.cfg  \n",
            "  inflating: fastText-0.2.0/setup.py  \n",
            "   creating: fastText-0.2.0/src/\n",
            "  inflating: fastText-0.2.0/src/args.cc  \n",
            "  inflating: fastText-0.2.0/src/args.h  \n",
            "  inflating: fastText-0.2.0/src/dictionary.cc  \n",
            "  inflating: fastText-0.2.0/src/dictionary.h  \n",
            "  inflating: fastText-0.2.0/src/fasttext.cc  \n",
            "  inflating: fastText-0.2.0/src/fasttext.h  \n",
            "  inflating: fastText-0.2.0/src/main.cc  \n",
            "  inflating: fastText-0.2.0/src/matrix.cc  \n",
            "  inflating: fastText-0.2.0/src/matrix.h  \n",
            "  inflating: fastText-0.2.0/src/meter.cc  \n",
            "  inflating: fastText-0.2.0/src/meter.h  \n",
            "  inflating: fastText-0.2.0/src/model.cc  \n",
            "  inflating: fastText-0.2.0/src/model.h  \n",
            "  inflating: fastText-0.2.0/src/productquantizer.cc  \n",
            "  inflating: fastText-0.2.0/src/productquantizer.h  \n",
            "  inflating: fastText-0.2.0/src/qmatrix.cc  \n",
            "  inflating: fastText-0.2.0/src/qmatrix.h  \n",
            "  inflating: fastText-0.2.0/src/real.h  \n",
            "  inflating: fastText-0.2.0/src/utils.cc  \n",
            "  inflating: fastText-0.2.0/src/utils.h  \n",
            "  inflating: fastText-0.2.0/src/vector.cc  \n",
            "  inflating: fastText-0.2.0/src/vector.h  \n",
            "   creating: fastText-0.2.0/tests/\n",
            "  inflating: fastText-0.2.0/tests/fetch_test_data.sh  \n",
            "   creating: fastText-0.2.0/website/\n",
            "  inflating: fastText-0.2.0/website/README.md  \n",
            "   creating: fastText-0.2.0/website/blog/\n",
            "  inflating: fastText-0.2.0/website/blog/2016-08-18-blog-post.md  \n",
            "  inflating: fastText-0.2.0/website/blog/2017-05-02-blog-post.md  \n",
            "  inflating: fastText-0.2.0/website/blog/2017-10-02-blog-post.md  \n",
            "   creating: fastText-0.2.0/website/core/\n",
            "  inflating: fastText-0.2.0/website/core/Footer.js  \n",
            "  inflating: fastText-0.2.0/website/package.json  \n",
            "   creating: fastText-0.2.0/website/pages/\n",
            "   creating: fastText-0.2.0/website/pages/en/\n",
            "  inflating: fastText-0.2.0/website/pages/en/index.js  \n",
            "  inflating: fastText-0.2.0/website/sidebars.json  \n",
            "  inflating: fastText-0.2.0/website/siteConfig.js  \n",
            "   creating: fastText-0.2.0/website/static/\n",
            "   creating: fastText-0.2.0/website/static/docs/\n",
            "   creating: fastText-0.2.0/website/static/docs/en/\n",
            "   creating: fastText-0.2.0/website/static/docs/en/html/\n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/.classfasttext_1_1QMatrix-members.html.i4eKqy  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/annotated.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/annotated_dup.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/args_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/args_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/args_8h.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/args_8h_source.html  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/bc_s.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/bdwn.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classes.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Args-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Args.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Args.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Dictionary-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Dictionary.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Dictionary.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1FastText-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1FastText.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1FastText.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Matrix-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Matrix.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Matrix.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Model-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Model.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Model.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1ProductQuantizer-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1QMatrix-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1QMatrix.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1QMatrix.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Vector-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Vector.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/classfasttext_1_1Vector.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/closed.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/dictionary_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/dictionary_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/dictionary_8h.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/dictionary_8h_source.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.js  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/doc.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/doxygen.css  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/doxygen.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/dynsections.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/fasttext_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/fasttext_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/fasttext_8h.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/fasttext_8h_source.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/favicon.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/files.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/files.js  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/folderclosed.png  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/folderopen.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_0x7e.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_b.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_c.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_d.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_dup.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_e.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_f.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_func.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_g.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_i.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_k.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_l.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_m.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_n.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_o.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_p.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_q.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_r.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_s.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_t.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_u.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_v.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_vars.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_w.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/functions_z.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/globals.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/globals_defs.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/globals_func.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/index.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/jquery.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/main_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/main_8cc.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/matrix_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/matrix_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/matrix_8h_source.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/menu.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/menudata.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/model_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/model_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/model_8h.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/model_8h_source.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/namespacefasttext.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/namespacefasttext.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/namespacefasttext_1_1utils.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/namespacemembers.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/namespacemembers_enum.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/namespacemembers_func.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/namespacemembers_type.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/namespaces.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/namespaces.js  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/nav_f.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/nav_g.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/nav_h.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/navtree.css  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/navtree.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/navtreedata.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/navtreeindex0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/navtreeindex1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/open.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/productquantizer_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/productquantizer_8cc.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/productquantizer_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/productquantizer_8h_source.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/qmatrix_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/qmatrix_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/qmatrix_8h_source.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/real_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/real_8h.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/real_8h_source.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/resize.js  \n",
            "   creating: fastText-0.2.0/website/static/docs/en/html/search/\n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/search/.files_7.html.StRRNc  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/search/.variables_a.html.1MGQ27  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_1.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_10.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_10.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_11.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_11.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_12.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_12.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_13.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_13.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_14.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_14.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_15.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_15.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_16.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_16.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_17.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_17.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_2.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_2.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_3.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_3.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_4.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_4.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_5.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_5.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_6.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_6.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_7.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_7.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_8.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_8.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_9.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_9.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_a.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_a.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_b.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_b.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_c.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_c.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_d.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_d.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_e.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_e.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_f.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/all_f.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_1.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_2.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_2.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_3.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_3.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_4.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_4.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_5.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_5.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_6.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_6.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_7.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_7.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_8.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/classes_8.js  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/search/close.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/defines_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/defines_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/defines_1.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/defines_1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/defines_2.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/defines_2.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/defines_3.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/defines_3.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enums_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enums_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enums_1.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enums_1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enums_2.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enums_2.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_1.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_2.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_2.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_3.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_3.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_4.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_4.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_5.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/enumvalues_5.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_1.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_2.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_2.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_3.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_3.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_4.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_4.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_5.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_5.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_6.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_6.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_7.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_7.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_8.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/files_8.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_1.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_10.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_10.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_11.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_11.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_12.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_12.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_13.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_13.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_14.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_14.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_15.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_15.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_16.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_16.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_17.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_17.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_2.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_2.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_3.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_3.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_4.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_4.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_5.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_5.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_6.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_6.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_7.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_7.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_8.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_8.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_9.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_9.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_a.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_a.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_b.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_b.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_c.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_c.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_d.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_d.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_e.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_e.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_f.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/functions_f.js  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/search/mag_sel.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/namespaces_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/namespaces_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/nomatches.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/search.css  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/search.js  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/search/search_l.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/search_m.png  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/search/search_r.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/searchdata.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/typedefs_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/typedefs_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/typedefs_1.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/typedefs_1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_0.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_0.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_1.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_1.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_10.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_10.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_11.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_11.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_12.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_12.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_13.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_13.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_2.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_2.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_3.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_3.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_4.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_4.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_5.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_5.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_6.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_6.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_7.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_7.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_8.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_8.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_9.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_9.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_a.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_a.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_b.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_b.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_c.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_c.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_d.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_d.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_e.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_e.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_f.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/search/variables_f.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/splitbar.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/structfasttext_1_1Node-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/structfasttext_1_1Node.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/structfasttext_1_1Node.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/structfasttext_1_1entry-members.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/structfasttext_1_1entry.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/structfasttext_1_1entry.js  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/sync_off.png  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/sync_on.png  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/tab_a.png  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/tab_b.png  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/tab_h.png  \n",
            " extracting: fastText-0.2.0/website/static/docs/en/html/tab_s.png  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/tabs.css  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/utils_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/utils_8cc.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/utils_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/utils_8h.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/utils_8h_source.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/vector_8cc.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/vector_8cc.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/vector_8h.html  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/vector_8h.js  \n",
            "  inflating: fastText-0.2.0/website/static/docs/en/html/vector_8h_source.html  \n",
            "  inflating: fastText-0.2.0/website/static/fasttext.css  \n",
            "   creating: fastText-0.2.0/website/static/img/\n",
            "   creating: fastText-0.2.0/website/static/img/authors/\n",
            "  inflating: fastText-0.2.0/website/static/img/authors/armand_joulin.jpg  \n",
            "  inflating: fastText-0.2.0/website/static/img/authors/christian_puhrsch.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/authors/edouard_grave.jpeg  \n",
            "  inflating: fastText-0.2.0/website/static/img/authors/piotr_bojanowski.jpg  \n",
            "  inflating: fastText-0.2.0/website/static/img/authors/tomas_mikolov.jpg  \n",
            "   creating: fastText-0.2.0/website/static/img/blog/\n",
            "  inflating: fastText-0.2.0/website/static/img/blog/2016-08-18-blog-post-img1.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/blog/2016-08-18-blog-post-img2.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/blog/2017-05-02-blog-post-img1.jpg  \n",
            "  inflating: fastText-0.2.0/website/static/img/blog/2017-05-02-blog-post-img2.jpg  \n",
            "  inflating: fastText-0.2.0/website/static/img/blog/2017-10-02-blog-post-img1.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/cbo_vs_skipgram.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/fasttext-icon-api.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/fasttext-icon-bg-web.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/fasttext-icon-color-square.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/fasttext-icon-color-web.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/fasttext-icon-faq.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/fasttext-icon-tutorial.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/fasttext-icon-white-web.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/fasttext-logo-color-web.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/fasttext-logo-white-web.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/logo-color.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/model-black.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/model-blue.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/model-red.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/ogimage.png  \n",
            "  inflating: fastText-0.2.0/website/static/img/oss_logo.png  \n",
            "  inflating: fastText-0.2.0/wikifil.pl  \n",
            "  inflating: fastText-0.2.0/word-vector-example.sh  \n",
            "/content/fastText-0.2.0\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/args.cc\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/dictionary.cc\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/productquantizer.cc\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/matrix.cc\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/qmatrix.cc\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/vector.cc\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/model.cc\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/utils.cc\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/meter.cc\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/fasttext.cc\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:\u001b[m\u001b[K In member function ‘\u001b[01m\u001b[Kvoid fasttext::FastText::quantize(const fasttext::Args&)\u001b[m\u001b[K’:\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:302:45:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[K‘\u001b[01m\u001b[Kstd::vector<int> fasttext::FastText::selectEmbeddings(int32_t) const\u001b[m\u001b[K’ is deprecated: selectEmbeddings is being deprecated. [\u001b[01;35m\u001b[K-Wdeprecated-declarations\u001b[m\u001b[K]\n",
            "     auto idx = selectEmbeddings(qargs.cutoff\u001b[01;35m\u001b[K)\u001b[m\u001b[K;\n",
            "                                             \u001b[01;35m\u001b[K^\u001b[m\u001b[K\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:279:22:\u001b[m\u001b[K \u001b[01;36m\u001b[Knote: \u001b[m\u001b[Kdeclared here\n",
            " std::vector<int32_t> \u001b[01;36m\u001b[KFastText\u001b[m\u001b[K::selectEmbeddings(int32_t cutoff) const {\n",
            "                      \u001b[01;36m\u001b[K^~~~~~~~\u001b[m\u001b[K\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:\u001b[m\u001b[K In member function ‘\u001b[01m\u001b[Kvoid fasttext::FastText::lazyComputeWordVectors()\u001b[m\u001b[K’:\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:531:40:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[K‘\u001b[01m\u001b[Kvoid fasttext::FastText::precomputeWordVectors(fasttext::Matrix&)\u001b[m\u001b[K’ is deprecated: precomputeWordVectors is being deprecated. [\u001b[01;35m\u001b[K-Wdeprecated-declarations\u001b[m\u001b[K]\n",
            "     precomputeWordVectors(*wordVectors_\u001b[01;35m\u001b[K)\u001b[m\u001b[K;\n",
            "                                        \u001b[01;35m\u001b[K^\u001b[m\u001b[K\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:514:6:\u001b[m\u001b[K \u001b[01;36m\u001b[Knote: \u001b[m\u001b[Kdeclared here\n",
            " void \u001b[01;36m\u001b[KFastText\u001b[m\u001b[K::precomputeWordVectors(Matrix& wordVectors) {\n",
            "      \u001b[01;36m\u001b[K^~~~~~~~\u001b[m\u001b[K\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:\u001b[m\u001b[K In member function ‘\u001b[01m\u001b[Kvoid fasttext::FastText::trainThread(int32_t)\u001b[m\u001b[K’:\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:650:41:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[K‘\u001b[01m\u001b[Kvoid fasttext::FastText::supervised(fasttext::Model&, fasttext::real, const std::vector<int>&, const std::vector<int>&)\u001b[m\u001b[K’ is deprecated: supervised is being deprecated. [\u001b[01;35m\u001b[K-Wdeprecated-declarations\u001b[m\u001b[K]\n",
            "       supervised(model, lr, line, labels\u001b[01;35m\u001b[K)\u001b[m\u001b[K;\n",
            "                                         \u001b[01;35m\u001b[K^\u001b[m\u001b[K\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:338:6:\u001b[m\u001b[K \u001b[01;36m\u001b[Knote: \u001b[m\u001b[Kdeclared here\n",
            " void \u001b[01;36m\u001b[KFastText\u001b[m\u001b[K::supervised(\n",
            "      \u001b[01;36m\u001b[K^~~~~~~~\u001b[m\u001b[K\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:653:27:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[K‘\u001b[01m\u001b[Kvoid fasttext::FastText::cbow(fasttext::Model&, fasttext::real, const std::vector<int>&)\u001b[m\u001b[K’ is deprecated: cbow is being deprecated. [\u001b[01;35m\u001b[K-Wdeprecated-declarations\u001b[m\u001b[K]\n",
            "       cbow(model, lr, line\u001b[01;35m\u001b[K)\u001b[m\u001b[K;\n",
            "                           \u001b[01;35m\u001b[K^\u001b[m\u001b[K\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:355:6:\u001b[m\u001b[K \u001b[01;36m\u001b[Knote: \u001b[m\u001b[Kdeclared here\n",
            " void \u001b[01;36m\u001b[KFastText\u001b[m\u001b[K::cbow(Model& model, real lr, const std::vector<int32_t>& line) {\n",
            "      \u001b[01;36m\u001b[K^~~~~~~~\u001b[m\u001b[K\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:656:31:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[K‘\u001b[01m\u001b[Kvoid fasttext::FastText::skipgram(fasttext::Model&, fasttext::real, const std::vector<int>&)\u001b[m\u001b[K’ is deprecated: skipgram is being deprecated. [\u001b[01;35m\u001b[K-Wdeprecated-declarations\u001b[m\u001b[K]\n",
            "       skipgram(model, lr, line\u001b[01;35m\u001b[K)\u001b[m\u001b[K;\n",
            "                               \u001b[01;35m\u001b[K^\u001b[m\u001b[K\n",
            "\u001b[01m\u001b[Ksrc/fasttext.cc:371:6:\u001b[m\u001b[K \u001b[01;36m\u001b[Knote: \u001b[m\u001b[Kdeclared here\n",
            " void \u001b[01;36m\u001b[KFastText\u001b[m\u001b[K::skipgram(\n",
            "      \u001b[01;36m\u001b[K^~~~~~~~\u001b[m\u001b[K\n",
            "c++ -pthread -std=c++0x -march=native -O3 -funroll-loops args.o dictionary.o productquantizer.o matrix.o qmatrix.o vector.o model.o utils.o meter.o fasttext.o src/main.cc -o fasttext\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "5JauDviyqqL-",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "## Make simple dataset"
      ]
    },
    {
      "metadata": {
        "id": "ALMQ3gjFqqZS",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "# 1 is positive, 0 is negative\n",
        "f = open('train.txt', 'w')\n",
        "f.write('__label__1 i love you\\n')\n",
        "f.write('__label__1 he loves me\\n')\n",
        "f.write('__label__1 she likes baseball\\n')\n",
        "f.write('__label__0 i hate you\\n')\n",
        "f.write('__label__0 sorry for that\\n')\n",
        "f.write('__label__0 this is awful')\n",
        "f.close()\n",
        "\n",
        "f = open('test.txt', 'w')\n",
        "f.write('sorry hate you')\n",
        "f.close()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "i3_PpexwsN_a",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "## Training"
      ]
    },
    {
      "metadata": {
        "id": "q06m76JusOQ8",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 92
        },
        "outputId": "4ed3502d-4aec-4d06-cb02-b8392978ce14"
      },
      "cell_type": "code",
      "source": [
        "!./fasttext supervised -input train.txt -output model -dim 2"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\rRead 0M words\n",
            "Number of words:  17\n",
            "Number of labels: 2\n",
            "\rProgress: 100.0% words/sec/thread:   17608 lr:  0.000000 loss:  0.672308 ETA:   0h 0m\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "C77MXO-GsOpi",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "## Predict"
      ]
    },
    {
      "metadata": {
        "id": "y1yDPCjVsO6x",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 36
        },
        "outputId": "8963d7bd-01c8-40b9-e1ee-1446cb1b3454"
      },
      "cell_type": "code",
      "source": [
        "!cat test.txt\n",
        "!./fasttext predict model.bin test.txt"
      ],
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "sorry hate you__label__0\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}

================================================
FILE: 1-3.FastText/test.txt
================================================
﻿sorry hate you

================================================
FILE: 1-3.FastText/train.txt
================================================
﻿__label__1 i love you
__label__1 he loves me
__label__1 she likes baseball
__label__0 i hate you
__label__0 sorry for that
__label__0 this is awful

================================================
FILE: 2-1.TextCNN/TextCNN.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung @graykode\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "import torch.nn.functional as F\n",
        "\n",
        "class TextCNN(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(TextCNN, self).__init__()\n",
        "        self.num_filters_total = num_filters * len(filter_sizes)\n",
        "        self.W = nn.Embedding(vocab_size, embedding_size)\n",
        "        self.Weight = nn.Linear(self.num_filters_total, num_classes, bias=False)\n",
        "        self.Bias = nn.Parameter(torch.ones([num_classes]))\n",
        "        self.filter_list = nn.ModuleList([nn.Conv2d(1, num_filters, (size, embedding_size)) for size in filter_sizes])\n",
        "\n",
        "    def forward(self, X):\n",
        "        embedded_chars = self.W(X) # [batch_size, sequence_length, sequence_length]\n",
        "        embedded_chars = embedded_chars.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]\n",
        "\n",
        "        pooled_outputs = []\n",
        "        for i, conv in enumerate(self.filter_list):\n",
        "            # conv : [input_channel(=1), output_channel(=3), (filter_height, filter_width), bias_option]\n",
        "            h = F.relu(conv(embedded_chars))\n",
        "            # mp : ((filter_height, filter_width))\n",
        "            mp = nn.MaxPool2d((sequence_length - filter_sizes[i] + 1, 1))\n",
        "            # pooled : [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3)]\n",
        "            pooled = mp(h).permute(0, 3, 2, 1)\n",
        "            pooled_outputs.append(pooled)\n",
        "\n",
        "        h_pool = torch.cat(pooled_outputs, len(filter_sizes)) # [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3) * 3]\n",
        "        h_pool_flat = torch.reshape(h_pool, [-1, self.num_filters_total]) # [batch_size(=6), output_height * output_width * (output_channel * 3)]\n",
        "        model = self.Weight(h_pool_flat) + self.Bias # [batch_size, num_classes]\n",
        "        return model\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    embedding_size = 2 # embedding size\n",
        "    sequence_length = 3 # sequence length\n",
        "    num_classes = 2 # number of classes\n",
        "    filter_sizes = [2, 2, 2] # n-gram windows\n",
        "    num_filters = 3 # number of filters\n",
        "\n",
        "    # 3 words sentences (=sequence_length is 3)\n",
        "    sentences = [\"i love you\", \"he loves me\", \"she likes baseball\", \"i hate you\", \"sorry for that\", \"this is awful\"]\n",
        "    labels = [1, 1, 1, 0, 0, 0]  # 1 is good, 0 is not good.\n",
        "\n",
        "    word_list = \" \".join(sentences).split()\n",
        "    word_list = list(set(word_list))\n",
        "    word_dict = {w: i for i, w in enumerate(word_list)}\n",
        "    vocab_size = len(word_dict)\n",
        "\n",
        "    model = TextCNN()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    inputs = torch.LongTensor([np.asarray([word_dict[n] for n in sen.split()]) for sen in sentences])\n",
        "    targets = torch.LongTensor([out for out in labels]) # To using Torch Softmax Loss function\n",
        "\n",
        "    # Training\n",
        "    for epoch in range(5000):\n",
        "        optimizer.zero_grad()\n",
        "        output = model(inputs)\n",
        "\n",
        "        # output : [batch_size, num_classes], target_batch : [batch_size] (LongTensor, not one-hot)\n",
        "        loss = criterion(output, targets)\n",
        "        if (epoch + 1) % 1000 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    # Test\n",
        "    test_text = 'sorry hate you'\n",
        "    tests = [np.asarray([word_dict[n] for n in test_text.split()])]\n",
        "    test_batch = torch.LongTensor(tests)\n",
        "\n",
        "    # Predict\n",
        "    predict = model(test_batch).data.max(1, keepdim=True)[1]\n",
        "    if predict[0][0] == 0:\n",
        "        print(test_text,\"is Bad Mean...\")\n",
        "    else:\n",
        "        print(test_text,\"is Good Mean!!\")"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 2-1.TextCNN/TextCNN.py
================================================
# %%
# code by Tae Hwan Jung @graykode
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.num_filters_total = num_filters * len(filter_sizes)
        self.W = nn.Embedding(vocab_size, embedding_size)
        self.Weight = nn.Linear(self.num_filters_total, num_classes, bias=False)
        self.Bias = nn.Parameter(torch.ones([num_classes]))
        self.filter_list = nn.ModuleList([nn.Conv2d(1, num_filters, (size, embedding_size)) for size in filter_sizes])

    def forward(self, X):
        embedded_chars = self.W(X) # [batch_size, sequence_length, sequence_length]
        embedded_chars = embedded_chars.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]

        pooled_outputs = []
        for i, conv in enumerate(self.filter_list):
            # conv : [input_channel(=1), output_channel(=3), (filter_height, filter_width), bias_option]
            h = F.relu(conv(embedded_chars))
            # mp : ((filter_height, filter_width))
            mp = nn.MaxPool2d((sequence_length - filter_sizes[i] + 1, 1))
            # pooled : [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3)]
            pooled = mp(h).permute(0, 3, 2, 1)
            pooled_outputs.append(pooled)

        h_pool = torch.cat(pooled_outputs, len(filter_sizes)) # [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3) * 3]
        h_pool_flat = torch.reshape(h_pool, [-1, self.num_filters_total]) # [batch_size(=6), output_height * output_width * (output_channel * 3)]
        model = self.Weight(h_pool_flat) + self.Bias # [batch_size, num_classes]
        return model

if __name__ == '__main__':
    embedding_size = 2 # embedding size
    sequence_length = 3 # sequence length
    num_classes = 2 # number of classes
    filter_sizes = [2, 2, 2] # n-gram windows
    num_filters = 3 # number of filters

    # 3 words sentences (=sequence_length is 3)
    sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
    labels = [1, 1, 1, 0, 0, 0]  # 1 is good, 0 is not good.

    word_list = " ".join(sentences).split()
    word_list = list(set(word_list))
    word_dict = {w: i for i, w in enumerate(word_list)}
    vocab_size = len(word_dict)

    model = TextCNN()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    inputs = torch.LongTensor([np.asarray([word_dict[n] for n in sen.split()]) for sen in sentences])
    targets = torch.LongTensor([out for out in labels]) # To using Torch Softmax Loss function

    # Training
    for epoch in range(5000):
        optimizer.zero_grad()
        output = model(inputs)

        # output : [batch_size, num_classes], target_batch : [batch_size] (LongTensor, not one-hot)
        loss = criterion(output, targets)
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    # Test
    test_text = 'sorry hate you'
    tests = [np.asarray([word_dict[n] for n in test_text.split()])]
    test_batch = torch.LongTensor(tests)

    # Predict
    predict = model(test_batch).data.max(1, keepdim=True)[1]
    if predict[0][0] == 0:
        print(test_text,"is Bad Mean...")
    else:
        print(test_text,"is Good Mean!!")

================================================
FILE: 3-1.TextRNN/TextRNN.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung @graykode\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "\n",
        "def make_batch():\n",
        "    input_batch = []\n",
        "    target_batch = []\n",
        "\n",
        "    for sen in sentences:\n",
        "        word = sen.split()  # space tokenizer\n",
        "        input = [word_dict[n] for n in word[:-1]]  # create (1~n-1) as input\n",
        "        target = word_dict[word[-1]]  # create (n) as target, We usually call this 'casual language model'\n",
        "\n",
        "        input_batch.append(np.eye(n_class)[input])\n",
        "        target_batch.append(target)\n",
        "\n",
        "    return input_batch, target_batch\n",
        "\n",
        "class TextRNN(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(TextRNN, self).__init__()\n",
        "        self.rnn = nn.RNN(input_size=n_class, hidden_size=n_hidden)\n",
        "        self.W = nn.Linear(n_hidden, n_class, bias=False)\n",
        "        self.b = nn.Parameter(torch.ones([n_class]))\n",
        "\n",
        "    def forward(self, hidden, X):\n",
        "        X = X.transpose(0, 1) # X : [n_step, batch_size, n_class]\n",
        "        outputs, hidden = self.rnn(X, hidden)\n",
        "        # outputs : [n_step, batch_size, num_directions(=1) * n_hidden]\n",
        "        # hidden : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]\n",
        "        outputs = outputs[-1] # [batch_size, num_directions(=1) * n_hidden]\n",
        "        model = self.W(outputs) + self.b # model : [batch_size, n_class]\n",
        "        return model\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    n_step = 2 # number of cells(= number of Step)\n",
        "    n_hidden = 5 # number of hidden units in one cell\n",
        "\n",
        "    sentences = [\"i like dog\", \"i love coffee\", \"i hate milk\"]\n",
        "\n",
        "    word_list = \" \".join(sentences).split()\n",
        "    word_list = list(set(word_list))\n",
        "    word_dict = {w: i for i, w in enumerate(word_list)}\n",
        "    number_dict = {i: w for i, w in enumerate(word_list)}\n",
        "    n_class = len(word_dict)\n",
        "    batch_size = len(sentences)\n",
        "\n",
        "    model = TextRNN()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    input_batch, target_batch = make_batch()\n",
        "    input_batch = torch.FloatTensor(input_batch)\n",
        "    target_batch = torch.LongTensor(target_batch)\n",
        "\n",
        "    # Training\n",
        "    for epoch in range(5000):\n",
        "        optimizer.zero_grad()\n",
        "\n",
        "        # hidden : [num_layers * num_directions, batch, hidden_size]\n",
        "        hidden = torch.zeros(1, batch_size, n_hidden)\n",
        "        # input_batch : [batch_size, n_step, n_class]\n",
        "        output = model(hidden, input_batch)\n",
        "\n",
        "        # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)\n",
        "        loss = criterion(output, target_batch)\n",
        "        if (epoch + 1) % 1000 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    input = [sen.split()[:2] for sen in sentences]\n",
        "\n",
        "    # Predict\n",
        "    hidden = torch.zeros(1, batch_size, n_hidden)\n",
        "    predict = model(hidden, input_batch).data.max(1, keepdim=True)[1]\n",
        "    print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 3-1.TextRNN/TextRNN.py
================================================
# %%
# code by Tae Hwan Jung @graykode
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

def make_batch():
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split()  # space tokenizer
        input = [word_dict[n] for n in word[:-1]]  # create (1~n-1) as input
        target = word_dict[word[-1]]  # create (n) as target, We usually call this 'casual language model'

        input_batch.append(np.eye(n_class)[input])
        target_batch.append(target)

    return input_batch, target_batch

class TextRNN(nn.Module):
    def __init__(self):
        super(TextRNN, self).__init__()
        self.rnn = nn.RNN(input_size=n_class, hidden_size=n_hidden)
        self.W = nn.Linear(n_hidden, n_class, bias=False)
        self.b = nn.Parameter(torch.ones([n_class]))

    def forward(self, hidden, X):
        X = X.transpose(0, 1) # X : [n_step, batch_size, n_class]
        outputs, hidden = self.rnn(X, hidden)
        # outputs : [n_step, batch_size, num_directions(=1) * n_hidden]
        # hidden : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
        outputs = outputs[-1] # [batch_size, num_directions(=1) * n_hidden]
        model = self.W(outputs) + self.b # model : [batch_size, n_class]
        return model

if __name__ == '__main__':
    n_step = 2 # number of cells(= number of Step)
    n_hidden = 5 # number of hidden units in one cell

    sentences = ["i like dog", "i love coffee", "i hate milk"]

    word_list = " ".join(sentences).split()
    word_list = list(set(word_list))
    word_dict = {w: i for i, w in enumerate(word_list)}
    number_dict = {i: w for i, w in enumerate(word_list)}
    n_class = len(word_dict)
    batch_size = len(sentences)

    model = TextRNN()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    input_batch, target_batch = make_batch()
    input_batch = torch.FloatTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)

    # Training
    for epoch in range(5000):
        optimizer.zero_grad()

        # hidden : [num_layers * num_directions, batch, hidden_size]
        hidden = torch.zeros(1, batch_size, n_hidden)
        # input_batch : [batch_size, n_step, n_class]
        output = model(hidden, input_batch)

        # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)
        loss = criterion(output, target_batch)
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    input = [sen.split()[:2] for sen in sentences]

    # Predict
    hidden = torch.zeros(1, batch_size, n_hidden)
    predict = model(hidden, input_batch).data.max(1, keepdim=True)[1]
    print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])

================================================
FILE: 3-2.TextLSTM/TextLSTM.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung @graykode\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "\n",
        "def make_batch():\n",
        "    input_batch, target_batch = [], []\n",
        "\n",
        "    for seq in seq_data:\n",
        "        input = [word_dict[n] for n in seq[:-1]] # 'm', 'a' , 'k' is input\n",
        "        target = word_dict[seq[-1]] # 'e' is target\n",
        "        input_batch.append(np.eye(n_class)[input])\n",
        "        target_batch.append(target)\n",
        "\n",
        "    return input_batch, target_batch\n",
        "\n",
        "class TextLSTM(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(TextLSTM, self).__init__()\n",
        "\n",
        "        self.lstm = nn.LSTM(input_size=n_class, hidden_size=n_hidden)\n",
        "        self.W = nn.Linear(n_hidden, n_class, bias=False)\n",
        "        self.b = nn.Parameter(torch.ones([n_class]))\n",
        "\n",
        "    def forward(self, X):\n",
        "        input = X.transpose(0, 1)  # X : [n_step, batch_size, n_class]\n",
        "\n",
        "        hidden_state = torch.zeros(1, len(X), n_hidden)  # [num_layers(=1) * num_directions(=1), batch_size, n_hidden]\n",
        "        cell_state = torch.zeros(1, len(X), n_hidden)     # [num_layers(=1) * num_directions(=1), batch_size, n_hidden]\n",
        "\n",
        "        outputs, (_, _) = self.lstm(input, (hidden_state, cell_state))\n",
        "        outputs = outputs[-1]  # [batch_size, n_hidden]\n",
        "        model = self.W(outputs) + self.b  # model : [batch_size, n_class]\n",
        "        return model\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    n_step = 3 # number of cells(= number of Step)\n",
        "    n_hidden = 128 # number of hidden units in one cell\n",
        "\n",
        "    char_arr = [c for c in 'abcdefghijklmnopqrstuvwxyz']\n",
        "    word_dict = {n: i for i, n in enumerate(char_arr)}\n",
        "    number_dict = {i: w for i, w in enumerate(char_arr)}\n",
        "    n_class = len(word_dict)  # number of class(=number of vocab)\n",
        "\n",
        "    seq_data = ['make', 'need', 'coal', 'word', 'love', 'hate', 'live', 'home', 'hash', 'star']\n",
        "\n",
        "    model = TextLSTM()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    input_batch, target_batch = make_batch()\n",
        "    input_batch = torch.FloatTensor(input_batch)\n",
        "    target_batch = torch.LongTensor(target_batch)\n",
        "\n",
        "    # Training\n",
        "    for epoch in range(1000):\n",
        "        optimizer.zero_grad()\n",
        "\n",
        "        output = model(input_batch)\n",
        "        loss = criterion(output, target_batch)\n",
        "        if (epoch + 1) % 100 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    inputs = [sen[:3] for sen in seq_data]\n",
        "\n",
        "    predict = model(input_batch).data.max(1, keepdim=True)[1]\n",
        "    print(inputs, '->', [number_dict[n.item()] for n in predict.squeeze()])"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 3-2.TextLSTM/TextLSTM.py
================================================
# %%
# code by Tae Hwan Jung @graykode
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

def make_batch():
    input_batch, target_batch = [], []

    for seq in seq_data:
        input = [word_dict[n] for n in seq[:-1]] # 'm', 'a' , 'k' is input
        target = word_dict[seq[-1]] # 'e' is target
        input_batch.append(np.eye(n_class)[input])
        target_batch.append(target)

    return input_batch, target_batch

class TextLSTM(nn.Module):
    def __init__(self):
        super(TextLSTM, self).__init__()

        self.lstm = nn.LSTM(input_size=n_class, hidden_size=n_hidden)
        self.W = nn.Linear(n_hidden, n_class, bias=False)
        self.b = nn.Parameter(torch.ones([n_class]))

    def forward(self, X):
        input = X.transpose(0, 1)  # X : [n_step, batch_size, n_class]

        hidden_state = torch.zeros(1, len(X), n_hidden)  # [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
        cell_state = torch.zeros(1, len(X), n_hidden)     # [num_layers(=1) * num_directions(=1), batch_size, n_hidden]

        outputs, (_, _) = self.lstm(input, (hidden_state, cell_state))
        outputs = outputs[-1]  # [batch_size, n_hidden]
        model = self.W(outputs) + self.b  # model : [batch_size, n_class]
        return model

if __name__ == '__main__':
    n_step = 3 # number of cells(= number of Step)
    n_hidden = 128 # number of hidden units in one cell

    char_arr = [c for c in 'abcdefghijklmnopqrstuvwxyz']
    word_dict = {n: i for i, n in enumerate(char_arr)}
    number_dict = {i: w for i, w in enumerate(char_arr)}
    n_class = len(word_dict)  # number of class(=number of vocab)

    seq_data = ['make', 'need', 'coal', 'word', 'love', 'hate', 'live', 'home', 'hash', 'star']

    model = TextLSTM()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    input_batch, target_batch = make_batch()
    input_batch = torch.FloatTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)

    # Training
    for epoch in range(1000):
        optimizer.zero_grad()

        output = model(input_batch)
        loss = criterion(output, target_batch)
        if (epoch + 1) % 100 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    inputs = [sen[:3] for sen in seq_data]

    predict = model(input_batch).data.max(1, keepdim=True)[1]
    print(inputs, '->', [number_dict[n.item()] for n in predict.squeeze()])

================================================
FILE: 3-3.Bi-LSTM/Bi-LSTM.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung @graykode\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "\n",
        "def make_batch():\n",
        "    input_batch = []\n",
        "    target_batch = []\n",
        "\n",
        "    words = sentence.split()\n",
        "    for i, word in enumerate(words[:-1]):\n",
        "        input = [word_dict[n] for n in words[:(i + 1)]]\n",
        "        input = input + [0] * (max_len - len(input))\n",
        "        target = word_dict[words[i + 1]]\n",
        "        input_batch.append(np.eye(n_class)[input])\n",
        "        target_batch.append(target)\n",
        "\n",
        "    return input_batch, target_batch\n",
        "\n",
        "class BiLSTM(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(BiLSTM, self).__init__()\n",
        "\n",
        "        self.lstm = nn.LSTM(input_size=n_class, hidden_size=n_hidden, bidirectional=True)\n",
        "        self.W = nn.Linear(n_hidden * 2, n_class, bias=False)\n",
        "        self.b = nn.Parameter(torch.ones([n_class]))\n",
        "\n",
        "    def forward(self, X):\n",
        "        input = X.transpose(0, 1)  # input : [n_step, batch_size, n_class]\n",
        "\n",
        "        hidden_state = torch.zeros(1*2, len(X), n_hidden)   # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]\n",
        "        cell_state = torch.zeros(1*2, len(X), n_hidden)     # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]\n",
        "\n",
        "        outputs, (_, _) = self.lstm(input, (hidden_state, cell_state))\n",
        "        outputs = outputs[-1]  # [batch_size, n_hidden]\n",
        "        model = self.W(outputs) + self.b  # model : [batch_size, n_class]\n",
        "        return model\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    n_hidden = 5 # number of hidden units in one cell\n",
        "\n",
        "    sentence = (\n",
        "        'Lorem ipsum dolor sit amet consectetur adipisicing elit '\n",
        "        'sed do eiusmod tempor incididunt ut labore et dolore magna '\n",
        "        'aliqua Ut enim ad minim veniam quis nostrud exercitation'\n",
        "    )\n",
        "\n",
        "    word_dict = {w: i for i, w in enumerate(list(set(sentence.split())))}\n",
        "    number_dict = {i: w for i, w in enumerate(list(set(sentence.split())))}\n",
        "    n_class = len(word_dict)\n",
        "    max_len = len(sentence.split())\n",
        "\n",
        "    model = BiLSTM()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    input_batch, target_batch = make_batch()\n",
        "    input_batch = torch.FloatTensor(input_batch)\n",
        "    target_batch = torch.LongTensor(target_batch)\n",
        "\n",
        "    # Training\n",
        "    for epoch in range(10000):\n",
        "        optimizer.zero_grad()\n",
        "        output = model(input_batch)\n",
        "        loss = criterion(output, target_batch)\n",
        "        if (epoch + 1) % 1000 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    predict = model(input_batch).data.max(1, keepdim=True)[1]\n",
        "    print(sentence)\n",
        "    print([number_dict[n.item()] for n in predict.squeeze()])\n"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 3-3.Bi-LSTM/Bi-LSTM.py
================================================
# %%
# code by Tae Hwan Jung @graykode
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

def make_batch():
    input_batch = []
    target_batch = []

    words = sentence.split()
    for i, word in enumerate(words[:-1]):
        input = [word_dict[n] for n in words[:(i + 1)]]
        input = input + [0] * (max_len - len(input))
        target = word_dict[words[i + 1]]
        input_batch.append(np.eye(n_class)[input])
        target_batch.append(target)

    return input_batch, target_batch

class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()

        self.lstm = nn.LSTM(input_size=n_class, hidden_size=n_hidden, bidirectional=True)
        self.W = nn.Linear(n_hidden * 2, n_class, bias=False)
        self.b = nn.Parameter(torch.ones([n_class]))

    def forward(self, X):
        input = X.transpose(0, 1)  # input : [n_step, batch_size, n_class]

        hidden_state = torch.zeros(1*2, len(X), n_hidden)   # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        cell_state = torch.zeros(1*2, len(X), n_hidden)     # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        outputs, (_, _) = self.lstm(input, (hidden_state, cell_state))
        outputs = outputs[-1]  # [batch_size, n_hidden]
        model = self.W(outputs) + self.b  # model : [batch_size, n_class]
        return model

if __name__ == '__main__':
    n_hidden = 5 # number of hidden units in one cell

    sentence = (
        'Lorem ipsum dolor sit amet consectetur adipisicing elit '
        'sed do eiusmod tempor incididunt ut labore et dolore magna '
        'aliqua Ut enim ad minim veniam quis nostrud exercitation'
    )

    word_dict = {w: i for i, w in enumerate(list(set(sentence.split())))}
    number_dict = {i: w for i, w in enumerate(list(set(sentence.split())))}
    n_class = len(word_dict)
    max_len = len(sentence.split())

    model = BiLSTM()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    input_batch, target_batch = make_batch()
    input_batch = torch.FloatTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)

    # Training
    for epoch in range(10000):
        optimizer.zero_grad()
        output = model(input_batch)
        loss = criterion(output, target_batch)
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    predict = model(input_batch).data.max(1, keepdim=True)[1]
    print(sentence)
    print([number_dict[n.item()] for n in predict.squeeze()])


================================================
FILE: 4-1.Seq2Seq/Seq2Seq.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung @graykode\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "\n",
        "# S: Symbol that shows starting of decoding input\n",
        "# E: Symbol that shows starting of decoding output\n",
        "# P: Symbol that will fill in blank sequence if current batch data size is short than time steps\n",
        "\n",
        "def make_batch():\n",
        "    input_batch, output_batch, target_batch = [], [], []\n",
        "\n",
        "    for seq in seq_data:\n",
        "        for i in range(2):\n",
        "            seq[i] = seq[i] + 'P' * (n_step - len(seq[i]))\n",
        "\n",
        "        input = [num_dic[n] for n in seq[0]]\n",
        "        output = [num_dic[n] for n in ('S' + seq[1])]\n",
        "        target = [num_dic[n] for n in (seq[1] + 'E')]\n",
        "\n",
        "        input_batch.append(np.eye(n_class)[input])\n",
        "        output_batch.append(np.eye(n_class)[output])\n",
        "        target_batch.append(target) # not one-hot\n",
        "\n",
        "    # make tensor\n",
        "    return torch.FloatTensor(input_batch), torch.FloatTensor(output_batch), torch.LongTensor(target_batch)\n",
        "\n",
        "# make test batch\n",
        "def make_testbatch(input_word):\n",
        "    input_batch, output_batch = [], []\n",
        "\n",
        "    input_w = input_word + 'P' * (n_step - len(input_word))\n",
        "    input = [num_dic[n] for n in input_w]\n",
        "    output = [num_dic[n] for n in 'S' + 'P' * n_step]\n",
        "\n",
        "    input_batch = np.eye(n_class)[input]\n",
        "    output_batch = np.eye(n_class)[output]\n",
        "\n",
        "    return torch.FloatTensor(input_batch).unsqueeze(0), torch.FloatTensor(output_batch).unsqueeze(0)\n",
        "\n",
        "# Model\n",
        "class Seq2Seq(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Seq2Seq, self).__init__()\n",
        "\n",
        "        self.enc_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)\n",
        "        self.dec_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)\n",
        "        self.fc = nn.Linear(n_hidden, n_class)\n",
        "\n",
        "    def forward(self, enc_input, enc_hidden, dec_input):\n",
        "        enc_input = enc_input.transpose(0, 1) # enc_input: [max_len(=n_step, time step), batch_size, n_class]\n",
        "        dec_input = dec_input.transpose(0, 1) # dec_input: [max_len(=n_step, time step), batch_size, n_class]\n",
        "\n",
        "        # enc_states : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]\n",
        "        _, enc_states = self.enc_cell(enc_input, enc_hidden)\n",
        "        # outputs : [max_len+1(=6), batch_size, num_directions(=1) * n_hidden(=128)]\n",
        "        outputs, _ = self.dec_cell(dec_input, enc_states)\n",
        "\n",
        "        model = self.fc(outputs) # model : [max_len+1(=6), batch_size, n_class]\n",
        "        return model\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    n_step = 5\n",
        "    n_hidden = 128\n",
        "\n",
        "    char_arr = [c for c in 'SEPabcdefghijklmnopqrstuvwxyz']\n",
        "    num_dic = {n: i for i, n in enumerate(char_arr)}\n",
        "    seq_data = [['man', 'women'], ['black', 'white'], ['king', 'queen'], ['girl', 'boy'], ['up', 'down'], ['high', 'low']]\n",
        "\n",
        "    n_class = len(num_dic)\n",
        "    batch_size = len(seq_data)\n",
        "\n",
        "    model = Seq2Seq()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    input_batch, output_batch, target_batch = make_batch()\n",
        "\n",
        "    for epoch in range(5000):\n",
        "        # make hidden shape [num_layers * num_directions, batch_size, n_hidden]\n",
        "        hidden = torch.zeros(1, batch_size, n_hidden)\n",
        "\n",
        "        optimizer.zero_grad()\n",
        "        # input_batch : [batch_size, max_len(=n_step, time step), n_class]\n",
        "        # output_batch : [batch_size, max_len+1(=n_step, time step) (becase of 'S' or 'E'), n_class]\n",
        "        # target_batch : [batch_size, max_len+1(=n_step, time step)], not one-hot\n",
        "        output = model(input_batch, hidden, output_batch)\n",
        "        # output : [max_len+1, batch_size, n_class]\n",
        "        output = output.transpose(0, 1) # [batch_size, max_len+1(=6), n_class]\n",
        "        loss = 0\n",
        "        for i in range(0, len(target_batch)):\n",
        "            # output[i] : [max_len+1, n_class, target_batch[i] : max_len+1]\n",
        "            loss += criterion(output[i], target_batch[i])\n",
        "        if (epoch + 1) % 1000 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    # Test\n",
        "    def translate(word):\n",
        "        input_batch, output_batch = make_testbatch(word)\n",
        "\n",
        "        # make hidden shape [num_layers * num_directions, batch_size, n_hidden]\n",
        "        hidden = torch.zeros(1, 1, n_hidden)\n",
        "        output = model(input_batch, hidden, output_batch)\n",
        "        # output : [max_len+1(=6), batch_size(=1), n_class]\n",
        "\n",
        "        predict = output.data.max(2, keepdim=True)[1] # select n_class dimension\n",
        "        decoded = [char_arr[i] for i in predict]\n",
        "        end = decoded.index('E')\n",
        "        translated = ''.join(decoded[:end])\n",
        "\n",
        "        return translated.replace('P', '')\n",
        "\n",
        "    print('test')\n",
        "    print('man ->', translate('man'))\n",
        "    print('mans ->', translate('mans'))\n",
        "    print('king ->', translate('king'))\n",
        "    print('black ->', translate('black'))\n",
        "    print('upp ->', translate('upp'))"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 4-1.Seq2Seq/Seq2Seq.py
================================================
# %%
# code by Tae Hwan Jung @graykode
import numpy as np
import torch
import torch.nn as nn

# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps

def make_batch():
    input_batch, output_batch, target_batch = [], [], []

    for seq in seq_data:
        for i in range(2):
            seq[i] = seq[i] + 'P' * (n_step - len(seq[i]))

        input = [num_dic[n] for n in seq[0]]
        output = [num_dic[n] for n in ('S' + seq[1])]
        target = [num_dic[n] for n in (seq[1] + 'E')]

        input_batch.append(np.eye(n_class)[input])
        output_batch.append(np.eye(n_class)[output])
        target_batch.append(target) # not one-hot

    # make tensor
    return torch.FloatTensor(input_batch), torch.FloatTensor(output_batch), torch.LongTensor(target_batch)

# make test batch
def make_testbatch(input_word):
    input_batch, output_batch = [], []

    input_w = input_word + 'P' * (n_step - len(input_word))
    input = [num_dic[n] for n in input_w]
    output = [num_dic[n] for n in 'S' + 'P' * n_step]

    input_batch = np.eye(n_class)[input]
    output_batch = np.eye(n_class)[output]

    return torch.FloatTensor(input_batch).unsqueeze(0), torch.FloatTensor(output_batch).unsqueeze(0)

# Model
class Seq2Seq(nn.Module):
    def __init__(self):
        super(Seq2Seq, self).__init__()

        self.enc_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)
        self.dec_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)
        self.fc = nn.Linear(n_hidden, n_class)

    def forward(self, enc_input, enc_hidden, dec_input):
        enc_input = enc_input.transpose(0, 1) # enc_input: [max_len(=n_step, time step), batch_size, n_class]
        dec_input = dec_input.transpose(0, 1) # dec_input: [max_len(=n_step, time step), batch_size, n_class]

        # enc_states : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
        _, enc_states = self.enc_cell(enc_input, enc_hidden)
        # outputs : [max_len+1(=6), batch_size, num_directions(=1) * n_hidden(=128)]
        outputs, _ = self.dec_cell(dec_input, enc_states)

        model = self.fc(outputs) # model : [max_len+1(=6), batch_size, n_class]
        return model

if __name__ == '__main__':
    n_step = 5
    n_hidden = 128

    char_arr = [c for c in 'SEPabcdefghijklmnopqrstuvwxyz']
    num_dic = {n: i for i, n in enumerate(char_arr)}
    seq_data = [['man', 'women'], ['black', 'white'], ['king', 'queen'], ['girl', 'boy'], ['up', 'down'], ['high', 'low']]

    n_class = len(num_dic)
    batch_size = len(seq_data)

    model = Seq2Seq()

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    input_batch, output_batch, target_batch = make_batch()

    for epoch in range(5000):
        # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
        hidden = torch.zeros(1, batch_size, n_hidden)

        optimizer.zero_grad()
        # input_batch : [batch_size, max_len(=n_step, time step), n_class]
        # output_batch : [batch_size, max_len+1(=n_step, time step) (becase of 'S' or 'E'), n_class]
        # target_batch : [batch_size, max_len+1(=n_step, time step)], not one-hot
        output = model(input_batch, hidden, output_batch)
        # output : [max_len+1, batch_size, n_class]
        output = output.transpose(0, 1) # [batch_size, max_len+1(=6), n_class]
        loss = 0
        for i in range(0, len(target_batch)):
            # output[i] : [max_len+1, n_class, target_batch[i] : max_len+1]
            loss += criterion(output[i], target_batch[i])
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
        loss.backward()
        optimizer.step()

    # Test
    def translate(word):
        input_batch, output_batch = make_testbatch(word)

        # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
        hidden = torch.zeros(1, 1, n_hidden)
        output = model(input_batch, hidden, output_batch)
        # output : [max_len+1(=6), batch_size(=1), n_class]

        predict = output.data.max(2, keepdim=True)[1] # select n_class dimension
        decoded = [char_arr[i] for i in predict]
        end = decoded.index('E')
        translated = ''.join(decoded[:end])

        return translated.replace('P', '')

    print('test')
    print('man ->', translate('man'))
    print('mans ->', translate('mans'))
    print('king ->', translate('king'))
    print('black ->', translate('black'))
    print('upp ->', translate('upp'))

================================================
FILE: 4-2.Seq2Seq(Attention)/Seq2Seq(Attention).ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung @graykode\n",
        "# Reference : https://github.com/hunkim/PyTorchZeroToAll/blob/master/14_2_seq2seq_att.py\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.nn.functional as F\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "# S: Symbol that shows starting of decoding input\n",
        "# E: Symbol that shows starting of decoding output\n",
        "# P: Symbol that will fill in blank sequence if current batch data size is short than time steps\n",
        "\n",
        "def make_batch():\n",
        "    input_batch = [np.eye(n_class)[[word_dict[n] for n in sentences[0].split()]]]\n",
        "    output_batch = [np.eye(n_class)[[word_dict[n] for n in sentences[1].split()]]]\n",
        "    target_batch = [[word_dict[n] for n in sentences[2].split()]]\n",
        "\n",
        "    # make tensor\n",
        "    return torch.FloatTensor(input_batch), torch.FloatTensor(output_batch), torch.LongTensor(target_batch)\n",
        "\n",
        "class Attention(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Attention, self).__init__()\n",
        "        self.enc_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)\n",
        "        self.dec_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)\n",
        "\n",
        "        # Linear for attention\n",
        "        self.attn = nn.Linear(n_hidden, n_hidden)\n",
        "        self.out = nn.Linear(n_hidden * 2, n_class)\n",
        "\n",
        "    def forward(self, enc_inputs, hidden, dec_inputs):\n",
        "        enc_inputs = enc_inputs.transpose(0, 1)  # enc_inputs: [n_step(=n_step, time step), batch_size, n_class]\n",
        "        dec_inputs = dec_inputs.transpose(0, 1)  # dec_inputs: [n_step(=n_step, time step), batch_size, n_class]\n",
        "\n",
        "        # enc_outputs : [n_step, batch_size, num_directions(=1) * n_hidden], matrix F\n",
        "        # enc_hidden : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]\n",
        "        enc_outputs, enc_hidden = self.enc_cell(enc_inputs, hidden)\n",
        "\n",
        "        trained_attn = []\n",
        "        hidden = enc_hidden\n",
        "        n_step = len(dec_inputs)\n",
        "        model = torch.empty([n_step, 1, n_class])\n",
        "\n",
        "        for i in range(n_step):  # each time step\n",
        "            # dec_output : [n_step(=1), batch_size(=1), num_directions(=1) * n_hidden]\n",
        "            # hidden : [num_layers(=1) * num_directions(=1), batch_size(=1), n_hidden]\n",
        "            dec_output, hidden = self.dec_cell(dec_inputs[i].unsqueeze(0), hidden)\n",
        "            attn_weights = self.get_att_weight(dec_output, enc_outputs)  # attn_weights : [1, 1, n_step]\n",
        "            trained_attn.append(attn_weights.squeeze().data.numpy())\n",
        "\n",
        "            # matrix-matrix product of matrices [1,1,n_step] x [1,n_step,n_hidden] = [1,1,n_hidden]\n",
        "            context = attn_weights.bmm(enc_outputs.transpose(0, 1))\n",
        "            dec_output = dec_output.squeeze(0)  # dec_output : [batch_size(=1), num_directions(=1) * n_hidden]\n",
        "            context = context.squeeze(1)  # [1, num_directions(=1) * n_hidden]\n",
        "            model[i] = self.out(torch.cat((dec_output, context), 1))\n",
        "\n",
        "        # make model shape [n_step, n_class]\n",
        "        return model.transpose(0, 1).squeeze(0), trained_attn\n",
        "\n",
        "    def get_att_weight(self, dec_output, enc_outputs):  # get attention weight one 'dec_output' with 'enc_outputs'\n",
        "        n_step = len(enc_outputs)\n",
        "        attn_scores = torch.zeros(n_step)  # attn_scores : [n_step]\n",
        "\n",
        "        for i in range(n_step):\n",
        "            attn_scores[i] = self.get_att_score(dec_output, enc_outputs[i])\n",
        "\n",
        "        # Normalize scores to weights in range 0 to 1\n",
        "        return F.softmax(attn_scores).view(1, 1, -1)\n",
        "\n",
        "    def get_att_score(self, dec_output, enc_output):  # enc_outputs [batch_size, num_directions(=1) * n_hidden]\n",
        "        score = self.attn(enc_output)  # score : [batch_size, n_hidden]\n",
        "        return torch.dot(dec_output.view(-1), score.view(-1))  # inner product make scalar value\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    n_step = 5 # number of cells(= number of Step)\n",
        "    n_hidden = 128 # number of hidden units in one cell\n",
        "\n",
        "    sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']\n",
        "\n",
        "    word_list = \" \".join(sentences).split()\n",
        "    word_list = list(set(word_list))\n",
        "    word_dict = {w: i for i, w in enumerate(word_list)}\n",
        "    number_dict = {i: w for i, w in enumerate(word_list)}\n",
        "    n_class = len(word_dict)  # vocab list\n",
        "\n",
        "    # hidden : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]\n",
        "    hidden = torch.zeros(1, 1, n_hidden)\n",
        "\n",
        "    model = Attention()\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    input_batch, output_batch, target_batch = make_batch()\n",
        "\n",
        "    # Train\n",
        "    for epoch in range(2000):\n",
        "        optimizer.zero_grad()\n",
        "        output, _ = model(input_batch, hidden, output_batch)\n",
        "\n",
        "        loss = criterion(output, target_batch.squeeze(0))\n",
        "        if (epoch + 1) % 400 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    # Test\n",
        "    test_batch = [np.eye(n_class)[[word_dict[n] for n in 'SPPPP']]]\n",
        "    test_batch = torch.FloatTensor(test_batch)\n",
        "    predict, trained_attn = model(input_batch, hidden, test_batch)\n",
        "    predict = predict.data.max(1, keepdim=True)[1]\n",
        "    print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])\n",
        "\n",
        "    # Show Attention\n",
        "    fig = plt.figure(figsize=(5, 5))\n",
        "    ax = fig.add_subplot(1, 1, 1)\n",
        "    ax.matshow(trained_attn, cmap='viridis')\n",
        "    ax.set_xticklabels([''] + sentences[0].split(), fontdict={'fontsize': 14})\n",
        "    ax.set_yticklabels([''] + sentences[2].split(), fontdict={'fontsize': 14})\n",
        "    plt.show()"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 4-2.Seq2Seq(Attention)/Seq2Seq(Attention).py
================================================
# %%
# code by Tae Hwan Jung @graykode
# Reference : https://github.com/hunkim/PyTorchZeroToAll/blob/master/14_2_seq2seq_att.py
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps

def make_batch():
    input_batch = [np.eye(n_class)[[word_dict[n] for n in sentences[0].split()]]]
    output_batch = [np.eye(n_class)[[word_dict[n] for n in sentences[1].split()]]]
    target_batch = [[word_dict[n] for n in sentences[2].split()]]

    # make tensor
    return torch.FloatTensor(input_batch), torch.FloatTensor(output_batch), torch.LongTensor(target_batch)

class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
        self.enc_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)
        self.dec_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)

        # Linear for attention
        self.attn = nn.Linear(n_hidden, n_hidden)
        self.out = nn.Linear(n_hidden * 2, n_class)

    def forward(self, enc_inputs, hidden, dec_inputs):
        enc_inputs = enc_inputs.transpose(0, 1)  # enc_inputs: [n_step(=n_step, time step), batch_size, n_class]
        dec_inputs = dec_inputs.transpose(0, 1)  # dec_inputs: [n_step(=n_step, time step), batch_size, n_class]

        # enc_outputs : [n_step, batch_size, num_directions(=1) * n_hidden], matrix F
        # enc_hidden : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
        enc_outputs, enc_hidden = self.enc_cell(enc_inputs, hidden)

        trained_attn = []
        hidden = enc_hidden
        n_step = len(dec_inputs)
        model = torch.empty([n_step, 1, n_class])

        for i in range(n_step):  # each time step
            # dec_output : [n_step(=1), batch_size(=1), num_directions(=1) * n_hidden]
            # hidden : [num_layers(=1) * num_directions(=1), batch_size(=1), n_hidden]
            dec_output, hidden = self.dec_cell(dec_inputs[i].unsqueeze(0), hidden)
            attn_weights = self.get_att_weight(dec_output, enc_outputs)  # attn_weights : [1, 1, n_step]
            trained_attn.append(attn_weights.squeeze().data.numpy())

            # matrix-matrix product of matrices [1,1,n_step] x [1,n_step,n_hidden] = [1,1,n_hidden]
            context = attn_weights.bmm(enc_outputs.transpose(0, 1))
            dec_output = dec_output.squeeze(0)  # dec_output : [batch_size(=1), num_directions(=1) * n_hidden]
            context = context.squeeze(1)  # [1, num_directions(=1) * n_hidden]
            model[i] = self.out(torch.cat((dec_output, context), 1))

        # make model shape [n_step, n_class]
        return model.transpose(0, 1).squeeze(0), trained_attn

    def get_att_weight(self, dec_output, enc_outputs):  # get attention weight one 'dec_output' with 'enc_outputs'
        n_step = len(enc_outputs)
        attn_scores = torch.zeros(n_step)  # attn_scores : [n_step]

        for i in range(n_step):
            attn_scores[i] = self.get_att_score(dec_output, enc_outputs[i])

        # Normalize scores to weights in range 0 to 1
        return F.softmax(attn_scores).view(1, 1, -1)

    def get_att_score(self, dec_output, enc_output):  # enc_outputs [batch_size, num_directions(=1) * n_hidden]
        score = self.attn(enc_output)  # score : [batch_size, n_hidden]
        return torch.dot(dec_output.view(-1), score.view(-1))  # inner product make scalar value

if __name__ == '__main__':
    n_step = 5 # number of cells(= number of Step)
    n_hidden = 128 # number of hidden units in one cell

    sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']

    word_list = " ".join(sentences).split()
    word_list = list(set(word_list))
    word_dict = {w: i for i, w in enumerate(word_list)}
    number_dict = {i: w for i, w in enumerate(word_list)}
    n_class = len(word_dict)  # vocab list

    # hidden : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
    hidden = torch.zeros(1, 1, n_hidden)

    model = Attention()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    input_batch, output_batch, target_batch = make_batch()

    # Train
    for epoch in range(2000):
        optimizer.zero_grad()
        output, _ = model(input_batch, hidden, output_batch)

        loss = criterion(output, target_batch.squeeze(0))
        if (epoch + 1) % 400 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    # Test
    test_batch = [np.eye(n_class)[[word_dict[n] for n in 'SPPPP']]]
    test_batch = torch.FloatTensor(test_batch)
    predict, trained_attn = model(input_batch, hidden, test_batch)
    predict = predict.data.max(1, keepdim=True)[1]
    print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])

    # Show Attention
    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(trained_attn, cmap='viridis')
    ax.set_xticklabels([''] + sentences[0].split(), fontdict={'fontsize': 14})
    ax.set_yticklabels([''] + sentences[2].split(), fontdict={'fontsize': 14})
    plt.show()

================================================
FILE: 4-3.Bi-LSTM(Attention)/Bi-LSTM(Attention).ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung(Jeff Jung) @graykode\n",
        "# Reference : https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM_Attn.py\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "import torch.nn.functional as F\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "class BiLSTM_Attention(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(BiLSTM_Attention, self).__init__()\n",
        "\n",
        "        self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
        "        self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)\n",
        "        self.out = nn.Linear(n_hidden * 2, num_classes)\n",
        "\n",
        "    # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix\n",
        "    def attention_net(self, lstm_output, final_state):\n",
        "        hidden = final_state.view(-1, n_hidden * 2, 1)   # hidden : [batch_size, n_hidden * num_directions(=2), 1(=n_layer)]\n",
        "        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # attn_weights : [batch_size, n_step]\n",
        "        soft_attn_weights = F.softmax(attn_weights, 1)\n",
        "        # [batch_size, n_hidden * num_directions(=2), n_step] * [batch_size, n_step, 1] = [batch_size, n_hidden * num_directions(=2), 1]\n",
        "        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)\n",
        "        return context, soft_attn_weights.data.numpy() # context : [batch_size, n_hidden * num_directions(=2)]\n",
        "\n",
        "    def forward(self, X):\n",
        "        input = self.embedding(X) # input : [batch_size, len_seq, embedding_dim]\n",
        "        input = input.permute(1, 0, 2) # input : [len_seq, batch_size, embedding_dim]\n",
        "\n",
        "        hidden_state = torch.zeros(1*2, len(X), n_hidden) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]\n",
        "        cell_state = torch.zeros(1*2, len(X), n_hidden) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]\n",
        "\n",
        "        # final_hidden_state, final_cell_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]\n",
        "        output, (final_hidden_state, final_cell_state) = self.lstm(input, (hidden_state, cell_state))\n",
        "        output = output.permute(1, 0, 2) # output : [batch_size, len_seq, n_hidden]\n",
        "        attn_output, attention = self.attention_net(output, final_hidden_state)\n",
        "        return self.out(attn_output), attention # model : [batch_size, num_classes], attention : [batch_size, n_step]\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    embedding_dim = 2 # embedding size\n",
        "    n_hidden = 5  # number of hidden units in one cell\n",
        "    num_classes = 2  # 0 or 1\n",
        "\n",
        "    # 3 words sentences (=sequence_length is 3)\n",
        "    sentences = [\"i love you\", \"he loves me\", \"she likes baseball\", \"i hate you\", \"sorry for that\", \"this is awful\"]\n",
        "    labels = [1, 1, 1, 0, 0, 0]  # 1 is good, 0 is not good.\n",
        "\n",
        "    word_list = \" \".join(sentences).split()\n",
        "    word_list = list(set(word_list))\n",
        "    word_dict = {w: i for i, w in enumerate(word_list)}\n",
        "    vocab_size = len(word_dict)\n",
        "\n",
        "    model = BiLSTM_Attention()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    inputs = torch.LongTensor([np.asarray([word_dict[n] for n in sen.split()]) for sen in sentences])\n",
        "    targets = torch.LongTensor([out for out in labels])  # To using Torch Softmax Loss function\n",
        "\n",
        "    # Training\n",
        "    for epoch in range(5000):\n",
        "        optimizer.zero_grad()\n",
        "        output, attention = model(inputs)\n",
        "        loss = criterion(output, targets)\n",
        "        if (epoch + 1) % 1000 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    # Test\n",
        "    test_text = 'sorry hate you'\n",
        "    tests = [np.asarray([word_dict[n] for n in test_text.split()])]\n",
        "    test_batch = torch.LongTensor(tests)\n",
        "\n",
        "    # Predict\n",
        "    predict, _ = model(test_batch)\n",
        "    predict = predict.data.max(1, keepdim=True)[1]\n",
        "    if predict[0][0] == 0:\n",
        "        print(test_text,\"is Bad Mean...\")\n",
        "    else:\n",
        "        print(test_text,\"is Good Mean!!\")\n",
        "\n",
        "    fig = plt.figure(figsize=(6, 3)) # [batch_size, n_step]\n",
        "    ax = fig.add_subplot(1, 1, 1)\n",
        "    ax.matshow(attention, cmap='viridis')\n",
        "    ax.set_xticklabels(['']+['first_word', 'second_word', 'third_word'], fontdict={'fontsize': 14}, rotation=90)\n",
        "    ax.set_yticklabels(['']+['batch_1', 'batch_2', 'batch_3', 'batch_4', 'batch_5', 'batch_6'], fontdict={'fontsize': 14})\n",
        "    plt.show()"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 4-3.Bi-LSTM(Attention)/Bi-LSTM(Attention).py
================================================
# %%
# code by Tae Hwan Jung(Jeff Jung) @graykode
# Reference : https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM_Attn.py
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt

class BiLSTM_Attention(nn.Module):
    def __init__(self):
        super(BiLSTM_Attention, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)
        self.out = nn.Linear(n_hidden * 2, num_classes)

    # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
    def attention_net(self, lstm_output, final_state):
        hidden = final_state.view(-1, n_hidden * 2, 1)   # hidden : [batch_size, n_hidden * num_directions(=2), 1(=n_layer)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # attn_weights : [batch_size, n_step]
        soft_attn_weights = F.softmax(attn_weights, 1)
        # [batch_size, n_hidden * num_directions(=2), n_step] * [batch_size, n_step, 1] = [batch_size, n_hidden * num_directions(=2), 1]
        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return context, soft_attn_weights.data.numpy() # context : [batch_size, n_hidden * num_directions(=2)]

    def forward(self, X):
        input = self.embedding(X) # input : [batch_size, len_seq, embedding_dim]
        input = input.permute(1, 0, 2) # input : [len_seq, batch_size, embedding_dim]

        hidden_state = torch.zeros(1*2, len(X), n_hidden) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        cell_state = torch.zeros(1*2, len(X), n_hidden) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        # final_hidden_state, final_cell_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        output, (final_hidden_state, final_cell_state) = self.lstm(input, (hidden_state, cell_state))
        output = output.permute(1, 0, 2) # output : [batch_size, len_seq, n_hidden]
        attn_output, attention = self.attention_net(output, final_hidden_state)
        return self.out(attn_output), attention # model : [batch_size, num_classes], attention : [batch_size, n_step]

if __name__ == '__main__':
    embedding_dim = 2 # embedding size
    n_hidden = 5  # number of hidden units in one cell
    num_classes = 2  # 0 or 1

    # 3 words sentences (=sequence_length is 3)
    sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
    labels = [1, 1, 1, 0, 0, 0]  # 1 is good, 0 is not good.

    word_list = " ".join(sentences).split()
    word_list = list(set(word_list))
    word_dict = {w: i for i, w in enumerate(word_list)}
    vocab_size = len(word_dict)

    model = BiLSTM_Attention()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    inputs = torch.LongTensor([np.asarray([word_dict[n] for n in sen.split()]) for sen in sentences])
    targets = torch.LongTensor([out for out in labels])  # To using Torch Softmax Loss function

    # Training
    for epoch in range(5000):
        optimizer.zero_grad()
        output, attention = model(inputs)
        loss = criterion(output, targets)
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    # Test
    test_text = 'sorry hate you'
    tests = [np.asarray([word_dict[n] for n in test_text.split()])]
    test_batch = torch.LongTensor(tests)

    # Predict
    predict, _ = model(test_batch)
    predict = predict.data.max(1, keepdim=True)[1]
    if predict[0][0] == 0:
        print(test_text,"is Bad Mean...")
    else:
        print(test_text,"is Good Mean!!")

    fig = plt.figure(figsize=(6, 3)) # [batch_size, n_step]
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    ax.set_xticklabels(['']+['first_word', 'second_word', 'third_word'], fontdict={'fontsize': 14}, rotation=90)
    ax.set_yticklabels(['']+['batch_1', 'batch_2', 'batch_3', 'batch_4', 'batch_5', 'batch_6'], fontdict={'fontsize': 14})
    plt.show()

================================================
FILE: 5-1.Transformer/Transformer(Greedy_decoder).ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung(Jeff Jung) @graykode, Derek Miller @dmmiller612\n",
        "# Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch\n",
        "#           https://github.com/JayParks/transformer\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "# S: Symbol that shows starting of decoding input\n",
        "# E: Symbol that shows starting of decoding output\n",
        "# P: Symbol that will fill in blank sequence if current batch data size is short than time steps\n",
        "\n",
        "def make_batch():\n",
        "    input_batch = [[src_vocab[n] for n in sentences[0].split()]]\n",
        "    output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]\n",
        "    target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]\n",
        "    return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)\n",
        "\n",
        "def get_sinusoid_encoding_table(n_position, d_model):\n",
        "    def cal_angle(position, hid_idx):\n",
        "        return position / np.power(10000, 2 * (hid_idx // 2) / d_model)\n",
        "    def get_posi_angle_vec(position):\n",
        "        return [cal_angle(position, hid_j) for hid_j in range(d_model)]\n",
        "\n",
        "    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])\n",
        "    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i\n",
        "    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1\n",
        "    return torch.FloatTensor(sinusoid_table)\n",
        "\n",
        "def get_attn_pad_mask(seq_q, seq_k):\n",
        "    # print(seq_q)\n",
        "    batch_size, len_q = seq_q.size()\n",
        "    batch_size, len_k = seq_k.size()\n",
        "    # eq(zero) is PAD token\n",
        "    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking\n",
        "    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k\n",
        "\n",
        "def get_attn_subsequent_mask(seq):\n",
        "    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]\n",
        "    subsequent_mask = np.triu(np.ones(attn_shape), k=1)\n",
        "    subsequent_mask = torch.from_numpy(subsequent_mask).byte()\n",
        "    return subsequent_mask\n",
        "\n",
        "class ScaledDotProductAttention(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(ScaledDotProductAttention, self).__init__()\n",
        "\n",
        "    def forward(self, Q, K, V, attn_mask):\n",
        "        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n",
        "        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.\n",
        "        attn = nn.Softmax(dim=-1)(scores)\n",
        "        context = torch.matmul(attn, V)\n",
        "        return context, attn\n",
        "\n",
        "class MultiHeadAttention(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(MultiHeadAttention, self).__init__()\n",
        "        self.W_Q = nn.Linear(d_model, d_k * n_heads)\n",
        "        self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
        "        self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
        "        self.linear = nn.Linear(n_heads * d_v, d_model)\n",
        "        self.layer_norm = nn.LayerNorm(d_model)\n",
        "\n",
        "    def forward(self, Q, K, V, attn_mask):\n",
        "        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]\n",
        "        residual, batch_size = Q, Q.size(0)\n",
        "        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)\n",
        "        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]\n",
        "        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]\n",
        "        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]\n",
        "\n",
        "        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]\n",
        "\n",
        "        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n",
        "        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n",
        "        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]\n",
        "        output = self.linear(context)\n",
        "        return self.layer_norm(output + residual), attn # output: [batch_size x len_q x d_model]\n",
        "\n",
        "class PoswiseFeedForwardNet(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(PoswiseFeedForwardNet, self).__init__()\n",
        "        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)\n",
        "        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)\n",
        "        self.layer_norm = nn.LayerNorm(d_model)\n",
        "\n",
        "    def forward(self, inputs):\n",
        "        residual = inputs # inputs : [batch_size, len_q, d_model]\n",
        "        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))\n",
        "        output = self.conv2(output).transpose(1, 2)\n",
        "        return self.layer_norm(output + residual)\n",
        "\n",
        "class EncoderLayer(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(EncoderLayer, self).__init__()\n",
        "        self.enc_self_attn = MultiHeadAttention()\n",
        "        self.pos_ffn = PoswiseFeedForwardNet()\n",
        "\n",
        "    def forward(self, enc_inputs, enc_self_attn_mask):\n",
        "        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V\n",
        "        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]\n",
        "        return enc_outputs, attn\n",
        "\n",
        "class DecoderLayer(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(DecoderLayer, self).__init__()\n",
        "        self.dec_self_attn = MultiHeadAttention()\n",
        "        self.dec_enc_attn = MultiHeadAttention()\n",
        "        self.pos_ffn = PoswiseFeedForwardNet()\n",
        "\n",
        "    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):\n",
        "        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)\n",
        "        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)\n",
        "        dec_outputs = self.pos_ffn(dec_outputs)\n",
        "        return dec_outputs, dec_self_attn, dec_enc_attn\n",
        "\n",
        "class Encoder(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Encoder, self).__init__()\n",
        "        self.src_emb = nn.Embedding(src_vocab_size, d_model)\n",
        "        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_len+1, d_model),freeze=True)\n",
        "        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
        "\n",
        "    def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]\n",
        "        enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(torch.LongTensor([[1,2,3,4,0]]))\n",
        "        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)\n",
        "        enc_self_attns = []\n",
        "        for layer in self.layers:\n",
        "            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)\n",
        "            enc_self_attns.append(enc_self_attn)\n",
        "        return enc_outputs, enc_self_attns\n",
        "\n",
        "class Decoder(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Decoder, self).__init__()\n",
        "        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)\n",
        "        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(tgt_len+1, d_model),freeze=True)\n",
        "        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])\n",
        "\n",
        "    def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]\n",
        "        dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(torch.LongTensor([[5,1,2,3,4]]))\n",
        "        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)\n",
        "        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs)\n",
        "        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)\n",
        "\n",
        "        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)\n",
        "\n",
        "        dec_self_attns, dec_enc_attns = [], []\n",
        "        for layer in self.layers:\n",
        "            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)\n",
        "            dec_self_attns.append(dec_self_attn)\n",
        "            dec_enc_attns.append(dec_enc_attn)\n",
        "        return dec_outputs, dec_self_attns, dec_enc_attns\n",
        "\n",
        "class Transformer(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Transformer, self).__init__()\n",
        "        self.encoder = Encoder()\n",
        "        self.decoder = Decoder()\n",
        "        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False)\n",
        "    def forward(self, enc_inputs, dec_inputs):\n",
        "        enc_outputs, enc_self_attns = self.encoder(enc_inputs)\n",
        "        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)\n",
        "        dec_logits = self.projection(dec_outputs) # dec_logits : [batch_size x src_vocab_size x tgt_vocab_size]\n",
        "        return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns\n",
        "\n",
        "def greedy_decoder(model, enc_input, start_symbol):\n",
        "    \"\"\"\n",
        "    For simplicity, a Greedy Decoder is Beam search when K=1. This is necessary for inference as we don't know the\n",
        "    target sequence input. Therefore we try to generate the target input word by word, then feed it into the transformer.\n",
        "    Starting Reference: http://nlp.seas.harvard.edu/2018/04/03/attention.html#greedy-decoding\n",
        "    :param model: Transformer Model\n",
        "    :param enc_input: The encoder input\n",
        "    :param start_symbol: The start symbol. In this example it is 'S' which corresponds to index 4\n",
        "    :return: The target input\n",
        "    \"\"\"\n",
        "    enc_outputs, enc_self_attns = model.encoder(enc_input)\n",
        "    dec_input = torch.zeros(1, 5).type_as(enc_input.data)\n",
        "    next_symbol = start_symbol\n",
        "    for i in range(0, 5):\n",
        "        dec_input[0][i] = next_symbol\n",
        "        dec_outputs, _, _ = model.decoder(dec_input, enc_input, enc_outputs)\n",
        "        projected = model.projection(dec_outputs)\n",
        "        prob = projected.squeeze(0).max(dim=-1, keepdim=False)[1]\n",
        "        next_word = prob.data[i]\n",
        "        next_symbol = next_word.item()\n",
        "    return dec_input\n",
        "\n",
        "def showgraph(attn):\n",
        "    attn = attn[-1].squeeze(0)[0]\n",
        "    attn = attn.squeeze(0).data.numpy()\n",
        "    fig = plt.figure(figsize=(n_heads, n_heads)) # [n_heads, n_heads]\n",
        "    ax = fig.add_subplot(1, 1, 1)\n",
        "    ax.matshow(attn, cmap='viridis')\n",
        "    ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)\n",
        "    ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})\n",
        "    plt.show()\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']\n",
        "    # Transformer Parameters\n",
        "    # Padding Should be Zero index\n",
        "    src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}\n",
        "    src_vocab_size = len(src_vocab)\n",
        "\n",
        "    tgt_vocab = {'P': 0, 'i': 1, 'want': 2, 'a': 3, 'beer': 4, 'S': 5, 'E': 6}\n",
        "    number_dict = {i: w for i, w in enumerate(tgt_vocab)}\n",
        "    tgt_vocab_size = len(tgt_vocab)\n",
        "\n",
        "    src_len = 5 # length of source\n",
        "    tgt_len = 5 # length of target\n",
        "\n",
        "    d_model = 512  # Embedding Size\n",
        "    d_ff = 2048  # FeedForward dimension\n",
        "    d_k = d_v = 64  # dimension of K(=Q), V\n",
        "    n_layers = 6  # number of Encoder of Decoder Layer\n",
        "    n_heads = 8  # number of heads in Multi-Head Attention\n",
        "\n",
        "    model = Transformer()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    enc_inputs, dec_inputs, target_batch = make_batch()\n",
        "\n",
        "    for epoch in range(20):\n",
        "        optimizer.zero_grad()\n",
        "        outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)\n",
        "        loss = criterion(outputs, target_batch.contiguous().view(-1))\n",
        "        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    # Test\n",
        "    greedy_dec_input = greedy_decoder(model, enc_inputs, start_symbol=tgt_vocab[\"S\"])\n",
        "    predict, _, _, _ = model(enc_inputs, greedy_dec_input)\n",
        "    predict = predict.data.max(1, keepdim=True)[1]\n",
        "    print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])\n",
        "\n",
        "    print('first head of last state enc_self_attns')\n",
        "    showgraph(enc_self_attns)\n",
        "\n",
        "    print('first head of last state dec_self_attns')\n",
        "    showgraph(dec_self_attns)\n",
        "\n",
        "    print('first head of last state dec_enc_attns')\n",
        "    showgraph(dec_enc_attns)"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 5-1.Transformer/Transformer(Greedy_decoder).py
================================================
# %%
# code by Tae Hwan Jung(Jeff Jung) @graykode, Derek Miller @dmmiller612
# Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch
#           https://github.com/JayParks/transformer
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps

def make_batch():
    input_batch = [[src_vocab[n] for n in sentences[0].split()]]
    output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]
    target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]
    return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)

def get_sinusoid_encoding_table(n_position, d_model):
    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2 * (hid_idx // 2) / d_model)
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
    return torch.FloatTensor(sinusoid_table)

def get_attn_pad_mask(seq_q, seq_k):
    # print(seq_q)
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

def get_attn_subsequent_mask(seq):
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    subsequent_mask = torch.from_numpy(subsequent_mask).byte()
    return subsequent_mask

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
        self.linear = nn.Linear(n_heads * d_v, d_model)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = self.linear(context)
        return self.layer_norm(output + residual), attn # output: [batch_size x len_q x d_model]

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, inputs):
        residual = inputs # inputs : [batch_size, len_q, d_model]
        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
        output = self.conv2(output).transpose(1, 2)
        return self.layer_norm(output + residual)

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention()
        self.dec_enc_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs)
        return dec_outputs, dec_self_attn, dec_enc_attn

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_len+1, d_model),freeze=True)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

    def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]
        enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(torch.LongTensor([[1,2,3,4,0]]))
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
        enc_self_attns = []
        for layer in self.layers:
            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
            enc_self_attns.append(enc_self_attn)
        return enc_outputs, enc_self_attns

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(tgt_len+1, d_model),freeze=True)
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])

    def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]
        dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(torch.LongTensor([[5,1,2,3,4]]))
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)
        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs)
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)

        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)

        dec_self_attns, dec_enc_attns = [], []
        for layer in self.layers:
            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)
        return dec_outputs, dec_self_attns, dec_enc_attns

class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False)
    def forward(self, enc_inputs, dec_inputs):
        enc_outputs, enc_self_attns = self.encoder(enc_inputs)
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        dec_logits = self.projection(dec_outputs) # dec_logits : [batch_size x src_vocab_size x tgt_vocab_size]
        return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns

def greedy_decoder(model, enc_input, start_symbol):
    """
    For simplicity, a Greedy Decoder is Beam search when K=1. This is necessary for inference as we don't know the
    target sequence input. Therefore we try to generate the target input word by word, then feed it into the transformer.
    Starting Reference: http://nlp.seas.harvard.edu/2018/04/03/attention.html#greedy-decoding
    :param model: Transformer Model
    :param enc_input: The encoder input
    :param start_symbol: The start symbol. In this example it is 'S' which corresponds to index 4
    :return: The target input
    """
    enc_outputs, enc_self_attns = model.encoder(enc_input)
    dec_input = torch.zeros(1, 5).type_as(enc_input.data)
    next_symbol = start_symbol
    for i in range(0, 5):
        dec_input[0][i] = next_symbol
        dec_outputs, _, _ = model.decoder(dec_input, enc_input, enc_outputs)
        projected = model.projection(dec_outputs)
        prob = projected.squeeze(0).max(dim=-1, keepdim=False)[1]
        next_word = prob.data[i]
        next_symbol = next_word.item()
    return dec_input

def showgraph(attn):
    attn = attn[-1].squeeze(0)[0]
    attn = attn.squeeze(0).data.numpy()
    fig = plt.figure(figsize=(n_heads, n_heads)) # [n_heads, n_heads]
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attn, cmap='viridis')
    ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)
    ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})
    plt.show()

if __name__ == '__main__':
    sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']
    # Transformer Parameters
    # Padding Should be Zero index
    src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}
    src_vocab_size = len(src_vocab)

    tgt_vocab = {'P': 0, 'i': 1, 'want': 2, 'a': 3, 'beer': 4, 'S': 5, 'E': 6}
    number_dict = {i: w for i, w in enumerate(tgt_vocab)}
    tgt_vocab_size = len(tgt_vocab)

    src_len = 5 # length of source
    tgt_len = 5 # length of target

    d_model = 512  # Embedding Size
    d_ff = 2048  # FeedForward dimension
    d_k = d_v = 64  # dimension of K(=Q), V
    n_layers = 6  # number of Encoder of Decoder Layer
    n_heads = 8  # number of heads in Multi-Head Attention

    model = Transformer()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    enc_inputs, dec_inputs, target_batch = make_batch()

    for epoch in range(20):
        optimizer.zero_grad()
        outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
        loss = criterion(outputs, target_batch.contiguous().view(-1))
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
        loss.backward()
        optimizer.step()

    # Test
    greedy_dec_input = greedy_decoder(model, enc_inputs, start_symbol=tgt_vocab["S"])
    predict, _, _, _ = model(enc_inputs, greedy_dec_input)
    predict = predict.data.max(1, keepdim=True)[1]
    print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])

    print('first head of last state enc_self_attns')
    showgraph(enc_self_attns)

    print('first head of last state dec_self_attns')
    showgraph(dec_self_attns)

    print('first head of last state dec_enc_attns')
    showgraph(dec_enc_attns)

================================================
FILE: 5-1.Transformer/Transformer.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung(Jeff Jung) @graykode, Derek Miller @dmmiller612\n",
        "# Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch\n",
        "#           https://github.com/JayParks/transformer\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "# S: Symbol that shows starting of decoding input\n",
        "# E: Symbol that shows starting of decoding output\n",
        "# P: Symbol that will fill in blank sequence if current batch data size is short than time steps\n",
        "\n",
        "def make_batch(sentences):\n",
        "    input_batch = [[src_vocab[n] for n in sentences[0].split()]]\n",
        "    output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]\n",
        "    target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]\n",
        "    return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)\n",
        "\n",
        "def get_sinusoid_encoding_table(n_position, d_model):\n",
        "    def cal_angle(position, hid_idx):\n",
        "        return position / np.power(10000, 2 * (hid_idx // 2) / d_model)\n",
        "    def get_posi_angle_vec(position):\n",
        "        return [cal_angle(position, hid_j) for hid_j in range(d_model)]\n",
        "\n",
        "    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])\n",
        "    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i\n",
        "    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1\n",
        "    return torch.FloatTensor(sinusoid_table)\n",
        "\n",
        "def get_attn_pad_mask(seq_q, seq_k):\n",
        "    batch_size, len_q = seq_q.size()\n",
        "    batch_size, len_k = seq_k.size()\n",
        "    # eq(zero) is PAD token\n",
        "    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking\n",
        "    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k\n",
        "\n",
        "def get_attn_subsequent_mask(seq):\n",
        "    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]\n",
        "    subsequent_mask = np.triu(np.ones(attn_shape), k=1)\n",
        "    subsequent_mask = torch.from_numpy(subsequent_mask).byte()\n",
        "    return subsequent_mask\n",
        "\n",
        "class ScaledDotProductAttention(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(ScaledDotProductAttention, self).__init__()\n",
        "\n",
        "    def forward(self, Q, K, V, attn_mask):\n",
        "        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n",
        "        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.\n",
        "        attn = nn.Softmax(dim=-1)(scores)\n",
        "        context = torch.matmul(attn, V)\n",
        "        return context, attn\n",
        "\n",
        "class MultiHeadAttention(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(MultiHeadAttention, self).__init__()\n",
        "        self.W_Q = nn.Linear(d_model, d_k * n_heads)\n",
        "        self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
        "        self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
        "        self.linear = nn.Linear(n_heads * d_v, d_model)\n",
        "        self.layer_norm = nn.LayerNorm(d_model)\n",
        "\n",
        "    def forward(self, Q, K, V, attn_mask):\n",
        "        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]\n",
        "        residual, batch_size = Q, Q.size(0)\n",
        "        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)\n",
        "        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]\n",
        "        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]\n",
        "        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]\n",
        "\n",
        "        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]\n",
        "\n",
        "        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n",
        "        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n",
        "        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]\n",
        "        output = self.linear(context)\n",
        "        return self.layer_norm(output + residual), attn # output: [batch_size x len_q x d_model]\n",
        "\n",
        "class PoswiseFeedForwardNet(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(PoswiseFeedForwardNet, self).__init__()\n",
        "        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)\n",
        "        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)\n",
        "        self.layer_norm = nn.LayerNorm(d_model)\n",
        "\n",
        "    def forward(self, inputs):\n",
        "        residual = inputs # inputs : [batch_size, len_q, d_model]\n",
        "        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))\n",
        "        output = self.conv2(output).transpose(1, 2)\n",
        "        return self.layer_norm(output + residual)\n",
        "\n",
        "class EncoderLayer(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(EncoderLayer, self).__init__()\n",
        "        self.enc_self_attn = MultiHeadAttention()\n",
        "        self.pos_ffn = PoswiseFeedForwardNet()\n",
        "\n",
        "    def forward(self, enc_inputs, enc_self_attn_mask):\n",
        "        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V\n",
        "        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]\n",
        "        return enc_outputs, attn\n",
        "\n",
        "class DecoderLayer(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(DecoderLayer, self).__init__()\n",
        "        self.dec_self_attn = MultiHeadAttention()\n",
        "        self.dec_enc_attn = MultiHeadAttention()\n",
        "        self.pos_ffn = PoswiseFeedForwardNet()\n",
        "\n",
        "    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):\n",
        "        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)\n",
        "        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)\n",
        "        dec_outputs = self.pos_ffn(dec_outputs)\n",
        "        return dec_outputs, dec_self_attn, dec_enc_attn\n",
        "\n",
        "class Encoder(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Encoder, self).__init__()\n",
        "        self.src_emb = nn.Embedding(src_vocab_size, d_model)\n",
        "        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_len+1, d_model),freeze=True)\n",
        "        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
        "\n",
        "    def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]\n",
        "        enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(torch.LongTensor([[1,2,3,4,0]]))\n",
        "        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)\n",
        "        enc_self_attns = []\n",
        "        for layer in self.layers:\n",
        "            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)\n",
        "            enc_self_attns.append(enc_self_attn)\n",
        "        return enc_outputs, enc_self_attns\n",
        "\n",
        "class Decoder(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Decoder, self).__init__()\n",
        "        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)\n",
        "        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(tgt_len+1, d_model),freeze=True)\n",
        "        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])\n",
        "\n",
        "    def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]\n",
        "        dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(torch.LongTensor([[5,1,2,3,4]]))\n",
        "        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)\n",
        "        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs)\n",
        "        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)\n",
        "\n",
        "        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)\n",
        "\n",
        "        dec_self_attns, dec_enc_attns = [], []\n",
        "        for layer in self.layers:\n",
        "            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)\n",
        "            dec_self_attns.append(dec_self_attn)\n",
        "            dec_enc_attns.append(dec_enc_attn)\n",
        "        return dec_outputs, dec_self_attns, dec_enc_attns\n",
        "\n",
        "class Transformer(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Transformer, self).__init__()\n",
        "        self.encoder = Encoder()\n",
        "        self.decoder = Decoder()\n",
        "        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False)\n",
        "    def forward(self, enc_inputs, dec_inputs):\n",
        "        enc_outputs, enc_self_attns = self.encoder(enc_inputs)\n",
        "        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)\n",
        "        dec_logits = self.projection(dec_outputs) # dec_logits : [batch_size x src_vocab_size x tgt_vocab_size]\n",
        "        return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns\n",
        "\n",
        "def showgraph(attn):\n",
        "    attn = attn[-1].squeeze(0)[0]\n",
        "    attn = attn.squeeze(0).data.numpy()\n",
        "    fig = plt.figure(figsize=(n_heads, n_heads)) # [n_heads, n_heads]\n",
        "    ax = fig.add_subplot(1, 1, 1)\n",
        "    ax.matshow(attn, cmap='viridis')\n",
        "    ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)\n",
        "    ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})\n",
        "    plt.show()\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']\n",
        "\n",
        "    # Transformer Parameters\n",
        "    # Padding Should be Zero\n",
        "    src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}\n",
        "    src_vocab_size = len(src_vocab)\n",
        "\n",
        "    tgt_vocab = {'P': 0, 'i': 1, 'want': 2, 'a': 3, 'beer': 4, 'S': 5, 'E': 6}\n",
        "    number_dict = {i: w for i, w in enumerate(tgt_vocab)}\n",
        "    tgt_vocab_size = len(tgt_vocab)\n",
        "\n",
        "    src_len = 5 # length of source\n",
        "    tgt_len = 5 # length of target\n",
        "\n",
        "    d_model = 512  # Embedding Size\n",
        "    d_ff = 2048  # FeedForward dimension\n",
        "    d_k = d_v = 64  # dimension of K(=Q), V\n",
        "    n_layers = 6  # number of Encoder of Decoder Layer\n",
        "    n_heads = 8  # number of heads in Multi-Head Attention\n",
        "\n",
        "    model = Transformer()\n",
        "\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    enc_inputs, dec_inputs, target_batch = make_batch(sentences)\n",
        "\n",
        "    for epoch in range(20):\n",
        "        optimizer.zero_grad()\n",
        "        outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)\n",
        "        loss = criterion(outputs, target_batch.contiguous().view(-1))\n",
        "        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    # Test\n",
        "    predict, _, _, _ = model(enc_inputs, dec_inputs)\n",
        "    predict = predict.data.max(1, keepdim=True)[1]\n",
        "    print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])\n",
        "\n",
        "    print('first head of last state enc_self_attns')\n",
        "    showgraph(enc_self_attns)\n",
        "\n",
        "    print('first head of last state dec_self_attns')\n",
        "    showgraph(dec_self_attns)\n",
        "\n",
        "    print('first head of last state dec_enc_attns')\n",
        "    showgraph(dec_enc_attns)"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 5-1.Transformer/Transformer.py
================================================
# %%
# code by Tae Hwan Jung(Jeff Jung) @graykode, Derek Miller @dmmiller612
# Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch
#           https://github.com/JayParks/transformer
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps

def make_batch(sentences):
    input_batch = [[src_vocab[n] for n in sentences[0].split()]]
    output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]
    target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]
    return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)

def get_sinusoid_encoding_table(n_position, d_model):
    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2 * (hid_idx // 2) / d_model)
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
    return torch.FloatTensor(sinusoid_table)

def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

def get_attn_subsequent_mask(seq):
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    subsequent_mask = torch.from_numpy(subsequent_mask).byte()
    return subsequent_mask

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
        self.linear = nn.Linear(n_heads * d_v, d_model)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = self.linear(context)
        return self.layer_norm(output + residual), attn # output: [batch_size x len_q x d_model]

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, inputs):
        residual = inputs # inputs : [batch_size, len_q, d_model]
        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
        output = self.conv2(output).transpose(1, 2)
        return self.layer_norm(output + residual)

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention()
        self.dec_enc_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs)
        return dec_outputs, dec_self_attn, dec_enc_attn

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_len+1, d_model),freeze=True)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

    def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]
        enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(torch.LongTensor([[1,2,3,4,0]]))
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
        enc_self_attns = []
        for layer in self.layers:
            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
            enc_self_attns.append(enc_self_attn)
        return enc_outputs, enc_self_attns

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(tgt_len+1, d_model),freeze=True)
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])

    def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]
        dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(torch.LongTensor([[5,1,2,3,4]]))
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)
        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs)
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)

        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)

        dec_self_attns, dec_enc_attns = [], []
        for layer in self.layers:
            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)
        return dec_outputs, dec_self_attns, dec_enc_attns

class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False)
    def forward(self, enc_inputs, dec_inputs):
        enc_outputs, enc_self_attns = self.encoder(enc_inputs)
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        dec_logits = self.projection(dec_outputs) # dec_logits : [batch_size x src_vocab_size x tgt_vocab_size]
        return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns

def showgraph(attn):
    attn = attn[-1].squeeze(0)[0]
    attn = attn.squeeze(0).data.numpy()
    fig = plt.figure(figsize=(n_heads, n_heads)) # [n_heads, n_heads]
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attn, cmap='viridis')
    ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)
    ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})
    plt.show()

if __name__ == '__main__':
    sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']

    # Transformer Parameters
    # Padding Should be Zero
    src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}
    src_vocab_size = len(src_vocab)

    tgt_vocab = {'P': 0, 'i': 1, 'want': 2, 'a': 3, 'beer': 4, 'S': 5, 'E': 6}
    number_dict = {i: w for i, w in enumerate(tgt_vocab)}
    tgt_vocab_size = len(tgt_vocab)

    src_len = 5 # length of source
    tgt_len = 5 # length of target

    d_model = 512  # Embedding Size
    d_ff = 2048  # FeedForward dimension
    d_k = d_v = 64  # dimension of K(=Q), V
    n_layers = 6  # number of Encoder of Decoder Layer
    n_heads = 8  # number of heads in Multi-Head Attention

    model = Transformer()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    enc_inputs, dec_inputs, target_batch = make_batch(sentences)

    for epoch in range(20):
        optimizer.zero_grad()
        outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
        loss = criterion(outputs, target_batch.contiguous().view(-1))
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
        loss.backward()
        optimizer.step()

    # Test
    predict, _, _, _ = model(enc_inputs, dec_inputs)
    predict = predict.data.max(1, keepdim=True)[1]
    print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])

    print('first head of last state enc_self_attns')
    showgraph(enc_self_attns)

    print('first head of last state dec_self_attns')
    showgraph(dec_self_attns)

    print('first head of last state dec_enc_attns')
    showgraph(dec_enc_attns)

================================================
FILE: 5-2.BERT/BERT.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# code by Tae Hwan Jung(Jeff Jung) @graykode\n",
        "# Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch\n",
        "#           https://github.com/JayParks/transformer, https://github.com/dhlee347/pytorchic-bert\n",
        "import math\n",
        "import re\n",
        "from random import *\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "\n",
        "# sample IsNext and NotNext to be same in small batch size\n",
        "def make_batch():\n",
        "    batch = []\n",
        "    positive = negative = 0\n",
        "    while positive != batch_size/2 or negative != batch_size/2:\n",
        "        tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences\n",
        "        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]\n",
        "        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]\n",
        "        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)\n",
        "\n",
        "        # MASK LM\n",
        "        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence\n",
        "        cand_maked_pos = [i for i, token in enumerate(input_ids)\n",
        "                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]\n",
        "        shuffle(cand_maked_pos)\n",
        "        masked_tokens, masked_pos = [], []\n",
        "        for pos in cand_maked_pos[:n_pred]:\n",
        "            masked_pos.append(pos)\n",
        "            masked_tokens.append(input_ids[pos])\n",
        "            if random() < 0.8:  # 80%\n",
        "                input_ids[pos] = word_dict['[MASK]'] # make mask\n",
        "            elif random() < 0.5:  # 10%\n",
        "                index = randint(0, vocab_size - 1) # random index in vocabulary\n",
        "                input_ids[pos] = word_dict[number_dict[index]] # replace\n",
        "\n",
        "        # Zero Paddings\n",
        "        n_pad = maxlen - len(input_ids)\n",
        "        input_ids.extend([0] * n_pad)\n",
        "        segment_ids.extend([0] * n_pad)\n",
        "\n",
        "        # Zero Padding (100% - 15%) tokens\n",
        "        if max_pred > n_pred:\n",
        "            n_pad = max_pred - n_pred\n",
        "            masked_tokens.extend([0] * n_pad)\n",
        "            masked_pos.extend([0] * n_pad)\n",
        "\n",
        "        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:\n",
        "            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext\n",
        "            positive += 1\n",
        "        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:\n",
        "            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext\n",
        "            negative += 1\n",
        "    return batch\n",
        "# Proprecessing Finished\n",
        "\n",
        "def get_attn_pad_mask(seq_q, seq_k):\n",
        "    batch_size, len_q = seq_q.size()\n",
        "    batch_size, len_k = seq_k.size()\n",
        "    # eq(zero) is PAD token\n",
        "    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking\n",
        "    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k\n",
        "\n",
        "def gelu(x):\n",
        "    \"Implementation of the gelu activation function by Hugging Face\"\n",
        "    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))\n",
        "\n",
        "class Embedding(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Embedding, self).__init__()\n",
        "        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding\n",
        "        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding\n",
        "        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding\n",
        "        self.norm = nn.LayerNorm(d_model)\n",
        "\n",
        "    def forward(self, x, seg):\n",
        "        seq_len = x.size(1)\n",
        "        pos = torch.arange(seq_len, dtype=torch.long)\n",
        "        pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)\n",
        "        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n",
        "        return self.norm(embedding)\n",
        "\n",
        "class ScaledDotProductAttention(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(ScaledDotProductAttention, self).__init__()\n",
        "\n",
        "    def forward(self, Q, K, V, attn_mask):\n",
        "        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n",
        "        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.\n",
        "        attn = nn.Softmax(dim=-1)(scores)\n",
        "        context = torch.matmul(attn, V)\n",
        "        return context, attn\n",
        "\n",
        "class MultiHeadAttention(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(MultiHeadAttention, self).__init__()\n",
        "        self.W_Q = nn.Linear(d_model, d_k * n_heads)\n",
        "        self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
        "        self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
        "    def forward(self, Q, K, V, attn_mask):\n",
        "        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]\n",
        "        residual, batch_size = Q, Q.size(0)\n",
        "        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)\n",
        "        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]\n",
        "        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]\n",
        "        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]\n",
        "\n",
        "        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]\n",
        "\n",
        "        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n",
        "        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n",
        "        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]\n",
        "        output = nn.Linear(n_heads * d_v, d_model)(context)\n",
        "        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]\n",
        "\n",
        "class PoswiseFeedForwardNet(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(PoswiseFeedForwardNet, self).__init__()\n",
        "        self.fc1 = nn.Linear(d_model, d_ff)\n",
        "        self.fc2 = nn.Linear(d_ff, d_model)\n",
        "\n",
        "    def forward(self, x):\n",
        "        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)\n",
        "        return self.fc2(gelu(self.fc1(x)))\n",
        "\n",
        "class EncoderLayer(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(EncoderLayer, self).__init__()\n",
        "        self.enc_self_attn = MultiHeadAttention()\n",
        "        self.pos_ffn = PoswiseFeedForwardNet()\n",
        "\n",
        "    def forward(self, enc_inputs, enc_self_attn_mask):\n",
        "        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V\n",
        "        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]\n",
        "        return enc_outputs, attn\n",
        "\n",
        "class BERT(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(BERT, self).__init__()\n",
        "        self.embedding = Embedding()\n",
        "        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
        "        self.fc = nn.Linear(d_model, d_model)\n",
        "        self.activ1 = nn.Tanh()\n",
        "        self.linear = nn.Linear(d_model, d_model)\n",
        "        self.activ2 = gelu\n",
        "        self.norm = nn.LayerNorm(d_model)\n",
        "        self.classifier = nn.Linear(d_model, 2)\n",
        "        # decoder is shared with embedding layer\n",
        "        embed_weight = self.embedding.tok_embed.weight\n",
        "        n_vocab, n_dim = embed_weight.size()\n",
        "        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)\n",
        "        self.decoder.weight = embed_weight\n",
        "        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n",
        "\n",
        "    def forward(self, input_ids, segment_ids, masked_pos):\n",
        "        output = self.embedding(input_ids, segment_ids)\n",
        "        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)\n",
        "        for layer in self.layers:\n",
        "            output, enc_self_attn = layer(output, enc_self_attn_mask)\n",
        "        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]\n",
        "        # it will be decided by first token(CLS)\n",
        "        h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]\n",
        "        logits_clsf = self.classifier(h_pooled) # [batch_size, 2]\n",
        "\n",
        "        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]\n",
        "        # get masked position from final output of transformer.\n",
        "        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]\n",
        "        h_masked = self.norm(self.activ2(self.linear(h_masked)))\n",
        "        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]\n",
        "\n",
        "        return logits_lm, logits_clsf\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    # BERT Parameters\n",
        "    maxlen = 30 # maximum of length\n",
        "    batch_size = 6\n",
        "    max_pred = 5  # max tokens of prediction\n",
        "    n_layers = 6 # number of Encoder of Encoder Layer\n",
        "    n_heads = 12 # number of heads in Multi-Head Attention\n",
        "    d_model = 768 # Embedding Size\n",
        "    d_ff = 768 * 4  # 4*d_model, FeedForward dimension\n",
        "    d_k = d_v = 64  # dimension of K(=Q), V\n",
        "    n_segments = 2\n",
        "\n",
        "    text = (\n",
        "        'Hello, how are you? I am Romeo.\\n'\n",
        "        'Hello, Romeo My name is Juliet. Nice to meet you.\\n'\n",
        "        'Nice meet you too. How are you today?\\n'\n",
        "        'Great. My baseball team won the competition.\\n'\n",
        "        'Oh Congratulations, Juliet\\n'\n",
        "        'Thanks you Romeo'\n",
        "    )\n",
        "    sentences = re.sub(\"[.,!?\\\\-]\", '', text.lower()).split('\\n')  # filter '.', ',', '?', '!'\n",
        "    word_list = list(set(\" \".join(sentences).split()))\n",
        "    word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}\n",
        "    for i, w in enumerate(word_list):\n",
        "        word_dict[w] = i + 4\n",
        "    number_dict = {i: w for i, w in enumerate(word_dict)}\n",
        "    vocab_size = len(word_dict)\n",
        "\n",
        "    token_list = list()\n",
        "    for sentence in sentences:\n",
        "        arr = [word_dict[s] for s in sentence.split()]\n",
        "        token_list.append(arr)\n",
        "\n",
        "    model = BERT()\n",
        "    criterion = nn.CrossEntropyLoss()\n",
        "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
        "\n",
        "    batch = make_batch()\n",
        "    input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))\n",
        "\n",
        "    for epoch in range(100):\n",
        "        optimizer.zero_grad()\n",
        "        logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n",
        "        loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM\n",
        "        loss_lm = (loss_lm.float()).mean()\n",
        "        loss_clsf = criterion(logits_clsf, isNext) # for sentence classification\n",
        "        loss = loss_lm + loss_clsf\n",
        "        if (epoch + 1) % 10 == 0:\n",
        "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "    # Predict mask tokens ans isNext\n",
        "    input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))\n",
        "    print(text)\n",
        "    print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])\n",
        "\n",
        "    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n",
        "    logits_lm = logits_lm.data.max(2)[1][0].data.numpy()\n",
        "    print('masked tokens list : ',[pos.item() for pos in masked_tokens[0] if pos.item() != 0])\n",
        "    print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])\n",
        "\n",
        "    logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]\n",
        "    print('isNext : ', True if isNext else False)\n",
        "    print('predict isNext : ',True if logits_clsf else False)\n"
      ],
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

================================================
FILE: 5-2.BERT/BERT.py
================================================
# %%
# code by Tae Hwan Jung(Jeff Jung) @graykode
# Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch
#           https://github.com/JayParks/transformer, https://github.com/dhlee347/pytorchic-bert
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# sample IsNext and NotNext to be same in small batch size
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]
        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # MASK LM
        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence
        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        shuffle(cand_maked_pos)
        masked_tokens, masked_pos = [], []
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.8:  # 80%
                input_ids[pos] = word_dict['[MASK]'] # make mask
            elif random() < 0.5:  # 10%
                index = randint(0, vocab_size - 1) # random index in vocabulary
                input_ids[pos] = word_dict[number_dict[index]] # replace

        # Zero Paddings
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero Padding (100% - 15%) tokens
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
    return batch
# Proprecessing Finished

def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

def gelu(x):
    "Implementation of the gelu activation function by Hugging Face"
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
        return self.fc2(gelu(self.fc1(x)))

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ1 = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
        # it will be decided by first token(CLS)
        h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2]

        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        # get masked position from final output of transformer.
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked = self.norm(self.activ2(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return logits_lm, logits_clsf

if __name__ == '__main__':
    # BERT Parameters
    maxlen = 30 # maximum of length
    batch_size = 6
    max_pred = 5  # max tokens of prediction
    n_layers = 6 # number of Encoder of Encoder Layer
    n_heads = 12 # number of heads in Multi-Head Attention
    d_model = 768 # Embedding Size
    d_ff = 768 * 4  # 4*d_model, FeedForward dimension
    d_k = d_v = 64  # dimension of K(=Q), V
    n_segments = 2

    text = (
        'Hello, how are you? I am Romeo.\n'
        'Hello, Romeo My name is Juliet. Nice to meet you.\n'
        'Nice meet you too. How are you today?\n'
        'Great. My baseball team won the competition.\n'
        'Oh Congratulations, Juliet\n'
        'Thanks you Romeo'
    )
    sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n')  # filter '.', ',', '?', '!'
    word_list = list(set(" ".join(sentences).split()))
    word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
    for i, w in enumerate(word_list):
        word_dict[w] = i + 4
    number_dict = {i: w for i, w in enumerate(word_dict)}
    vocab_size = len(word_dict)

    token_list = list()
    for sentence in sentences:
        arr = [word_dict[s] for s in sentence.split()]
        token_list.append(arr)

    model = BERT()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    batch = make_batch()
    input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

    for epoch in range(100):
        optimizer.zero_grad()
        logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
        loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
        loss_lm = (loss_lm.float()).mean()
        loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
        loss = loss_lm + loss_clsf
        if (epoch + 1) % 10 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
        loss.backward()
        optimizer.step()

    # Predict mask tokens ans isNext
    input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))
    print(text)
    print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])

    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
    logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
    print('masked tokens list : ',[pos.item() for pos in masked_tokens[0] if pos.item() != 0])
    print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

    logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
    print('isNext : ', True if isNext else False)
    print('predict isNext : ',True if logits_clsf else False)


================================================
FILE: CONTRIBUTING.md
================================================
## Contribution Guidelines

Thank you to everyone who contributes. Here are some rules to follow before contributing.
1. Contributions are open to the smallest details such as typos, comments and code refactors.
2. Do not commit the jupyter notebook file(*.ipynb). When the modified python code is merged into the master branch, the github action automatically generates an ipynb.
3. Please attach a commit message appropriate to the modified code.

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2019 TaeHwan Jung

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
## nlp-tutorial

<p align="center"><img width="100" src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/11/TensorFlowLogo.svg/225px-TensorFlowLogo.svg.png" />  <img width="100" src="https://media-thumbs.golden.com/OLqzmrmwAzY1P7Sl29k2T9WjJdM=/200x200/smart/golden-storage-production.s3.amazonaws.com/topic_images/e08914afa10a4179893eeb07cb5e4713.png" /></p>

`nlp-tutorial` is a tutorial for who is studying NLP(Natural Language Processing) using **Pytorch**. Most of the models in NLP were implemented with less than **100 lines** of code.(except comments or blank lines)

- [08-14-2020] Old TensorFlow v1 code is archived in [the archive folder](archive). For beginner readability, only pytorch version 1.0 or higher is supported.


## Curriculum - (Example Purpose)

#### 1. Basic Embedding Model

- 1-1. [NNLM(Neural Network Language Model)](1-1.NNLM) - **Predict Next Word**
  - Paper -  [A Neural Probabilistic Language Model(2003)](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)
  - Colab - [NNLM.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/1-1.NNLM/NNLM.ipynb)
- 1-2. [Word2Vec(Skip-gram)](1-2.Word2Vec) - **Embedding Words and Show Graph**
  - Paper - [Distributed Representations of Words and Phrases
    and their Compositionality(2013)](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)
  - Colab - [Word2Vec.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/1-2.Word2Vec/Word2Vec_Skipgram(Softmax).ipynb)
- 1-3. [FastText(Application Level)](1-3.FastText) - **Sentence Classification**
  - Paper - [Bag of Tricks for Efficient Text Classification(2016)](https://arxiv.org/pdf/1607.01759.pdf)
  - Colab - [FastText.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/1-3.FastText/FastText.ipynb)


#### 2. CNN(Convolutional Neural Network)

- 2-1. [TextCNN](2-1.TextCNN) - **Binary Sentiment Classification**
  - Paper - [Convolutional Neural Networks for Sentence Classification(2014)](http://www.aclweb.org/anthology/D14-1181)
  - [TextCNN.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/2-1.TextCNN/TextCNN.ipynb)


#### 3. RNN(Recurrent Neural Network)

- 3-1. [TextRNN](3-1.TextRNN) - **Predict Next Step**
  - Paper - [Finding Structure in Time(1990)](http://psych.colorado.edu/~kimlab/Elman1990.pdf)
  - Colab - [TextRNN.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/3-1.TextRNN/TextRNN.ipynb)
- 3-2. [TextLSTM](https://github.com/graykode/nlp-tutorial/tree/master/3-2.TextLSTM) - **Autocomplete**
  - Paper - [LONG SHORT-TERM MEMORY(1997)](https://www.bioinf.jku.at/publications/older/2604.pdf)
  - Colab - [TextLSTM.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/3-2.TextLSTM/TextLSTM.ipynb)
- 3-3. [Bi-LSTM](3-3.Bi-LSTM) - **Predict Next Word in Long Sentence**
  - Colab - [Bi_LSTM.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/3-3.Bi-LSTM/Bi_LSTM.ipynb)


#### 4. Attention Mechanism

- 4-1. [Seq2Seq](4-1.Seq2Seq) - **Change Word**
  - Paper - [Learning Phrase Representations using RNN Encoder–Decoder
    for Statistical Machine Translation(2014)](https://arxiv.org/pdf/1406.1078.pdf)
  - Colab - [Seq2Seq.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/4-1.Seq2Seq/Seq2Seq.ipynb)
- 4-2. [Seq2Seq with Attention](4-2.Seq2Seq(Attention)) - **Translate**
  - Paper - [Neural Machine Translation by Jointly Learning to Align and Translate(2014)](https://arxiv.org/abs/1409.0473)
  - Colab - [Seq2Seq(Attention).ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/4-2.Seq2Seq(Attention)/Seq2Seq(Attention).ipynb)
- 4-3. [Bi-LSTM with Attention](4-3.Bi-LSTM(Attention)) - **Binary Sentiment Classification**
  - Colab - [Bi_LSTM(Attention).ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/4-3.Bi-LSTM(Attention)/Bi_LSTM(Attention).ipynb)


#### 5. Model based on Transformer

- 5-1.  [The Transformer](5-1.Transformer) - **Translate**
  - Paper - [Attention Is All You Need(2017)](https://arxiv.org/abs/1706.03762)
  - Colab - [Transformer.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/5-1.Transformer/Transformer.ipynb), [Transformer(Greedy_decoder).ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/5-1.Transformer/Transformer(Greedy_decoder).ipynb)
- 5-2. [BERT](5-2.BERT) - **Classification Next Sentence & Predict Masked Tokens**
  - Paper - [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding(2018)](https://arxiv.org/abs/1810.04805)
  - Colab - [BERT.ipynb](https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/5-2.BERT/BERT.ipynb)


## Dependencies

- Python 3.5+
- Pytorch 1.0.0+


## Author

- Tae Hwan Jung(Jeff Jung) @graykode
- Author Email : nlkey2022@gmail.com
- Acknowledgements to [mojitok](http://mojitok.com/) as NLP Research Internship.


================================================
FILE: archive/tensorflow/v1/1-1.NNLM/NNLM.py
================================================
# code by Tae Hwan Jung @graykode
import tensorflow as tf
import numpy as np

tf.reset_default_graph()

sentences = [ "i like dog", "i love coffee", "i hate milk"]

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict) # number of Vocabulary

# NNLM Parameter
n_step = 2 # number of steps ['i like', 'i love', 'i hate']
n_hidden = 2 # number of hidden units

def make_batch(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split()
        input = [word_dict[n] for n in word[:-1]]
        target = word_dict[word[-1]]

        input_batch.append(np.eye(n_class)[input])
        target_batch.append(np.eye(n_class)[target])

    return input_batch, target_batch

# Model
X = tf.placeholder(tf.float32, [None, n_step, n_class]) # [batch_size, number of steps, number of Vocabulary]
Y = tf.placeholder(tf.float32, [None, n_class])

input = tf.reshape(X, shape=[-1, n_step * n_class]) # [batch_size, n_step * n_class]
H = tf.Variable(tf.random_normal([n_step * n_class, n_hidden]))
d = tf.Variable(tf.random_normal([n_hidden]))
U = tf.Variable(tf.random_normal([n_hidden, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

tanh = tf.nn.tanh(d + tf.matmul(input, H)) # [batch_size, n_hidden]
model = tf.matmul(tanh, U) + b # [batch_size, n_class]

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
prediction =tf.argmax(model, 1)

# Training
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

input_batch, target_batch = make_batch(sentences)

for epoch in range(5000):
    _, loss = sess.run([optimizer, cost], feed_dict={X: input_batch, Y: target_batch})
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

# Predict
predict =  sess.run([prediction], feed_dict={X: input_batch})

# Test
input = [sen.split()[:2] for sen in sentences]
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n] for n in predict[0]])

================================================
FILE: archive/tensorflow/v1/1-2.Word2Vec/Word2Vec-Skipgram(NCE_loss).py
================================================
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode
  reference : https://github.com/golbin/TensorFlow-Tutorials/blob/master/04%20-%20Neural%20Network%20Basic/03%20-%20Word2Vec.py
'''
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

tf.reset_default_graph()

# 3 Words Sentence
sentences = [ "i like dog", "i like cat", "i like animal",
              "dog cat animal", "apple cat dog like", "dog fish milk like",
              "dog cat eyes like", "i like apple", "apple i hate",
              "apple i movie book music like", "cat dog hate", "cat dog like"]

word_sequence = " ".join(sentences).split()
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}

# Word2Vec Parameter
batch_size = 20
embedding_size = 2 # To show 2 dim embedding graph
num_sampled = 10 # for negative sampling, less than batch_size
voc_size = len(word_list)

def random_batch(data, size):
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(data)), size, replace=False)

    for i in random_index:
        random_inputs.append(data[i][0])  # target
        random_labels.append([data[i][1]])  # context word

    return random_inputs, random_labels

# Make skip gram of one size window
skip_grams = []
for i in range(1, len(word_sequence) - 1):
    target = word_dict[word_sequence[i]]
    context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]

    for w in context:
        skip_grams.append([target, w])

# Model
inputs = tf.placeholder(tf.int32, shape=[batch_size])
labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) # To use tf.nn.nce_loss, [batch_size, 1]

embeddings = tf.Variable(tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
selected_embed = tf.nn.embedding_lookup(embeddings, inputs)

nce_weights = tf.Variable(tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
nce_biases = tf.Variable(tf.zeros([voc_size]))

# Loss and optimizer
cost = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, labels, selected_embed, num_sampled, voc_size))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

# Training
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)

    for epoch in range(5000):
        batch_inputs, batch_labels = random_batch(skip_grams, batch_size)
        _, loss = sess.run([optimizer, cost], feed_dict={inputs: batch_inputs, labels: batch_labels})

        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    trained_embeddings = embeddings.eval()

for i, label in enumerate(word_list):
    x, y = trained_embeddings[i]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

================================================
FILE: archive/tensorflow/v1/1-2.Word2Vec/Word2Vec-Skipgram(Softmax).py
================================================
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode
'''
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

tf.reset_default_graph()

# 3 Words Sentence
sentences = [ "i like dog", "i like cat", "i like animal",
              "dog cat animal", "apple cat dog like", "dog fish milk like",
              "dog cat eyes like", "i like apple", "apple i hate",
              "apple i movie book music like", "cat dog hate", "cat dog like"]

word_sequence = " ".join(sentences).split()
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}

# Word2Vec Parameter
batch_size = 20
embedding_size = 2 # To show 2 dim embedding graph
voc_size = len(word_list)

def random_batch(data, size):
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(data)), size, replace=False)

    for i in random_index:
        random_inputs.append(np.eye(voc_size)[data[i][0]])  # target
        random_labels.append(np.eye(voc_size)[data[i][1]])  # context word

    return random_inputs, random_labels

# Make skip gram of one size window
skip_grams = []
for i in range(1, len(word_sequence) - 1):
    target = word_dict[word_sequence[i]]
    context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]

    for w in context:
        skip_grams.append([target, w])

# Model
inputs = tf.placeholder(tf.float32, shape=[None, voc_size])
labels = tf.placeholder(tf.float32, shape=[None, voc_size])

# W and WT is not Traspose relationship
W = tf.Variable(tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
WT = tf.Variable(tf.random_uniform([embedding_size, voc_size], -1.0, 1.0))

hidden_layer = tf.matmul(inputs, W) # [batch_size, embedding_size]
output_layer = tf.matmul(hidden_layer, WT) # [batch_size, voc_size]

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output_layer, labels=labels))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)

    for epoch in range(5000):
        batch_inputs, batch_labels = random_batch(skip_grams, batch_size)
        _, loss = sess.run([optimizer, cost], feed_dict={inputs: batch_inputs, labels: batch_labels})

        if (epoch + 1)%1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        trained_embeddings = W.eval()

for i, label in enumerate(word_list):
    x, y = trained_embeddings[i]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

================================================
FILE: archive/tensorflow/v1/2-1.TextCNN/TextCNN.py
================================================
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode
  Reference : https://github.com/ioatr/textcnn
'''
import tensorflow as tf
import numpy as np

tf.reset_default_graph()

# Text-CNN Parameter
embedding_size = 2 # n-gram
sequence_length = 3
num_classes = 2 # 0 or 1
filter_sizes = [2,2,2] # n-gram window
num_filters = 3

# 3 words sentences (=sequence_length is 3)
sentences = ["i love you","he loves me", "she likes baseball", "i hate you","sorry for that", "this is awful"]
labels = [1,1,1,0,0,0] # 1 is good, 0 is not good.

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
vocab_size = len(word_dict)

inputs = []
for sen in sentences:
    inputs.append(np.asarray([word_dict[n] for n in sen.split()]))

outputs = []
for out in labels:
    outputs.append(np.eye(num_classes)[out]) # ONE-HOT : To using Tensor Softmax Loss function

# Model
X = tf.placeholder(tf.int32, [None, sequence_length])
Y = tf.placeholder(tf.int32, [None, num_classes])

W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
embedded_chars = tf.nn.embedding_lookup(W, X) # [batch_size, sequence_length, embedding_size]
embedded_chars = tf.expand_dims(embedded_chars, -1) # add channel(=1) [batch_size, sequence_length, embedding_size, 1]

pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
    filter_shape = [filter_size, embedding_size, 1, num_filters]
    W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
    b = tf.Variable(tf.constant(0.1, shape=[num_filters]))

    conv = tf.nn.conv2d(embedded_chars, # [batch_size, sequence_length, embedding_size, 1]
                        W,              # [filter_size(n-gram window), embedding_size, 1, num_filters(=3)]
                        strides=[1, 1, 1, 1],
                        padding='VALID')
    h = tf.nn.relu(tf.nn.bias_add(conv, b))
    pooled = tf.nn.max_pool(h,
                            ksize=[1, sequence_length - filter_size + 1, 1, 1], # [batch_size, filter_height, filter_width, channel]
                            strides=[1, 1, 1, 1],
                            padding='VALID')
    pooled_outputs.append(pooled) # dim of pooled : [batch_size(=6), output_height(=1), output_width(=1), channel(=1)]

num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat(pooled_outputs, num_filters) # h_pool : [batch_size(=6), output_height(=1), output_width(=1), channel(=1) * 3]
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) # [batch_size, ]

# Model-Training
Weight = tf.get_variable('W', shape=[num_filters_total, num_classes], 
                    initializer=tf.contrib.layers.xavier_initializer())
Bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
model = tf.nn.xw_plus_b(h_pool_flat, Weight, Bias)  
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

# Model-Predict
hypothesis = tf.nn.softmax(model)
predictions = tf.argmax(hypothesis, 1)
# Training
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

for epoch in range(5000):
    _, loss = sess.run([optimizer, cost], feed_dict={X: inputs, Y: outputs})
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%06d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

# Test
test_text = 'sorry hate you'
tests = []
tests.append(np.asarray([word_dict[n] for n in test_text.split()]))

predict = sess.run([predictions], feed_dict={X: tests})
result = predict[0][0]
if result == 0:
    print(test_text,"is Bad Mean...")
else:
    print(test_text,"is Good Mean!!")

================================================
FILE: archive/tensorflow/v1/3-1.TextRNN/TextRNN.py
================================================
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode
'''
import tensorflow as tf
import numpy as np

tf.reset_default_graph()

sentences = [ "i like dog", "i love coffee", "i hate milk"]

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict)

# TextRNN Parameter
n_step = 2 # number of cells(= number of Step)
n_hidden = 5 # number of hidden units in one cell

def make_batch(sentences):
    input_batch = []
    target_batch = []
    
    for sen in sentences:
        word = sen.split()
        input = [word_dict[n] for n in word[:-1]]
        target = word_dict[word[-1]]

        input_batch.append(np.eye(n_class)[input])
        target_batch.append(np.eye(n_class)[target])

    return input_batch, target_batch

# Model
X = tf.placeholder(tf.float32, [None, n_step, n_class]) # [batch_size, n_step, n_class]
Y = tf.placeholder(tf.float32, [None, n_class])         # [batch_size, n_class]

W = tf.Variable(tf.random_normal([n_hidden, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

# outputs : [batch_size, n_step, n_hidden]
outputs = tf.transpose(outputs, [1, 0, 2]) # [n_step, batch_size, n_hidden]
outputs = outputs[-1] # [batch_size, n_hidden]
model = tf.matmul(outputs, W) + b # model : [batch_size, n_class]

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

prediction = tf.cast(tf.argmax(model, 1), tf.int32)

# Training
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

input_batch, target_batch = make_batch(sentences)

for epoch in range(5000):
    _, loss = sess.run([optimizer, cost], feed_dict={X: input_batch, Y: target_batch})
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
        
input = [sen.split()[:2] for sen in sentences]

predict =  sess.run([prediction], feed_dict={X: input_batch})
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n] for n in predict[0]])

================================================
FILE: archive/tensorflow/v1/3-2.TextLSTM/TextLSTM.py
================================================
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode
'''
import tensorflow as tf
import numpy as np

tf.reset_default_graph()

char_arr = [c for c in 'abcdefghijklmnopqrstuvwxyz']
word_dict = {n: i for i, n in enumerate(char_arr)}
number_dict = {i: w for i, w in enumerate(char_arr)}
n_class = len(word_dict) # number of class(=number of vocab)

seq_data = ['make', 'need', 'coal', 'word', 'love', 'hate', 'live', 'home', 'hash', 'star']

# TextLSTM Parameters
n_step = 3
n_hidden = 128

def make_batch(seq_data):
    input_batch, target_batch = [], []

    for seq in seq_data:
        input = [word_dict[n] for n in seq[:-1]] # 'm', 'a' , 'k' is input
        target = word_dict[seq[-1]] # 'e' is target
        input_batch.append(np.eye(n_class)[input])
        target_batch.append(np.eye(n_class)[target])

    return input_batch, target_batch

# Model
X = tf.placeholder(tf.float32, [None, n_step, n_class]) # [batch_size, n_step, n_class]
Y = tf.placeholder(tf.float32, [None, n_class])         # [batch_size, n_class]

W = tf.Variable(tf.random_normal([n_hidden, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

# outputs : [batch_size, n_step, n_hidden]
outputs = tf.transpose(outputs, [1, 0, 2]) # [n_step, batch_size, n_hidden]
outputs = outputs[-1] # [batch_size, n_hidden]
model = tf.matmul(outputs, W) + b # model : [batch_size, n_class]

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

prediction = tf.cast(tf.argmax(model, 1), tf.int32)

# Training
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

input_batch, target_batch = make_batch(seq_data)

for epoch in range(1000):
    _, loss = sess.run([optimizer, cost], feed_dict={X: input_batch, Y: target_batch})
    if (epoch + 1)%100 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

inputs = [sen[:3] for sen in seq_data]

predict =  sess.run([prediction], feed_dict={X: input_batch})
print(inputs, '->', [number_dict[n] for n in predict[0]])

================================================
FILE: archive/tensorflow/v1/3-3.Bi-LSTM/Bi-LSTM.py
================================================
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode
'''
import tensorflow as tf
import numpy as np

tf.reset_default_graph()

sentence = (
    'Lorem ipsum dolor sit amet consectetur adipisicing elit '
    'sed do eiusmod tempor incididunt ut labore et dolore magna '
    'aliqua Ut enim ad minim veniam quis nostrud exercitation'
)

word_dict = {w: i for i, w in enumerate(list(set(sentence.split())))}
number_dict = {i: w for i, w in enumerate(list(set(sentence.split())))}
n_class = len(word_dict)
n_step = len(sentence.split())
n_hidden = 5

def make_batch(sentence):
    input_batch = []
    target_batch = []

    words = sentence.split()
    for i, word in enumerate(words[:-1]):
        input = [word_dict[n] for n in words[:(i + 1)]]
        input = input + [0] * (n_step - len(input))
        target = word_dict[words[i + 1]]
        input_batch.append(np.eye(n_class)[input])
        target_batch.append(np.eye(n_class)[target])

    return input_batch, target_batch

# Bi-LSTM Model
X = tf.placeholder(tf.float32, [None, n_step, n_class])
Y = tf.placeholder(tf.float32, [None, n_class])

W = tf.Variable(tf.random_normal([n_hidden * 2, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)
lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)

# outputs : [batch_size, len_seq, n_hidden], states : [batch_size, n_hidden]
outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell, X, dtype=tf.float32)

outputs = tf.concat([outputs[0], outputs[1]], 2) # output[0] : lstm_fw, output[1] : lstm_bw
outputs = tf.transpose(outputs, [1, 0, 2]) # [n_step, batch_size, n_hidden]
outputs = outputs[-1] # [batch_size, n_hidden]

model = tf.matmul(outputs, W) + b

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

prediction = tf.cast(tf.argmax(model, 1), tf.int32)

# Training
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

input_batch, target_batch = make_batch(sentence)

for epoch in range(10000):
    _, loss = sess.run([optimizer, cost], feed_dict={X: input_batch, Y: target_batch})
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

predict =  sess.run([prediction], feed_dict={X: input_batch})
print(sentence)
print([number_dict[n] for n in [pre for pre in predict[0]]])

================================================
FILE: archive/tensorflow/v1/4-1.Seq2Seq/Seq2Seq.py
================================================
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode
  reference : https://github.com/golbin/TensorFlow-Tutorials/blob/master/10%20-%20RNN/03%20-%20Seq2Seq.py
'''
import tensorflow as tf
import numpy as np

tf.reset_default_graph()
# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps

char_arr = [c for c in 'SEPabcdefghijklmnopqrstuvwxyz']
num_dic = {n: i for i, n in enumerate(char_arr)}

seq_data = [['man', 'women'], ['black', 'white'], ['king', 'queen'], ['girl', 'boy'], ['up', 'down'], ['high', 'low']]

# Seq2Seq Parameter
n_step = 5
n_hidden = 128
n_class = len(num_dic) # number of class(=number of vocab)

def make_batch(seq_data):
    input_batch, output_batch, target_batch = [], [], []

    for seq in seq_data:
        for i in range(2):
            seq[i] = seq[i] + 'P' * (n_step - len(seq[i]))

        input = [num_dic[n] for n in seq[0]]
        output = [num_dic[n] for n in ('S' + seq[1])]
        target = [num_dic[n] for n in (seq[1] + 'E')]

        input_batch.append(np.eye(n_class)[input])
        output_batch.append(np.eye(n_class)[output])

        target_batch.append(target)

    return input_batch, output_batch, target_batch

# Model
enc_input = tf.placeholder(tf.float32, [None, None, n_class]) # [batch_size, max_len(=encoder_step), n_class]
dec_input = tf.placeholder(tf.float32, [None, None, n_class]) # [batch_size, max_len+1(=decoder_step) (becase of 'S' or 'E'), n_class]
targets = tf.placeholder(tf.int64, [None, None]) # [batch_size, max_len+1], not one-hot

with tf.variable_scope('encode'):
    enc_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
    enc_cell = tf.nn.rnn_cell.DropoutWrapper(enc_cell, output_keep_prob=0.5)
    _, enc_states = tf.nn.dynamic_rnn(enc_cell, enc_input, dtype=tf.float32)
    # encoder state will go to decoder initial_state, enc_states : [batch_size, n_hidden(=128)]

with tf.variable_scope('decode'):
    dec_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
    dec_cell = tf.nn.rnn_cell.DropoutWrapper(dec_cell, output_keep_prob=0.5)
    outputs, _ = tf.nn.dynamic_rnn(dec_cell, dec_input, initial_state=enc_states, dtype=tf.float32)
    # outputs : [batch_size, max_len+1, n_hidden(=128)]

model = tf.layers.dense(outputs, n_class, activation=None) # model : [batch_size, max_len+1, n_class]

cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model, labels=targets))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

# Training
sess = tf.Session()
sess.run(tf.global_variables_initializer())
input_batch, output_batch, target_batch = make_batch(seq_data)

for epoch in range(5000):
    _, loss = sess.run([optimizer, cost], feed_dict={enc_input: input_batch, dec_input: output_batch, targets: target_batch})
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

# Test
def translate(word):
    seq_data = [word, 'P' * len(word)]

    input_batch, output_batch, _ = make_batch([seq_data])
    prediction = tf.argmax(model, 2)

    result = sess.run(prediction, feed_dict={enc_input: input_batch, dec_input: output_batch})

    decoded = [char_arr[i] for i in result[0]]
    end = decoded.index('E')
    translated = ''.join(decoded[:end])

    return translated.replace('P','')

print('test')
print('man ->', translate('man'))
print('mans ->', translate('mans'))
print('king ->', translate('king'))
print('black ->', translate('black'))
print('upp ->', translate('upp'))

================================================
FILE: archive/tensorflow/v1/4-2.Seq2Seq(Attention)/Seq2Seq(Attention).py
================================================
# code by Tae Hwan Jung(Jeff Jung) @graykode
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

tf.reset_default_graph()
# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps
sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict)  # vocab list

# Parameter
n_step = 5  # maxium number of words in one sentence(=number of time steps)
n_hidden = 128

def make_batch(sentences):
    input_batch = [np.eye(n_class)[[word_dict[n] for n in sentences[0].split()]]]
    output_batch = [np.eye(n_class)[[word_dict[n] for n in sentences[1].split()]]]
    target_batch = [[word_dict[n] for n in sentences[2].split()]]
    return input_batch, output_batch, target_batch

# Model
enc_inputs = tf.placeholder(tf.float32, [None, None, n_class])  # [batch_size, n_step, n_class]
dec_inputs = tf.placeholder(tf.float32, [None, None, n_class])  # [batch_size, n_step, n_class]
targets = tf.placeholder(tf.int64, [1, n_step])  # [batch_size, n_step], not one-hot

# Linear for attention
attn = tf.Variable(tf.random_normal([n_hidden, n_hidden]))
out = tf.Variable(tf.random_normal([n_hidden * 2, n_class]))

def get_att_score(dec_output, enc_output):  # enc_output [n_step, n_hidden]
    score = tf.squeeze(tf.matmul(enc_output, attn), 0)  # score : [n_hidden]
    dec_output = tf.squeeze(dec_output, [0, 1])  # dec_output : [n_hidden]
    return tf.tensordot(dec_output, score, 1)  # inner product make scalar value

def get_att_weight(dec_output, enc_outputs):
    attn_scores = []  # list of attention scalar : [n_step]
    enc_outputs = tf.transpose(enc_outputs, [1, 0, 2])  # enc_outputs : [n_step, batch_size, n_hidden]
    for i in range(n_step):
        attn_scores.append(get_att_score(dec_output, enc_outputs[i]))

    # Normalize scores to weights in range 0 to 1
    return tf.reshape(tf.nn.softmax(attn_scores), [1, 1, -1])  # [1, 1, n_step]

model = []
Attention = []
with tf.variable_scope('encode'):
    enc_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
    enc_cell = tf.nn.rnn_cell.DropoutWrapper(enc_cell, output_keep_prob=0.5)
    # enc_outputs : [batch_size(=1), n_step(=decoder_step), n_hidden(=128)]
    # enc_hidden : [batch_size(=1), n_hidden(=128)]
    enc_outputs, enc_hidden = tf.nn.dynamic_rnn(enc_cell, enc_inputs, dtype=tf.float32)

with tf.variable_scope('decode'):
    dec_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
    dec_cell = tf.nn.rnn_cell.DropoutWrapper(dec_cell, output_keep_prob=0.5)

    inputs = tf.transpose(dec_inputs, [1, 0, 2])
    hidden = enc_hidden
    for i in range(n_step):
        # time_major True mean inputs shape: [max_time, batch_size, ...]
        dec_output, hidden = tf.nn.dynamic_rnn(dec_cell, tf.expand_dims(inputs[i], 1),
                                               initial_state=hidden, dtype=tf.float32, time_major=True)
        attn_weights = get_att_weight(dec_output, enc_outputs)  # attn_weights : [1, 1, n_step]
        Attention.append(tf.squeeze(attn_weights))

        # matrix-matrix product of matrices [1, 1, n_step] x [1, n_step, n_hidden] = [1, 1, n_hidden]
        context = tf.matmul(attn_weights, enc_outputs)
        dec_output = tf.squeeze(dec_output, 0)  # [1, n_step]
        context = tf.squeeze(context, 1)  # [1, n_hidden]

        model.append(tf.matmul(tf.concat((dec_output, context), 1), out))  # [n_step, batch_size(=1), n_class]

trained_attn = tf.stack([Attention[0], Attention[1], Attention[2], Attention[3], Attention[4]], 0)  # to show attention matrix
model = tf.transpose(model, [1, 0, 2])  # model : [n_step, n_class]
prediction = tf.argmax(model, 2)
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model, labels=targets))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

# Training and Test
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    for epoch in range(2000):
        input_batch, output_batch, target_batch = make_batch(sentences)
        _, loss, attention = sess.run([optimizer, cost, trained_attn],
                                      feed_dict={enc_inputs: input_batch, dec_inputs: output_batch, targets: target_batch})

        if (epoch + 1) % 400 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    predict_batch = [np.eye(n_class)[[word_dict[n] for n in 'P P P P P'.split()]]]
    result = sess.run(prediction, feed_dict={enc_inputs: input_batch, dec_inputs: predict_batch})
    print(sentences[0].split(), '->', [number_dict[n] for n in result[0]])

    # Show Attention
    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    ax.set_xticklabels([''] + sentences[0].split(), fontdict={'fontsize': 14})
    ax.set_yticklabels([''] + sentences[2].split(), fontdict={'fontsize': 14})
    plt.show()

================================================
FILE: archive/tensorflow/v1/4-3.Bi-LSTM(Attention)/Bi-LSTM(Attention).py
================================================
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode
  Reference : https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM_Attn.py
'''
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

tf.reset_default_graph()

# Bi-LSTM(Attention) Parameters
embedding_dim = 2
n_hidden = 5 # number of hidden units in one cell
n_step = 3 # all sentence is consist of 3 words
n_class = 2  # 0 or 1

# 3 words sentences (=sequence_length is 3)
sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
labels = [1, 1, 1, 0, 0, 0]  # 1 is good, 0 is not good.

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
vocab_size = len(word_dict)

input_batch = []
for sen in sentences:
    input_batch.append(np.asarray([word_dict[n] for n in sen.split()]))

target_batch = []
for out in labels:
    target_batch.append(np.eye(n_class)[out]) # ONE-HOT : To using Tensor Softmax Loss function

# LSTM Model
X = tf.placeholder(tf.int32, [None, n_step])
Y = tf.placeholder(tf.int32, [None, n_class])
out = tf.Variable(tf.random_normal([n_hidden * 2, n_class]))

embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_dim]))
input = tf.nn.embedding_lookup(embedding, X) # [batch_size, len_seq, embedding_dim]

lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)
lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)

# output : [batch_size, len_seq, n_hidden], states : [batch_size, n_hidden]
output, final_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell, input, dtype=tf.float32)

# Attention
output = tf.concat([output[0], output[1]], 2)                             # output[0] : lstm_fw, output[1] : lstm_bw
final_hidden_state = tf.concat([final_state[1][0], final_state[1][1]], 1) # final_hidden_state : [batch_size, n_hidden * num_directions(=2)]
final_hidden_state = tf.expand_dims(final_hidden_state, 2)                # final_hidden_state : [batch_size, n_hidden * num_directions(=2), 1]

attn_weights = tf.squeeze(tf.matmul(output, final_hidden_state), 2) # attn_weights : [batch_size, n_step]
soft_attn_weights = tf.nn.softmax(attn_weights, 1)
context = tf.matmul(tf.transpose(output, [0, 2, 1]), tf.expand_dims(soft_attn_weights, 2)) # context : [batch_size, n_hidden * num_directions(=2), 1]
context = tf.squeeze(context, 2) # [batch_size, n_hidden * num_directions(=2)]

model = tf.matmul(context, out)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

# Model-Predict
hypothesis = tf.nn.softmax(model)
predictions = tf.argmax(hypothesis, 1)

# Training
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    for epoch in range(5000):
        _, loss, attention = sess.run([optimizer, cost, soft_attn_weights], feed_dict={X: input_batch, Y: target_batch})
        if (epoch + 1)%1000 == 0:
            print('Epoch:', '%06d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    # Test
    test_text = 'sorry hate you'
    tests = [np.asarray([word_dict[n] for n in test_text.split()])]

    predict = sess.run([predictions], feed_dict={X: tests})
    result = predict[0][0]
    if result == 0:
        print(test_text,"is Bad Mean...")
    else:
        print(test_text,"is Good Mean!!")

    fig = plt.figure(figsize=(6, 3)) # [batch_size, n_step]
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    ax.set_xticklabels([''] + ['first_word', 'second_word', 'third_word'], fontdict={'fontsize': 14}, rotation=90)
    ax.set_yticklabels([''] + ['batch_1', 'batch_2', 'batch_3', 'batch_4', 'batch_5', 'batch_6'], fontdict={'fontsize': 14})
    plt.show()