[
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "include LICENSE\n"
  },
  {
    "path": "README.md",
    "content": "### 实现说明\n\n主要实现文章前半部分的工作，PyTorch实现，基于[huggingface](https://github.com/huggingface/pytorch-pretrained-BERT)的工作，PyTorch才是世界上最屌的框架，逃。\n\n### 实现参考\n\n![img1](http://wx2.sinaimg.cn/mw690/aba7d18bgy1g47p0g5ln3j210n0drtas.jpg)\n\n\n### 代码说明\n\n（1）主要修改：[modeling.py](https://github.com/zhpmatrix/BERTem/blob/master/pytorch_pretrained_bert/modeling.py)\n\noutput representation: **BertForSequenceClassification**\n\ninput representation:  **BertEmbeddings**\n\ninput和output都实现了多种策略，可以结合具体的任务，找到最佳的组合。\n\n\n（2）非主要实现：examples下的关于classification的文件\n\n（3）服务部署：基于Flask，可以在本地开启一个服务。具体实现在[tacred\\_run\\_infer.py](https://github.com/zhpmatrix/BERTem/blob/master/examples/tacred_run_infer.py)中。\n\n（4）代码仅供参考，不提供数据集，不提供预训练模型，不提供训练后的模型（希望理解吧）。\n\n（5）相关工作可以参考[我的博客-神经关系抽取](https://zhpmatrix.github.io/2019/06/30/neural-relation-extraction/)，可能比这个代码更有价值一些吧。\n\n\n### 实现结果：\n\n 数据集TACRED上的结果：\n\n|模型序号|输入类型|输出类型|指标类型|P|R|F1|备注|\n|------|------|------|------|------|------|------|------|\n|0|entity marker|sum(entity start)|micro|**0.68**|**0.63**|**0.65**|**base-model**,lr=3e-5,epoch=3|\n||||macro|**0.60**|**0.54**|**0.55**|\n|1|entity marker|sum(entity start)|micro|**0.70**|**0.62**|**0.65**|**large-model**,lr=3e-5,epoch=1|\n||||macro|**0.63**|**0.52**|**0.55**|\n|-1|None|None|micro|**0.69**|**0.66**|**0.67**|手误之后，再也找不到了，尴尬|||\n||||macro|**0.58**|**0.50**|**0.53**||||\n\n\n数据集SemEval2010 Task 8上的结果：\n\n|模型序号|输入类型|输出类型|指标类型|P|R|F1|备注|\n|------|------|------|------|------|------|------|------|\n|0|entity marker|maxpool(entity emb)+relu|micro|**0.86**|**0.86**|**0.86**|bert-large|\n||||macro|**0.82**|**0.83**|**0.82**||||\n\n\n### 混合精度加速结果\n\n在具体任务上，延续之前的setting，将train和dev合并共同作为新的train集，test集不变。在fp32\n和fp16的两种setting下，比较相同batch\\_size下，一个epoch的用时或者每个迭代的用时。\n\n|比较方面|fp32|fp16|备注|\n|------|------|------|------|\n|训练阶段|1.04it/s|4.41it/s|12.76it/s（独占显卡）|\n|推断阶段|4.14it/s|8.63it/s||\n|测试集指标|0.65/0.55|0.64/0.53|格式：micro/macro|\n|模型大小|421M|212M||\n"
  },
  {
    "path": "docker/Dockerfile",
    "content": "FROM pytorch/pytorch:latest\n\nRUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext\n\nRUN pip install pytorch-pretrained-bert\n\nWORKDIR /workspace"
  },
  {
    "path": "examples/bertology.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport argparse\nimport logging\nfrom datetime import timedelta, datetime\nfrom tqdm import tqdm\n\nimport numpy as np\n\nimport torch\nfrom torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset\nfrom torch.utils.data.distributed import DistributedSampler\nfrom torch.nn import CrossEntropyLoss, MSELoss\n\nfrom pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer\n\nfrom run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics\n\n\nlogger = logging.getLogger(__name__)\n\n\ndef entropy(p):\n    plogp = p * torch.log(p)\n    plogp[p == 0] = 0\n    return -plogp.sum(dim=-1)\n\n\ndef print_1d_tensor(tensor, prefix=\"\"):\n    if tensor.dtype != torch.long:\n        logger.info(prefix + \"\\t\".join(f\"{x:.5f}\" for x in tensor.cpu().data))\n    else:\n        logger.info(prefix + \"\\t\".join(f\"{x:d}\" for x in tensor.cpu().data))\n\n\ndef print_2d_tensor(tensor):\n    logger.info(\"lv, h >\\t\" + \"\\t\".join(f\"{x + 1}\" for x in range(len(tensor))))\n    for row in range(len(tensor)):\n        print_1d_tensor(tensor[row], prefix=f\"layer {row + 1}:\\t\")\n\n\ndef compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):\n    \"\"\" Example on how to use model outputs to compute:\n        - head attention entropy (activated by setting output_attentions=True when we created the model\n        - head importance scores according to http://arxiv.org/abs/1905.10650\n            (activated by setting keep_multihead_output=True when we created the model)\n    \"\"\"\n    # Prepare our tensors\n    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads\n    head_importance = torch.zeros(n_layers, n_heads).to(args.device)\n    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)\n    preds = None\n    labels = None\n    tot_tokens = 0.0\n\n    for step, batch in enumerate(tqdm(eval_dataloader, desc=\"Iteration\", disable=args.local_rank not in [-1, 0])):\n        batch = tuple(t.to(args.device) for t in batch)\n        input_ids, input_mask, segment_ids, label_ids = batch\n\n        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)\n        all_attentions, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=head_mask)\n\n        if compute_entropy:\n            # Update head attention entropy\n            for layer, attn in enumerate(all_attentions):\n                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)\n                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()\n\n        if compute_importance:\n            # Update head importance scores with regards to our loss\n            # First, backpropagate to populate the gradients\n            if args.output_mode == \"classification\":\n                loss_fct = CrossEntropyLoss()\n                loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))\n            elif args.output_mode == \"regression\":\n                loss_fct = MSELoss()\n                loss = loss_fct(logits.view(-1), label_ids.view(-1))\n            loss.backward()\n            # Second, compute importance scores according to http://arxiv.org/abs/1905.10650\n            multihead_outputs = model.bert.get_multihead_outputs()\n            for layer, mh_layer_output in enumerate(multihead_outputs):\n                dot = torch.einsum(\"bhli,bhli->bhl\", [mh_layer_output.grad, mh_layer_output])\n                head_importance[layer] += dot.abs().sum(-1).sum(0).detach()\n\n        # Also store our logits/labels if we want to compute metrics afterwards\n        if preds is None:\n            preds = logits.detach().cpu().numpy()\n            labels = label_ids.detach().cpu().numpy()\n        else:\n            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)\n            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)\n\n        tot_tokens += input_mask.float().detach().sum().data\n\n    # Normalize\n    attn_entropy /= tot_tokens\n    head_importance /= tot_tokens\n    # Layerwise importance normalization\n    if not args.dont_normalize_importance_by_layer:\n        exponent = 2\n        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)\n        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20\n\n    if not args.dont_normalize_global_importance:\n        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())\n\n    return attn_entropy, head_importance, preds, labels\n\n\ndef run_model():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--model_name_or_path', type=str, default='bert-base-cased-finetuned-mrpc', help='pretrained model name or path to local checkpoint')\n    parser.add_argument(\"--task_name\", type=str, default='mrpc', help=\"The name of the task to train.\")\n    parser.add_argument(\"--data_dir\", type=str, required=True, help=\"The input data dir. Should contain the .tsv files (or other data files) for the task.\")\n    parser.add_argument(\"--output_dir\", type=str, required=True, help=\"The output directory where the model predictions and checkpoints will be written.\")\n    parser.add_argument(\"--data_subset\", type=int, default=-1, help=\"If > 0: limit the data to a subset of data_subset instances.\")\n    parser.add_argument(\"--overwrite_output_dir\", action='store_true', help=\"Whether to overwrite data in output directory\")\n\n    parser.add_argument(\"--dont_normalize_importance_by_layer\", action='store_true', help=\"Don't normalize importance score by layers\")\n    parser.add_argument(\"--dont_normalize_global_importance\", action='store_true', help=\"Don't normalize all importance scores between 0 and 1\")\n\n    parser.add_argument(\"--try_masking\", action='store_true', help=\"Whether to try to mask head until a threshold of accuracy.\")\n    parser.add_argument(\"--masking_threshold\", default=0.9, type=float, help=\"masking threshold in term of metrics\"\n                                                                             \"(stop masking when metric < threshold * original metric value).\")\n    parser.add_argument(\"--masking_amount\", default=0.1, type=float, help=\"Amount to heads to masking at each masking step.\")\n    parser.add_argument(\"--metric_name\", default=\"acc\", type=str, help=\"Metric to use for head masking.\")\n\n    parser.add_argument(\"--max_seq_length\", default=128, type=int, help=\"The maximum total input sequence length after WordPiece tokenization. \\n\"\n                             \"Sequences longer than this will be truncated, and sequences shorter \\n\"\n                             \"than this will be padded.\")\n    parser.add_argument(\"--batch_size\", default=1, type=int, help=\"Batch size.\")\n\n    parser.add_argument(\"--seed\", type=int, default=42)\n    parser.add_argument(\"--local_rank\", type=int, default=-1, help=\"local_rank for distributed training on gpus\")\n    parser.add_argument(\"--no_cuda\", action='store_true', help=\"Whether not to use CUDA when available\")\n    parser.add_argument('--server_ip', type=str, default='', help=\"Can be used for distant debugging.\")\n    parser.add_argument('--server_port', type=str, default='', help=\"Can be used for distant debugging.\")\n    args = parser.parse_args()\n\n    if args.server_ip and args.server_port:\n        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script\n        import ptvsd\n        print(\"Waiting for debugger attach\")\n        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)\n        ptvsd.wait_for_attach()\n\n    # Setup devices and distributed training\n    if args.local_rank == -1 or args.no_cuda:\n        args.device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        torch.cuda.set_device(args.local_rank)\n        args.device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend\n\n    # Setup logging\n    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)\n    logger.info(\"device: {} n_gpu: {}, distributed: {}\".format(args.device, n_gpu, bool(args.local_rank != -1)))\n\n    # Set seeds\n    np.random.seed(args.seed)\n    torch.random.manual_seed(args.seed)\n    if n_gpu > 0:\n        torch.cuda.manual_seed(args.seed)\n\n    # Prepare GLUE task\n    task_name = args.task_name.lower()\n    processor = processors[task_name]()\n    label_list = processor.get_labels()\n    args.output_mode = output_modes[task_name]\n    args.num_labels = len(label_list)\n\n    # Prepare output directory\n    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:\n        raise ValueError(\"Output directory ({}) already exists and is not empty.\".format(args.output_dir))\n    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:\n        os.makedirs(args.output_dir)\n\n    # Load model & tokenizer\n    if args.local_rank not in [-1, 0]:\n        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab\n    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)\n\n    # Load a model with all BERTology options on:\n    #   output_attentions => will output attention weights\n    #   keep_multihead_output => will store gradient of attention head outputs for head importance computation\n    #       see: http://arxiv.org/abs/1905.10650\n    model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,\n                                                          num_labels=args.num_labels,\n                                                          output_attentions=True,\n                                                          keep_multihead_output=True)\n    if args.local_rank == 0:\n        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab\n    model.to(args.device)\n    if args.local_rank != -1:\n        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)\n    model.eval()\n\n    # Prepare dataset for the GLUE task\n    eval_examples = processor.get_dev_examples(args.data_dir)\n    cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(\n        list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task_name)))\n    try:\n        eval_features = torch.load(cached_eval_features_file)\n    except:\n        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, args.output_mode)\n        if args.local_rank in [-1, 0]:\n            logger.info(\"Saving eval features to cache file %s\", cached_eval_features_file)\n            torch.save(eval_features, cached_eval_features_file)\n\n    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if args.output_mode == \"classification\" else torch.float)\n    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)\n\n    if args.data_subset > 0:\n        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))\n\n    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)\n    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)\n\n    # Print/save training arguments\n    print(args)\n    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))\n\n    # Compute head entropy and importance score\n    attn_entropy, head_importance, _, _ = compute_heads_importance(args, model, eval_dataloader)\n\n    # Print/save matrices\n    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())\n    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())\n\n    logger.info(\"Attention entropies\")\n    print_2d_tensor(attn_entropy)\n    logger.info(\"Head importance scores\")\n    print_2d_tensor(head_importance)\n    logger.info(\"Head ranked by importance scores\")\n    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)\n    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)\n    head_ranks = head_ranks.view_as(head_importance)\n    print_2d_tensor(head_ranks)\n\n    # Do masking if we want to\n    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:\n        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)\n        preds = np.argmax(preds, axis=1) if args.output_mode == \"classification\" else np.squeeze(preds)\n        original_score = compute_metrics(task_name, preds, labels)[args.metric_name]\n        logger.info(\"Pruning: original score: %f, threshold: %f\", original_score, original_score * args.masking_threshold)\n\n        new_head_mask = torch.ones_like(head_importance)\n        num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))\n\n        current_score = original_score\n        while current_score >= original_score * args.masking_threshold:\n            head_mask = new_head_mask.clone() # save current head mask\n            # heads from least important to most - keep only not-masked heads\n            head_importance[head_mask == 0.0] = float('Inf')\n            current_heads_to_mask = head_importance.view(-1).sort()[1]\n\n            if len(current_heads_to_mask) <= num_to_mask:\n                break\n\n            # mask heads\n            current_heads_to_mask = current_heads_to_mask[:num_to_mask]\n            logger.info(\"Heads to mask: %s\", str(current_heads_to_mask.tolist()))\n            new_head_mask = new_head_mask.view(-1)\n            new_head_mask[current_heads_to_mask] = 0.0\n            new_head_mask = new_head_mask.view_as(head_mask)\n            print_2d_tensor(new_head_mask)\n\n            # Compute metric and head importance again\n            _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)\n            preds = np.argmax(preds, axis=1) if args.output_mode == \"classification\" else np.squeeze(preds)\n            current_score = compute_metrics(task_name, preds, labels)[args.metric_name]\n            logger.info(\"Masking: current score: %f, remaning heads %d (%.1f percents)\", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)\n\n        logger.info(\"Final head mask\")\n        print_2d_tensor(head_mask)\n        np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())\n\n        # Try pruning and test time speedup\n        # Pruning is like masking but we actually remove the masked weights\n        before_time = datetime.now()\n        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,\n                                                       compute_entropy=False, compute_importance=False, head_mask=head_mask)\n        preds = np.argmax(preds, axis=1) if args.output_mode == \"classification\" else np.squeeze(preds)\n        score_masking = compute_metrics(task_name, preds, labels)[args.metric_name]\n        original_time = datetime.now() - before_time\n\n        original_num_params = sum(p.numel() for p in model.parameters())\n        heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))\n        assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()\n        model.bert.prune_heads(heads_to_prune)\n        pruned_num_params = sum(p.numel() for p in model.parameters())\n\n        before_time = datetime.now()\n        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,\n                                                       compute_entropy=False, compute_importance=False, head_mask=None)\n        preds = np.argmax(preds, axis=1) if args.output_mode == \"classification\" else np.squeeze(preds)\n        score_pruning = compute_metrics(task_name, preds, labels)[args.metric_name]\n        new_time = datetime.now() - before_time\n\n        logger.info(\"Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)\", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)\n        logger.info(\"Pruning: score with masking: %f score with pruning: %f\", score_masking, score_pruning)\n        logger.info(\"Pruning: speed ratio (new timing / original timing): %f percents\", original_time/new_time * 100)\n\nif __name__ == '__main__':\n    run_model()\n"
  },
  {
    "path": "examples/extract_features.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Extract pre-computed feature vectors from a PyTorch BERT model.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport argparse\nimport collections\nimport logging\nimport json\nimport re\n\nimport torch\nfrom torch.utils.data import TensorDataset, DataLoader, SequentialSampler\nfrom torch.utils.data.distributed import DistributedSampler\n\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\nfrom pytorch_pretrained_bert.modeling import BertModel\n\nlogging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', \n                    datefmt = '%m/%d/%Y %H:%M:%S',\n                    level = logging.INFO)\nlogger = logging.getLogger(__name__)\n\n\nclass InputExample(object):\n\n    def __init__(self, unique_id, text_a, text_b):\n        self.unique_id = unique_id\n        self.text_a = text_a\n        self.text_b = text_b\n\n\nclass InputFeatures(object):\n    \"\"\"A single set of features of data.\"\"\"\n\n    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):\n        self.unique_id = unique_id\n        self.tokens = tokens\n        self.input_ids = input_ids\n        self.input_mask = input_mask\n        self.input_type_ids = input_type_ids\n\n\ndef convert_examples_to_features(examples, seq_length, tokenizer):\n    \"\"\"Loads a data file into a list of `InputFeature`s.\"\"\"\n\n    features = []\n    for (ex_index, example) in enumerate(examples):\n        tokens_a = tokenizer.tokenize(example.text_a)\n\n        tokens_b = None\n        if example.text_b:\n            tokens_b = tokenizer.tokenize(example.text_b)\n\n        if tokens_b:\n            # Modifies `tokens_a` and `tokens_b` in place so that the total\n            # length is less than the specified length.\n            # Account for [CLS], [SEP], [SEP] with \"- 3\"\n            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)\n        else:\n            # Account for [CLS] and [SEP] with \"- 2\"\n            if len(tokens_a) > seq_length - 2:\n                tokens_a = tokens_a[0:(seq_length - 2)]\n\n        # The convention in BERT is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids:   0   0   0   0  0     0   0\n        #\n        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary\n        # since the [SEP] token unambigiously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = []\n        input_type_ids = []\n        tokens.append(\"[CLS]\")\n        input_type_ids.append(0)\n        for token in tokens_a:\n            tokens.append(token)\n            input_type_ids.append(0)\n        tokens.append(\"[SEP]\")\n        input_type_ids.append(0)\n\n        if tokens_b:\n            for token in tokens_b:\n                tokens.append(token)\n                input_type_ids.append(1)\n            tokens.append(\"[SEP]\")\n            input_type_ids.append(1)\n\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n        # The mask has 1 for real tokens and 0 for padding tokens. Only real\n        # tokens are attended to.\n        input_mask = [1] * len(input_ids)\n\n        # Zero-pad up to the sequence length.\n        while len(input_ids) < seq_length:\n            input_ids.append(0)\n            input_mask.append(0)\n            input_type_ids.append(0)\n\n        assert len(input_ids) == seq_length\n        assert len(input_mask) == seq_length\n        assert len(input_type_ids) == seq_length\n\n        if ex_index < 5:\n            logger.info(\"*** Example ***\")\n            logger.info(\"unique_id: %s\" % (example.unique_id))\n            logger.info(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n            logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n            logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n            logger.info(\n                \"input_type_ids: %s\" % \" \".join([str(x) for x in input_type_ids]))\n\n        features.append(\n            InputFeatures(\n                unique_id=example.unique_id,\n                tokens=tokens,\n                input_ids=input_ids,\n                input_mask=input_mask,\n                input_type_ids=input_type_ids))\n    return features\n\n\ndef _truncate_seq_pair(tokens_a, tokens_b, max_length):\n    \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n\n    # This is a simple heuristic which will always truncate the longer sequence\n    # one token at a time. This makes more sense than truncating an equal percent\n    # of tokens from each, since if one sequence is very short then each token\n    # that's truncated likely contains more information than a longer sequence.\n    while True:\n        total_length = len(tokens_a) + len(tokens_b)\n        if total_length <= max_length:\n            break\n        if len(tokens_a) > len(tokens_b):\n            tokens_a.pop()\n        else:\n            tokens_b.pop()\n\n\ndef read_examples(input_file):\n    \"\"\"Read a list of `InputExample`s from an input file.\"\"\"\n    examples = []\n    unique_id = 0\n    with open(input_file, \"r\", encoding='utf-8') as reader:\n        while True:\n            line = reader.readline()\n            if not line:\n                break\n            line = line.strip()\n            text_a = None\n            text_b = None\n            m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line)\n            if m is None:\n                text_a = line\n            else:\n                text_a = m.group(1)\n                text_b = m.group(2)\n            examples.append(\n                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))\n            unique_id += 1\n    return examples\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n\n    ## Required parameters\n    parser.add_argument(\"--input_file\", default=None, type=str, required=True)\n    parser.add_argument(\"--output_file\", default=None, type=str, required=True)\n    parser.add_argument(\"--bert_model\", default=None, type=str, required=True,\n                        help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\n                             \"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.\")\n\n    ## Other parameters\n    parser.add_argument(\"--do_lower_case\", action='store_true', help=\"Set this flag if you are using an uncased model.\")\n    parser.add_argument(\"--layers\", default=\"-1,-2,-3,-4\", type=str)\n    parser.add_argument(\"--max_seq_length\", default=128, type=int,\n                        help=\"The maximum total input sequence length after WordPiece tokenization. Sequences longer \"\n                            \"than this will be truncated, and sequences shorter than this will be padded.\")\n    parser.add_argument(\"--batch_size\", default=32, type=int, help=\"Batch size for predictions.\")\n    parser.add_argument(\"--local_rank\",\n                        type=int,\n                        default=-1,\n                        help = \"local_rank for distributed training on gpus\")\n    parser.add_argument(\"--no_cuda\",\n                        action='store_true',\n                        help=\"Whether not to use CUDA when available\")\n\n    args = parser.parse_args()\n\n    if args.local_rank == -1 or args.no_cuda:\n        device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n        torch.distributed.init_process_group(backend='nccl')\n    logger.info(\"device: {} n_gpu: {} distributed training: {}\".format(device, n_gpu, bool(args.local_rank != -1)))\n\n    layer_indexes = [int(x) for x in args.layers.split(\",\")]\n\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)\n\n    examples = read_examples(args.input_file)\n\n    features = convert_examples_to_features(\n        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)\n\n    unique_id_to_feature = {}\n    for feature in features:\n        unique_id_to_feature[feature.unique_id] = feature\n\n    model = BertModel.from_pretrained(args.bert_model)\n    model.to(device)\n\n    if args.local_rank != -1:\n        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],\n                                                          output_device=args.local_rank)\n    elif n_gpu > 1:\n        model = torch.nn.DataParallel(model)\n\n    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n\n    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)\n    if args.local_rank == -1:\n        eval_sampler = SequentialSampler(eval_data)\n    else:\n        eval_sampler = DistributedSampler(eval_data)\n    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)\n\n    model.eval()\n    with open(args.output_file, \"w\", encoding='utf-8') as writer:\n        for input_ids, input_mask, example_indices in eval_dataloader:\n            input_ids = input_ids.to(device)\n            input_mask = input_mask.to(device)\n\n            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)\n            all_encoder_layers = all_encoder_layers\n\n            for b, example_index in enumerate(example_indices):\n                feature = features[example_index.item()]\n                unique_id = int(feature.unique_id)\n                # feature = unique_id_to_feature[unique_id]\n                output_json = collections.OrderedDict()\n                output_json[\"linex_index\"] = unique_id\n                all_out_features = []\n                for (i, token) in enumerate(feature.tokens):\n                    all_layers = []\n                    for (j, layer_index) in enumerate(layer_indexes):\n                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n                        layer_output = layer_output[b]\n                        layers = collections.OrderedDict()\n                        layers[\"index\"] = layer_index\n                        layers[\"values\"] = [\n                            round(x.item(), 6) for x in layer_output[i]\n                        ]\n                        all_layers.append(layers)\n                    out_features = collections.OrderedDict()\n                    out_features[\"token\"] = token\n                    out_features[\"layers\"] = all_layers\n                    all_out_features.append(out_features)\n                output_json[\"features\"] = all_out_features\n                writer.write(json.dumps(output_json) + \"\\n\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/lm_finetuning/README.md",
    "content": "# BERT Model Finetuning using Masked Language Modeling objective\n\n## Introduction\n\nThe three example scripts in this folder can be used to **fine-tune** a pre-trained BERT model using the pretraining objective (combination of masked language modeling and next sentence prediction loss). In general, pretrained models like BERT are first trained with a pretraining objective (masked language modeling and next sentence prediction for BERT) on a large and general natural language corpus. A classifier head is then added on top of the pre-trained architecture and the model is quickly fine-tuned on a target task, while still (hopefully) retaining its general language understanding. This greatly reduces overfitting and yields state-of-the-art results, especially when training data for the target task are limited.\n\nThe [ULMFiT paper](https://arxiv.org/abs/1801.06146) took a slightly different approach, however, and added an intermediate step in which the model is fine-tuned on text **from the same domain as the target task and using the pretraining objective** before the final stage in which the classifier head is added and the model is trained on the target task itself. This paper reported significantly improved results from this step, and found that they could get high-quality classifications even with only tiny numbers (<1000) of labelled training examples, as long as they had a lot of unlabelled data from the target domain.\n\nAlthough this wasn't covered in the original BERT paper, domain-specific fine-tuning of Transformer models has [recently been reported by other authors](https://arxiv.org/pdf/1905.05583.pdf), and they report performance improvements as well.\n\n## Input format\n\nThe scripts in this folder expect a single file as input, consisting of untokenized text, with one **sentence** per line, and one blank line between documents. The reason for the sentence splitting is that part of BERT's training involves a _next sentence_ objective in which the model must predict whether two sequences of text are contiguous text from the same document or not, and to avoid making the task _too easy_, the split point between the sequences is always at the end of a sentence. The linebreaks in the file are therefore necessary to mark the points where the text can be split.\n\n## Usage\n\nThere are two ways to fine-tune a language model using these scripts. The first _quick_ approach is to use [`simple_lm_finetuning.py`](./simple_lm_finetuning.py). This script does everything in a single script, but generates training instances that consist of just two sentences. This is quite different from the BERT paper, where (confusingly) the NextSentence task concatenated sentences together from each document to form two long multi-sentences, which the paper just referred to as _sentences_. The difference between this simple approach and the original paper approach can have a significant effect for long sequences since two sentences will be much shorter than the max sequence length. In this case, most of each training example will just consist of blank padding characters, which wastes a lot of computation and results in a model that isn't really training on long sequences.\n\nAs such, the preferred approach (assuming you have documents containing multiple contiguous sentences from your target domain) is to use [`pregenerate_training_data.py`](./pregenerate_training_data.py) to pre-process your data into training examples following the methodology used for LM training in the original BERT paper and repository. Since there is a significant random component to training data generation for BERT, this script includes an option to generate multiple _epochs_ of pre-processed data, to avoid training on the same random splits each epoch. Generating an epoch of data for each training epoch should result a better final model, and so we recommend doing so.\n\nYou can then train on the pregenerated data using [`finetune_on_pregenerated.py`](./finetune_on_pregenerated.py), and pointing it to the folder created by [`pregenerate_training_data.py`](./pregenerate_training_data.py). Note that you should use the same `bert_model` and case options for both! Also note that `max_seq_len` does not need to be specified for the [`finetune_on_pregenerated.py`](./finetune_on_pregenerated.py) script, as it is inferred from the training examples.\n\nThere are various options that can be tweaked, but they are mostly set to the values from the BERT paper/repository and default values should make sense. The most relevant ones are:\n\n- `--max_seq_len`: Controls the length of training examples (in wordpiece tokens) seen by the model. Defaults to 128 but can be set as high as 512. Higher values may yield stronger language models at the cost of slower and more memory-intensive training.\n- `--fp16`: Enables fast half-precision training on recent GPUs.\n\nIn addition, if memory usage is an issue, especially when training on a single GPU, reducing `--train_batch_size` from the default 32 to a lower number (4-16) can be helpful, or leaving `--train_batch_size` at the default and increasing `--gradient_accumulation_steps` to 2-8. Changing `--gradient_accumulation_steps` may be preferable as alterations to the batch size may require corresponding changes in the learning rate to compensate. There is also a `--reduce_memory` option for both the `pregenerate_training_data.py` and `finetune_on_pregenerated.py` scripts that spills data to disc in shelf objects or numpy memmaps rather than retaining it in memory, which significantly reduces memory usage with little performance impact.\n\n## Examples\n\n### Simple fine-tuning\n\n```\npython3 simple_lm_finetuning.py \n--train_corpus my_corpus.txt \n--bert_model bert-base-uncased \n--do_lower_case \n--output_dir finetuned_lm/\n--do_train\n```\n\n### Pregenerating training data\n\n```\npython3 pregenerate_training_data.py\n--train_corpus my_corpus.txt\n--bert_model bert-base-uncased\n--do_lower_case\n--output_dir training/\n--epochs_to_generate 3\n--max_seq_len 256\n```\n\n### Training on pregenerated data\n\n```\npython3 finetune_on_pregenerated.py\n--pregenerated_data training/\n--bert_model bert-base-uncased\n--do_lower_case\n--output_dir finetuned_lm/\n--epochs 3\n```\n"
  },
  {
    "path": "examples/lm_finetuning/finetune_on_pregenerated.py",
    "content": "from argparse import ArgumentParser\nfrom pathlib import Path\nimport os\nimport torch\nimport logging\nimport json\nimport random\nimport numpy as np\nfrom collections import namedtuple\nfrom tempfile import TemporaryDirectory\n\nfrom torch.utils.data import DataLoader, Dataset, RandomSampler\nfrom torch.utils.data.distributed import DistributedSampler\nfrom tqdm import tqdm\n\nfrom pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME\nfrom pytorch_pretrained_bert.modeling import BertForPreTraining\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\nfrom pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule\n\nInputFeatures = namedtuple(\"InputFeatures\", \"input_ids input_mask segment_ids lm_label_ids is_next\")\n\nlog_format = '%(asctime)-10s: %(message)s'\nlogging.basicConfig(level=logging.INFO, format=log_format)\n\n\ndef convert_example_to_features(example, tokenizer, max_seq_length):\n    tokens = example[\"tokens\"]\n    segment_ids = example[\"segment_ids\"]\n    is_random_next = example[\"is_random_next\"]\n    masked_lm_positions = example[\"masked_lm_positions\"]\n    masked_lm_labels = example[\"masked_lm_labels\"]\n\n    assert len(tokens) == len(segment_ids) <= max_seq_length  # The preprocessed data should be already truncated\n    input_ids = tokenizer.convert_tokens_to_ids(tokens)\n    masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)\n\n    input_array = np.zeros(max_seq_length, dtype=np.int)\n    input_array[:len(input_ids)] = input_ids\n\n    mask_array = np.zeros(max_seq_length, dtype=np.bool)\n    mask_array[:len(input_ids)] = 1\n\n    segment_array = np.zeros(max_seq_length, dtype=np.bool)\n    segment_array[:len(segment_ids)] = segment_ids\n\n    lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)\n    lm_label_array[masked_lm_positions] = masked_label_ids\n\n    features = InputFeatures(input_ids=input_array,\n                             input_mask=mask_array,\n                             segment_ids=segment_array,\n                             lm_label_ids=lm_label_array,\n                             is_next=is_random_next)\n    return features\n\n\nclass PregeneratedDataset(Dataset):\n    def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False):\n        self.vocab = tokenizer.vocab\n        self.tokenizer = tokenizer\n        self.epoch = epoch\n        self.data_epoch = epoch % num_data_epochs\n        data_file = training_path / f\"epoch_{self.data_epoch}.json\"\n        metrics_file = training_path / f\"epoch_{self.data_epoch}_metrics.json\"\n        assert data_file.is_file() and metrics_file.is_file()\n        metrics = json.loads(metrics_file.read_text())\n        num_samples = metrics['num_training_examples']\n        seq_len = metrics['max_seq_len']\n        self.temp_dir = None\n        self.working_dir = None\n        if reduce_memory:\n            self.temp_dir = TemporaryDirectory()\n            self.working_dir = Path(self.temp_dir.name)\n            input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap',\n                                  mode='w+', dtype=np.int32, shape=(num_samples, seq_len))\n            input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap',\n                                    shape=(num_samples, seq_len), mode='w+', dtype=np.bool)\n            segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap',\n                                    shape=(num_samples, seq_len), mode='w+', dtype=np.bool)\n            lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap',\n                                     shape=(num_samples, seq_len), mode='w+', dtype=np.int32)\n            lm_label_ids[:] = -1\n            is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap',\n                                 shape=(num_samples,), mode='w+', dtype=np.bool)\n        else:\n            input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)\n            input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)\n            segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)\n            lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1)\n            is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool)\n        logging.info(f\"Loading training examples for epoch {epoch}\")\n        with data_file.open() as f:\n            for i, line in enumerate(tqdm(f, total=num_samples, desc=\"Training examples\")):\n                line = line.strip()\n                example = json.loads(line)\n                features = convert_example_to_features(example, tokenizer, seq_len)\n                input_ids[i] = features.input_ids\n                segment_ids[i] = features.segment_ids\n                input_masks[i] = features.input_mask\n                lm_label_ids[i] = features.lm_label_ids\n                is_nexts[i] = features.is_next\n        assert i == num_samples - 1  # Assert that the sample count metric was true\n        logging.info(\"Loading complete!\")\n        self.num_samples = num_samples\n        self.seq_len = seq_len\n        self.input_ids = input_ids\n        self.input_masks = input_masks\n        self.segment_ids = segment_ids\n        self.lm_label_ids = lm_label_ids\n        self.is_nexts = is_nexts\n\n    def __len__(self):\n        return self.num_samples\n\n    def __getitem__(self, item):\n        return (torch.tensor(self.input_ids[item].astype(np.int64)),\n                torch.tensor(self.input_masks[item].astype(np.int64)),\n                torch.tensor(self.segment_ids[item].astype(np.int64)),\n                torch.tensor(self.lm_label_ids[item].astype(np.int64)),\n                torch.tensor(self.is_nexts[item].astype(np.int64)))\n\n\ndef main():\n    parser = ArgumentParser()\n    parser.add_argument('--pregenerated_data', type=Path, required=True)\n    parser.add_argument('--output_dir', type=Path, required=True)\n    parser.add_argument(\"--bert_model\", type=str, required=True, help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\n                             \"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.\")\n    parser.add_argument(\"--do_lower_case\", action=\"store_true\")\n    parser.add_argument(\"--reduce_memory\", action=\"store_true\",\n                        help=\"Store training data as on-disc memmaps to massively reduce memory usage\")\n\n    parser.add_argument(\"--epochs\", type=int, default=3, help=\"Number of epochs to train for\")\n    parser.add_argument(\"--local_rank\",\n                        type=int,\n                        default=-1,\n                        help=\"local_rank for distributed training on gpus\")\n    parser.add_argument(\"--no_cuda\",\n                        action='store_true',\n                        help=\"Whether not to use CUDA when available\")\n    parser.add_argument('--gradient_accumulation_steps',\n                        type=int,\n                        default=1,\n                        help=\"Number of updates steps to accumulate before performing a backward/update pass.\")\n    parser.add_argument(\"--train_batch_size\",\n                        default=32,\n                        type=int,\n                        help=\"Total batch size for training.\")\n    parser.add_argument('--fp16',\n                        action='store_true',\n                        help=\"Whether to use 16-bit float precision instead of 32-bit\")\n    parser.add_argument('--loss_scale',\n                        type=float, default=0,\n                        help=\"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\\n\"\n                        \"0 (default value): dynamic loss scaling.\\n\"\n                        \"Positive power of 2: static loss scaling value.\\n\")\n    parser.add_argument(\"--warmup_proportion\",\n                        default=0.1,\n                        type=float,\n                        help=\"Proportion of training to perform linear learning rate warmup for. \"\n                             \"E.g., 0.1 = 10%% of training.\")\n    parser.add_argument(\"--learning_rate\",\n                        default=3e-5,\n                        type=float,\n                        help=\"The initial learning rate for Adam.\")\n    parser.add_argument('--seed',\n                        type=int,\n                        default=42,\n                        help=\"random seed for initialization\")\n    args = parser.parse_args()\n\n    assert args.pregenerated_data.is_dir(), \\\n        \"--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!\"\n\n    samples_per_epoch = []\n    for i in range(args.epochs):\n        epoch_file = args.pregenerated_data / f\"epoch_{i}.json\"\n        metrics_file = args.pregenerated_data / f\"epoch_{i}_metrics.json\"\n        if epoch_file.is_file() and metrics_file.is_file():\n            metrics = json.loads(metrics_file.read_text())\n            samples_per_epoch.append(metrics['num_training_examples'])\n        else:\n            if i == 0:\n                exit(\"No training data was found!\")\n            print(f\"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).\")\n            print(\"This script will loop over the available data, but training diversity may be negatively impacted.\")\n            num_data_epochs = i\n            break\n    else:\n        num_data_epochs = args.epochs\n\n    if args.local_rank == -1 or args.no_cuda:\n        device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        torch.cuda.set_device(args.local_rank)\n        device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n        torch.distributed.init_process_group(backend='nccl')\n    logging.info(\"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}\".format(\n        device, n_gpu, bool(args.local_rank != -1), args.fp16))\n\n    if args.gradient_accumulation_steps < 1:\n        raise ValueError(\"Invalid gradient_accumulation_steps parameter: {}, should be >= 1\".format(\n                            args.gradient_accumulation_steps))\n\n    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps\n\n    random.seed(args.seed)\n    np.random.seed(args.seed)\n    torch.manual_seed(args.seed)\n    if n_gpu > 0:\n        torch.cuda.manual_seed_all(args.seed)\n\n    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):\n        logging.warning(f\"Output directory ({args.output_dir}) already exists and is not empty!\")\n    args.output_dir.mkdir(parents=True, exist_ok=True)\n\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)\n\n    total_train_examples = 0\n    for i in range(args.epochs):\n        # The modulo takes into account the fact that we may loop over limited epochs of data\n        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]\n\n    num_train_optimization_steps = int(\n        total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)\n    if args.local_rank != -1:\n        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()\n\n    # Prepare model\n    model = BertForPreTraining.from_pretrained(args.bert_model)\n    if args.fp16:\n        model.half()\n    model.to(device)\n    if args.local_rank != -1:\n        try:\n            from apex.parallel import DistributedDataParallel as DDP\n        except ImportError:\n            raise ImportError(\n                \"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n        model = DDP(model)\n    elif n_gpu > 1:\n        model = torch.nn.DataParallel(model)\n\n    # Prepare optimizer\n    param_optimizer = list(model.named_parameters())\n    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n    optimizer_grouped_parameters = [\n        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n         'weight_decay': 0.01},\n        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n    ]\n\n    if args.fp16:\n        try:\n            from apex.optimizers import FP16_Optimizer\n            from apex.optimizers import FusedAdam\n        except ImportError:\n            raise ImportError(\n                \"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n        optimizer = FusedAdam(optimizer_grouped_parameters,\n                              lr=args.learning_rate,\n                              bias_correction=False,\n                              max_grad_norm=1.0)\n        if args.loss_scale == 0:\n            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)\n        else:\n            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)\n        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,\n                                             t_total=num_train_optimization_steps)\n    else:\n        optimizer = BertAdam(optimizer_grouped_parameters,\n                             lr=args.learning_rate,\n                             warmup=args.warmup_proportion,\n                             t_total=num_train_optimization_steps)\n\n    global_step = 0\n    logging.info(\"***** Running training *****\")\n    logging.info(f\"  Num examples = {total_train_examples}\")\n    logging.info(\"  Batch size = %d\", args.train_batch_size)\n    logging.info(\"  Num steps = %d\", num_train_optimization_steps)\n    model.train()\n    for epoch in range(args.epochs):\n        epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer,\n                                            num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory)\n        if args.local_rank == -1:\n            train_sampler = RandomSampler(epoch_dataset)\n        else:\n            train_sampler = DistributedSampler(epoch_dataset)\n        train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size)\n        tr_loss = 0\n        nb_tr_examples, nb_tr_steps = 0, 0\n        with tqdm(total=len(train_dataloader), desc=f\"Epoch {epoch}\") as pbar:\n            for step, batch in enumerate(train_dataloader):\n                batch = tuple(t.to(device) for t in batch)\n                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch\n                loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)\n                if n_gpu > 1:\n                    loss = loss.mean() # mean() to average on multi-gpu.\n                if args.gradient_accumulation_steps > 1:\n                    loss = loss / args.gradient_accumulation_steps\n                if args.fp16:\n                    optimizer.backward(loss)\n                else:\n                    loss.backward()\n                tr_loss += loss.item()\n                nb_tr_examples += input_ids.size(0)\n                nb_tr_steps += 1\n                pbar.update(1)\n                mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps\n                pbar.set_postfix_str(f\"Loss: {mean_loss:.5f}\")\n                if (step + 1) % args.gradient_accumulation_steps == 0:\n                    if args.fp16:\n                        # modify learning rate with special warm up BERT uses\n                        # if args.fp16 is False, BertAdam is used that handles this automatically\n                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)\n                        for param_group in optimizer.param_groups:\n                            param_group['lr'] = lr_this_step\n                    optimizer.step()\n                    optimizer.zero_grad()\n                    global_step += 1\n\n    # Save a trained model\n    logging.info(\"** ** * Saving fine-tuned model ** ** * \")\n    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self\n    \n    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)\n    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)\n\n    torch.save(model_to_save.state_dict(), output_model_file)\n    model_to_save.config.to_json_file(output_config_file)\n    tokenizer.save_vocabulary(args.output_dir)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/lm_finetuning/pregenerate_training_data.py",
    "content": "from argparse import ArgumentParser\nfrom pathlib import Path\nfrom tqdm import tqdm, trange\nfrom tempfile import TemporaryDirectory\nimport shelve\nfrom multiprocessing import Pool\n\nfrom random import random, randrange, randint, shuffle, choice\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\nimport numpy as np\nimport json\nimport collections\n\nclass DocumentDatabase:\n    def __init__(self, reduce_memory=False):\n        if reduce_memory:\n            self.temp_dir = TemporaryDirectory()\n            self.working_dir = Path(self.temp_dir.name)\n            self.document_shelf_filepath = self.working_dir / 'shelf.db'\n            self.document_shelf = shelve.open(str(self.document_shelf_filepath),\n                                              flag='n', protocol=-1)\n            self.documents = None\n        else:\n            self.documents = []\n            self.document_shelf = None\n            self.document_shelf_filepath = None\n            self.temp_dir = None\n        self.doc_lengths = []\n        self.doc_cumsum = None\n        self.cumsum_max = None\n        self.reduce_memory = reduce_memory\n\n    def add_document(self, document):\n        if not document:\n            return\n        if self.reduce_memory:\n            current_idx = len(self.doc_lengths)\n            self.document_shelf[str(current_idx)] = document\n        else:\n            self.documents.append(document)\n        self.doc_lengths.append(len(document))\n\n    def _precalculate_doc_weights(self):\n        self.doc_cumsum = np.cumsum(self.doc_lengths)\n        self.cumsum_max = self.doc_cumsum[-1]\n\n    def sample_doc(self, current_idx, sentence_weighted=True):\n        # Uses the current iteration counter to ensure we don't sample the same doc twice\n        if sentence_weighted:\n            # With sentence weighting, we sample docs proportionally to their sentence length\n            if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths):\n                self._precalculate_doc_weights()\n            rand_start = self.doc_cumsum[current_idx]\n            rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]\n            sentence_index = randrange(rand_start, rand_end) % self.cumsum_max\n            sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')\n        else:\n            # If we don't use sentence weighting, then every doc has an equal chance to be chosen\n            sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)\n        assert sampled_doc_index != current_idx\n        if self.reduce_memory:\n            return self.document_shelf[str(sampled_doc_index)]\n        else:\n            return self.documents[sampled_doc_index]\n\n    def __len__(self):\n        return len(self.doc_lengths)\n\n    def __getitem__(self, item):\n        if self.reduce_memory:\n            return self.document_shelf[str(item)]\n        else:\n            return self.documents[item]\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_val, traceback):\n        if self.document_shelf is not None:\n            self.document_shelf.close()\n        if self.temp_dir is not None:\n            self.temp_dir.cleanup()\n\n\ndef truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):\n    \"\"\"Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo.\"\"\"\n    while True:\n        total_length = len(tokens_a) + len(tokens_b)\n        if total_length <= max_num_tokens:\n            break\n\n        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b\n        assert len(trunc_tokens) >= 1\n\n        # We want to sometimes truncate from the front and sometimes from the\n        # back to add more randomness and avoid biases.\n        if random() < 0.5:\n            del trunc_tokens[0]\n        else:\n            trunc_tokens.pop()\n\nMaskedLmInstance = collections.namedtuple(\"MaskedLmInstance\",\n                                          [\"index\", \"label\"])\n\ndef create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):\n    \"\"\"Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but\n    with several refactors to clean it up and remove a lot of unnecessary variables.\"\"\"\n    cand_indices = []\n    for (i, token) in enumerate(tokens):\n        if token == \"[CLS]\" or token == \"[SEP]\":\n            continue\n        # Whole Word Masking means that if we mask all of the wordpieces\n        # corresponding to an original word. When a word has been split into\n        # WordPieces, the first token does not have any marker and any subsequence\n        # tokens are prefixed with ##. So whenever we see the ## token, we\n        # append it to the previous set of word indexes.\n        #\n        # Note that Whole Word Masking does *not* change the training code\n        # at all -- we still predict each WordPiece independently, softmaxed\n        # over the entire vocabulary.\n        if (whole_word_mask and len(cand_indices) >= 1 and token.startswith(\"##\")):\n            cand_indices[-1].append(i)\n        else:\n            cand_indices.append([i])\n\n    num_to_mask = min(max_predictions_per_seq,\n                      max(1, int(round(len(tokens) * masked_lm_prob))))\n    shuffle(cand_indices)\n    masked_lms = []\n    covered_indexes = set()\n    for index_set in cand_indices:\n        if len(masked_lms) >= num_to_mask:\n            break\n        # If adding a whole-word mask would exceed the maximum number of\n        # predictions, then just skip this candidate.\n        if len(masked_lms) + len(index_set) > num_to_mask:\n            continue\n        is_any_index_covered = False\n        for index in index_set:\n            if index in covered_indexes:\n                is_any_index_covered = True\n                break\n        if is_any_index_covered:\n            continue\n        for index in index_set:\n            covered_indexes.add(index)\n\n            masked_token = None\n            # 80% of the time, replace with [MASK]\n            if random() < 0.8:\n                masked_token = \"[MASK]\"\n            else:\n                # 10% of the time, keep original\n                if random() < 0.5:\n                    masked_token = tokens[index]\n                # 10% of the time, replace with random word\n                else:\n                    masked_token = choice(vocab_list)\n            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))\n            tokens[index] = masked_token\n\n    assert len(masked_lms) <= num_to_mask\n    masked_lms = sorted(masked_lms, key=lambda x: x.index)\n    mask_indices = [p.index for p in masked_lms]\n    masked_token_labels = [p.label for p in masked_lms]\n\n    return tokens, mask_indices, masked_token_labels\n\n\ndef create_instances_from_document(\n        doc_database, doc_idx, max_seq_length, short_seq_prob,\n        masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):\n    \"\"\"This code is mostly a duplicate of the equivalent function from Google BERT's repo.\n    However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.\n    Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence\n    (rather than each document) has an equal chance of being sampled as a false example for the NextSentence task.\"\"\"\n    document = doc_database[doc_idx]\n    # Account for [CLS], [SEP], [SEP]\n    max_num_tokens = max_seq_length - 3\n\n    # We *usually* want to fill up the entire sequence since we are padding\n    # to `max_seq_length` anyways, so short sequences are generally wasted\n    # computation. However, we *sometimes*\n    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter\n    # sequences to minimize the mismatch between pre-training and fine-tuning.\n    # The `target_seq_length` is just a rough target however, whereas\n    # `max_seq_length` is a hard limit.\n    target_seq_length = max_num_tokens\n    if random() < short_seq_prob:\n        target_seq_length = randint(2, max_num_tokens)\n\n    # We DON'T just concatenate all of the tokens from a document into a long\n    # sequence and choose an arbitrary split point because this would make the\n    # next sentence prediction task too easy. Instead, we split the input into\n    # segments \"A\" and \"B\" based on the actual \"sentences\" provided by the user\n    # input.\n    instances = []\n    current_chunk = []\n    current_length = 0\n    i = 0\n    while i < len(document):\n        segment = document[i]\n        current_chunk.append(segment)\n        current_length += len(segment)\n        if i == len(document) - 1 or current_length >= target_seq_length:\n            if current_chunk:\n                # `a_end` is how many segments from `current_chunk` go into the `A`\n                # (first) sentence.\n                a_end = 1\n                if len(current_chunk) >= 2:\n                    a_end = randrange(1, len(current_chunk))\n\n                tokens_a = []\n                for j in range(a_end):\n                    tokens_a.extend(current_chunk[j])\n\n                tokens_b = []\n\n                # Random next\n                if len(current_chunk) == 1 or random() < 0.5:\n                    is_random_next = True\n                    target_b_length = target_seq_length - len(tokens_a)\n\n                    # Sample a random document, with longer docs being sampled more frequently\n                    random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)\n\n                    random_start = randrange(0, len(random_document))\n                    for j in range(random_start, len(random_document)):\n                        tokens_b.extend(random_document[j])\n                        if len(tokens_b) >= target_b_length:\n                            break\n                    # We didn't actually use these segments so we \"put them back\" so\n                    # they don't go to waste.\n                    num_unused_segments = len(current_chunk) - a_end\n                    i -= num_unused_segments\n                # Actual next\n                else:\n                    is_random_next = False\n                    for j in range(a_end, len(current_chunk)):\n                        tokens_b.extend(current_chunk[j])\n                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)\n\n                assert len(tokens_a) >= 1\n                assert len(tokens_b) >= 1\n\n                tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"] + tokens_b + [\"[SEP]\"]\n                # The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP]\n                # They are 1 for the B tokens and the final [SEP]\n                segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]\n\n                tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(\n                    tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)\n\n                instance = {\n                    \"tokens\": tokens,\n                    \"segment_ids\": segment_ids,\n                    \"is_random_next\": is_random_next,\n                    \"masked_lm_positions\": masked_lm_positions,\n                    \"masked_lm_labels\": masked_lm_labels}\n                instances.append(instance)\n            current_chunk = []\n            current_length = 0\n        i += 1\n\n    return instances\n\n\ndef create_training_file(docs, vocab_list, args, epoch_num):\n    epoch_filename = args.output_dir / \"epoch_{}.json\".format(epoch_num)\n    num_instances = 0\n    with epoch_filename.open('w') as epoch_file:\n        for doc_idx in trange(len(docs), desc=\"Document\"):\n            doc_instances = create_instances_from_document(\n                docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,\n                masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,\n                whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list)\n            doc_instances = [json.dumps(instance) for instance in doc_instances]\n            for instance in doc_instances:\n                epoch_file.write(instance + '\\n')\n                num_instances += 1\n    metrics_file = args.output_dir / \"epoch_{}_metrics.json\".format(epoch_num)\n    with metrics_file.open('w') as metrics_file:\n        metrics = {\n            \"num_training_examples\": num_instances,\n            \"max_seq_len\": args.max_seq_len\n        }\n        metrics_file.write(json.dumps(metrics))\n\n\ndef main():\n    parser = ArgumentParser()\n    parser.add_argument('--train_corpus', type=Path, required=True)\n    parser.add_argument(\"--output_dir\", type=Path, required=True)\n    parser.add_argument(\"--bert_model\", type=str, required=True,\n                        choices=[\"bert-base-uncased\", \"bert-large-uncased\", \"bert-base-cased\",\n                                 \"bert-base-multilingual-uncased\", \"bert-base-chinese\", \"bert-base-multilingual-cased\"])\n    parser.add_argument(\"--do_lower_case\", action=\"store_true\")\n    parser.add_argument(\"--do_whole_word_mask\", action=\"store_true\",\n                        help=\"Whether to use whole word masking rather than per-WordPiece masking.\")\n    parser.add_argument(\"--reduce_memory\", action=\"store_true\",\n                        help=\"Reduce memory usage for large datasets by keeping data on disc rather than in memory\")\n\n    parser.add_argument(\"--num_workers\", type=int, default=1,\n                        help=\"The number of workers to use to write the files\")\n    parser.add_argument(\"--epochs_to_generate\", type=int, default=3,\n                        help=\"Number of epochs of data to pregenerate\")\n    parser.add_argument(\"--max_seq_len\", type=int, default=128)\n    parser.add_argument(\"--short_seq_prob\", type=float, default=0.1,\n                        help=\"Probability of making a short sentence as a training example\")\n    parser.add_argument(\"--masked_lm_prob\", type=float, default=0.15,\n                        help=\"Probability of masking each token for the LM task\")\n    parser.add_argument(\"--max_predictions_per_seq\", type=int, default=20,\n                        help=\"Maximum number of tokens to mask in each sequence\")\n\n    args = parser.parse_args()\n\n    if args.num_workers > 1 and args.reduce_memory:\n        raise ValueError(\"Cannot use multiple workers while reducing memory\")\n\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)\n    vocab_list = list(tokenizer.vocab.keys())\n    with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:\n        with args.train_corpus.open() as f:\n            doc = []\n            for line in tqdm(f, desc=\"Loading Dataset\", unit=\" lines\"):\n                line = line.strip()\n                if line == \"\":\n                    docs.add_document(doc)\n                    doc = []\n                else:\n                    tokens = tokenizer.tokenize(line)\n                    doc.append(tokens)\n            if doc:\n                docs.add_document(doc)  # If the last doc didn't end on a newline, make sure it still gets added\n        if len(docs) <= 1:\n            exit(\"ERROR: No document breaks were found in the input file! These are necessary to allow the script to \"\n                 \"ensure that random NextSentences are not sampled from the same document. Please add blank lines to \"\n                 \"indicate breaks between documents in your input file. If your dataset does not contain multiple \"\n                 \"documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, \"\n                 \"sections or paragraphs.\")\n\n        args.output_dir.mkdir(exist_ok=True)\n\n        if args.num_workers > 1:\n            writer_workers = Pool(min(args.num_workers, args.epochs_to_generate))\n            arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)]\n            writer_workers.starmap(create_training_file, arguments)\n        else:\n            for epoch in trange(args.epochs_to_generate, desc=\"Epoch\"):\n                create_training_file(docs, vocab_list, args, epoch)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/lm_finetuning/simple_lm_finetuning.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"BERT finetuning runner.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport argparse\nimport logging\nimport os\nimport random\nfrom io import open\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import DataLoader, Dataset, RandomSampler\nfrom torch.utils.data.distributed import DistributedSampler\nfrom tqdm import tqdm, trange\n\nfrom pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME\nfrom pytorch_pretrained_bert.modeling import BertForPreTraining\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\nfrom pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule\n\nlogging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n                    datefmt='%m/%d/%Y %H:%M:%S',\n                    level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n\nclass BERTDataset(Dataset):\n    def __init__(self, corpus_path, tokenizer, seq_len, encoding=\"utf-8\", corpus_lines=None, on_memory=True):\n        self.vocab = tokenizer.vocab\n        self.tokenizer = tokenizer\n        self.seq_len = seq_len\n        self.on_memory = on_memory\n        self.corpus_lines = corpus_lines  # number of non-empty lines in input corpus\n        self.corpus_path = corpus_path\n        self.encoding = encoding\n        self.current_doc = 0  # to avoid random sentence from same doc\n\n        # for loading samples directly from file\n        self.sample_counter = 0  # used to keep track of full epochs on file\n        self.line_buffer = None  # keep second sentence of a pair in memory and use as first sentence in next pair\n\n        # for loading samples in memory\n        self.current_random_doc = 0\n        self.num_docs = 0\n        self.sample_to_doc = [] # map sample index to doc and line\n\n        # load samples into memory\n        if on_memory:\n            self.all_docs = []\n            doc = []\n            self.corpus_lines = 0\n            with open(corpus_path, \"r\", encoding=encoding) as f:\n                for line in tqdm(f, desc=\"Loading Dataset\", total=corpus_lines):\n                    line = line.strip()\n                    if line == \"\":\n                        self.all_docs.append(doc)\n                        doc = []\n                        #remove last added sample because there won't be a subsequent line anymore in the doc\n                        self.sample_to_doc.pop()\n                    else:\n                        #store as one sample\n                        sample = {\"doc_id\": len(self.all_docs),\n                                  \"line\": len(doc)}\n                        self.sample_to_doc.append(sample)\n                        doc.append(line)\n                        self.corpus_lines = self.corpus_lines + 1\n\n            # if last row in file is not empty\n            if self.all_docs[-1] != doc:\n                self.all_docs.append(doc)\n                self.sample_to_doc.pop()\n\n            self.num_docs = len(self.all_docs)\n\n        # load samples later lazily from disk\n        else:\n            if self.corpus_lines is None:\n                with open(corpus_path, \"r\", encoding=encoding) as f:\n                    self.corpus_lines = 0\n                    for line in tqdm(f, desc=\"Loading Dataset\", total=corpus_lines):\n                        if line.strip() == \"\":\n                            self.num_docs += 1\n                        else:\n                            self.corpus_lines += 1\n\n                    # if doc does not end with empty line\n                    if line.strip() != \"\":\n                        self.num_docs += 1\n\n            self.file = open(corpus_path, \"r\", encoding=encoding)\n            self.random_file = open(corpus_path, \"r\", encoding=encoding)\n\n    def __len__(self):\n        # last line of doc won't be used, because there's no \"nextSentence\". Additionally, we start counting at 0.\n        return self.corpus_lines - self.num_docs - 1\n\n    def __getitem__(self, item):\n        cur_id = self.sample_counter\n        self.sample_counter += 1\n        if not self.on_memory:\n            # after one epoch we start again from beginning of file\n            if cur_id != 0 and (cur_id % len(self) == 0):\n                self.file.close()\n                self.file = open(self.corpus_path, \"r\", encoding=self.encoding)\n\n        t1, t2, is_next_label = self.random_sent(item)\n\n        # tokenize\n        tokens_a = self.tokenizer.tokenize(t1)\n        tokens_b = self.tokenizer.tokenize(t2)\n\n        # combine to one sample\n        cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label)\n\n        # transform sample to features\n        cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)\n\n        cur_tensors = (torch.tensor(cur_features.input_ids),\n                       torch.tensor(cur_features.input_mask),\n                       torch.tensor(cur_features.segment_ids),\n                       torch.tensor(cur_features.lm_label_ids),\n                       torch.tensor(cur_features.is_next))\n\n        return cur_tensors\n\n    def random_sent(self, index):\n        \"\"\"\n        Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences\n        from one doc. With 50% the second sentence will be a random one from another doc.\n        :param index: int, index of sample.\n        :return: (str, str, int), sentence 1, sentence 2, isNextSentence Label\n        \"\"\"\n        t1, t2 = self.get_corpus_line(index)\n        if random.random() > 0.5:\n            label = 0\n        else:\n            t2 = self.get_random_line()\n            label = 1\n\n        assert len(t1) > 0\n        assert len(t2) > 0\n        return t1, t2, label\n\n    def get_corpus_line(self, item):\n        \"\"\"\n        Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.\n        :param item: int, index of sample.\n        :return: (str, str), two subsequent sentences from corpus\n        \"\"\"\n        t1 = \"\"\n        t2 = \"\"\n        assert item < self.corpus_lines\n        if self.on_memory:\n            sample = self.sample_to_doc[item]\n            t1 = self.all_docs[sample[\"doc_id\"]][sample[\"line\"]]\n            t2 = self.all_docs[sample[\"doc_id\"]][sample[\"line\"]+1]\n            # used later to avoid random nextSentence from same doc\n            self.current_doc = sample[\"doc_id\"]\n            return t1, t2\n        else:\n            if self.line_buffer is None:\n                # read first non-empty line of file\n                while t1 == \"\" :\n                    t1 = next(self.file).strip()\n                    t2 = next(self.file).strip()\n            else:\n                # use t2 from previous iteration as new t1\n                t1 = self.line_buffer\n                t2 = next(self.file).strip()\n                # skip empty rows that are used for separating documents and keep track of current doc id\n                while t2 == \"\" or t1 == \"\":\n                    t1 = next(self.file).strip()\n                    t2 = next(self.file).strip()\n                    self.current_doc = self.current_doc+1\n            self.line_buffer = t2\n\n        assert t1 != \"\"\n        assert t2 != \"\"\n        return t1, t2\n\n    def get_random_line(self):\n        \"\"\"\n        Get random line from another document for nextSentence task.\n        :return: str, content of one line\n        \"\"\"\n        # Similar to original tf repo: This outer loop should rarely go for more than one iteration for large\n        # corpora. However, just to be careful, we try to make sure that\n        # the random document is not the same as the document we're processing.\n        for _ in range(10):\n            if self.on_memory:\n                rand_doc_idx = random.randint(0, len(self.all_docs)-1)\n                rand_doc = self.all_docs[rand_doc_idx]\n                line = rand_doc[random.randrange(len(rand_doc))]\n            else:\n                rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)\n                #pick random line\n                for _ in range(rand_index):\n                    line = self.get_next_line()\n            #check if our picked random line is really from another doc like we want it to be\n            if self.current_random_doc != self.current_doc:\n                break\n        return line\n\n    def get_next_line(self):\n        \"\"\" Gets next line of random_file and starts over when reaching end of file\"\"\"\n        try:\n            line = next(self.random_file).strip()\n            #keep track of which document we are currently looking at to later avoid having the same doc as t1\n            if line == \"\":\n                self.current_random_doc = self.current_random_doc + 1\n                line = next(self.random_file).strip()\n        except StopIteration:\n            self.random_file.close()\n            self.random_file = open(self.corpus_path, \"r\", encoding=self.encoding)\n            line = next(self.random_file).strip()\n        return line\n\n\nclass InputExample(object):\n    \"\"\"A single training/test example for the language model.\"\"\"\n\n    def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None):\n        \"\"\"Constructs a InputExample.\n\n        Args:\n            guid: Unique id for the example.\n            tokens_a: string. The untokenized text of the first sequence. For single\n            sequence tasks, only this sequence must be specified.\n            tokens_b: (Optional) string. The untokenized text of the second sequence.\n            Only must be specified for sequence pair tasks.\n            label: (Optional) string. The label of the example. This should be\n            specified for train and dev examples, but not for test examples.\n        \"\"\"\n        self.guid = guid\n        self.tokens_a = tokens_a\n        self.tokens_b = tokens_b\n        self.is_next = is_next  # nextSentence\n        self.lm_labels = lm_labels  # masked words for language model\n\n\nclass InputFeatures(object):\n    \"\"\"A single set of features of data.\"\"\"\n\n    def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids):\n        self.input_ids = input_ids\n        self.input_mask = input_mask\n        self.segment_ids = segment_ids\n        self.is_next = is_next\n        self.lm_label_ids = lm_label_ids\n\n\ndef random_word(tokens, tokenizer):\n    \"\"\"\n    Masking some random tokens for Language Model task with probabilities as in the original BERT paper.\n    :param tokens: list of str, tokenized sentence.\n    :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)\n    :return: (list of str, list of int), masked tokens and related labels for LM prediction\n    \"\"\"\n    output_label = []\n\n    for i, token in enumerate(tokens):\n        prob = random.random()\n        # mask token with 15% probability\n        if prob < 0.15:\n            prob /= 0.15\n\n            # 80% randomly change token to mask token\n            if prob < 0.8:\n                tokens[i] = \"[MASK]\"\n\n            # 10% randomly change token to random token\n            elif prob < 0.9:\n                tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]\n\n            # -> rest 10% randomly keep current token\n\n            # append current token to output (we will predict these later)\n            try:\n                output_label.append(tokenizer.vocab[token])\n            except KeyError:\n                # For unknown words (should not occur with BPE vocab)\n                output_label.append(tokenizer.vocab[\"[UNK]\"])\n                logger.warning(\"Cannot find token '{}' in vocab. Using [UNK] insetad\".format(token))\n        else:\n            # no masking token (will be ignored by loss function later)\n            output_label.append(-1)\n\n    return tokens, output_label\n\n\ndef convert_example_to_features(example, max_seq_length, tokenizer):\n    \"\"\"\n    Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with\n    IDs, LM labels, input_mask, CLS and SEP tokens etc.\n    :param example: InputExample, containing sentence input as strings and is_next label\n    :param max_seq_length: int, maximum length of sequence.\n    :param tokenizer: Tokenizer\n    :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)\n    \"\"\"\n    tokens_a = example.tokens_a\n    tokens_b = example.tokens_b\n    # Modifies `tokens_a` and `tokens_b` in place so that the total\n    # length is less than the specified length.\n    # Account for [CLS], [SEP], [SEP] with \"- 3\"\n    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n\n    tokens_a, t1_label = random_word(tokens_a, tokenizer)\n    tokens_b, t2_label = random_word(tokens_b, tokenizer)\n    # concatenate lm labels and account for CLS, SEP, SEP\n    lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])\n\n    # The convention in BERT is:\n    # (a) For sequence pairs:\n    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n    #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1\n    # (b) For single sequences:\n    #  tokens:   [CLS] the dog is hairy . [SEP]\n    #  type_ids: 0   0   0   0  0     0 0\n    #\n    # Where \"type_ids\" are used to indicate whether this is the first\n    # sequence or the second sequence. The embedding vectors for `type=0` and\n    # `type=1` were learned during pre-training and are added to the wordpiece\n    # embedding vector (and position vector). This is not *strictly* necessary\n    # since the [SEP] token unambigiously separates the sequences, but it makes\n    # it easier for the model to learn the concept of sequences.\n    #\n    # For classification tasks, the first vector (corresponding to [CLS]) is\n    # used as as the \"sentence vector\". Note that this only makes sense because\n    # the entire model is fine-tuned.\n    tokens = []\n    segment_ids = []\n    tokens.append(\"[CLS]\")\n    segment_ids.append(0)\n    for token in tokens_a:\n        tokens.append(token)\n        segment_ids.append(0)\n    tokens.append(\"[SEP]\")\n    segment_ids.append(0)\n\n    assert len(tokens_b) > 0\n    for token in tokens_b:\n        tokens.append(token)\n        segment_ids.append(1)\n    tokens.append(\"[SEP]\")\n    segment_ids.append(1)\n\n    input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n    # The mask has 1 for real tokens and 0 for padding tokens. Only real\n    # tokens are attended to.\n    input_mask = [1] * len(input_ids)\n\n    # Zero-pad up to the sequence length.\n    while len(input_ids) < max_seq_length:\n        input_ids.append(0)\n        input_mask.append(0)\n        segment_ids.append(0)\n        lm_label_ids.append(-1)\n\n    assert len(input_ids) == max_seq_length\n    assert len(input_mask) == max_seq_length\n    assert len(segment_ids) == max_seq_length\n    assert len(lm_label_ids) == max_seq_length\n\n    if example.guid < 5:\n        logger.info(\"*** Example ***\")\n        logger.info(\"guid: %s\" % (example.guid))\n        logger.info(\"tokens: %s\" % \" \".join(\n                [str(x) for x in tokens]))\n        logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n        logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n        logger.info(\n                \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n        logger.info(\"LM label: %s \" % (lm_label_ids))\n        logger.info(\"Is next sentence label: %s \" % (example.is_next))\n\n    features = InputFeatures(input_ids=input_ids,\n                             input_mask=input_mask,\n                             segment_ids=segment_ids,\n                             lm_label_ids=lm_label_ids,\n                             is_next=example.is_next)\n    return features\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n\n    ## Required parameters\n    parser.add_argument(\"--train_corpus\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The input train corpus.\")\n    parser.add_argument(\"--bert_model\", default=None, type=str, required=True,\n                        help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\n                             \"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.\")\n    parser.add_argument(\"--output_dir\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The output directory where the model checkpoints will be written.\")\n\n    ## Other parameters\n    parser.add_argument(\"--max_seq_length\",\n                        default=128,\n                        type=int,\n                        help=\"The maximum total input sequence length after WordPiece tokenization. \\n\"\n                             \"Sequences longer than this will be truncated, and sequences shorter \\n\"\n                             \"than this will be padded.\")\n    parser.add_argument(\"--do_train\",\n                        action='store_true',\n                        help=\"Whether to run training.\")\n    parser.add_argument(\"--train_batch_size\",\n                        default=32,\n                        type=int,\n                        help=\"Total batch size for training.\")\n    parser.add_argument(\"--learning_rate\",\n                        default=3e-5,\n                        type=float,\n                        help=\"The initial learning rate for Adam.\")\n    parser.add_argument(\"--num_train_epochs\",\n                        default=3.0,\n                        type=float,\n                        help=\"Total number of training epochs to perform.\")\n    parser.add_argument(\"--warmup_proportion\",\n                        default=0.1,\n                        type=float,\n                        help=\"Proportion of training to perform linear learning rate warmup for. \"\n                             \"E.g., 0.1 = 10%% of training.\")\n    parser.add_argument(\"--no_cuda\",\n                        action='store_true',\n                        help=\"Whether not to use CUDA when available\")\n    parser.add_argument(\"--on_memory\",\n                        action='store_true',\n                        help=\"Whether to load train samples into memory or use disk\")\n    parser.add_argument(\"--do_lower_case\",\n                        action='store_true',\n                        help=\"Whether to lower case the input text. True for uncased models, False for cased models.\")\n    parser.add_argument(\"--local_rank\",\n                        type=int,\n                        default=-1,\n                        help=\"local_rank for distributed training on gpus\")\n    parser.add_argument('--seed',\n                        type=int,\n                        default=42,\n                        help=\"random seed for initialization\")\n    parser.add_argument('--gradient_accumulation_steps',\n                        type=int,\n                        default=1,\n                        help=\"Number of updates steps to accumualte before performing a backward/update pass.\")\n    parser.add_argument('--fp16',\n                        action='store_true',\n                        help=\"Whether to use 16-bit float precision instead of 32-bit\")\n    parser.add_argument('--loss_scale',\n                        type = float, default = 0,\n                        help = \"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\\n\"\n                        \"0 (default value): dynamic loss scaling.\\n\"\n                        \"Positive power of 2: static loss scaling value.\\n\")\n\n    args = parser.parse_args()\n\n    if args.local_rank == -1 or args.no_cuda:\n        device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        torch.cuda.set_device(args.local_rank)\n        device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n        torch.distributed.init_process_group(backend='nccl')\n    logger.info(\"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}\".format(\n        device, n_gpu, bool(args.local_rank != -1), args.fp16))\n\n    if args.gradient_accumulation_steps < 1:\n        raise ValueError(\"Invalid gradient_accumulation_steps parameter: {}, should be >= 1\".format(\n                            args.gradient_accumulation_steps))\n\n    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps\n\n    random.seed(args.seed)\n    np.random.seed(args.seed)\n    torch.manual_seed(args.seed)\n    if n_gpu > 0:\n        torch.cuda.manual_seed_all(args.seed)\n\n    if not args.do_train:\n        raise ValueError(\"Training is currently the only implemented execution option. Please set `do_train`.\")\n\n    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):\n        raise ValueError(\"Output directory ({}) already exists and is not empty.\".format(args.output_dir))\n    if not os.path.exists(args.output_dir):\n        os.makedirs(args.output_dir)\n\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)\n\n    #train_examples = None\n    num_train_optimization_steps = None\n    if args.do_train:\n        print(\"Loading Train Dataset\", args.train_corpus)\n        train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length,\n                                    corpus_lines=None, on_memory=args.on_memory)\n        num_train_optimization_steps = int(\n            len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs\n        if args.local_rank != -1:\n            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()\n\n    # Prepare model\n    model = BertForPreTraining.from_pretrained(args.bert_model)\n    if args.fp16:\n        model.half()\n    model.to(device)\n    if args.local_rank != -1:\n        try:\n            from apex.parallel import DistributedDataParallel as DDP\n        except ImportError:\n            raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n        model = DDP(model)\n    elif n_gpu > 1:\n        model = torch.nn.DataParallel(model)\n\n    # Prepare optimizer\n    if args.do_train:\n        param_optimizer = list(model.named_parameters())\n        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n        optimizer_grouped_parameters = [\n            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n            ]\n\n        if args.fp16:\n            try:\n                from apex.optimizers import FP16_Optimizer\n                from apex.optimizers import FusedAdam\n            except ImportError:\n                raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n            optimizer = FusedAdam(optimizer_grouped_parameters,\n                                  lr=args.learning_rate,\n                                  bias_correction=False,\n                                  max_grad_norm=1.0)\n            if args.loss_scale == 0:\n                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)\n            else:\n                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)\n            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,\n                                                 t_total=num_train_optimization_steps)\n\n        else:\n            optimizer = BertAdam(optimizer_grouped_parameters,\n                                 lr=args.learning_rate,\n                                 warmup=args.warmup_proportion,\n                                 t_total=num_train_optimization_steps)\n\n    global_step = 0\n    if args.do_train:\n        logger.info(\"***** Running training *****\")\n        logger.info(\"  Num examples = %d\", len(train_dataset))\n        logger.info(\"  Batch size = %d\", args.train_batch_size)\n        logger.info(\"  Num steps = %d\", num_train_optimization_steps)\n\n        if args.local_rank == -1:\n            train_sampler = RandomSampler(train_dataset)\n        else:\n            #TODO: check if this works with current data generator from disk that relies on next(file)\n            # (it doesn't return item back by index)\n            train_sampler = DistributedSampler(train_dataset)\n        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)\n\n        model.train()\n        for _ in trange(int(args.num_train_epochs), desc=\"Epoch\"):\n            tr_loss = 0\n            nb_tr_examples, nb_tr_steps = 0, 0\n            for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")):\n                batch = tuple(t.to(device) for t in batch)\n                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch\n                loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)\n                if n_gpu > 1:\n                    loss = loss.mean() # mean() to average on multi-gpu.\n                if args.gradient_accumulation_steps > 1:\n                    loss = loss / args.gradient_accumulation_steps\n                if args.fp16:\n                    optimizer.backward(loss)\n                else:\n                    loss.backward()\n                tr_loss += loss.item()\n                nb_tr_examples += input_ids.size(0)\n                nb_tr_steps += 1\n                if (step + 1) % args.gradient_accumulation_steps == 0:\n                    if args.fp16:\n                        # modify learning rate with special warm up BERT uses\n                        # if args.fp16 is False, BertAdam is used that handles this automatically\n                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)\n                        for param_group in optimizer.param_groups:\n                            param_group['lr'] = lr_this_step\n                    optimizer.step()\n                    optimizer.zero_grad()\n                    global_step += 1\n\n        # Save a trained model\n        logger.info(\"** ** * Saving fine - tuned model ** ** * \")\n        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self\n        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)\n        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)\n        if args.do_train:\n            torch.save(model_to_save.state_dict(), output_model_file)\n            model_to_save.config.to_json_file(output_config_file)\n            tokenizer.save_vocabulary(args.output_dir)\n\n\ndef _truncate_seq_pair(tokens_a, tokens_b, max_length):\n    \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n\n    # This is a simple heuristic which will always truncate the longer sequence\n    # one token at a time. This makes more sense than truncating an equal percent\n    # of tokens from each, since if one sequence is very short then each token\n    # that's truncated likely contains more information than a longer sequence.\n    while True:\n        total_length = len(tokens_a) + len(tokens_b)\n        if total_length <= max_length:\n            break\n        if len(tokens_a) > len(tokens_b):\n            tokens_a.pop()\n        else:\n            tokens_b.pop()\n\n\ndef accuracy(out, labels):\n    outputs = np.argmax(out, axis=1)\n    return np.sum(outputs == labels)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/run_classifier.py",
    "content": "#coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"BERT finetuning runner.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function\n\nimport argparse\nimport csv\nimport logging\nimport os\nimport random\n\nimport sys\nsys.path.append('..')\n\nimport copy\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,\n                              TensorDataset)\nfrom torch.utils.data.distributed import DistributedSampler\nfrom tqdm import tqdm, trange\n\nfrom torch.nn import CrossEntropyLoss, MSELoss\nfrom scipy.stats import pearsonr, spearmanr\nfrom sklearn.metrics import matthews_corrcoef, f1_score, classification_report\n\n\nfrom pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME\nfrom pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\nfrom pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule\n\nlogger = logging.getLogger(__name__)\n\n\nclass InputExample(object):\n    \"\"\"A single training/test example for simple sequence classification.\"\"\"\n\n    def __init__(self, guid, text_a, text_b=None, label=None, entity_pos=None):\n        \"\"\"Constructs a InputExample.\n\n        Args:\n            guid: Unique id for the example.\n            text_a: string. The untokenized text of the first sequence. For single\n            sequence tasks, only this sequence must be specified.\n            text_b: (Optional) string. The untokenized text of the second sequence.\n            Only must be specified for sequence pair tasks.\n            label: (Optional) string. The label of the example. This should be\n            specified for train and dev examples, but not for test examples.\n        \"\"\"\n        self.guid = guid\n        self.text_a = text_a\n        self.text_b = text_b\n        self.label = label\n        self.entity_pos = entity_pos\n\nclass InputFeatures(object):\n    \"\"\"A single set of features of data.\"\"\"\n\n    def __init__(self, input_ids, input_mask, segment_ids, label_id, entity_mask=None, entity_seg_pos=None, entity_span1_pos=None, entity_span2_pos=None):\n        self.input_ids = input_ids\n        self.input_mask = input_mask\n        self.segment_ids = segment_ids\n        self.label_id = label_id\n        self.entity_mask = entity_mask\n        self.entity_seg_pos = entity_seg_pos\n        self.entity_span1_pos = entity_span1_pos\n        self.entity_span2_pos = entity_span2_pos\n\n\nclass DataProcessor(object):\n    \"\"\"Base class for data converters for sequence classification data sets.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the train set.\"\"\"\n        raise NotImplementedError()\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the dev set.\"\"\"\n        raise NotImplementedError()\n\n    def get_labels(self):\n        \"\"\"Gets the list of labels for this data set.\"\"\"\n        raise NotImplementedError()\n\n    @classmethod\n    def _read_tsv(cls, input_file, quotechar=None):\n        \"\"\"Reads a tab separated value file.\"\"\"\n        with open(input_file, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.reader(f, delimiter=\"\\t\", quotechar=quotechar)\n            lines = []\n            for line in reader:\n                if sys.version_info[0] == 2:\n                    line = list(unicode(cell, 'utf-8') for cell in line)\n                lines.append(line)\n            return lines\n\n\nclass MrpcProcessor(DataProcessor):\n    \"\"\"Processor for the MRPC data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.tsv\")))\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[3]\n            text_b = line[4]\n            label = line[0]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\nclass SemProcessor(DataProcessor):\n    \"\"\"Processor for the SemEval 2010 Task 8 dataset.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.jsonl\")))\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.jsonl\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"test.jsonl\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return ['Message-Topic(e2,e1)', 'Instrument-Agency(e2,e1)', 'Entity-Origin(e2,e1)', 'Member-Collection(e1,e2)', 'Member-Collection(e2,e1)', 'Other', 'Component-Whole(e1,e2)', 'Product-Producer(e2,e1)', 'Component-Whole(e2,e1)', 'Entity-Destination(e2,e1)', 'Content-Container(e2,e1)', 'Entity-Destination(e1,e2)', 'Instrument-Agency(e1,e2)', 'Cause-Effect(e2,e1)', 'Entity-Origin(e1,e2)', 'Product-Producer(e1,e2)', 'Cause-Effect(e1,e2)', 'Message-Topic(e1,e2)', 'Content-Container(e1,e2)']\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        import json\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            line = json.loads(line[0])\n            text_a = ' '.join(line['tokens'])\n            label = line['label']\n            entity_pos = line['entities']\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, label=label, entity_pos = entity_pos))\n        return examples\n\n\nclass MnliProcessor(DataProcessor):\n    \"\"\"Processor for the MultiNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev_matched.tsv\")),\n            \"dev_matched\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"contradiction\", \"entailment\", \"neutral\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[8]\n            text_b = line[9]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass MnliMismatchedProcessor(MnliProcessor):\n    \"\"\"Processor for the MultiNLI Mismatched data set (GLUE version).\"\"\"\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev_mismatched.tsv\")),\n            \"dev_matched\")\n\n\nclass ColaProcessor(DataProcessor):\n    \"\"\"Processor for the CoLA data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[3]\n            label = line[1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n        return examples\n\n\nclass Sst2Processor(DataProcessor):\n    \"\"\"Processor for the SST-2 data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[0]\n            label = line[1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n        return examples\n\n\nclass StsbProcessor(DataProcessor):\n    \"\"\"Processor for the STS-B data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [None]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[7]\n            text_b = line[8]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass QqpProcessor(DataProcessor):\n    \"\"\"Processor for the QQP data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            try:\n                text_a = line[3]\n                text_b = line[4]\n                label = line[5]\n            except IndexError:\n                continue\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass QnliProcessor(DataProcessor):\n    \"\"\"Processor for the QNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \n            \"dev_matched\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"entailment\", \"not_entailment\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass RteProcessor(DataProcessor):\n    \"\"\"Processor for the RTE data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"entailment\", \"not_entailment\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass WnliProcessor(DataProcessor):\n    \"\"\"Processor for the WNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\ndef convert_examples_to_features(examples, label_list, max_seq_length,\n                                 tokenizer, output_mode):\n    \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n    label_map = {label : i for i, label in enumerate(label_list)}\n    features = []\n    for (ex_index, example) in enumerate(examples):\n        if ex_index % 10000 == 0:\n            logger.info(\"Writing example %d of %d\" % (ex_index, len(examples)))\n        old_entity_pos = copy.deepcopy(example.entity_pos)\n        tokens_a, new_entity_pos = tokenizer.tokenize(example.text_a,example.entity_pos)\n        \n        old_entity0 = ''.join(example.text_a.split()[old_entity_pos[0][0]:old_entity_pos[0][1]])\n        old_entity1 = ''.join(example.text_a.split()[old_entity_pos[1][0]:old_entity_pos[1][1]])\n        new_entity0 = ''.join(tokens_a[new_entity_pos[0][0]:new_entity_pos[0][1]])\n        new_entity1 = ''.join(tokens_a[new_entity_pos[1][0]:new_entity_pos[1][1]])\n        \n        old_entity0 = old_entity0.lower()\n        old_entity1 = old_entity1.lower()\n\n        if '##' in new_entity0 or '##' in new_entity1:\n            new_entity0 = new_entity0.replace('#','')\n            new_entity1 = new_entity1.replace('#','')\n        \n        try:\n            assert(old_entity0 == new_entity0)\n            assert(old_entity1 == new_entity1)\n        except:\n            import pdb;pdb.set_trace()\n        \n        # Entity marker\n        tokens_a_ = copy.deepcopy(tokens_a) \n        new_entity_pos_ = copy.deepcopy(new_entity_pos)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        \n        tokens_a.insert(entity1_start, '<s1>') \n        new_entity_pos[0][0] = entity1_start\n        tokens_a.insert(entity1_end+1, '<e1>')\n        new_entity_pos[0][1] = entity1_end+1+1\n        tokens_a.insert(entity2_start+2, '<s2>')\n        new_entity_pos[1][0] = entity2_start+2\n        tokens_a.insert(entity2_end+3,'<e2>')\n        new_entity_pos[1][1] = entity2_end+3+1\n\n        if new_entity_pos[1][1] > max_seq_length - 2 - 1:\n            import pdb;pdb.set_trace()\n            \n        tokens_b = None\n        if example.text_b:\n            tokens_b = tokenizer.tokenize(example.text_b)\n            # Modifies `tokens_a` and `tokens_b` in place so that the total\n            # length is less than the specified length.\n            # Account for [CLS], [SEP], [SEP] with \"- 3\"\n            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n        else:\n            # Account for [CLS] and [SEP] with \"- 2\"\n            if len(tokens_a) > max_seq_length - 2:\n                tokens_a = tokens_a[:(max_seq_length - 2)]\n\n        # The convention in BERT is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids: 0   0   0   0  0     0 0\n        #\n        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary\n        # since the [SEP] token unambiguously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n        segment_ids = [0] * len(tokens)\n\n        if tokens_b:\n            tokens += tokens_b + [\"[SEP]\"]\n            segment_ids += [1] * (len(tokens_b) + 1)\n\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n        # The mask has 1 for real tokens and 0 for padding tokens. Only real\n        # tokens are attended to.\n        input_mask = [1] * len(input_ids)\n\n        # Zero-pad up to the sequence length.\n        padding = [0] * (max_seq_length - len(input_ids))\n        input_ids += padding\n        input_mask += padding\n        segment_ids += padding\n        \n\n        # Used for mention pooling\n        entity_mask_tag = 1\n        entity_mask = [0] * len(input_ids)\n        for entity in new_entity_pos:\n            start, end = entity[0],entity[1]\n            for i in range(start, end):\n                # [CLS], need to +1 offset\n                entity_mask[i+1] = entity_mask_tag\n        \n        \"\"\"\n            Different position embedding\n        \"\"\"\n        # Strategy 1\n        entity1_pos_tag = 1\n        entity2_pos_tag = 2\n\n        entity_seg_pos = [0] * len(input_ids)\n\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        for i in range(entity1_start, entity1_end):\n            entity_seg_pos[i+1] = entity1_pos_tag\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        for i in range(entity2_start, entity2_end):\n            entity_seg_pos[i+1] = entity2_pos_tag\n        \n        # Strategy 2\n        entity_start_pos_tag = 1\n        entity_seg_pos_ = [0] * len(input_ids)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        entity_seg_pos_[entity1_start+1] = entity_start_pos_tag\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        entity_seg_pos_[entity2_start+1] = entity_start_pos_tag\n\n        # Strategy 3\n        entity_span1_pos = [0] * len(input_ids)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        for i in range(len(entity_span1_pos)):\n            if i < entity1_start:\n                #entity_span1_pos[i] = np.abs(i - entity1_start)\n                entity_span1_pos[i] = i - entity1_start\n            elif entity1_start <= i and i < entity1_end:\n                entity_span1_pos[i] = 0\n            elif i >= entity1_end:\n                entity_span1_pos[i] = i - entity1_end + 1\n        \n        entity_span2_pos = [0] * len(input_ids)\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        for i in range(len(entity_span2_pos)):\n            if i < entity2_start:\n                #entity_span2_pos[i] = np.abs(i - entity2_start)\n                entity_span2_pos[i] = i - entity2_start\n            elif entity2_start <= i and i < entity2_end:\n                entity_span2_pos[i] = 0\n            elif i >= entity2_end:\n                entity_span2_pos[i] = i - entity2_end + 1\n\n        # Avoid to get negative position to fuck the nn.Embedding\n        #entity_span1_pos = [pos+max_seq_length-1 for pos in entity_span1_pos]\n        #entity_span2_pos = [pos+max_seq_length-1 for pos in entity_span2_pos]\n        \n        assert len(input_ids) == max_seq_length\n        assert len(input_mask) == max_seq_length\n        assert len(segment_ids) == max_seq_length\n        assert len(entity_mask) == max_seq_length\n        assert len(entity_seg_pos) == max_seq_length\n        assert len(entity_seg_pos_) == max_seq_length\n        assert len(entity_span1_pos) == max_seq_length\n        assert len(entity_span2_pos) == max_seq_length\n        if output_mode == \"classification\":\n            label_id = label_map[example.label]\n        elif output_mode == \"regression\":\n            label_id = float(example.label)\n        else:\n            raise KeyError(output_mode)\n\n        if ex_index < 5:\n            logger.info(\"*** Example ***\")\n            logger.info(\"guid: %s\" % (example.guid))\n            logger.info(\"tokens: %s\" % \" \".join(\n                    [str(x) for x in tokens]))\n            logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n            logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n            logger.info(\"entity_mask: %s\" % \" \".join([str(x) for x in entity_mask]))\n            logger.info(\"entity_seg_pos: %s\" % \" \".join([str(x) for x in entity_seg_pos]))\n            logger.info(\"entity_seg_pos_: %s\" % \" \".join([str(x) for x in entity_seg_pos_]))\n            logger.info(\"entity_span1_pos: %s\" % \" \".join([str(x) for x in entity_span1_pos]))\n            logger.info(\"entity_span2_pos: %s\" % \" \".join([str(x) for x in entity_span2_pos]))\n            logger.info(\n                    \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n            logger.info(\"label: %s (id = %d)\" % (example.label, label_id))\n        \n        #if example.guid == 'train-3':\n        #    import pdb;pdb.set_trace()\n\n        features.append(\n                InputFeatures(input_ids=input_ids,\n                              input_mask=input_mask,\n                              segment_ids=segment_ids,\n                              label_id=label_id,\n                              entity_mask=entity_mask,\n                              entity_seg_pos=entity_seg_pos_,\n                              entity_span1_pos=entity_span1_pos,\n                              entity_span2_pos=entity_span2_pos))\n    return features\n\n\ndef _truncate_seq_pair(tokens_a, tokens_b, max_length):\n    \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n\n    # This is a simple heuristic which will always truncate the longer sequence\n    # one token at a time. This makes more sense than truncating an equal percent\n    # of tokens from each, since if one sequence is very short then each token\n    # that's truncated likely contains more information than a longer sequence.\n    while True:\n        total_length = len(tokens_a) + len(tokens_b)\n        if total_length <= max_length:\n            break\n        if len(tokens_a) > len(tokens_b):\n            tokens_a.pop()\n        else:\n            tokens_b.pop()\n\n\ndef simple_accuracy(preds, labels):\n    return (preds == labels).mean()\n\n\ndef acc_and_f1(preds, labels):\n    acc = simple_accuracy(preds, labels)\n    f1 = f1_score(y_true=labels, y_pred=preds,average='micro')\n    report = classification_report(labels, preds)\n    return {\n        \"acc\": acc,\n        \"f1\": f1,\n        \"acc_and_f1\": (acc + f1) / 2,\n        \"report\": report\n    }\n\n\ndef pearson_and_spearman(preds, labels):\n    pearson_corr = pearsonr(preds, labels)[0]\n    spearman_corr = spearmanr(preds, labels)[0]\n    return {\n        \"pearson\": pearson_corr,\n        \"spearmanr\": spearman_corr,\n        \"corr\": (pearson_corr + spearman_corr) / 2,\n    }\n\n\ndef compute_metrics(task_name, preds, labels):\n    assert len(preds) == len(labels)\n    if task_name == \"cola\":\n        return {\"mcc\": matthews_corrcoef(labels, preds)}\n    elif task_name == \"sst-2\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"mrpc\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"sem\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"sts-b\":\n        return pearson_and_spearman(preds, labels)\n    elif task_name == \"qqp\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"mnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"mnli-mm\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"qnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"rte\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"wnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    else:\n        raise KeyError(task_name)\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n\n    ## Required parameters\n    parser.add_argument(\"--data_dir\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The input data dir. Should contain the .tsv files (or other data files) for the task.\")\n    parser.add_argument(\"--bert_model\", default=None, type=str, required=True,\n                        help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\n                        \"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, \"\n                        \"bert-base-multilingual-cased, bert-base-chinese.\")\n    parser.add_argument(\"--task_name\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The name of the task to train.\")\n    parser.add_argument(\"--output_dir\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The output directory where the model predictions and checkpoints will be written.\")\n\n    ## Other parameters\n    parser.add_argument(\"--cache_dir\",\n                        default=\"\",\n                        type=str,\n                        help=\"Where do you want to store the pre-trained models downloaded from s3\")\n    parser.add_argument(\"--max_seq_length\",\n                        default=128,\n                        type=int,\n                        help=\"The maximum total input sequence length after WordPiece tokenization. \\n\"\n                             \"Sequences longer than this will be truncated, and sequences shorter \\n\"\n                             \"than this will be padded.\")\n    parser.add_argument(\"--do_train\",\n                        action='store_true',\n                        help=\"Whether to run training.\")\n    parser.add_argument(\"--do_eval\",\n                        action='store_true',\n                        help=\"Whether to run eval on the dev set.\")\n    parser.add_argument(\"--do_lower_case\",\n                        action='store_true',\n                        help=\"Set this flag if you are using an uncased model.\")\n    parser.add_argument(\"--train_batch_size\",\n                        default=32,\n                        type=int,\n                        help=\"Total batch size for training.\")\n    parser.add_argument(\"--eval_batch_size\",\n                        default=8,\n                        type=int,\n                        help=\"Total batch size for eval.\")\n    parser.add_argument(\"--learning_rate\",\n                        default=5e-5,\n                        type=float,\n                        help=\"The initial learning rate for Adam.\")\n    parser.add_argument(\"--num_train_epochs\",\n                        default=3.0,\n                        type=float,\n                        help=\"Total number of training epochs to perform.\")\n    parser.add_argument(\"--warmup_proportion\",\n                        default=0.1,\n                        type=float,\n                        help=\"Proportion of training to perform linear learning rate warmup for. \"\n                             \"E.g., 0.1 = 10%% of training.\")\n    parser.add_argument(\"--no_cuda\",\n                        action='store_true',\n                        help=\"Whether not to use CUDA when available\")\n    parser.add_argument(\"--local_rank\",\n                        type=int,\n                        default=-1,\n                        help=\"local_rank for distributed training on gpus\")\n    parser.add_argument('--seed',\n                        type=int,\n                        default=42,\n                        help=\"random seed for initialization\")\n    parser.add_argument('--gradient_accumulation_steps',\n                        type=int,\n                        default=1,\n                        help=\"Number of updates steps to accumulate before performing a backward/update pass.\")\n    parser.add_argument('--fp16',\n                        action='store_true',\n                        help=\"Whether to use 16-bit float precision instead of 32-bit\")\n    parser.add_argument('--loss_scale',\n                        type=float, default=0,\n                        help=\"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\\n\"\n                             \"0 (default value): dynamic loss scaling.\\n\"\n                             \"Positive power of 2: static loss scaling value.\\n\")\n    parser.add_argument('--server_ip', type=str, default='', help=\"Can be used for distant debugging.\")\n    parser.add_argument('--server_port', type=str, default='', help=\"Can be used for distant debugging.\")\n    args = parser.parse_args()\n\n    if args.server_ip and args.server_port:\n        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script\n        import ptvsd\n        print(\"Waiting for debugger attach\")\n        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)\n        ptvsd.wait_for_attach()\n\n    processors = {\n        \"cola\": ColaProcessor,\n        \"mnli\": MnliProcessor,\n        \"mnli-mm\": MnliMismatchedProcessor,\n        \"mrpc\": MrpcProcessor,\n        \"sem\": SemProcessor,\n        \"sst-2\": Sst2Processor,\n        \"sts-b\": StsbProcessor,\n        \"qqp\": QqpProcessor,\n        \"qnli\": QnliProcessor,\n        \"rte\": RteProcessor,\n        \"wnli\": WnliProcessor,\n    }\n\n    output_modes = {\n        \"cola\": \"classification\",\n        \"mnli\": \"classification\",\n        \"mrpc\": \"classification\",\n        \"sem\": \"classification\",\n        \"sst-2\": \"classification\",\n        \"sts-b\": \"regression\",\n        \"qqp\": \"classification\",\n        \"qnli\": \"classification\",\n        \"rte\": \"classification\",\n        \"wnli\": \"classification\",\n    }\n\n    if args.local_rank == -1 or args.no_cuda:\n        device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        torch.cuda.set_device(args.local_rank)\n        device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n        torch.distributed.init_process_group(backend='nccl')\n\n    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n                        datefmt = '%m/%d/%Y %H:%M:%S',\n                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)\n\n    logger.info(\"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}\".format(\n        device, n_gpu, bool(args.local_rank != -1), args.fp16))\n\n    if args.gradient_accumulation_steps < 1:\n        raise ValueError(\"Invalid gradient_accumulation_steps parameter: {}, should be >= 1\".format(\n                            args.gradient_accumulation_steps))\n\n    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps\n\n    random.seed(args.seed)\n    np.random.seed(args.seed)\n    torch.manual_seed(args.seed)\n    if n_gpu > 0:\n        torch.cuda.manual_seed_all(args.seed)\n\n    if not args.do_train and not args.do_eval:\n        raise ValueError(\"At least one of `do_train` or `do_eval` must be True.\")\n\n    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:\n        raise ValueError(\"Output directory ({}) already exists and is not empty.\".format(args.output_dir))\n    if not os.path.exists(args.output_dir):\n        os.makedirs(args.output_dir)\n\n    task_name = args.task_name.lower()\n\n    if task_name not in processors:\n        raise ValueError(\"Task not found: %s\" % (task_name))\n\n    processor = processors[task_name]()\n    output_mode = output_modes[task_name]\n\n    label_list = processor.get_labels()\n    num_labels = len(label_list)\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)\n    train_examples = None\n    num_train_optimization_steps = None\n    if args.do_train:\n        train_examples = processor.get_train_examples(args.data_dir)\n        num_train_optimization_steps = int(\n            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs\n        if args.local_rank != -1:\n            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()\n\n    # Prepare model\n    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))\n    model = BertForSequenceClassification.from_pretrained(args.bert_model,\n              cache_dir=cache_dir,\n              num_labels=num_labels)\n    if args.fp16:\n        model.half()\n    model.to(device)\n    if args.local_rank != -1:\n        try:\n            from apex.parallel import DistributedDataParallel as DDP\n        except ImportError:\n            raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n        model = DDP(model)\n    elif n_gpu > 1:\n        model = torch.nn.DataParallel(model)\n\n    # Prepare optimizer\n    if args.do_train:\n        param_optimizer = list(model.named_parameters())\n        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n        optimizer_grouped_parameters = [\n            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n            ]\n        if args.fp16:\n            try:\n                from apex.optimizers import FP16_Optimizer\n                from apex.optimizers import FusedAdam\n            except ImportError:\n                raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n            optimizer = FusedAdam(optimizer_grouped_parameters,\n                                  lr=args.learning_rate,\n                                  bias_correction=False,\n                                  max_grad_norm=1.0)\n            if args.loss_scale == 0:\n                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)\n            else:\n                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)\n            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,\n                                                 t_total=num_train_optimization_steps)\n\n        else:\n            optimizer = BertAdam(optimizer_grouped_parameters,\n                                 lr=args.learning_rate,\n                                 warmup=args.warmup_proportion,\n                                 t_total=num_train_optimization_steps)\n\n    global_step = 0\n    nb_tr_steps = 0\n    tr_loss = 0\n    if args.do_train:\n        train_features = convert_examples_to_features(\n            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)\n        logger.info(\"***** Running training *****\")\n        logger.info(\"  Num examples = %d\", len(train_examples))\n        logger.info(\"  Batch size = %d\", args.train_batch_size)\n        logger.info(\"  Num steps = %d\", num_train_optimization_steps)\n        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)\n        # FloatTensor(forward)\n        all_entity_mask = torch.tensor([f.entity_mask for f in train_features], dtype=torch.float)\n        all_entity_seg_pos = torch.tensor([f.entity_seg_pos for f in train_features], dtype=torch.long)\n        all_entity_span1_pos = torch.tensor([f.entity_span1_pos for f in train_features], dtype=torch.float)\n        all_entity_span2_pos = torch.tensor([f.entity_span2_pos for f in train_features], dtype=torch.float)\n        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)\n        if output_mode == \"classification\":\n            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)\n        elif output_mode == \"regression\":\n            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)\n\n        train_data = TensorDataset(all_input_ids, all_input_mask, all_entity_mask, all_entity_seg_pos, all_entity_span1_pos, all_entity_span2_pos, all_segment_ids, all_label_ids)\n        if args.local_rank == -1:\n            train_sampler = RandomSampler(train_data)\n        else:\n            train_sampler = DistributedSampler(train_data)\n        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)\n\n        model.train()\n        for _ in trange(int(args.num_train_epochs), desc=\"Epoch\"):\n            tr_loss = 0\n            nb_tr_examples, nb_tr_steps = 0, 0\n            for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")):\n                batch = tuple(t.to(device) for t in batch)\n                input_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, segment_ids, label_ids = batch\n                # define a new function to compute loss values for both output_modes\n                logits = model(input_ids, segment_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, labels=None)\n\n                if output_mode == \"classification\":\n                    loss_fct = CrossEntropyLoss()\n                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n                elif output_mode == \"regression\":\n                    loss_fct = MSELoss()\n                    loss = loss_fct(logits.view(-1), label_ids.view(-1))\n\n                if n_gpu > 1:\n                    loss = loss.mean() # mean() to average on multi-gpu.\n                if args.gradient_accumulation_steps > 1:\n                    loss = loss / args.gradient_accumulation_steps\n\n                if args.fp16:\n                    optimizer.backward(loss)\n                else:\n                    loss.backward()\n\n                tr_loss += loss.item()\n                nb_tr_examples += input_ids.size(0)\n                nb_tr_steps += 1\n                if (step + 1) % args.gradient_accumulation_steps == 0:\n                    if args.fp16:\n                        # modify learning rate with special warm up BERT uses\n                        # if args.fp16 is False, BertAdam is used that handles this automatically\n                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)\n                        for param_group in optimizer.param_groups:\n                            param_group['lr'] = lr_this_step\n                    optimizer.step()\n                    optimizer.zero_grad()\n                    global_step += 1\n\n    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):\n        # Save a trained model, configuration and tokenizer\n        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self\n\n        # If we save using the predefined names, we can load using `from_pretrained`\n        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)\n        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)\n\n        torch.save(model_to_save.state_dict(), output_model_file)\n        model_to_save.config.to_json_file(output_config_file)\n        tokenizer.save_vocabulary(args.output_dir)\n\n        # Load a trained model and vocabulary that you have fine-tuned\n        model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)\n        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)\n    else:\n        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)\n    model.to(device)\n\n    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):\n        eval_examples = processor.get_dev_examples(args.data_dir)\n        eval_features = convert_examples_to_features(\n            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)\n        logger.info(\"***** Running evaluation *****\")\n        logger.info(\"  Num examples = %d\", len(eval_examples))\n        logger.info(\"  Batch size = %d\", args.eval_batch_size)\n        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n        all_entity_mask = torch.tensor([f.entity_mask for f in eval_features], dtype=torch.float)\n        all_entity_seg_pos = torch.tensor([f.entity_seg_pos for f in eval_features], dtype=torch.long)\n        all_entity_span1_pos = torch.tensor([f.entity_span1_pos for f in eval_features], dtype=torch.float)\n        all_entity_span2_pos = torch.tensor([f.entity_span2_pos for f in eval_features], dtype=torch.float)\n        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n\n        if output_mode == \"classification\":\n            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)\n        elif output_mode == \"regression\":\n            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)\n\n        eval_data = TensorDataset(all_input_ids, all_input_mask, all_entity_mask, all_entity_seg_pos, all_entity_span1_pos, all_entity_span2_pos, all_segment_ids, all_label_ids)\n        # Run prediction for full data\n        eval_sampler = SequentialSampler(eval_data)\n        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)\n\n        model.eval()\n        eval_loss = 0\n        nb_eval_steps = 0\n        preds = []\n\n        for input_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, segment_ids, label_ids in tqdm(eval_dataloader, desc=\"Evaluating\"):\n            input_ids = input_ids.to(device)\n            input_mask = input_mask.to(device)\n            entity_mask = entity_mask.to(device)\n            entity_seg_pos = entity_seg_pos.to(device)\n            entity_span1_pos = entity_span1_pos.to(device)\n            entity_span2_pos = entity_span2_pos.to(device)\n            segment_ids = segment_ids.to(device)\n            label_ids = label_ids.to(device)\n            with torch.no_grad():\n                logits = model(input_ids, segment_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, labels=None)\n                #logits = model(input_ids, segment_ids, input_mask, labels=None)\n\n            # create eval loss and other metric required by the task\n            if output_mode == \"classification\":\n                loss_fct = CrossEntropyLoss()\n                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n            elif output_mode == \"regression\":\n                loss_fct = MSELoss()\n                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))\n            \n            eval_loss += tmp_eval_loss.mean().item()\n            nb_eval_steps += 1\n            if len(preds) == 0:\n                preds.append(logits.detach().cpu().numpy())\n            else:\n                preds[0] = np.append(\n                    preds[0], logits.detach().cpu().numpy(), axis=0)\n\n        eval_loss = eval_loss / nb_eval_steps\n        preds = preds[0]\n        if output_mode == \"classification\":\n            preds = np.argmax(preds, axis=1)\n        elif output_mode == \"regression\":\n            preds = np.squeeze(preds)\n        result = compute_metrics(task_name, preds, all_label_ids.numpy())\n        loss = tr_loss/global_step if args.do_train else None\n\n        result['eval_loss'] = eval_loss\n        result['global_step'] = global_step\n        result['loss'] = loss\n\n        output_eval_file = os.path.join(args.output_dir, \"eval_results.txt\")\n        with open(output_eval_file, \"w\") as writer:\n            logger.info(\"***** Eval results *****\")\n            for key in sorted(result.keys()):\n                logger.info(\"  %s = %s\", key, str(result[key]))\n                writer.write(\"%s = %s\\n\" % (key, str(result[key])))\n\n        # hack for MNLI-MM\n        if task_name == \"mnli\":\n            task_name = \"mnli-mm\"\n            processor = processors[task_name]()\n\n            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:\n                raise ValueError(\"Output directory ({}) already exists and is not empty.\".format(args.output_dir))\n            if not os.path.exists(args.output_dir + '-MM'):\n                os.makedirs(args.output_dir + '-MM')\n\n            eval_examples = processor.get_dev_examples(args.data_dir)\n            eval_features = convert_examples_to_features(\n                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)\n            logger.info(\"***** Running evaluation *****\")\n            logger.info(\"  Num examples = %d\", len(eval_examples))\n            logger.info(\"  Batch size = %d\", args.eval_batch_size)\n            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)\n\n            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)\n            # Run prediction for full data\n            eval_sampler = SequentialSampler(eval_data)\n            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)\n\n            model.eval()\n            eval_loss = 0\n            nb_eval_steps = 0\n            preds = []\n\n            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc=\"Evaluating\"):\n                input_ids = input_ids.to(device)\n                input_mask = input_mask.to(device)\n                segment_ids = segment_ids.to(device)\n                label_ids = label_ids.to(device)\n\n                with torch.no_grad():\n                    logits = model(input_ids, segment_ids, input_mask, labels=None)\n            \n                loss_fct = CrossEntropyLoss()\n                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n            \n                eval_loss += tmp_eval_loss.mean().item()\n                nb_eval_steps += 1\n                if len(preds) == 0:\n                    preds.append(logits.detach().cpu().numpy())\n                else:\n                    preds[0] = np.append(\n                        preds[0], logits.detach().cpu().numpy(), axis=0)\n\n            eval_loss = eval_loss / nb_eval_steps\n            preds = preds[0]\n            preds = np.argmax(preds, axis=1)\n            result = compute_metrics(task_name, preds, all_label_ids.numpy())\n            loss = tr_loss/global_step if args.do_train else None\n\n            result['eval_loss'] = eval_loss\n            result['global_step'] = global_step\n            result['loss'] = loss\n\n            output_eval_file = os.path.join(args.output_dir + '-MM', \"eval_results.txt\")\n            with open(output_eval_file, \"w\") as writer:\n                logger.info(\"***** Eval results *****\")\n                for key in sorted(result.keys()):\n                    logger.info(\"  %s = %s\", key, str(result[key]))\n                    writer.write(\"%s = %s\\n\" % (key, str(result[key])))\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/run_classifier_dataset_utils.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" BERT classification fine-tuning: utilities to work with GLUE tasks \"\"\"\n\nfrom __future__ import absolute_import, division, print_function\n\nimport csv\nimport logging\nimport os\nimport sys\n\nfrom scipy.stats import pearsonr, spearmanr\nfrom sklearn.metrics import matthews_corrcoef, f1_score\n\nlogger = logging.getLogger(__name__)\n\n\nclass InputExample(object):\n    \"\"\"A single training/test example for simple sequence classification.\"\"\"\n\n    def __init__(self, guid, text_a, text_b=None, label=None):\n        \"\"\"Constructs a InputExample.\n\n        Args:\n            guid: Unique id for the example.\n            text_a: string. The untokenized text of the first sequence. For single\n            sequence tasks, only this sequence must be specified.\n            text_b: (Optional) string. The untokenized text of the second sequence.\n            Only must be specified for sequence pair tasks.\n            label: (Optional) string. The label of the example. This should be\n            specified for train and dev examples, but not for test examples.\n        \"\"\"\n        self.guid = guid\n        self.text_a = text_a\n        self.text_b = text_b\n        self.label = label\n\n\nclass InputFeatures(object):\n    \"\"\"A single set of features of data.\"\"\"\n\n    def __init__(self, input_ids, input_mask, segment_ids, label_id):\n        self.input_ids = input_ids\n        self.input_mask = input_mask\n        self.segment_ids = segment_ids\n        self.label_id = label_id\n\n\nclass DataProcessor(object):\n    \"\"\"Base class for data converters for sequence classification data sets.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the train set.\"\"\"\n        raise NotImplementedError()\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the dev set.\"\"\"\n        raise NotImplementedError()\n\n    def get_labels(self):\n        \"\"\"Gets the list of labels for this data set.\"\"\"\n        raise NotImplementedError()\n\n    @classmethod\n    def _read_tsv(cls, input_file, quotechar=None):\n        \"\"\"Reads a tab separated value file.\"\"\"\n        with open(input_file, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.reader(f, delimiter=\"\\t\", quotechar=quotechar)\n            lines = []\n            for line in reader:\n                if sys.version_info[0] == 2:\n                    line = list(unicode(cell, 'utf-8') for cell in line)\n                lines.append(line)\n            return lines\n\n\nclass MrpcProcessor(DataProcessor):\n    \"\"\"Processor for the MRPC data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.tsv\")))\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[3]\n            text_b = line[4]\n            label = line[0]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass MnliProcessor(DataProcessor):\n    \"\"\"Processor for the MultiNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev_matched.tsv\")),\n            \"dev_matched\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"contradiction\", \"entailment\", \"neutral\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[8]\n            text_b = line[9]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass MnliMismatchedProcessor(MnliProcessor):\n    \"\"\"Processor for the MultiNLI Mismatched data set (GLUE version).\"\"\"\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev_mismatched.tsv\")),\n            \"dev_matched\")\n\n\nclass ColaProcessor(DataProcessor):\n    \"\"\"Processor for the CoLA data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[3]\n            label = line[1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n        return examples\n\n\nclass Sst2Processor(DataProcessor):\n    \"\"\"Processor for the SST-2 data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[0]\n            label = line[1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n        return examples\n\n\nclass StsbProcessor(DataProcessor):\n    \"\"\"Processor for the STS-B data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [None]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[7]\n            text_b = line[8]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass QqpProcessor(DataProcessor):\n    \"\"\"Processor for the QQP data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            try:\n                text_a = line[3]\n                text_b = line[4]\n                label = line[5]\n            except IndexError:\n                continue\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass QnliProcessor(DataProcessor):\n    \"\"\"Processor for the QNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \n            \"dev_matched\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"entailment\", \"not_entailment\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass RteProcessor(DataProcessor):\n    \"\"\"Processor for the RTE data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"entailment\", \"not_entailment\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass WnliProcessor(DataProcessor):\n    \"\"\"Processor for the WNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\ndef convert_examples_to_features(examples, label_list, max_seq_length,\n                                 tokenizer, output_mode):\n    \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n    label_map = {label : i for i, label in enumerate(label_list)}\n\n    features = []\n    for (ex_index, example) in enumerate(examples):\n        if ex_index % 10000 == 0:\n            logger.info(\"Writing example %d of %d\" % (ex_index, len(examples)))\n\n        tokens_a = tokenizer.tokenize(example.text_a)\n\n        tokens_b = None\n        if example.text_b:\n            tokens_b = tokenizer.tokenize(example.text_b)\n            # Modifies `tokens_a` and `tokens_b` in place so that the total\n            # length is less than the specified length.\n            # Account for [CLS], [SEP], [SEP] with \"- 3\"\n            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n        else:\n            # Account for [CLS] and [SEP] with \"- 2\"\n            if len(tokens_a) > max_seq_length - 2:\n                tokens_a = tokens_a[:(max_seq_length - 2)]\n\n        # The convention in BERT is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids: 0   0   0   0  0     0 0\n        #\n        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary\n        # since the [SEP] token unambiguously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n        segment_ids = [0] * len(tokens)\n\n        if tokens_b:\n            tokens += tokens_b + [\"[SEP]\"]\n            segment_ids += [1] * (len(tokens_b) + 1)\n\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n        # The mask has 1 for real tokens and 0 for padding tokens. Only real\n        # tokens are attended to.\n        input_mask = [1] * len(input_ids)\n\n        # Zero-pad up to the sequence length.\n        padding = [0] * (max_seq_length - len(input_ids))\n        input_ids += padding\n        input_mask += padding\n        segment_ids += padding\n\n        assert len(input_ids) == max_seq_length\n        assert len(input_mask) == max_seq_length\n        assert len(segment_ids) == max_seq_length\n\n        if output_mode == \"classification\":\n            label_id = label_map[example.label]\n        elif output_mode == \"regression\":\n            label_id = float(example.label)\n        else:\n            raise KeyError(output_mode)\n\n        if ex_index < 5:\n            logger.info(\"*** Example ***\")\n            logger.info(\"guid: %s\" % (example.guid))\n            logger.info(\"tokens: %s\" % \" \".join(\n                    [str(x) for x in tokens]))\n            logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n            logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n            logger.info(\n                    \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n            logger.info(\"label: %s (id = %d)\" % (example.label, label_id))\n\n        features.append(\n                InputFeatures(input_ids=input_ids,\n                              input_mask=input_mask,\n                              segment_ids=segment_ids,\n                              label_id=label_id))\n    return features\n\n\ndef _truncate_seq_pair(tokens_a, tokens_b, max_length):\n    \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n\n    # This is a simple heuristic which will always truncate the longer sequence\n    # one token at a time. This makes more sense than truncating an equal percent\n    # of tokens from each, since if one sequence is very short then each token\n    # that's truncated likely contains more information than a longer sequence.\n    while True:\n        total_length = len(tokens_a) + len(tokens_b)\n        if total_length <= max_length:\n            break\n        if len(tokens_a) > len(tokens_b):\n            tokens_a.pop()\n        else:\n            tokens_b.pop()\n\n\ndef simple_accuracy(preds, labels):\n    return (preds == labels).mean()\n\n\ndef acc_and_f1(preds, labels):\n    acc = simple_accuracy(preds, labels)\n    f1 = f1_score(y_true=labels, y_pred=preds)\n    return {\n        \"acc\": acc,\n        \"f1\": f1,\n        \"acc_and_f1\": (acc + f1) / 2,\n    }\n\n\ndef pearson_and_spearman(preds, labels):\n    pearson_corr = pearsonr(preds, labels)[0]\n    spearman_corr = spearmanr(preds, labels)[0]\n    return {\n        \"pearson\": pearson_corr,\n        \"spearmanr\": spearman_corr,\n        \"corr\": (pearson_corr + spearman_corr) / 2,\n    }\n\n\ndef compute_metrics(task_name, preds, labels):\n    assert len(preds) == len(labels)\n    if task_name == \"cola\":\n        return {\"mcc\": matthews_corrcoef(labels, preds)}\n    elif task_name == \"sst-2\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"mrpc\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"sts-b\":\n        return pearson_and_spearman(preds, labels)\n    elif task_name == \"qqp\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"mnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"mnli-mm\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"qnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"rte\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"wnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    else:\n        raise KeyError(task_name)\n\nprocessors = {\n    \"cola\": ColaProcessor,\n    \"mnli\": MnliProcessor,\n    \"mnli-mm\": MnliMismatchedProcessor,\n    \"mrpc\": MrpcProcessor,\n    \"sst-2\": Sst2Processor,\n    \"sts-b\": StsbProcessor,\n    \"qqp\": QqpProcessor,\n    \"qnli\": QnliProcessor,\n    \"rte\": RteProcessor,\n    \"wnli\": WnliProcessor,\n}\n\noutput_modes = {\n    \"cola\": \"classification\",\n    \"mnli\": \"classification\",\n    \"mrpc\": \"classification\",\n    \"sst-2\": \"classification\",\n    \"sts-b\": \"regression\",\n    \"qqp\": \"classification\",\n    \"qnli\": \"classification\",\n    \"rte\": \"classification\",\n    \"wnli\": \"classification\",\n}\n"
  },
  {
    "path": "examples/run_gpt2.py",
    "content": "#!/usr/bin/env python3\n\nimport argparse\nimport logging\nfrom tqdm import trange\n\nimport torch\nimport torch.nn.functional as F\nimport numpy as np\n\nfrom pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer\n\nlogging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n                    datefmt = '%m/%d/%Y %H:%M:%S',\n                    level = logging.INFO)\nlogger = logging.getLogger(__name__)\n\ndef top_k_logits(logits, k):\n    \"\"\"\n    Masks everything but the k top entries as -infinity (1e10).\n    Used to mask logits such that e^-infinity -> 0 won't contribute to the\n    sum of the denominator.\n    \"\"\"\n    if k == 0:\n        return logits\n    else:\n        values = torch.topk(logits, k)[0]\n        batch_mins = values[:, -1].view(-1, 1).expand_as(logits)\n        return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits)\n\ndef sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):\n    if start_token is None:\n        assert context is not None, 'Specify exactly one of start_token and context!'\n        context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)\n    else:\n        assert context is None, 'Specify exactly one of start_token and context!'\n        context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)\n    prev = context\n    output = context\n    past = None\n    with torch.no_grad():\n        for i in trange(length):\n            logits, past = model(prev, past=past)\n            logits = logits[:, -1, :] / temperature\n            logits = top_k_logits(logits, k=top_k)\n            log_probs = F.softmax(logits, dim=-1)\n            if sample:\n                prev = torch.multinomial(log_probs, num_samples=1)\n            else:\n                _, prev = torch.topk(log_probs, k=1, dim=-1)\n            output = torch.cat((output, prev), dim=1)\n    return output\n\ndef run_model():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')\n    parser.add_argument(\"--seed\", type=int, default=0)\n    parser.add_argument(\"--nsamples\", type=int, default=1)\n    parser.add_argument(\"--batch_size\", type=int, default=-1)\n    parser.add_argument(\"--length\", type=int, default=-1)\n    parser.add_argument(\"--temperature\", type=float, default=1.0)\n    parser.add_argument(\"--top_k\", type=int, default=0)\n    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')\n    args = parser.parse_args()\n    print(args)\n\n    if args.batch_size == -1:\n        args.batch_size = 1\n    assert args.nsamples % args.batch_size == 0\n\n    np.random.seed(args.seed)\n    torch.random.manual_seed(args.seed)\n    torch.cuda.manual_seed(args.seed)\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n\n    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)\n    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)\n    model.to(device)\n    model.eval()\n\n    if args.length == -1:\n        args.length = model.config.n_ctx // 2\n    elif args.length > model.config.n_ctx:\n        raise ValueError(\"Can't get samples longer than window size: %s\" % model.config.n_ctx)\n\n    while True:\n        context_tokens = []\n        if not args.unconditional:\n            raw_text = input(\"Model prompt >>> \")\n            while not raw_text:\n                print('Prompt should not be empty!')\n                raw_text = input(\"Model prompt >>> \")\n            context_tokens = enc.encode(raw_text)\n            generated = 0\n            for _ in range(args.nsamples // args.batch_size):\n                out = sample_sequence(\n                    model=model, length=args.length,\n                    context=context_tokens,\n                    start_token=None,\n                    batch_size=args.batch_size,\n                    temperature=args.temperature, top_k=args.top_k, device=device\n                )\n                out = out[:, len(context_tokens):].tolist()\n                for i in range(args.batch_size):\n                    generated += 1\n                    text = enc.decode(out[i])\n                    print(\"=\" * 40 + \" SAMPLE \" + str(generated) + \" \" + \"=\" * 40)\n                    print(text)\n            print(\"=\" * 80)\n        else:\n            generated = 0\n            for _ in range(args.nsamples // args.batch_size):\n                out = sample_sequence(\n                    model=model, length=args.length,\n                    context=None,\n                    start_token=enc.encoder['<|endoftext|>'],\n                    batch_size=args.batch_size,\n                    temperature=args.temperature, top_k=args.top_k, device=device\n                )\n                out = out[:,1:].tolist()\n                for i in range(args.batch_size):\n                    generated += 1\n                    text = enc.decode(out[i])\n                    print(\"=\" * 40 + \" SAMPLE \" + str(generated) + \" \" + \"=\" * 40)\n                    print(text)\n            print(\"=\" * 80)\n\nif __name__ == '__main__':\n    run_model()\n\n\n"
  },
  {
    "path": "examples/run_openai_gpt.py",
    "content": "# coding=utf-8\n# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" OpenAI GPT model fine-tuning script.\n    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py\n    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py\n\n    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset:\n        python run_openai_gpt.py \\\n          --model_name openai-gpt \\\n          --do_train \\\n          --do_eval \\\n          --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\\ -\\ cloze_test_ALL_val.csv \\\n          --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\\ -\\ cloze_test_ALL_test.csv \\\n          --output_dir ../log \\\n          --train_batch_size 16 \\\n\"\"\"\nimport argparse\nimport os\nimport csv\nimport random\nimport logging\nfrom tqdm import tqdm, trange\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,\n                              TensorDataset)\n\nfrom pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,\n                                     OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)\n\nROCSTORIES_URL = \"https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz\"\n\nlogging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n                    datefmt = '%m/%d/%Y %H:%M:%S',\n                    level = logging.INFO)\nlogger = logging.getLogger(__name__)\n\ndef accuracy(out, labels):\n    outputs = np.argmax(out, axis=1)\n    return np.sum(outputs == labels)\n\ndef load_rocstories_dataset(dataset_path):\n    \"\"\" Output a list of tuples(story, 1st continuation, 2nd continuation, label) \"\"\"\n    with open(dataset_path, encoding='utf_8') as f:\n        f = csv.reader(f)\n        output = []\n        next(f) # skip the first line\n        for line in tqdm(f):\n            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))\n    return output\n\ndef pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):\n    \"\"\" Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)\n\n        To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:\n        input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]\n    \"\"\"\n    tensor_datasets = []\n    for dataset in encoded_datasets:\n        n_batch = len(dataset)\n        input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)\n        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)\n        lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)\n        mc_labels = np.zeros((n_batch,), dtype=np.int64)\n        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):\n            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]\n            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]\n            input_ids[i, 0, :len(with_cont1)] = with_cont1\n            input_ids[i, 1, :len(with_cont2)] = with_cont2\n            mc_token_ids[i, 0] = len(with_cont1) - 1\n            mc_token_ids[i, 1] = len(with_cont2) - 1\n            lm_labels[i, 0, :len(with_cont1)] = with_cont1\n            lm_labels[i, 1, :len(with_cont2)] = with_cont2\n            mc_labels[i] = mc_label\n        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)\n        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))\n    return tensor_datasets\n\ndef main():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--model_name', type=str, default='openai-gpt',\n                        help='pretrained model name')\n    parser.add_argument(\"--do_train\", action='store_true', help=\"Whether to run training.\")\n    parser.add_argument(\"--do_eval\", action='store_true', help=\"Whether to run eval on the dev set.\")\n    parser.add_argument(\"--output_dir\", default=None, type=str, required=True,\n                        help=\"The output directory where the model predictions and checkpoints will be written.\")\n    parser.add_argument('--train_dataset', type=str, default='')\n    parser.add_argument('--eval_dataset', type=str, default='')\n    parser.add_argument('--seed', type=int, default=42)\n    parser.add_argument('--num_train_epochs', type=int, default=3)\n    parser.add_argument('--train_batch_size', type=int, default=8)\n    parser.add_argument('--eval_batch_size', type=int, default=16)\n    parser.add_argument('--max_grad_norm', type=int, default=1)\n    parser.add_argument('--learning_rate', type=float, default=6.25e-5)\n    parser.add_argument('--warmup_proportion', type=float, default=0.002)\n    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')\n    parser.add_argument('--weight_decay', type=float, default=0.01)\n    parser.add_argument('--lm_coef', type=float, default=0.9)\n    parser.add_argument('--n_valid', type=int, default=374)\n\n    parser.add_argument('--server_ip', type=str, default='', help=\"Can be used for distant debugging.\")\n    parser.add_argument('--server_port', type=str, default='', help=\"Can be used for distant debugging.\")\n    args = parser.parse_args()\n    print(args)\n\n    if args.server_ip and args.server_port:\n        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script\n        import ptvsd\n        print(\"Waiting for debugger attach\")\n        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)\n        ptvsd.wait_for_attach()\n\n    random.seed(args.seed)\n    np.random.seed(args.seed)\n    torch.manual_seed(args.seed)\n    torch.cuda.manual_seed_all(args.seed)\n\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    n_gpu = torch.cuda.device_count()\n    logger.info(\"device: {}, n_gpu {}\".format(device, n_gpu))\n\n    if not args.do_train and not args.do_eval:\n        raise ValueError(\"At least one of `do_train` or `do_eval` must be True.\")\n\n    if not os.path.exists(args.output_dir):\n        os.makedirs(args.output_dir)\n\n    # Load tokenizer and model\n    # This loading functions also add new tokens and embeddings called `special tokens`\n    # These new embeddings will be fine-tuned on the RocStories dataset\n    special_tokens = ['_start_', '_delimiter_', '_classify_']\n    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)\n    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)\n    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))\n    model.to(device)\n\n    # Load and encode the datasets\n    if not args.train_dataset and not args.eval_dataset:\n        roc_stories = cached_path(ROCSTORIES_URL)\n    def tokenize_and_encode(obj):\n        \"\"\" Tokenize and encode a nested object \"\"\"\n        if isinstance(obj, str):\n            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))\n        elif isinstance(obj, int):\n            return obj\n        return list(tokenize_and_encode(o) for o in obj)\n    logger.info(\"Encoding dataset...\")\n    train_dataset = load_rocstories_dataset(args.train_dataset)\n    eval_dataset = load_rocstories_dataset(args.eval_dataset)\n    datasets = (train_dataset, eval_dataset)\n    encoded_datasets = tokenize_and_encode(datasets)\n\n    # Compute the max input length for the Transformer\n    max_length = model.config.n_positions // 2 - 2\n    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \\\n                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)\n    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model\n\n    # Prepare inputs tensors and dataloaders\n    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)\n    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]\n\n    train_data = TensorDataset(*train_tensor_dataset)\n    train_sampler = RandomSampler(train_data)\n    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)\n\n    eval_data = TensorDataset(*eval_tensor_dataset)\n    eval_sampler = SequentialSampler(eval_data)\n    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)\n\n    # Prepare optimizer\n    if args.do_train:\n        param_optimizer = list(model.named_parameters())\n        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n        optimizer_grouped_parameters = [\n            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n            ]\n        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs\n        optimizer = OpenAIAdam(optimizer_grouped_parameters,\n                               lr=args.learning_rate,\n                               warmup=args.warmup_proportion,\n                               max_grad_norm=args.max_grad_norm,\n                               weight_decay=args.weight_decay,\n                               t_total=num_train_optimization_steps)\n\n    if args.do_train:\n        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None\n        model.train()\n        for _ in trange(int(args.num_train_epochs), desc=\"Epoch\"):\n            tr_loss = 0\n            nb_tr_steps = 0\n            tqdm_bar = tqdm(train_dataloader, desc=\"Training\")\n            for step, batch in enumerate(tqdm_bar):\n                batch = tuple(t.to(device) for t in batch)\n                input_ids, mc_token_ids, lm_labels, mc_labels = batch\n                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)\n                loss = args.lm_coef * losses[0] + losses[1]\n                loss.backward()\n                optimizer.step()\n                optimizer.zero_grad()\n                tr_loss += loss.item()\n                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()\n                nb_tr_steps += 1\n                tqdm_bar.desc = \"Training loss: {:.2e} lr: {:.2e}\".format(exp_average_loss, optimizer.get_lr()[0])\n\n    # Save a trained model\n    if args.do_train:\n        # Save a trained model, configuration and tokenizer\n        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self\n\n        # If we save using the predefined names, we can load using `from_pretrained`\n        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)\n        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)\n\n        torch.save(model_to_save.state_dict(), output_model_file)\n        model_to_save.config.to_json_file(output_config_file)\n        tokenizer.save_vocabulary(args.output_dir)\n\n        # Load a trained model and vocabulary that you have fine-tuned\n        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)\n        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)\n        model.to(device)\n\n    if args.do_eval:\n        model.eval()\n        eval_loss, eval_accuracy = 0, 0\n        nb_eval_steps, nb_eval_examples = 0, 0\n        for batch in tqdm(eval_dataloader, desc=\"Evaluating\"):\n            batch = tuple(t.to(device) for t in batch)\n            input_ids, mc_token_ids, lm_labels, mc_labels = batch\n            with torch.no_grad():\n                _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels)\n                _, mc_logits = model(input_ids, mc_token_ids)\n\n            mc_logits = mc_logits.detach().cpu().numpy()\n            mc_labels = mc_labels.to('cpu').numpy()\n            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)\n\n            eval_loss += mc_loss.mean().item()\n            eval_accuracy += tmp_eval_accuracy\n\n            nb_eval_examples += input_ids.size(0)\n            nb_eval_steps += 1\n\n        eval_loss = eval_loss / nb_eval_steps\n        eval_accuracy = eval_accuracy / nb_eval_examples\n        train_loss = tr_loss/nb_tr_steps if args.do_train else None\n        result = {'eval_loss': eval_loss,\n                  'eval_accuracy': eval_accuracy,\n                  'train_loss': train_loss}\n\n        output_eval_file = os.path.join(args.output_dir, \"eval_results.txt\")\n        with open(output_eval_file, \"w\") as writer:\n            logger.info(\"***** Eval results *****\")\n            for key in sorted(result.keys()):\n                logger.info(\"  %s = %s\", key, str(result[key]))\n                writer.write(\"%s = %s\\n\" % (key, str(result[key])))\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/run_squad.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Run BERT on SQuAD.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function\n\nimport argparse\nimport logging\nimport os\nimport random\nimport sys\nfrom io import open\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,\n                              TensorDataset)\nfrom torch.utils.data.distributed import DistributedSampler\nfrom tqdm import tqdm, trange\n\nfrom tensorboardX import SummaryWriter\n\nfrom pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME\nfrom pytorch_pretrained_bert.modeling import BertForQuestionAnswering\nfrom pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\n\nfrom run_squad_dataset_utils import read_squad_examples, convert_examples_to_features, RawResult, write_predictions\n\nif sys.version_info[0] == 2:\n    import cPickle as pickle\nelse:\n    import pickle\n\nlogger = logging.getLogger(__name__)\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n\n    ## Required parameters\n    parser.add_argument(\"--bert_model\", default=None, type=str, required=True,\n                        help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\n                        \"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, \"\n                        \"bert-base-multilingual-cased, bert-base-chinese.\")\n    parser.add_argument(\"--output_dir\", default=None, type=str, required=True,\n                        help=\"The output directory where the model checkpoints and predictions will be written.\")\n\n    ## Other parameters\n    parser.add_argument(\"--train_file\", default=None, type=str, help=\"SQuAD json for training. E.g., train-v1.1.json\")\n    parser.add_argument(\"--predict_file\", default=None, type=str,\n                        help=\"SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json\")\n    parser.add_argument(\"--max_seq_length\", default=384, type=int,\n                        help=\"The maximum total input sequence length after WordPiece tokenization. Sequences \"\n                             \"longer than this will be truncated, and sequences shorter than this will be padded.\")\n    parser.add_argument(\"--doc_stride\", default=128, type=int,\n                        help=\"When splitting up a long document into chunks, how much stride to take between chunks.\")\n    parser.add_argument(\"--max_query_length\", default=64, type=int,\n                        help=\"The maximum number of tokens for the question. Questions longer than this will \"\n                             \"be truncated to this length.\")\n    parser.add_argument(\"--do_train\", action='store_true', help=\"Whether to run training.\")\n    parser.add_argument(\"--do_predict\", action='store_true', help=\"Whether to run eval on the dev set.\")\n    parser.add_argument(\"--train_batch_size\", default=32, type=int, help=\"Total batch size for training.\")\n    parser.add_argument(\"--predict_batch_size\", default=8, type=int, help=\"Total batch size for predictions.\")\n    parser.add_argument(\"--learning_rate\", default=5e-5, type=float, help=\"The initial learning rate for Adam.\")\n    parser.add_argument(\"--num_train_epochs\", default=3.0, type=float,\n                        help=\"Total number of training epochs to perform.\")\n    parser.add_argument(\"--warmup_proportion\", default=0.1, type=float,\n                        help=\"Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% \"\n                             \"of training.\")\n    parser.add_argument(\"--n_best_size\", default=20, type=int,\n                        help=\"The total number of n-best predictions to generate in the nbest_predictions.json \"\n                             \"output file.\")\n    parser.add_argument(\"--max_answer_length\", default=30, type=int,\n                        help=\"The maximum length of an answer that can be generated. This is needed because the start \"\n                             \"and end predictions are not conditioned on one another.\")\n    parser.add_argument(\"--verbose_logging\", action='store_true',\n                        help=\"If true, all of the warnings related to data processing will be printed. \"\n                             \"A number of warnings are expected for a normal SQuAD evaluation.\")\n    parser.add_argument(\"--no_cuda\",\n                        action='store_true',\n                        help=\"Whether not to use CUDA when available\")\n    parser.add_argument('--seed',\n                        type=int,\n                        default=42,\n                        help=\"random seed for initialization\")\n    parser.add_argument('--gradient_accumulation_steps',\n                        type=int,\n                        default=1,\n                        help=\"Number of updates steps to accumulate before performing a backward/update pass.\")\n    parser.add_argument(\"--do_lower_case\",\n                        action='store_true',\n                        help=\"Whether to lower case the input text. True for uncased models, False for cased models.\")\n    parser.add_argument(\"--local_rank\",\n                        type=int,\n                        default=-1,\n                        help=\"local_rank for distributed training on gpus\")\n    parser.add_argument('--fp16',\n                        action='store_true',\n                        help=\"Whether to use 16-bit float precision instead of 32-bit\")\n    parser.add_argument('--overwrite_output_dir',\n                        action='store_true',\n                        help=\"Overwrite the content of the output directory\")\n    parser.add_argument('--loss_scale',\n                        type=float, default=0,\n                        help=\"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\\n\"\n                             \"0 (default value): dynamic loss scaling.\\n\"\n                             \"Positive power of 2: static loss scaling value.\\n\")\n    parser.add_argument('--version_2_with_negative',\n                        action='store_true',\n                        help='If true, the SQuAD examples contain some that do not have an answer.')\n    parser.add_argument('--null_score_diff_threshold',\n                        type=float, default=0.0,\n                        help=\"If null_score - best_non_null is greater than the threshold predict null.\")\n    parser.add_argument('--server_ip', type=str, default='', help=\"Can be used for distant debugging.\")\n    parser.add_argument('--server_port', type=str, default='', help=\"Can be used for distant debugging.\")\n    args = parser.parse_args()\n    print(args)\n\n    if args.server_ip and args.server_port:\n        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script\n        import ptvsd\n        print(\"Waiting for debugger attach\")\n        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)\n        ptvsd.wait_for_attach()\n\n    if args.local_rank == -1 or args.no_cuda:\n        device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        torch.cuda.set_device(args.local_rank)\n        device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n        torch.distributed.init_process_group(backend='nccl')\n\n    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n                        datefmt = '%m/%d/%Y %H:%M:%S',\n                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)\n\n    logger.info(\"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}\".format(\n        device, n_gpu, bool(args.local_rank != -1), args.fp16))\n\n    if args.gradient_accumulation_steps < 1:\n        raise ValueError(\"Invalid gradient_accumulation_steps parameter: {}, should be >= 1\".format(\n                            args.gradient_accumulation_steps))\n\n    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps\n\n    random.seed(args.seed)\n    np.random.seed(args.seed)\n    torch.manual_seed(args.seed)\n    if n_gpu > 0:\n        torch.cuda.manual_seed_all(args.seed)\n\n    if not args.do_train and not args.do_predict:\n        raise ValueError(\"At least one of `do_train` or `do_predict` must be True.\")\n\n    if args.do_train:\n        if not args.train_file:\n            raise ValueError(\n                \"If `do_train` is True, then `train_file` must be specified.\")\n    if args.do_predict:\n        if not args.predict_file:\n            raise ValueError(\n                \"If `do_predict` is True, then `predict_file` must be specified.\")\n\n    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:\n        raise ValueError(\"Output directory () already exists and is not empty.\")\n    if not os.path.exists(args.output_dir):\n        os.makedirs(args.output_dir)\n\n    if args.local_rank not in [-1, 0]:\n        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)\n    model = BertForQuestionAnswering.from_pretrained(args.bert_model)\n    if args.local_rank == 0:\n        torch.distributed.barrier()\n\n    if args.fp16:\n        model.half()\n    model.to(device)\n    if args.local_rank != -1:\n        model = torch.nn.parallel.DistributedDataParallel(model,\n                                                          device_ids=[args.local_rank],\n                                                          output_device=args.local_rank,\n                                                          find_unused_parameters=True)\n    elif n_gpu > 1:\n        model = torch.nn.DataParallel(model)\n\n    if args.do_train:\n        if args.local_rank in [-1, 0]:\n            tb_writer = SummaryWriter()\n        # Prepare data loader\n        train_examples = read_squad_examples(\n            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)\n        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(\n            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))\n        try:\n            with open(cached_train_features_file, \"rb\") as reader:\n                train_features = pickle.load(reader)\n        except:\n            train_features = convert_examples_to_features(\n                examples=train_examples,\n                tokenizer=tokenizer,\n                max_seq_length=args.max_seq_length,\n                doc_stride=args.doc_stride,\n                max_query_length=args.max_query_length,\n                is_training=True)\n            if args.local_rank == -1 or torch.distributed.get_rank() == 0:\n                logger.info(\"  Saving train features into cached file %s\", cached_train_features_file)\n                with open(cached_train_features_file, \"wb\") as writer:\n                    pickle.dump(train_features, writer)\n\n        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)\n        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)\n        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)\n        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)\n        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,\n                                   all_start_positions, all_end_positions)\n        if args.local_rank == -1:\n            train_sampler = RandomSampler(train_data)\n        else:\n            train_sampler = DistributedSampler(train_data)\n\n        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)\n        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs\n        # if args.local_rank != -1:\n        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()\n\n        # Prepare optimizer\n        param_optimizer = list(model.named_parameters())\n\n        # hack to remove pooler, which is not used\n        # thus it produce None grad that break apex\n        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]\n\n        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n        optimizer_grouped_parameters = [\n            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n            ]\n\n        if args.fp16:\n            try:\n                from apex.optimizers import FP16_Optimizer\n                from apex.optimizers import FusedAdam\n            except ImportError:\n                raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n            optimizer = FusedAdam(optimizer_grouped_parameters,\n                                  lr=args.learning_rate,\n                                  bias_correction=False,\n                                  max_grad_norm=1.0)\n            if args.loss_scale == 0:\n                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)\n            else:\n                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)\n            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,\n                                                 t_total=num_train_optimization_steps)\n        else:\n            optimizer = BertAdam(optimizer_grouped_parameters,\n                                 lr=args.learning_rate,\n                                 warmup=args.warmup_proportion,\n                                 t_total=num_train_optimization_steps)\n\n        global_step = 0\n\n        logger.info(\"***** Running training *****\")\n        logger.info(\"  Num orig examples = %d\", len(train_examples))\n        logger.info(\"  Num split examples = %d\", len(train_features))\n        logger.info(\"  Batch size = %d\", args.train_batch_size)\n        logger.info(\"  Num steps = %d\", num_train_optimization_steps)\n\n        model.train()\n        for epoch in trange(int(args.num_train_epochs), desc=\"Epoch\"):\n            for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\", disable=args.local_rank not in [-1, 0])):\n                if n_gpu == 1:\n                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self\n                input_ids, input_mask, segment_ids, start_positions, end_positions = batch\n                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)\n                if n_gpu > 1:\n                    loss = loss.mean() # mean() to average on multi-gpu.\n                if args.gradient_accumulation_steps > 1:\n                    loss = loss / args.gradient_accumulation_steps\n\n                if args.fp16:\n                    optimizer.backward(loss)\n                else:\n                    loss.backward()\n                if (step + 1) % args.gradient_accumulation_steps == 0:\n                    if args.fp16:\n                        # modify learning rate with special warm up BERT uses\n                        # if args.fp16 is False, BertAdam is used and handles this automatically\n                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)\n                        for param_group in optimizer.param_groups:\n                            param_group['lr'] = lr_this_step\n                    optimizer.step()\n                    optimizer.zero_grad()\n                    global_step += 1\n                    if args.local_rank in [-1, 0]:\n                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)\n                        tb_writer.add_scalar('loss', loss.item(), global_step)\n\n    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):\n        # Save a trained model, configuration and tokenizer\n        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self\n\n        # If we save using the predefined names, we can load using `from_pretrained`\n        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)\n        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)\n\n        torch.save(model_to_save.state_dict(), output_model_file)\n        model_to_save.config.to_json_file(output_config_file)\n        tokenizer.save_vocabulary(args.output_dir)\n\n        # Load a trained model and vocabulary that you have fine-tuned\n        model = BertForQuestionAnswering.from_pretrained(args.output_dir)\n        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)\n\n        # Good practice: save your training arguments together with the trained model\n        output_args_file = os.path.join(args.output_dir, 'training_args.bin')\n        torch.save(args, output_args_file)\n    else:\n        # Load a trained model and vocabulary that you have fine-tuned\n        model = BertForQuestionAnswering.from_pretrained(args.output_dir)\n        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)\n\n    model.to(device)\n\n    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):\n        eval_examples = read_squad_examples(\n            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)\n        eval_features = convert_examples_to_features(\n            examples=eval_examples,\n            tokenizer=tokenizer,\n            max_seq_length=args.max_seq_length,\n            doc_stride=args.doc_stride,\n            max_query_length=args.max_query_length,\n            is_training=False)\n\n        logger.info(\"***** Running predictions *****\")\n        logger.info(\"  Num orig examples = %d\", len(eval_examples))\n        logger.info(\"  Num split examples = %d\", len(eval_features))\n        logger.info(\"  Batch size = %d\", args.predict_batch_size)\n\n        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)\n        # Run prediction for full data\n        eval_sampler = SequentialSampler(eval_data)\n        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)\n\n        model.eval()\n        all_results = []\n        logger.info(\"Start evaluating\")\n        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc=\"Evaluating\", disable=args.local_rank not in [-1, 0]):\n            if len(all_results) % 1000 == 0:\n                logger.info(\"Processing example: %d\" % (len(all_results)))\n            input_ids = input_ids.to(device)\n            input_mask = input_mask.to(device)\n            segment_ids = segment_ids.to(device)\n            with torch.no_grad():\n                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)\n            for i, example_index in enumerate(example_indices):\n                start_logits = batch_start_logits[i].detach().cpu().tolist()\n                end_logits = batch_end_logits[i].detach().cpu().tolist()\n                eval_feature = eval_features[example_index.item()]\n                unique_id = int(eval_feature.unique_id)\n                all_results.append(RawResult(unique_id=unique_id,\n                                             start_logits=start_logits,\n                                             end_logits=end_logits))\n        output_prediction_file = os.path.join(args.output_dir, \"predictions.json\")\n        output_nbest_file = os.path.join(args.output_dir, \"nbest_predictions.json\")\n        output_null_log_odds_file = os.path.join(args.output_dir, \"null_odds.json\")\n        write_predictions(eval_examples, eval_features, all_results,\n                          args.n_best_size, args.max_answer_length,\n                          args.do_lower_case, output_prediction_file,\n                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,\n                          args.version_2_with_negative, args.null_score_diff_threshold)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/run_squad_dataset_utils.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" Load SQuAD dataset. \"\"\"\n\nfrom __future__ import absolute_import, division, print_function\n\nimport json\nimport logging\nimport math\nimport collections\nfrom io import open\n\nfrom pytorch_pretrained_bert.tokenization import BasicTokenizer, whitespace_tokenize\n\nlogger = logging.getLogger(__name__)\n\n\nclass SquadExample(object):\n    \"\"\"\n    A single training/test example for the Squad dataset.\n    For examples without an answer, the start and end position are -1.\n    \"\"\"\n\n    def __init__(self,\n                 qas_id,\n                 question_text,\n                 doc_tokens,\n                 orig_answer_text=None,\n                 start_position=None,\n                 end_position=None,\n                 is_impossible=None):\n        self.qas_id = qas_id\n        self.question_text = question_text\n        self.doc_tokens = doc_tokens\n        self.orig_answer_text = orig_answer_text\n        self.start_position = start_position\n        self.end_position = end_position\n        self.is_impossible = is_impossible\n\n    def __str__(self):\n        return self.__repr__()\n\n    def __repr__(self):\n        s = \"\"\n        s += \"qas_id: %s\" % (self.qas_id)\n        s += \", question_text: %s\" % (\n            self.question_text)\n        s += \", doc_tokens: [%s]\" % (\" \".join(self.doc_tokens))\n        if self.start_position:\n            s += \", start_position: %d\" % (self.start_position)\n        if self.end_position:\n            s += \", end_position: %d\" % (self.end_position)\n        if self.is_impossible:\n            s += \", is_impossible: %r\" % (self.is_impossible)\n        return s\n\n\nclass InputFeatures(object):\n    \"\"\"A single set of features of data.\"\"\"\n\n    def __init__(self,\n                 unique_id,\n                 example_index,\n                 doc_span_index,\n                 tokens,\n                 token_to_orig_map,\n                 token_is_max_context,\n                 input_ids,\n                 input_mask,\n                 segment_ids,\n                 start_position=None,\n                 end_position=None,\n                 is_impossible=None):\n        self.unique_id = unique_id\n        self.example_index = example_index\n        self.doc_span_index = doc_span_index\n        self.tokens = tokens\n        self.token_to_orig_map = token_to_orig_map\n        self.token_is_max_context = token_is_max_context\n        self.input_ids = input_ids\n        self.input_mask = input_mask\n        self.segment_ids = segment_ids\n        self.start_position = start_position\n        self.end_position = end_position\n        self.is_impossible = is_impossible\n\n\ndef read_squad_examples(input_file, is_training, version_2_with_negative):\n    \"\"\"Read a SQuAD json file into a list of SquadExample.\"\"\"\n    with open(input_file, \"r\", encoding='utf-8') as reader:\n        input_data = json.load(reader)[\"data\"]\n\n    def is_whitespace(c):\n        if c == \" \" or c == \"\\t\" or c == \"\\r\" or c == \"\\n\" or ord(c) == 0x202F:\n            return True\n        return False\n\n    examples = []\n    for entry in input_data:\n        for paragraph in entry[\"paragraphs\"]:\n            paragraph_text = paragraph[\"context\"]\n            doc_tokens = []\n            char_to_word_offset = []\n            prev_is_whitespace = True\n            for c in paragraph_text:\n                if is_whitespace(c):\n                    prev_is_whitespace = True\n                else:\n                    if prev_is_whitespace:\n                        doc_tokens.append(c)\n                    else:\n                        doc_tokens[-1] += c\n                    prev_is_whitespace = False\n                char_to_word_offset.append(len(doc_tokens) - 1)\n\n            for qa in paragraph[\"qas\"]:\n                qas_id = qa[\"id\"]\n                question_text = qa[\"question\"]\n                start_position = None\n                end_position = None\n                orig_answer_text = None\n                is_impossible = False\n                if is_training:\n                    if version_2_with_negative:\n                        is_impossible = qa[\"is_impossible\"]\n                    if (len(qa[\"answers\"]) != 1) and (not is_impossible):\n                        raise ValueError(\n                            \"For training, each question should have exactly 1 answer.\")\n                    if not is_impossible:\n                        answer = qa[\"answers\"][0]\n                        orig_answer_text = answer[\"text\"]\n                        answer_offset = answer[\"answer_start\"]\n                        answer_length = len(orig_answer_text)\n                        start_position = char_to_word_offset[answer_offset]\n                        end_position = char_to_word_offset[answer_offset + answer_length - 1]\n                        # Only add answers where the text can be exactly recovered from the\n                        # document. If this CAN'T happen it's likely due to weird Unicode\n                        # stuff so we will just skip the example.\n                        #\n                        # Note that this means for training mode, every example is NOT\n                        # guaranteed to be preserved.\n                        actual_text = \" \".join(doc_tokens[start_position:(end_position + 1)])\n                        cleaned_answer_text = \" \".join(\n                            whitespace_tokenize(orig_answer_text))\n                        if actual_text.find(cleaned_answer_text) == -1:\n                            logger.warning(\"Could not find answer: '%s' vs. '%s'\",\n                                           actual_text, cleaned_answer_text)\n                            continue\n                    else:\n                        start_position = -1\n                        end_position = -1\n                        orig_answer_text = \"\"\n\n                example = SquadExample(\n                    qas_id=qas_id,\n                    question_text=question_text,\n                    doc_tokens=doc_tokens,\n                    orig_answer_text=orig_answer_text,\n                    start_position=start_position,\n                    end_position=end_position,\n                    is_impossible=is_impossible)\n                examples.append(example)\n    return examples\n\n\ndef convert_examples_to_features(examples, tokenizer, max_seq_length,\n                                 doc_stride, max_query_length, is_training):\n    \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n    unique_id = 1000000000\n\n    features = []\n    for (example_index, example) in enumerate(examples):\n        query_tokens = tokenizer.tokenize(example.question_text)\n\n        if len(query_tokens) > max_query_length:\n            query_tokens = query_tokens[0:max_query_length]\n\n        tok_to_orig_index = []\n        orig_to_tok_index = []\n        all_doc_tokens = []\n        for (i, token) in enumerate(example.doc_tokens):\n            orig_to_tok_index.append(len(all_doc_tokens))\n            sub_tokens = tokenizer.tokenize(token)\n            for sub_token in sub_tokens:\n                tok_to_orig_index.append(i)\n                all_doc_tokens.append(sub_token)\n\n        tok_start_position = None\n        tok_end_position = None\n        if is_training and example.is_impossible:\n            tok_start_position = -1\n            tok_end_position = -1\n        if is_training and not example.is_impossible:\n            tok_start_position = orig_to_tok_index[example.start_position]\n            if example.end_position < len(example.doc_tokens) - 1:\n                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1\n            else:\n                tok_end_position = len(all_doc_tokens) - 1\n            (tok_start_position, tok_end_position) = _improve_answer_span(\n                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,\n                example.orig_answer_text)\n\n        # The -3 accounts for [CLS], [SEP] and [SEP]\n        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3\n\n        # We can have documents that are longer than the maximum sequence length.\n        # To deal with this we do a sliding window approach, where we take chunks\n        # of the up to our max length with a stride of `doc_stride`.\n        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name\n            \"DocSpan\", [\"start\", \"length\"])\n        doc_spans = []\n        start_offset = 0\n        while start_offset < len(all_doc_tokens):\n            length = len(all_doc_tokens) - start_offset\n            if length > max_tokens_for_doc:\n                length = max_tokens_for_doc\n            doc_spans.append(_DocSpan(start=start_offset, length=length))\n            if start_offset + length == len(all_doc_tokens):\n                break\n            start_offset += min(length, doc_stride)\n\n        for (doc_span_index, doc_span) in enumerate(doc_spans):\n            tokens = []\n            token_to_orig_map = {}\n            token_is_max_context = {}\n            segment_ids = []\n            tokens.append(\"[CLS]\")\n            segment_ids.append(0)\n            for token in query_tokens:\n                tokens.append(token)\n                segment_ids.append(0)\n            tokens.append(\"[SEP]\")\n            segment_ids.append(0)\n\n            for i in range(doc_span.length):\n                split_token_index = doc_span.start + i\n                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]\n\n                is_max_context = _check_is_max_context(doc_spans, doc_span_index,\n                                                       split_token_index)\n                token_is_max_context[len(tokens)] = is_max_context\n                tokens.append(all_doc_tokens[split_token_index])\n                segment_ids.append(1)\n            tokens.append(\"[SEP]\")\n            segment_ids.append(1)\n\n            input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n            # The mask has 1 for real tokens and 0 for padding tokens. Only real\n            # tokens are attended to.\n            input_mask = [1] * len(input_ids)\n\n            # Zero-pad up to the sequence length.\n            while len(input_ids) < max_seq_length:\n                input_ids.append(0)\n                input_mask.append(0)\n                segment_ids.append(0)\n\n            assert len(input_ids) == max_seq_length\n            assert len(input_mask) == max_seq_length\n            assert len(segment_ids) == max_seq_length\n\n            start_position = None\n            end_position = None\n            if is_training and not example.is_impossible:\n                # For training, if our document chunk does not contain an annotation\n                # we throw it out, since there is nothing to predict.\n                doc_start = doc_span.start\n                doc_end = doc_span.start + doc_span.length - 1\n                out_of_span = False\n                if not (tok_start_position >= doc_start and\n                        tok_end_position <= doc_end):\n                    out_of_span = True\n                if out_of_span:\n                    start_position = 0\n                    end_position = 0\n                else:\n                    doc_offset = len(query_tokens) + 2\n                    start_position = tok_start_position - doc_start + doc_offset\n                    end_position = tok_end_position - doc_start + doc_offset\n            if is_training and example.is_impossible:\n                start_position = 0\n                end_position = 0\n            if example_index < 20:\n                logger.info(\"*** Example ***\")\n                logger.info(\"unique_id: %s\" % (unique_id))\n                logger.info(\"example_index: %s\" % (example_index))\n                logger.info(\"doc_span_index: %s\" % (doc_span_index))\n                logger.info(\"tokens: %s\" % \" \".join(tokens))\n                logger.info(\"token_to_orig_map: %s\" % \" \".join([\n                    \"%d:%d\" % (x, y) for (x, y) in token_to_orig_map.items()]))\n                logger.info(\"token_is_max_context: %s\" % \" \".join([\n                    \"%d:%s\" % (x, y) for (x, y) in token_is_max_context.items()\n                ]))\n                logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n                logger.info(\n                    \"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n                logger.info(\n                    \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n                if is_training and example.is_impossible:\n                    logger.info(\"impossible example\")\n                if is_training and not example.is_impossible:\n                    answer_text = \" \".join(tokens[start_position:(end_position + 1)])\n                    logger.info(\"start_position: %d\" % (start_position))\n                    logger.info(\"end_position: %d\" % (end_position))\n                    logger.info(\n                        \"answer: %s\" % (answer_text))\n\n            features.append(\n                InputFeatures(\n                    unique_id=unique_id,\n                    example_index=example_index,\n                    doc_span_index=doc_span_index,\n                    tokens=tokens,\n                    token_to_orig_map=token_to_orig_map,\n                    token_is_max_context=token_is_max_context,\n                    input_ids=input_ids,\n                    input_mask=input_mask,\n                    segment_ids=segment_ids,\n                    start_position=start_position,\n                    end_position=end_position,\n                    is_impossible=example.is_impossible))\n            unique_id += 1\n\n    return features\n\n\ndef _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,\n                         orig_answer_text):\n    \"\"\"Returns tokenized answer spans that better match the annotated answer.\"\"\"\n\n    # The SQuAD annotations are character based. We first project them to\n    # whitespace-tokenized words. But then after WordPiece tokenization, we can\n    # often find a \"better match\". For example:\n    #\n    #   Question: What year was John Smith born?\n    #   Context: The leader was John Smith (1895-1943).\n    #   Answer: 1895\n    #\n    # The original whitespace-tokenized answer will be \"(1895-1943).\". However\n    # after tokenization, our tokens will be \"( 1895 - 1943 ) .\". So we can match\n    # the exact answer, 1895.\n    #\n    # However, this is not always possible. Consider the following:\n    #\n    #   Question: What country is the top exporter of electornics?\n    #   Context: The Japanese electronics industry is the lagest in the world.\n    #   Answer: Japan\n    #\n    # In this case, the annotator chose \"Japan\" as a character sub-span of\n    # the word \"Japanese\". Since our WordPiece tokenizer does not split\n    # \"Japanese\", we just use \"Japanese\" as the annotation. This is fairly rare\n    # in SQuAD, but does happen.\n    tok_answer_text = \" \".join(tokenizer.tokenize(orig_answer_text))\n\n    for new_start in range(input_start, input_end + 1):\n        for new_end in range(input_end, new_start - 1, -1):\n            text_span = \" \".join(doc_tokens[new_start:(new_end + 1)])\n            if text_span == tok_answer_text:\n                return (new_start, new_end)\n\n    return (input_start, input_end)\n\n\ndef _check_is_max_context(doc_spans, cur_span_index, position):\n    \"\"\"Check if this is the 'max context' doc span for the token.\"\"\"\n\n    # Because of the sliding window approach taken to scoring documents, a single\n    # token can appear in multiple documents. E.g.\n    #  Doc: the man went to the store and bought a gallon of milk\n    #  Span A: the man went to the\n    #  Span B: to the store and bought\n    #  Span C: and bought a gallon of\n    #  ...\n    #\n    # Now the word 'bought' will have two scores from spans B and C. We only\n    # want to consider the score with \"maximum context\", which we define as\n    # the *minimum* of its left and right context (the *sum* of left and\n    # right context will always be the same, of course).\n    #\n    # In the example the maximum context for 'bought' would be span C since\n    # it has 1 left context and 3 right context, while span B has 4 left context\n    # and 0 right context.\n    best_score = None\n    best_span_index = None\n    for (span_index, doc_span) in enumerate(doc_spans):\n        end = doc_span.start + doc_span.length - 1\n        if position < doc_span.start:\n            continue\n        if position > end:\n            continue\n        num_left_context = position - doc_span.start\n        num_right_context = end - position\n        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length\n        if best_score is None or score > best_score:\n            best_score = score\n            best_span_index = span_index\n\n    return cur_span_index == best_span_index\n\n\nRawResult = collections.namedtuple(\"RawResult\",\n                                   [\"unique_id\", \"start_logits\", \"end_logits\"])\n\n\ndef write_predictions(all_examples, all_features, all_results, n_best_size,\n                      max_answer_length, do_lower_case, output_prediction_file,\n                      output_nbest_file, output_null_log_odds_file, verbose_logging,\n                      version_2_with_negative, null_score_diff_threshold):\n    \"\"\"Write final predictions to the json file and log-odds of null if needed.\"\"\"\n    logger.info(\"Writing predictions to: %s\" % (output_prediction_file))\n    logger.info(\"Writing nbest to: %s\" % (output_nbest_file))\n\n    example_index_to_features = collections.defaultdict(list)\n    for feature in all_features:\n        example_index_to_features[feature.example_index].append(feature)\n\n    unique_id_to_result = {}\n    for result in all_results:\n        unique_id_to_result[result.unique_id] = result\n\n    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name\n        \"PrelimPrediction\",\n        [\"feature_index\", \"start_index\", \"end_index\", \"start_logit\", \"end_logit\"])\n\n    all_predictions = collections.OrderedDict()\n    all_nbest_json = collections.OrderedDict()\n    scores_diff_json = collections.OrderedDict()\n\n    for (example_index, example) in enumerate(all_examples):\n        features = example_index_to_features[example_index]\n\n        prelim_predictions = []\n        # keep track of the minimum score of null start+end of position 0\n        score_null = 1000000  # large and positive\n        min_null_feature_index = 0  # the paragraph slice with min null score\n        null_start_logit = 0  # the start logit at the slice with min null score\n        null_end_logit = 0  # the end logit at the slice with min null score\n        for (feature_index, feature) in enumerate(features):\n            result = unique_id_to_result[feature.unique_id]\n            start_indexes = _get_best_indexes(result.start_logits, n_best_size)\n            end_indexes = _get_best_indexes(result.end_logits, n_best_size)\n            # if we could have irrelevant answers, get the min score of irrelevant\n            if version_2_with_negative:\n                feature_null_score = result.start_logits[0] + result.end_logits[0]\n                if feature_null_score < score_null:\n                    score_null = feature_null_score\n                    min_null_feature_index = feature_index\n                    null_start_logit = result.start_logits[0]\n                    null_end_logit = result.end_logits[0]\n            for start_index in start_indexes:\n                for end_index in end_indexes:\n                    # We could hypothetically create invalid predictions, e.g., predict\n                    # that the start of the span is in the question. We throw out all\n                    # invalid predictions.\n                    if start_index >= len(feature.tokens):\n                        continue\n                    if end_index >= len(feature.tokens):\n                        continue\n                    if start_index not in feature.token_to_orig_map:\n                        continue\n                    if end_index not in feature.token_to_orig_map:\n                        continue\n                    if not feature.token_is_max_context.get(start_index, False):\n                        continue\n                    if end_index < start_index:\n                        continue\n                    length = end_index - start_index + 1\n                    if length > max_answer_length:\n                        continue\n                    prelim_predictions.append(\n                        _PrelimPrediction(\n                            feature_index=feature_index,\n                            start_index=start_index,\n                            end_index=end_index,\n                            start_logit=result.start_logits[start_index],\n                            end_logit=result.end_logits[end_index]))\n        if version_2_with_negative:\n            prelim_predictions.append(\n                _PrelimPrediction(\n                    feature_index=min_null_feature_index,\n                    start_index=0,\n                    end_index=0,\n                    start_logit=null_start_logit,\n                    end_logit=null_end_logit))\n        prelim_predictions = sorted(\n            prelim_predictions,\n            key=lambda x: (x.start_logit + x.end_logit),\n            reverse=True)\n\n        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name\n            \"NbestPrediction\", [\"text\", \"start_logit\", \"end_logit\"])\n\n        seen_predictions = {}\n        nbest = []\n        for pred in prelim_predictions:\n            if len(nbest) >= n_best_size:\n                break\n            feature = features[pred.feature_index]\n            if pred.start_index > 0:  # this is a non-null prediction\n                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]\n                orig_doc_start = feature.token_to_orig_map[pred.start_index]\n                orig_doc_end = feature.token_to_orig_map[pred.end_index]\n                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]\n                tok_text = \" \".join(tok_tokens)\n\n                # De-tokenize WordPieces that have been split off.\n                tok_text = tok_text.replace(\" ##\", \"\")\n                tok_text = tok_text.replace(\"##\", \"\")\n\n                # Clean whitespace\n                tok_text = tok_text.strip()\n                tok_text = \" \".join(tok_text.split())\n                orig_text = \" \".join(orig_tokens)\n\n                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)\n                if final_text in seen_predictions:\n                    continue\n\n                seen_predictions[final_text] = True\n            else:\n                final_text = \"\"\n                seen_predictions[final_text] = True\n\n            nbest.append(\n                _NbestPrediction(\n                    text=final_text,\n                    start_logit=pred.start_logit,\n                    end_logit=pred.end_logit))\n        # if we didn't include the empty option in the n-best, include it\n        if version_2_with_negative:\n            if \"\" not in seen_predictions:\n                nbest.append(\n                    _NbestPrediction(\n                        text=\"\",\n                        start_logit=null_start_logit,\n                        end_logit=null_end_logit))\n                \n            # In very rare edge cases we could only have single null prediction.\n            # So we just create a nonce prediction in this case to avoid failure.\n            if len(nbest)==1:\n                nbest.insert(0,\n                    _NbestPrediction(text=\"empty\", start_logit=0.0, end_logit=0.0))\n                \n        # In very rare edge cases we could have no valid predictions. So we\n        # just create a nonce prediction in this case to avoid failure.\n        if not nbest:\n            nbest.append(\n                _NbestPrediction(text=\"empty\", start_logit=0.0, end_logit=0.0))\n\n        assert len(nbest) >= 1\n\n        total_scores = []\n        best_non_null_entry = None\n        for entry in nbest:\n            total_scores.append(entry.start_logit + entry.end_logit)\n            if not best_non_null_entry:\n                if entry.text:\n                    best_non_null_entry = entry\n\n        probs = _compute_softmax(total_scores)\n\n        nbest_json = []\n        for (i, entry) in enumerate(nbest):\n            output = collections.OrderedDict()\n            output[\"text\"] = entry.text\n            output[\"probability\"] = probs[i]\n            output[\"start_logit\"] = entry.start_logit\n            output[\"end_logit\"] = entry.end_logit\n            nbest_json.append(output)\n\n        assert len(nbest_json) >= 1\n\n        if not version_2_with_negative:\n            all_predictions[example.qas_id] = nbest_json[0][\"text\"]\n        else:\n            # predict \"\" iff the null score - the score of best non-null > threshold\n            score_diff = score_null - best_non_null_entry.start_logit - (\n                best_non_null_entry.end_logit)\n            scores_diff_json[example.qas_id] = score_diff\n            if score_diff > null_score_diff_threshold:\n                all_predictions[example.qas_id] = \"\"\n            else:\n                all_predictions[example.qas_id] = best_non_null_entry.text\n        all_nbest_json[example.qas_id] = nbest_json\n\n    with open(output_prediction_file, \"w\") as writer:\n        writer.write(json.dumps(all_predictions, indent=4) + \"\\n\")\n\n    with open(output_nbest_file, \"w\") as writer:\n        writer.write(json.dumps(all_nbest_json, indent=4) + \"\\n\")\n\n    if version_2_with_negative:\n        with open(output_null_log_odds_file, \"w\") as writer:\n            writer.write(json.dumps(scores_diff_json, indent=4) + \"\\n\")\n\n\ndef get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):\n    \"\"\"Project the tokenized prediction back to the original text.\"\"\"\n\n    # When we created the data, we kept track of the alignment between original\n    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So\n    # now `orig_text` contains the span of our original text corresponding to the\n    # span that we predicted.\n    #\n    # However, `orig_text` may contain extra characters that we don't want in\n    # our prediction.\n    #\n    # For example, let's say:\n    #   pred_text = steve smith\n    #   orig_text = Steve Smith's\n    #\n    # We don't want to return `orig_text` because it contains the extra \"'s\".\n    #\n    # We don't want to return `pred_text` because it's already been normalized\n    # (the SQuAD eval script also does punctuation stripping/lower casing but\n    # our tokenizer does additional normalization like stripping accent\n    # characters).\n    #\n    # What we really want to return is \"Steve Smith\".\n    #\n    # Therefore, we have to apply a semi-complicated alignment heuristic between\n    # `pred_text` and `orig_text` to get a character-to-character alignment. This\n    # can fail in certain cases in which case we just return `orig_text`.\n\n    def _strip_spaces(text):\n        ns_chars = []\n        ns_to_s_map = collections.OrderedDict()\n        for (i, c) in enumerate(text):\n            if c == \" \":\n                continue\n            ns_to_s_map[len(ns_chars)] = i\n            ns_chars.append(c)\n        ns_text = \"\".join(ns_chars)\n        return (ns_text, ns_to_s_map)\n\n    # We first tokenize `orig_text`, strip whitespace from the result\n    # and `pred_text`, and check if they are the same length. If they are\n    # NOT the same length, the heuristic has failed. If they are the same\n    # length, we assume the characters are one-to-one aligned.\n    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)\n\n    tok_text = \" \".join(tokenizer.tokenize(orig_text))\n\n    start_position = tok_text.find(pred_text)\n    if start_position == -1:\n        if verbose_logging:\n            logger.info(\n                \"Unable to find text: '%s' in '%s'\" % (pred_text, orig_text))\n        return orig_text\n    end_position = start_position + len(pred_text) - 1\n\n    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)\n    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)\n\n    if len(orig_ns_text) != len(tok_ns_text):\n        if verbose_logging:\n            logger.info(\"Length not equal after stripping spaces: '%s' vs '%s'\",\n                        orig_ns_text, tok_ns_text)\n        return orig_text\n\n    # We then project the characters in `pred_text` back to `orig_text` using\n    # the character-to-character alignment.\n    tok_s_to_ns_map = {}\n    for (i, tok_index) in tok_ns_to_s_map.items():\n        tok_s_to_ns_map[tok_index] = i\n\n    orig_start_position = None\n    if start_position in tok_s_to_ns_map:\n        ns_start_position = tok_s_to_ns_map[start_position]\n        if ns_start_position in orig_ns_to_s_map:\n            orig_start_position = orig_ns_to_s_map[ns_start_position]\n\n    if orig_start_position is None:\n        if verbose_logging:\n            logger.info(\"Couldn't map start position\")\n        return orig_text\n\n    orig_end_position = None\n    if end_position in tok_s_to_ns_map:\n        ns_end_position = tok_s_to_ns_map[end_position]\n        if ns_end_position in orig_ns_to_s_map:\n            orig_end_position = orig_ns_to_s_map[ns_end_position]\n\n    if orig_end_position is None:\n        if verbose_logging:\n            logger.info(\"Couldn't map end position\")\n        return orig_text\n\n    output_text = orig_text[orig_start_position:(orig_end_position + 1)]\n    return output_text\n\n\ndef _get_best_indexes(logits, n_best_size):\n    \"\"\"Get the n-best logits from a list.\"\"\"\n    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)\n\n    best_indexes = []\n    for i in range(len(index_and_score)):\n        if i >= n_best_size:\n            break\n        best_indexes.append(index_and_score[i][0])\n    return best_indexes\n\n\ndef _compute_softmax(scores):\n    \"\"\"Compute softmax probability over raw logits.\"\"\"\n    if not scores:\n        return []\n\n    max_score = None\n    for score in scores:\n        if max_score is None or score > max_score:\n            max_score = score\n\n    exp_scores = []\n    total_sum = 0.0\n    for score in scores:\n        x = math.exp(score - max_score)\n        exp_scores.append(x)\n        total_sum += x\n\n    probs = []\n    for score in exp_scores:\n        probs.append(score / total_sum)\n    return probs\n"
  },
  {
    "path": "examples/run_swag.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"BERT finetuning runner.\"\"\"\n\nfrom __future__ import absolute_import\n\nimport argparse\nimport csv\nimport logging\nimport os\nimport random\nimport sys\nfrom io import open\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,\n                              TensorDataset)\nfrom torch.utils.data.distributed import DistributedSampler\nfrom tqdm import tqdm, trange\n\nfrom pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME\nfrom pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig\nfrom pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\n\nlogging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n                    datefmt = '%m/%d/%Y %H:%M:%S',\n                    level = logging.INFO)\nlogger = logging.getLogger(__name__)\n\n\nclass SwagExample(object):\n    \"\"\"A single training/test example for the SWAG dataset.\"\"\"\n    def __init__(self,\n                 swag_id,\n                 context_sentence,\n                 start_ending,\n                 ending_0,\n                 ending_1,\n                 ending_2,\n                 ending_3,\n                 label = None):\n        self.swag_id = swag_id\n        self.context_sentence = context_sentence\n        self.start_ending = start_ending\n        self.endings = [\n            ending_0,\n            ending_1,\n            ending_2,\n            ending_3,\n        ]\n        self.label = label\n\n    def __str__(self):\n        return self.__repr__()\n\n    def __repr__(self):\n        l = [\n            \"swag_id: {}\".format(self.swag_id),\n            \"context_sentence: {}\".format(self.context_sentence),\n            \"start_ending: {}\".format(self.start_ending),\n            \"ending_0: {}\".format(self.endings[0]),\n            \"ending_1: {}\".format(self.endings[1]),\n            \"ending_2: {}\".format(self.endings[2]),\n            \"ending_3: {}\".format(self.endings[3]),\n        ]\n\n        if self.label is not None:\n            l.append(\"label: {}\".format(self.label))\n\n        return \", \".join(l)\n\n\nclass InputFeatures(object):\n    def __init__(self,\n                 example_id,\n                 choices_features,\n                 label\n\n    ):\n        self.example_id = example_id\n        self.choices_features = [\n            {\n                'input_ids': input_ids,\n                'input_mask': input_mask,\n                'segment_ids': segment_ids\n            }\n            for _, input_ids, input_mask, segment_ids in choices_features\n        ]\n        self.label = label\n\n\ndef read_swag_examples(input_file, is_training):\n    with open(input_file, 'r', encoding='utf-8') as f:\n        reader = csv.reader(f)\n        lines = []\n        for line in reader:\n            if sys.version_info[0] == 2:\n                line = list(unicode(cell, 'utf-8') for cell in line)\n            lines.append(line)\n\n    if is_training and lines[0][-1] != 'label':\n        raise ValueError(\n            \"For training, the input file must contain a label column.\"\n        )\n\n    examples = [\n        SwagExample(\n            swag_id = line[2],\n            context_sentence = line[4],\n            start_ending = line[5], # in the swag dataset, the\n                                         # common beginning of each\n                                         # choice is stored in \"sent2\".\n            ending_0 = line[7],\n            ending_1 = line[8],\n            ending_2 = line[9],\n            ending_3 = line[10],\n            label = int(line[11]) if is_training else None\n        ) for line in lines[1:] # we skip the line with the column names\n    ]\n\n    return examples\n\ndef convert_examples_to_features(examples, tokenizer, max_seq_length,\n                                 is_training):\n    \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n    # Swag is a multiple choice task. To perform this task using Bert,\n    # we will use the formatting proposed in \"Improving Language\n    # Understanding by Generative Pre-Training\" and suggested by\n    # @jacobdevlin-google in this issue\n    # https://github.com/google-research/bert/issues/38.\n    #\n    # Each choice will correspond to a sample on which we run the\n    # inference. For a given Swag example, we will create the 4\n    # following inputs:\n    # - [CLS] context [SEP] choice_1 [SEP]\n    # - [CLS] context [SEP] choice_2 [SEP]\n    # - [CLS] context [SEP] choice_3 [SEP]\n    # - [CLS] context [SEP] choice_4 [SEP]\n    # The model will output a single value for each input. To get the\n    # final decision of the model, we will run a softmax over these 4\n    # outputs.\n    features = []\n    for example_index, example in enumerate(examples):\n        context_tokens = tokenizer.tokenize(example.context_sentence)\n        start_ending_tokens = tokenizer.tokenize(example.start_ending)\n\n        choices_features = []\n        for ending_index, ending in enumerate(example.endings):\n            # We create a copy of the context tokens in order to be\n            # able to shrink it according to ending_tokens\n            context_tokens_choice = context_tokens[:]\n            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)\n            # Modifies `context_tokens_choice` and `ending_tokens` in\n            # place so that the total length is less than the\n            # specified length.  Account for [CLS], [SEP], [SEP] with\n            # \"- 3\"\n            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)\n\n            tokens = [\"[CLS]\"] + context_tokens_choice + [\"[SEP]\"] + ending_tokens + [\"[SEP]\"]\n            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)\n\n            input_ids = tokenizer.convert_tokens_to_ids(tokens)\n            input_mask = [1] * len(input_ids)\n\n            # Zero-pad up to the sequence length.\n            padding = [0] * (max_seq_length - len(input_ids))\n            input_ids += padding\n            input_mask += padding\n            segment_ids += padding\n\n            assert len(input_ids) == max_seq_length\n            assert len(input_mask) == max_seq_length\n            assert len(segment_ids) == max_seq_length\n\n            choices_features.append((tokens, input_ids, input_mask, segment_ids))\n\n        label = example.label\n        if example_index < 5:\n            logger.info(\"*** Example ***\")\n            logger.info(\"swag_id: {}\".format(example.swag_id))\n            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):\n                logger.info(\"choice: {}\".format(choice_idx))\n                logger.info(\"tokens: {}\".format(' '.join(tokens)))\n                logger.info(\"input_ids: {}\".format(' '.join(map(str, input_ids))))\n                logger.info(\"input_mask: {}\".format(' '.join(map(str, input_mask))))\n                logger.info(\"segment_ids: {}\".format(' '.join(map(str, segment_ids))))\n            if is_training:\n                logger.info(\"label: {}\".format(label))\n\n        features.append(\n            InputFeatures(\n                example_id = example.swag_id,\n                choices_features = choices_features,\n                label = label\n            )\n        )\n\n    return features\n\ndef _truncate_seq_pair(tokens_a, tokens_b, max_length):\n    \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n\n    # This is a simple heuristic which will always truncate the longer sequence\n    # one token at a time. This makes more sense than truncating an equal percent\n    # of tokens from each, since if one sequence is very short then each token\n    # that's truncated likely contains more information than a longer sequence.\n    while True:\n        total_length = len(tokens_a) + len(tokens_b)\n        if total_length <= max_length:\n            break\n        if len(tokens_a) > len(tokens_b):\n            tokens_a.pop()\n        else:\n            tokens_b.pop()\n\ndef accuracy(out, labels):\n    outputs = np.argmax(out, axis=1)\n    return np.sum(outputs == labels)\n\ndef select_field(features, field):\n    return [\n        [\n            choice[field]\n            for choice in feature.choices_features\n        ]\n        for feature in features\n    ]\n\ndef main():\n    parser = argparse.ArgumentParser()\n\n    ## Required parameters\n    parser.add_argument(\"--data_dir\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The input data dir. Should contain the .csv files (or other data files) for the task.\")\n    parser.add_argument(\"--bert_model\", default=None, type=str, required=True,\n                        help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\n                        \"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, \"\n                        \"bert-base-multilingual-cased, bert-base-chinese.\")\n    parser.add_argument(\"--output_dir\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The output directory where the model checkpoints will be written.\")\n\n    ## Other parameters\n    parser.add_argument(\"--max_seq_length\",\n                        default=128,\n                        type=int,\n                        help=\"The maximum total input sequence length after WordPiece tokenization. \\n\"\n                             \"Sequences longer than this will be truncated, and sequences shorter \\n\"\n                             \"than this will be padded.\")\n    parser.add_argument(\"--do_train\",\n                        action='store_true',\n                        help=\"Whether to run training.\")\n    parser.add_argument(\"--do_eval\",\n                        action='store_true',\n                        help=\"Whether to run eval on the dev set.\")\n    parser.add_argument(\"--do_lower_case\",\n                        action='store_true',\n                        help=\"Set this flag if you are using an uncased model.\")\n    parser.add_argument(\"--train_batch_size\",\n                        default=32,\n                        type=int,\n                        help=\"Total batch size for training.\")\n    parser.add_argument(\"--eval_batch_size\",\n                        default=8,\n                        type=int,\n                        help=\"Total batch size for eval.\")\n    parser.add_argument(\"--learning_rate\",\n                        default=5e-5,\n                        type=float,\n                        help=\"The initial learning rate for Adam.\")\n    parser.add_argument(\"--num_train_epochs\",\n                        default=3.0,\n                        type=float,\n                        help=\"Total number of training epochs to perform.\")\n    parser.add_argument(\"--warmup_proportion\",\n                        default=0.1,\n                        type=float,\n                        help=\"Proportion of training to perform linear learning rate warmup for. \"\n                             \"E.g., 0.1 = 10%% of training.\")\n    parser.add_argument(\"--no_cuda\",\n                        action='store_true',\n                        help=\"Whether not to use CUDA when available\")\n    parser.add_argument(\"--local_rank\",\n                        type=int,\n                        default=-1,\n                        help=\"local_rank for distributed training on gpus\")\n    parser.add_argument('--seed',\n                        type=int,\n                        default=42,\n                        help=\"random seed for initialization\")\n    parser.add_argument('--gradient_accumulation_steps',\n                        type=int,\n                        default=1,\n                        help=\"Number of updates steps to accumulate before performing a backward/update pass.\")\n    parser.add_argument('--fp16',\n                        action='store_true',\n                        help=\"Whether to use 16-bit float precision instead of 32-bit\")\n    parser.add_argument('--loss_scale',\n                        type=float, default=0,\n                        help=\"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\\n\"\n                             \"0 (default value): dynamic loss scaling.\\n\"\n                             \"Positive power of 2: static loss scaling value.\\n\")\n\n    args = parser.parse_args()\n\n    if args.local_rank == -1 or args.no_cuda:\n        device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        torch.cuda.set_device(args.local_rank)\n        device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n        torch.distributed.init_process_group(backend='nccl')\n    logger.info(\"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}\".format(\n        device, n_gpu, bool(args.local_rank != -1), args.fp16))\n\n    if args.gradient_accumulation_steps < 1:\n        raise ValueError(\"Invalid gradient_accumulation_steps parameter: {}, should be >= 1\".format(\n                            args.gradient_accumulation_steps))\n\n    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps\n\n    random.seed(args.seed)\n    np.random.seed(args.seed)\n    torch.manual_seed(args.seed)\n    if n_gpu > 0:\n        torch.cuda.manual_seed_all(args.seed)\n\n    if not args.do_train and not args.do_eval:\n        raise ValueError(\"At least one of `do_train` or `do_eval` must be True.\")\n\n    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):\n        raise ValueError(\"Output directory ({}) already exists and is not empty.\".format(args.output_dir))\n    if not os.path.exists(args.output_dir):\n        os.makedirs(args.output_dir)\n\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)\n\n    # Prepare model\n    model = BertForMultipleChoice.from_pretrained(args.bert_model,\n        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),\n        num_choices=4)\n    if args.fp16:\n        model.half()\n    model.to(device)\n    if args.local_rank != -1:\n        try:\n            from apex.parallel import DistributedDataParallel as DDP\n        except ImportError:\n            raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n        model = DDP(model)\n    elif n_gpu > 1:\n        model = torch.nn.DataParallel(model)\n\n    if args.do_train:\n\n        # Prepare data loader\n\n        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)\n        train_features = convert_examples_to_features(\n            train_examples, tokenizer, args.max_seq_length, True)\n        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)\n        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)\n        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)\n        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)\n        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)\n        if args.local_rank == -1:\n            train_sampler = RandomSampler(train_data)\n        else:\n            train_sampler = DistributedSampler(train_data)\n        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)\n\n        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs\n        if args.local_rank != -1:\n            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()\n\n        # Prepare optimizer\n\n        param_optimizer = list(model.named_parameters())\n\n        # hack to remove pooler, which is not used\n        # thus it produce None grad that break apex\n        param_optimizer = [n for n in param_optimizer]\n\n        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n        optimizer_grouped_parameters = [\n            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n            ]\n        if args.fp16:\n            try:\n                from apex.optimizers import FP16_Optimizer\n                from apex.optimizers import FusedAdam\n            except ImportError:\n                raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n            optimizer = FusedAdam(optimizer_grouped_parameters,\n                                  lr=args.learning_rate,\n                                  bias_correction=False,\n                                  max_grad_norm=1.0)\n            if args.loss_scale == 0:\n                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)\n            else:\n                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)\n            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,\n                                                 t_total=num_train_optimization_steps)\n        else:\n            optimizer = BertAdam(optimizer_grouped_parameters,\n                                 lr=args.learning_rate,\n                                 warmup=args.warmup_proportion,\n                                 t_total=num_train_optimization_steps)\n\n        global_step = 0\n\n        logger.info(\"***** Running training *****\")\n        logger.info(\"  Num examples = %d\", len(train_examples))\n        logger.info(\"  Batch size = %d\", args.train_batch_size)\n        logger.info(\"  Num steps = %d\", num_train_optimization_steps)\n\n        model.train()\n        for _ in trange(int(args.num_train_epochs), desc=\"Epoch\"):\n            tr_loss = 0\n            nb_tr_examples, nb_tr_steps = 0, 0\n            for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")):\n                batch = tuple(t.to(device) for t in batch)\n                input_ids, input_mask, segment_ids, label_ids = batch\n                loss = model(input_ids, segment_ids, input_mask, label_ids)\n                if n_gpu > 1:\n                    loss = loss.mean() # mean() to average on multi-gpu.\n                if args.fp16 and args.loss_scale != 1.0:\n                    # rescale loss for fp16 training\n                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html\n                    loss = loss * args.loss_scale\n                if args.gradient_accumulation_steps > 1:\n                    loss = loss / args.gradient_accumulation_steps\n                tr_loss += loss.item()\n                nb_tr_examples += input_ids.size(0)\n                nb_tr_steps += 1\n\n                if args.fp16:\n                    optimizer.backward(loss)\n                else:\n                    loss.backward()\n                if (step + 1) % args.gradient_accumulation_steps == 0:\n                    if args.fp16:\n                        # modify learning rate with special warm up BERT uses\n                        # if args.fp16 is False, BertAdam is used that handles this automatically\n                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)\n                        for param_group in optimizer.param_groups:\n                            param_group['lr'] = lr_this_step\n                    optimizer.step()\n                    optimizer.zero_grad()\n                    global_step += 1\n\n\n    if args.do_train:\n        # Save a trained model, configuration and tokenizer\n        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self\n\n        # If we save using the predefined names, we can load using `from_pretrained`\n        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)\n        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)\n\n        torch.save(model_to_save.state_dict(), output_model_file)\n        model_to_save.config.to_json_file(output_config_file)\n        tokenizer.save_vocabulary(args.output_dir)\n\n        # Load a trained model and vocabulary that you have fine-tuned\n        model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4)\n        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)\n    else:\n        model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)\n    model.to(device)\n\n\n    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):\n        eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)\n        eval_features = convert_examples_to_features(\n            eval_examples, tokenizer, args.max_seq_length, True)\n        logger.info(\"***** Running evaluation *****\")\n        logger.info(\"  Num examples = %d\", len(eval_examples))\n        logger.info(\"  Batch size = %d\", args.eval_batch_size)\n        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)\n        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)\n        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)\n        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)\n        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)\n        # Run prediction for full data\n        eval_sampler = SequentialSampler(eval_data)\n        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)\n\n        model.eval()\n        eval_loss, eval_accuracy = 0, 0\n        nb_eval_steps, nb_eval_examples = 0, 0\n        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc=\"Evaluating\"):\n            input_ids = input_ids.to(device)\n            input_mask = input_mask.to(device)\n            segment_ids = segment_ids.to(device)\n            label_ids = label_ids.to(device)\n\n            with torch.no_grad():\n                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)\n                logits = model(input_ids, segment_ids, input_mask)\n\n            logits = logits.detach().cpu().numpy()\n            label_ids = label_ids.to('cpu').numpy()\n            tmp_eval_accuracy = accuracy(logits, label_ids)\n\n            eval_loss += tmp_eval_loss.mean().item()\n            eval_accuracy += tmp_eval_accuracy\n\n            nb_eval_examples += input_ids.size(0)\n            nb_eval_steps += 1\n\n        eval_loss = eval_loss / nb_eval_steps\n        eval_accuracy = eval_accuracy / nb_eval_examples\n\n        result = {'eval_loss': eval_loss,\n                  'eval_accuracy': eval_accuracy,\n                  'global_step': global_step,\n                  'loss': tr_loss/global_step}\n\n        output_eval_file = os.path.join(args.output_dir, \"eval_results.txt\")\n        with open(output_eval_file, \"w\") as writer:\n            logger.info(\"***** Eval results *****\")\n            for key in sorted(result.keys()):\n                logger.info(\"  %s = %s\", key, str(result[key]))\n                writer.write(\"%s = %s\\n\" % (key, str(result[key])))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/run_transfo_xl.py",
    "content": "# coding=utf-8\n# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" PyTorch Transformer XL model evaluation script.\n    Adapted from https://github.com/kimiyoung/transformer-xl.\n    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py\n\n    This script with default values evaluates a pretrained Transformer-XL on WikiText 103\n\"\"\"\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport argparse\nimport logging\nimport time\nimport math\n\nimport torch\n\nfrom pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer\n\nlogging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n                    datefmt = '%m/%d/%Y %H:%M:%S',\n                    level = logging.INFO)\nlogger = logging.getLogger(__name__)\n\ndef main():\n    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')\n    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',\n                        help='pretrained model name')\n    parser.add_argument('--split', type=str, default='test',\n                        choices=['all', 'valid', 'test'],\n                        help='which split to evaluate')\n    parser.add_argument('--batch_size', type=int, default=10,\n                        help='batch size')\n    parser.add_argument('--tgt_len', type=int, default=128,\n                        help='number of tokens to predict')\n    parser.add_argument('--ext_len', type=int, default=0,\n                        help='length of the extended context')\n    parser.add_argument('--mem_len', type=int, default=1600,\n                        help='length of the retained previous heads')\n    parser.add_argument('--clamp_len', type=int, default=1000,\n                        help='max positional embedding index')\n    parser.add_argument('--no_cuda', action='store_true',\n                        help='Do not use CUDA even though CUA is available')\n    parser.add_argument('--work_dir', type=str, required=True,\n                        help='path to the work_dir')\n    parser.add_argument('--no_log', action='store_true',\n                        help='do not log the eval result')\n    parser.add_argument('--same_length', action='store_true',\n                        help='set same length attention with masking')\n    parser.add_argument('--server_ip', type=str, default='', help=\"Can be used for distant debugging.\")\n    parser.add_argument('--server_port', type=str, default='', help=\"Can be used for distant debugging.\")\n    args = parser.parse_args()\n    assert args.ext_len >= 0, 'extended context length must be non-negative'\n\n    if args.server_ip and args.server_port:\n        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script\n        import ptvsd\n        print(\"Waiting for debugger attach\")\n        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)\n        ptvsd.wait_for_attach()\n\n    device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n    logger.info(\"device: {}\".format(device))\n\n    # Load a pre-processed dataset\n    # You can also build the corpus yourself using TransfoXLCorpus methods\n    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax\n    # and tokenizing the dataset\n    # The pre-processed corpus is a convertion (using the conversion script )\n    tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name)\n    corpus = TransfoXLCorpus.from_pretrained(args.model_name)\n    ntokens = len(corpus.vocab)\n\n    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,\n        device=device, ext_len=args.ext_len)\n    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,\n        device=device, ext_len=args.ext_len)\n\n    # Load a pre-trained model\n    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)\n    model = model.to(device)\n\n    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(\n        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))\n\n    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)\n    if args.clamp_len > 0:\n        model.clamp_len = args.clamp_len\n    if args.same_length:\n        model.same_length = True\n\n    ###############################################################################\n    # Evaluation code\n    ###############################################################################\n    def evaluate(eval_iter):\n        # Turn on evaluation mode which disables dropout.\n        model.eval()\n        total_len, total_loss = 0, 0.\n        start_time = time.time()\n        with torch.no_grad():\n            mems = None\n            for idx, (data, target, seq_len) in enumerate(eval_iter):\n                ret = model(data, target, mems)\n                loss, mems = ret\n                loss = loss.mean()\n                total_loss += seq_len * loss.item()\n                total_len += seq_len\n            total_time = time.time() - start_time\n        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(\n                total_time, 1000 * total_time / (idx+1)))\n        return total_loss / total_len\n\n    # Run on test data.\n    if args.split == 'all':\n        test_loss = evaluate(te_iter)\n        valid_loss = evaluate(va_iter)\n    elif args.split == 'valid':\n        valid_loss = evaluate(va_iter)\n        test_loss = None\n    elif args.split == 'test':\n        test_loss = evaluate(te_iter)\n        valid_loss = None\n\n    def format_log(loss, split):\n        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(\n            split, loss, math.exp(loss))\n        return log_str\n\n    log_str = ''\n    if valid_loss is not None:\n        log_str += format_log(valid_loss, 'valid')\n    if test_loss is not None:\n        log_str += format_log(test_loss, 'test')\n\n    logger.info('=' * 100)\n    logger.info(log_str)\n    logger.info('=' * 100)\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/sem_run_classifier.py",
    "content": "#coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"BERT finetuning runner.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function\n\nimport argparse\nimport csv\nimport logging\nimport os\nimport random\n\nimport sys\nsys.path.append('..')\n\nimport copy\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,\n                              TensorDataset)\nfrom torch.utils.data.distributed import DistributedSampler\nfrom tqdm import tqdm, trange\n\nfrom torch.nn import CrossEntropyLoss, MSELoss\nfrom scipy.stats import pearsonr, spearmanr\nfrom sklearn.metrics import matthews_corrcoef, f1_score, classification_report\n\n\nfrom pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME\nfrom pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\nfrom pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule\n\nlogger = logging.getLogger(__name__)\n\n\nclass InputExample(object):\n    \"\"\"A single training/test example for simple sequence classification.\"\"\"\n\n    def __init__(self, guid, text_a, text_b=None, label=None, entity_pos=None):\n        \"\"\"Constructs a InputExample.\n\n        Args:\n            guid: Unique id for the example.\n            text_a: string. The untokenized text of the first sequence. For single\n            sequence tasks, only this sequence must be specified.\n            text_b: (Optional) string. The untokenized text of the second sequence.\n            Only must be specified for sequence pair tasks.\n            label: (Optional) string. The label of the example. This should be\n            specified for train and dev examples, but not for test examples.\n        \"\"\"\n        self.guid = guid\n        self.text_a = text_a\n        self.text_b = text_b\n        self.label = label\n        self.entity_pos = entity_pos\n\nclass InputFeatures(object):\n    \"\"\"A single set of features of data.\"\"\"\n\n    def __init__(self, input_ids, input_mask, segment_ids, label_id, entity_mask=None, entity_seg_pos=None, entity_span1_pos=None, entity_span2_pos=None):\n        self.input_ids = input_ids\n        self.input_mask = input_mask\n        self.segment_ids = segment_ids\n        self.label_id = label_id\n        self.entity_mask = entity_mask\n        self.entity_seg_pos = entity_seg_pos\n        self.entity_span1_pos = entity_span1_pos\n        self.entity_span2_pos = entity_span2_pos\n\n\nclass DataProcessor(object):\n    \"\"\"Base class for data converters for sequence classification data sets.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the train set.\"\"\"\n        raise NotImplementedError()\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the dev set.\"\"\"\n        raise NotImplementedError()\n\n    def get_labels(self):\n        \"\"\"Gets the list of labels for this data set.\"\"\"\n        raise NotImplementedError()\n\n    @classmethod\n    def _read_tsv(cls, input_file, quotechar=None):\n        \"\"\"Reads a tab separated value file.\"\"\"\n        with open(input_file, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.reader(f, delimiter=\"\\t\", quotechar=quotechar)\n            lines = []\n            for line in reader:\n                if sys.version_info[0] == 2:\n                    line = list(unicode(cell, 'utf-8') for cell in line)\n                lines.append(line)\n            return lines\n\n\nclass MrpcProcessor(DataProcessor):\n    \"\"\"Processor for the MRPC data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.tsv\")))\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[3]\n            text_b = line[4]\n            label = line[0]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\nclass SemProcessor(DataProcessor):\n    \"\"\"Processor for the SemEval 2010 Task 8 dataset.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.jsonl\")))\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.jsonl\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"test.jsonl\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return ['Message-Topic(e2,e1)', 'Instrument-Agency(e2,e1)', 'Entity-Origin(e2,e1)', 'Member-Collection(e1,e2)', 'Member-Collection(e2,e1)', 'Other', 'Component-Whole(e1,e2)', 'Product-Producer(e2,e1)', 'Component-Whole(e2,e1)', 'Entity-Destination(e2,e1)', 'Content-Container(e2,e1)', 'Entity-Destination(e1,e2)', 'Instrument-Agency(e1,e2)', 'Cause-Effect(e2,e1)', 'Entity-Origin(e1,e2)', 'Product-Producer(e1,e2)', 'Cause-Effect(e1,e2)', 'Message-Topic(e1,e2)', 'Content-Container(e1,e2)']\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        import json\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            line = json.loads(line[0])\n            text_a = ' '.join(line['tokens'])\n            label = line['label']\n            entity_pos = line['entities']\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, label=label, entity_pos = entity_pos))\n        return examples\n\n\nclass MnliProcessor(DataProcessor):\n    \"\"\"Processor for the MultiNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev_matched.tsv\")),\n            \"dev_matched\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"contradiction\", \"entailment\", \"neutral\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[8]\n            text_b = line[9]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass MnliMismatchedProcessor(MnliProcessor):\n    \"\"\"Processor for the MultiNLI Mismatched data set (GLUE version).\"\"\"\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev_mismatched.tsv\")),\n            \"dev_matched\")\n\n\nclass ColaProcessor(DataProcessor):\n    \"\"\"Processor for the CoLA data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[3]\n            label = line[1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n        return examples\n\n\nclass Sst2Processor(DataProcessor):\n    \"\"\"Processor for the SST-2 data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[0]\n            label = line[1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n        return examples\n\n\nclass StsbProcessor(DataProcessor):\n    \"\"\"Processor for the STS-B data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [None]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[7]\n            text_b = line[8]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass QqpProcessor(DataProcessor):\n    \"\"\"Processor for the QQP data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            try:\n                text_a = line[3]\n                text_b = line[4]\n                label = line[5]\n            except IndexError:\n                continue\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass QnliProcessor(DataProcessor):\n    \"\"\"Processor for the QNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \n            \"dev_matched\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"entailment\", \"not_entailment\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass RteProcessor(DataProcessor):\n    \"\"\"Processor for the RTE data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"entailment\", \"not_entailment\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass WnliProcessor(DataProcessor):\n    \"\"\"Processor for the WNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\ndef convert_examples_to_features(examples, label_list, max_seq_length,\n                                 tokenizer, output_mode):\n    \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n    label_map = {label : i for i, label in enumerate(label_list)}\n    features = []\n    for (ex_index, example) in enumerate(examples):\n        if ex_index % 10000 == 0:\n            logger.info(\"Writing example %d of %d\" % (ex_index, len(examples)))\n        old_entity_pos = copy.deepcopy(example.entity_pos)\n        tokens_a, new_entity_pos = tokenizer.tokenize(example.text_a,example.entity_pos)\n        \n        old_entity0 = ''.join(example.text_a.split()[old_entity_pos[0][0]:old_entity_pos[0][1]])\n        old_entity1 = ''.join(example.text_a.split()[old_entity_pos[1][0]:old_entity_pos[1][1]])\n        new_entity0 = ''.join(tokens_a[new_entity_pos[0][0]:new_entity_pos[0][1]])\n        new_entity1 = ''.join(tokens_a[new_entity_pos[1][0]:new_entity_pos[1][1]])\n        \n        old_entity0 = old_entity0.lower()\n        old_entity1 = old_entity1.lower()\n\n        if '##' in new_entity0 or '##' in new_entity1:\n            new_entity0 = new_entity0.replace('#','')\n            new_entity1 = new_entity1.replace('#','')\n        \n        try:\n            assert(old_entity0 == new_entity0)\n            assert(old_entity1 == new_entity1)\n        except:\n            import pdb;pdb.set_trace()\n        \n        # Entity marker\n        tokens_a_ = copy.deepcopy(tokens_a) \n        new_entity_pos_ = copy.deepcopy(new_entity_pos)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        \n        tokens_a.insert(entity1_start, '<s1>') \n        new_entity_pos[0][0] = entity1_start\n        tokens_a.insert(entity1_end+1, '<e1>')\n        new_entity_pos[0][1] = entity1_end+1+1\n        tokens_a.insert(entity2_start+2, '<s2>')\n        new_entity_pos[1][0] = entity2_start+2\n        tokens_a.insert(entity2_end+3,'<e2>')\n        new_entity_pos[1][1] = entity2_end+3+1\n\n        if new_entity_pos[1][1] > max_seq_length - 2 - 1:\n            import pdb;pdb.set_trace()\n            \n        tokens_b = None\n        if example.text_b:\n            tokens_b = tokenizer.tokenize(example.text_b)\n            # Modifies `tokens_a` and `tokens_b` in place so that the total\n            # length is less than the specified length.\n            # Account for [CLS], [SEP], [SEP] with \"- 3\"\n            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n        else:\n            # Account for [CLS] and [SEP] with \"- 2\"\n            if len(tokens_a) > max_seq_length - 2:\n                tokens_a = tokens_a[:(max_seq_length - 2)]\n\n        # The convention in BERT is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids: 0   0   0   0  0     0 0\n        #\n        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary\n        # since the [SEP] token unambiguously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n        segment_ids = [0] * len(tokens)\n\n        if tokens_b:\n            tokens += tokens_b + [\"[SEP]\"]\n            segment_ids += [1] * (len(tokens_b) + 1)\n\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n        # The mask has 1 for real tokens and 0 for padding tokens. Only real\n        # tokens are attended to.\n        input_mask = [1] * len(input_ids)\n\n        # Zero-pad up to the sequence length.\n        padding = [0] * (max_seq_length - len(input_ids))\n        input_ids += padding\n        input_mask += padding\n        segment_ids += padding\n        \n\n        # Used for mention pooling\n        entity_mask_tag = 1\n        entity_mask = [0] * len(input_ids)\n        for entity in new_entity_pos:\n            start, end = entity[0],entity[1]\n            for i in range(start, end):\n                # [CLS], need to +1 offset\n                entity_mask[i+1] = entity_mask_tag\n        \n        \"\"\"\n            Different position embedding\n        \"\"\"\n        # Strategy 1\n        entity1_pos_tag = 1\n        entity2_pos_tag = 2\n\n        entity_seg_pos = [0] * len(input_ids)\n\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        for i in range(entity1_start, entity1_end):\n            entity_seg_pos[i+1] = entity1_pos_tag\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        for i in range(entity2_start, entity2_end):\n            entity_seg_pos[i+1] = entity2_pos_tag\n        \n        # Strategy 2\n        entity_start_pos_tag = 1\n        entity_seg_pos_ = [0] * len(input_ids)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        entity_seg_pos_[entity1_start+1] = entity_start_pos_tag\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        entity_seg_pos_[entity2_start+1] = entity_start_pos_tag\n\n        # Strategy 3\n        entity_span1_pos = [0] * len(input_ids)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        for i in range(len(entity_span1_pos)):\n            if i < entity1_start:\n                #entity_span1_pos[i] = np.abs(i - entity1_start)\n                entity_span1_pos[i] = i - entity1_start\n            elif entity1_start <= i and i < entity1_end:\n                entity_span1_pos[i] = 0\n            elif i >= entity1_end:\n                entity_span1_pos[i] = i - entity1_end + 1\n        \n        entity_span2_pos = [0] * len(input_ids)\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        for i in range(len(entity_span2_pos)):\n            if i < entity2_start:\n                #entity_span2_pos[i] = np.abs(i - entity2_start)\n                entity_span2_pos[i] = i - entity2_start\n            elif entity2_start <= i and i < entity2_end:\n                entity_span2_pos[i] = 0\n            elif i >= entity2_end:\n                entity_span2_pos[i] = i - entity2_end + 1\n\n        # Avoid to get negative position to fuck the nn.Embedding\n        #entity_span1_pos = [pos+max_seq_length-1 for pos in entity_span1_pos]\n        #entity_span2_pos = [pos+max_seq_length-1 for pos in entity_span2_pos]\n        \n        assert len(input_ids) == max_seq_length\n        assert len(input_mask) == max_seq_length\n        assert len(segment_ids) == max_seq_length\n        assert len(entity_mask) == max_seq_length\n        assert len(entity_seg_pos) == max_seq_length\n        assert len(entity_seg_pos_) == max_seq_length\n        assert len(entity_span1_pos) == max_seq_length\n        assert len(entity_span2_pos) == max_seq_length\n        if output_mode == \"classification\":\n            label_id = label_map[example.label]\n        elif output_mode == \"regression\":\n            label_id = float(example.label)\n        else:\n            raise KeyError(output_mode)\n\n        if ex_index < 5:\n            logger.info(\"*** Example ***\")\n            logger.info(\"guid: %s\" % (example.guid))\n            logger.info(\"tokens: %s\" % \" \".join(\n                    [str(x) for x in tokens]))\n            logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n            logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n            logger.info(\"entity_mask: %s\" % \" \".join([str(x) for x in entity_mask]))\n            logger.info(\"entity_seg_pos: %s\" % \" \".join([str(x) for x in entity_seg_pos]))\n            logger.info(\"entity_seg_pos_: %s\" % \" \".join([str(x) for x in entity_seg_pos_]))\n            logger.info(\"entity_span1_pos: %s\" % \" \".join([str(x) for x in entity_span1_pos]))\n            logger.info(\"entity_span2_pos: %s\" % \" \".join([str(x) for x in entity_span2_pos]))\n            logger.info(\n                    \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n            logger.info(\"label: %s (id = %d)\" % (example.label, label_id))\n        \n        #if example.guid == 'train-3':\n        #    import pdb;pdb.set_trace()\n\n        features.append(\n                InputFeatures(input_ids=input_ids,\n                              input_mask=input_mask,\n                              segment_ids=segment_ids,\n                              label_id=label_id,\n                              entity_mask=entity_mask,\n                              entity_seg_pos=entity_seg_pos_,\n                              entity_span1_pos=entity_span1_pos,\n                              entity_span2_pos=entity_span2_pos))\n    return features\n\n\ndef _truncate_seq_pair(tokens_a, tokens_b, max_length):\n    \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n\n    # This is a simple heuristic which will always truncate the longer sequence\n    # one token at a time. This makes more sense than truncating an equal percent\n    # of tokens from each, since if one sequence is very short then each token\n    # that's truncated likely contains more information than a longer sequence.\n    while True:\n        total_length = len(tokens_a) + len(tokens_b)\n        if total_length <= max_length:\n            break\n        if len(tokens_a) > len(tokens_b):\n            tokens_a.pop()\n        else:\n            tokens_b.pop()\n\n\ndef simple_accuracy(preds, labels):\n    return (preds == labels).mean()\n\n\ndef acc_and_f1(preds, labels):\n    acc = simple_accuracy(preds, labels)\n    f1 = f1_score(y_true=labels, y_pred=preds,average='micro')\n    report = classification_report(labels, preds)\n    return {\n        \"acc\": acc,\n        \"f1\": f1,\n        \"acc_and_f1\": (acc + f1) / 2,\n        \"report\": report\n    }\n\n\ndef pearson_and_spearman(preds, labels):\n    pearson_corr = pearsonr(preds, labels)[0]\n    spearman_corr = spearmanr(preds, labels)[0]\n    return {\n        \"pearson\": pearson_corr,\n        \"spearmanr\": spearman_corr,\n        \"corr\": (pearson_corr + spearman_corr) / 2,\n    }\n\n\ndef compute_metrics(task_name, preds, labels):\n    assert len(preds) == len(labels)\n    if task_name == \"cola\":\n        return {\"mcc\": matthews_corrcoef(labels, preds)}\n    elif task_name == \"sst-2\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"mrpc\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"sem\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"sts-b\":\n        return pearson_and_spearman(preds, labels)\n    elif task_name == \"qqp\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"mnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"mnli-mm\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"qnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"rte\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"wnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    else:\n        raise KeyError(task_name)\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n\n    ## Required parameters\n    parser.add_argument(\"--data_dir\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The input data dir. Should contain the .tsv files (or other data files) for the task.\")\n    parser.add_argument(\"--bert_model\", default=None, type=str, required=True,\n                        help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\n                        \"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, \"\n                        \"bert-base-multilingual-cased, bert-base-chinese.\")\n    parser.add_argument(\"--task_name\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The name of the task to train.\")\n    parser.add_argument(\"--output_dir\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The output directory where the model predictions and checkpoints will be written.\")\n\n    ## Other parameters\n    parser.add_argument(\"--cache_dir\",\n                        default=\"\",\n                        type=str,\n                        help=\"Where do you want to store the pre-trained models downloaded from s3\")\n    parser.add_argument(\"--max_seq_length\",\n                        default=128,\n                        type=int,\n                        help=\"The maximum total input sequence length after WordPiece tokenization. \\n\"\n                             \"Sequences longer than this will be truncated, and sequences shorter \\n\"\n                             \"than this will be padded.\")\n    parser.add_argument(\"--do_train\",\n                        action='store_true',\n                        help=\"Whether to run training.\")\n    parser.add_argument(\"--do_eval\",\n                        action='store_true',\n                        help=\"Whether to run eval on the dev set.\")\n    parser.add_argument(\"--do_lower_case\",\n                        action='store_true',\n                        help=\"Set this flag if you are using an uncased model.\")\n    parser.add_argument(\"--train_batch_size\",\n                        default=32,\n                        type=int,\n                        help=\"Total batch size for training.\")\n    parser.add_argument(\"--eval_batch_size\",\n                        default=8,\n                        type=int,\n                        help=\"Total batch size for eval.\")\n    parser.add_argument(\"--learning_rate\",\n                        default=5e-5,\n                        type=float,\n                        help=\"The initial learning rate for Adam.\")\n    parser.add_argument(\"--num_train_epochs\",\n                        default=3.0,\n                        type=float,\n                        help=\"Total number of training epochs to perform.\")\n    parser.add_argument(\"--warmup_proportion\",\n                        default=0.1,\n                        type=float,\n                        help=\"Proportion of training to perform linear learning rate warmup for. \"\n                             \"E.g., 0.1 = 10%% of training.\")\n    parser.add_argument(\"--no_cuda\",\n                        action='store_true',\n                        help=\"Whether not to use CUDA when available\")\n    parser.add_argument(\"--local_rank\",\n                        type=int,\n                        default=-1,\n                        help=\"local_rank for distributed training on gpus\")\n    parser.add_argument('--seed',\n                        type=int,\n                        default=42,\n                        help=\"random seed for initialization\")\n    parser.add_argument('--gradient_accumulation_steps',\n                        type=int,\n                        default=1,\n                        help=\"Number of updates steps to accumulate before performing a backward/update pass.\")\n    parser.add_argument('--fp16',\n                        action='store_true',\n                        help=\"Whether to use 16-bit float precision instead of 32-bit\")\n    parser.add_argument('--loss_scale',\n                        type=float, default=0,\n                        help=\"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\\n\"\n                             \"0 (default value): dynamic loss scaling.\\n\"\n                             \"Positive power of 2: static loss scaling value.\\n\")\n    parser.add_argument('--server_ip', type=str, default='', help=\"Can be used for distant debugging.\")\n    parser.add_argument('--server_port', type=str, default='', help=\"Can be used for distant debugging.\")\n    args = parser.parse_args()\n\n    if args.server_ip and args.server_port:\n        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script\n        import ptvsd\n        print(\"Waiting for debugger attach\")\n        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)\n        ptvsd.wait_for_attach()\n\n    processors = {\n        \"cola\": ColaProcessor,\n        \"mnli\": MnliProcessor,\n        \"mnli-mm\": MnliMismatchedProcessor,\n        \"mrpc\": MrpcProcessor,\n        \"sem\": SemProcessor,\n        \"sst-2\": Sst2Processor,\n        \"sts-b\": StsbProcessor,\n        \"qqp\": QqpProcessor,\n        \"qnli\": QnliProcessor,\n        \"rte\": RteProcessor,\n        \"wnli\": WnliProcessor,\n    }\n\n    output_modes = {\n        \"cola\": \"classification\",\n        \"mnli\": \"classification\",\n        \"mrpc\": \"classification\",\n        \"sem\": \"classification\",\n        \"sst-2\": \"classification\",\n        \"sts-b\": \"regression\",\n        \"qqp\": \"classification\",\n        \"qnli\": \"classification\",\n        \"rte\": \"classification\",\n        \"wnli\": \"classification\",\n    }\n\n    if args.local_rank == -1 or args.no_cuda:\n        device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        torch.cuda.set_device(args.local_rank)\n        device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n        torch.distributed.init_process_group(backend='nccl')\n\n    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n                        datefmt = '%m/%d/%Y %H:%M:%S',\n                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)\n\n    logger.info(\"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}\".format(\n        device, n_gpu, bool(args.local_rank != -1), args.fp16))\n\n    if args.gradient_accumulation_steps < 1:\n        raise ValueError(\"Invalid gradient_accumulation_steps parameter: {}, should be >= 1\".format(\n                            args.gradient_accumulation_steps))\n\n    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps\n\n    random.seed(args.seed)\n    np.random.seed(args.seed)\n    torch.manual_seed(args.seed)\n    if n_gpu > 0:\n        torch.cuda.manual_seed_all(args.seed)\n\n    if not args.do_train and not args.do_eval:\n        raise ValueError(\"At least one of `do_train` or `do_eval` must be True.\")\n\n    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:\n        raise ValueError(\"Output directory ({}) already exists and is not empty.\".format(args.output_dir))\n    if not os.path.exists(args.output_dir):\n        os.makedirs(args.output_dir)\n\n    task_name = args.task_name.lower()\n\n    if task_name not in processors:\n        raise ValueError(\"Task not found: %s\" % (task_name))\n\n    processor = processors[task_name]()\n    output_mode = output_modes[task_name]\n\n    label_list = processor.get_labels()\n    num_labels = len(label_list)\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)\n    train_examples = None\n    num_train_optimization_steps = None\n    if args.do_train:\n        train_examples = processor.get_train_examples(args.data_dir)\n        num_train_optimization_steps = int(\n            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs\n        if args.local_rank != -1:\n            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()\n\n    # Prepare model\n    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))\n    model = BertForSequenceClassification.from_pretrained(args.bert_model,\n              cache_dir=cache_dir,\n              num_labels=num_labels)\n    if args.fp16:\n        model.half()\n    model.to(device)\n    if args.local_rank != -1:\n        try:\n            from apex.parallel import DistributedDataParallel as DDP\n        except ImportError:\n            raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n        model = DDP(model)\n    elif n_gpu > 1:\n        model = torch.nn.DataParallel(model)\n\n    # Prepare optimizer\n    if args.do_train:\n        param_optimizer = list(model.named_parameters())\n        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n        optimizer_grouped_parameters = [\n            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n            ]\n        if args.fp16:\n            try:\n                from apex.optimizers import FP16_Optimizer\n                from apex.optimizers import FusedAdam\n            except ImportError:\n                raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n            optimizer = FusedAdam(optimizer_grouped_parameters,\n                                  lr=args.learning_rate,\n                                  bias_correction=False,\n                                  max_grad_norm=1.0)\n            if args.loss_scale == 0:\n                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)\n            else:\n                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)\n            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,\n                                                 t_total=num_train_optimization_steps)\n\n        else:\n            optimizer = BertAdam(optimizer_grouped_parameters,\n                                 lr=args.learning_rate,\n                                 warmup=args.warmup_proportion,\n                                 t_total=num_train_optimization_steps)\n\n    global_step = 0\n    nb_tr_steps = 0\n    tr_loss = 0\n    if args.do_train:\n        train_features = convert_examples_to_features(\n            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)\n        logger.info(\"***** Running training *****\")\n        logger.info(\"  Num examples = %d\", len(train_examples))\n        logger.info(\"  Batch size = %d\", args.train_batch_size)\n        logger.info(\"  Num steps = %d\", num_train_optimization_steps)\n        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)\n        # FloatTensor(forward)\n        all_entity_mask = torch.tensor([f.entity_mask for f in train_features], dtype=torch.float)\n        all_entity_seg_pos = torch.tensor([f.entity_seg_pos for f in train_features], dtype=torch.long)\n        all_entity_span1_pos = torch.tensor([f.entity_span1_pos for f in train_features], dtype=torch.float)\n        all_entity_span2_pos = torch.tensor([f.entity_span2_pos for f in train_features], dtype=torch.float)\n        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)\n        if output_mode == \"classification\":\n            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)\n        elif output_mode == \"regression\":\n            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)\n\n        train_data = TensorDataset(all_input_ids, all_input_mask, all_entity_mask, all_entity_seg_pos, all_entity_span1_pos, all_entity_span2_pos, all_segment_ids, all_label_ids)\n        if args.local_rank == -1:\n            train_sampler = RandomSampler(train_data)\n        else:\n            train_sampler = DistributedSampler(train_data)\n        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)\n\n        model.train()\n        for _ in trange(int(args.num_train_epochs), desc=\"Epoch\"):\n            tr_loss = 0\n            nb_tr_examples, nb_tr_steps = 0, 0\n            for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")):\n                batch = tuple(t.to(device) for t in batch)\n                input_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, segment_ids, label_ids = batch\n                # define a new function to compute loss values for both output_modes\n                logits = model(input_ids, segment_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, labels=None)\n\n                if output_mode == \"classification\":\n                    loss_fct = CrossEntropyLoss()\n                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n                elif output_mode == \"regression\":\n                    loss_fct = MSELoss()\n                    loss = loss_fct(logits.view(-1), label_ids.view(-1))\n\n                if n_gpu > 1:\n                    loss = loss.mean() # mean() to average on multi-gpu.\n                if args.gradient_accumulation_steps > 1:\n                    loss = loss / args.gradient_accumulation_steps\n\n                if args.fp16:\n                    optimizer.backward(loss)\n                else:\n                    loss.backward()\n\n                tr_loss += loss.item()\n                nb_tr_examples += input_ids.size(0)\n                nb_tr_steps += 1\n                if (step + 1) % args.gradient_accumulation_steps == 0:\n                    if args.fp16:\n                        # modify learning rate with special warm up BERT uses\n                        # if args.fp16 is False, BertAdam is used that handles this automatically\n                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)\n                        for param_group in optimizer.param_groups:\n                            param_group['lr'] = lr_this_step\n                    optimizer.step()\n                    optimizer.zero_grad()\n                    global_step += 1\n\n    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):\n        # Save a trained model, configuration and tokenizer\n        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self\n\n        # If we save using the predefined names, we can load using `from_pretrained`\n        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)\n        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)\n\n        torch.save(model_to_save.state_dict(), output_model_file)\n        model_to_save.config.to_json_file(output_config_file)\n        tokenizer.save_vocabulary(args.output_dir)\n\n        # Load a trained model and vocabulary that you have fine-tuned\n        model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)\n        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)\n    else:\n        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)\n    model.to(device)\n\n    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):\n        eval_examples = processor.get_dev_examples(args.data_dir)\n        eval_features = convert_examples_to_features(\n            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)\n        logger.info(\"***** Running evaluation *****\")\n        logger.info(\"  Num examples = %d\", len(eval_examples))\n        logger.info(\"  Batch size = %d\", args.eval_batch_size)\n        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n        all_entity_mask = torch.tensor([f.entity_mask for f in eval_features], dtype=torch.float)\n        all_entity_seg_pos = torch.tensor([f.entity_seg_pos for f in eval_features], dtype=torch.long)\n        all_entity_span1_pos = torch.tensor([f.entity_span1_pos for f in eval_features], dtype=torch.float)\n        all_entity_span2_pos = torch.tensor([f.entity_span2_pos for f in eval_features], dtype=torch.float)\n        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n\n        if output_mode == \"classification\":\n            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)\n        elif output_mode == \"regression\":\n            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)\n\n        eval_data = TensorDataset(all_input_ids, all_input_mask, all_entity_mask, all_entity_seg_pos, all_entity_span1_pos, all_entity_span2_pos, all_segment_ids, all_label_ids)\n        # Run prediction for full data\n        eval_sampler = SequentialSampler(eval_data)\n        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)\n\n        model.eval()\n        eval_loss = 0\n        nb_eval_steps = 0\n        preds = []\n\n        for input_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, segment_ids, label_ids in tqdm(eval_dataloader, desc=\"Evaluating\"):\n            input_ids = input_ids.to(device)\n            input_mask = input_mask.to(device)\n            entity_mask = entity_mask.to(device)\n            entity_seg_pos = entity_seg_pos.to(device)\n            entity_span1_pos = entity_span1_pos.to(device)\n            entity_span2_pos = entity_span2_pos.to(device)\n            segment_ids = segment_ids.to(device)\n            label_ids = label_ids.to(device)\n            with torch.no_grad():\n                logits = model(input_ids, segment_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, labels=None)\n                #logits = model(input_ids, segment_ids, input_mask, labels=None)\n\n            # create eval loss and other metric required by the task\n            if output_mode == \"classification\":\n                loss_fct = CrossEntropyLoss()\n                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n            elif output_mode == \"regression\":\n                loss_fct = MSELoss()\n                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))\n            \n            eval_loss += tmp_eval_loss.mean().item()\n            nb_eval_steps += 1\n            if len(preds) == 0:\n                preds.append(logits.detach().cpu().numpy())\n            else:\n                preds[0] = np.append(\n                    preds[0], logits.detach().cpu().numpy(), axis=0)\n\n        eval_loss = eval_loss / nb_eval_steps\n        preds = preds[0]\n        if output_mode == \"classification\":\n            preds = np.argmax(preds, axis=1)\n        elif output_mode == \"regression\":\n            preds = np.squeeze(preds)\n        result = compute_metrics(task_name, preds, all_label_ids.numpy())\n        loss = tr_loss/global_step if args.do_train else None\n\n        result['eval_loss'] = eval_loss\n        result['global_step'] = global_step\n        result['loss'] = loss\n\n        output_eval_file = os.path.join(args.output_dir, \"eval_results.txt\")\n        with open(output_eval_file, \"w\") as writer:\n            logger.info(\"***** Eval results *****\")\n            for key in sorted(result.keys()):\n                logger.info(\"  %s = %s\", key, str(result[key]))\n                writer.write(\"%s = %s\\n\" % (key, str(result[key])))\n\n        # hack for MNLI-MM\n        if task_name == \"mnli\":\n            task_name = \"mnli-mm\"\n            processor = processors[task_name]()\n\n            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:\n                raise ValueError(\"Output directory ({}) already exists and is not empty.\".format(args.output_dir))\n            if not os.path.exists(args.output_dir + '-MM'):\n                os.makedirs(args.output_dir + '-MM')\n\n            eval_examples = processor.get_dev_examples(args.data_dir)\n            eval_features = convert_examples_to_features(\n                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)\n            logger.info(\"***** Running evaluation *****\")\n            logger.info(\"  Num examples = %d\", len(eval_examples))\n            logger.info(\"  Batch size = %d\", args.eval_batch_size)\n            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)\n\n            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)\n            # Run prediction for full data\n            eval_sampler = SequentialSampler(eval_data)\n            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)\n\n            model.eval()\n            eval_loss = 0\n            nb_eval_steps = 0\n            preds = []\n\n            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc=\"Evaluating\"):\n                input_ids = input_ids.to(device)\n                input_mask = input_mask.to(device)\n                segment_ids = segment_ids.to(device)\n                label_ids = label_ids.to(device)\n\n                with torch.no_grad():\n                    logits = model(input_ids, segment_ids, input_mask, labels=None)\n            \n                loss_fct = CrossEntropyLoss()\n                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n            \n                eval_loss += tmp_eval_loss.mean().item()\n                nb_eval_steps += 1\n                if len(preds) == 0:\n                    preds.append(logits.detach().cpu().numpy())\n                else:\n                    preds[0] = np.append(\n                        preds[0], logits.detach().cpu().numpy(), axis=0)\n\n            eval_loss = eval_loss / nb_eval_steps\n            preds = preds[0]\n            preds = np.argmax(preds, axis=1)\n            result = compute_metrics(task_name, preds, all_label_ids.numpy())\n            loss = tr_loss/global_step if args.do_train else None\n\n            result['eval_loss'] = eval_loss\n            result['global_step'] = global_step\n            result['loss'] = loss\n\n            output_eval_file = os.path.join(args.output_dir + '-MM', \"eval_results.txt\")\n            with open(output_eval_file, \"w\") as writer:\n                logger.info(\"***** Eval results *****\")\n                for key in sorted(result.keys()):\n                    logger.info(\"  %s = %s\", key, str(result[key]))\n                    writer.write(\"%s = %s\\n\" % (key, str(result[key])))\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/tacred_run_classifier.py",
    "content": "#coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"BERT finetuning runner.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function\n\nimport argparse\nimport csv\nimport logging\nimport os\nimport random\n\nimport sys\nsys.path.append('..')\n\nimport copy\nimport json\nimport numpy as np\nimport torch\nfrom torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,\n                              TensorDataset)\nfrom torch.utils.data.distributed import DistributedSampler\nfrom tqdm import tqdm, trange\n\nfrom torch.nn import CrossEntropyLoss, MSELoss\nfrom scipy.stats import pearsonr, spearmanr\nfrom sklearn.metrics import matthews_corrcoef, f1_score, classification_report,precision_recall_fscore_support\n\n\nfrom pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME\nfrom pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\nfrom pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule\n\nlogger = logging.getLogger(__name__)\n\n\nclass InputExample(object):\n    \"\"\"A single training/test example for simple sequence classification.\"\"\"\n\n    def __init__(self, guid, text_a, text_b=None, label=None, entity_pos=None):\n        \"\"\"Constructs a InputExample.\n\n        Args:\n            guid: Unique id for the example.\n            text_a: string. The untokenized text of the first sequence. For single\n            sequence tasks, only this sequence must be specified.\n            text_b: (Optional) string. The untokenized text of the second sequence.\n            Only must be specified for sequence pair tasks.\n            label: (Optional) string. The label of the example. This should be\n            specified for train and dev examples, but not for test examples.\n        \"\"\"\n        self.guid = guid\n        self.text_a = text_a\n        self.text_b = text_b\n        self.label = label\n        self.entity_pos = entity_pos\n\nclass InputFeatures(object):\n    \"\"\"A single set of features of data.\"\"\"\n\n    def __init__(self, input_ids, input_mask, segment_ids, label_id, entity_mask=None, entity_seg_pos=None, entity_span1_pos=None, entity_span2_pos=None):\n        self.input_ids = input_ids\n        self.input_mask = input_mask\n        self.segment_ids = segment_ids\n        self.label_id = label_id\n        self.entity_mask = entity_mask\n        self.entity_seg_pos = entity_seg_pos\n        self.entity_span1_pos = entity_span1_pos\n        self.entity_span2_pos = entity_span2_pos\n\n\nclass DataProcessor(object):\n    \"\"\"Base class for data converters for sequence classification data sets.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the train set.\"\"\"\n        raise NotImplementedError()\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the dev set.\"\"\"\n        raise NotImplementedError()\n\n    def get_labels(self):\n        \"\"\"Gets the list of labels for this data set.\"\"\"\n        raise NotImplementedError()\n\n    @classmethod\n    def _read_tsv(cls, input_file, quotechar=None):\n        \"\"\"Reads a tab separated value file.\"\"\"\n        with open(input_file, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.reader(f, delimiter=\"\\t\", quotechar=quotechar)\n            lines = []\n            for line in reader:\n                if sys.version_info[0] == 2:\n                    line = list(unicode(cell, 'utf-8') for cell in line)\n                lines.append(line)\n            return lines\n\n\nclass MrpcProcessor(DataProcessor):\n    \"\"\"Processor for the MRPC data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.tsv\")))\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[3]\n            text_b = line[4]\n            label = line[0]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\nclass SemProcessor(DataProcessor):\n    \"\"\"Processor for the SemEval 2010 Task 8 dataset.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.jsonl\")))\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.jsonl\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"test.jsonl\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return ['Message-Topic(e2,e1)', 'Instrument-Agency(e2,e1)', 'Entity-Origin(e2,e1)', 'Member-Collection(e1,e2)', 'Member-Collection(e2,e1)', 'Other', 'Component-Whole(e1,e2)', 'Product-Producer(e2,e1)', 'Component-Whole(e2,e1)', 'Entity-Destination(e2,e1)', 'Content-Container(e2,e1)', 'Entity-Destination(e1,e2)', 'Instrument-Agency(e1,e2)', 'Cause-Effect(e2,e1)', 'Entity-Origin(e1,e2)', 'Product-Producer(e1,e2)', 'Cause-Effect(e1,e2)', 'Message-Topic(e1,e2)', 'Content-Container(e1,e2)']\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            line = json.loads(line[0])\n            text_a = ' '.join(line['tokens'])\n            label = line['label']\n            entity_pos = line['entities']\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, label=label, entity_pos = entity_pos))\n        return examples\n\nclass TacredProcessor(DataProcessor):\n    \"\"\"Processor for the TACRED dataset.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.jsonl\")))\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train_dev.jsonl\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.jsonl\")), \"dev\")\n    \n    def get_test_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"test.jsonl\")), \"test\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return ['per:parents', 'per:country_of_birth', 'org:political/religious_affiliation', 'org:parents', 'org:members', 'per:schools_attended', 'org:shareholders', 'per:stateorprovince_of_death', 'per:age', 'per:city_of_death', 'per:siblings', 'per:date_of_birth', 'org:founded', 'per:stateorprovince_of_birth', 'per:origin', 'per:charges', 'per:children', 'per:title', 'per:countries_of_residence', 'org:top_members/employees', 'per:religion', 'per:country_of_death', 'per:employee_of', 'no_relation', 'per:stateorprovinces_of_residence', 'org:city_of_headquarters', 'org:dissolved', 'per:date_of_death', 'per:other_family', 'per:alternate_names', 'org:number_of_employees/members', 'per:spouse', 'per:cause_of_death', 'org:alternate_names', 'org:founded_by', 'org:stateorprovince_of_headquarters', 'per:city_of_birth', 'org:subsidiaries', 'org:website', 'org:member_of', 'per:cities_of_residence', 'org:country_of_headquarters']\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            line = json.loads(line[0])\n            text_a = ' '.join(line['tokens'])\n            label = line['label']\n            entity_pos = line['entities']\n            # 假设entity之间不重叠\n            entity_pos = sorted(entity_pos) \n            examples.append(\n                InputExample(guid=guid, text_a=text_a, label=label, entity_pos = entity_pos))\n        return examples\n\n\nclass MnliProcessor(DataProcessor):\n    \"\"\"Processor for the MultiNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev_matched.tsv\")),\n            \"dev_matched\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"contradiction\", \"entailment\", \"neutral\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[8]\n            text_b = line[9]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass MnliMismatchedProcessor(MnliProcessor):\n    \"\"\"Processor for the MultiNLI Mismatched data set (GLUE version).\"\"\"\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev_mismatched.tsv\")),\n            \"dev_matched\")\n\n\nclass ColaProcessor(DataProcessor):\n    \"\"\"Processor for the CoLA data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[3]\n            label = line[1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n        return examples\n\n\nclass Sst2Processor(DataProcessor):\n    \"\"\"Processor for the SST-2 data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = line[0]\n            label = line[1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n        return examples\n\n\nclass StsbProcessor(DataProcessor):\n    \"\"\"Processor for the STS-B data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [None]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[7]\n            text_b = line[8]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass QqpProcessor(DataProcessor):\n    \"\"\"Processor for the QQP data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            try:\n                text_a = line[3]\n                text_b = line[4]\n                label = line[5]\n            except IndexError:\n                continue\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass QnliProcessor(DataProcessor):\n    \"\"\"Processor for the QNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \n            \"dev_matched\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"entailment\", \"not_entailment\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass RteProcessor(DataProcessor):\n    \"\"\"Processor for the RTE data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"entailment\", \"not_entailment\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\n\nclass WnliProcessor(DataProcessor):\n    \"\"\"Processor for the WNLI data set (GLUE version).\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return [\"0\", \"1\"]\n\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            if i == 0:\n                continue\n            guid = \"%s-%s\" % (set_type, line[0])\n            text_a = line[1]\n            text_b = line[2]\n            label = line[-1]\n            examples.append(\n                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n        return examples\n\ndef convert_examples_to_features(examples, label_list, max_seq_length,\n                                 tokenizer, output_mode):\n    \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n    label_map = {label : i for i, label in enumerate(label_list)}\n    features = []\n    for (ex_index, example) in enumerate(examples):\n        if ex_index % 10000 == 0:\n            logger.info(\"Writing example %d of %d\" % (ex_index, len(examples)))\n        old_entity_pos = copy.deepcopy(example.entity_pos)\n        tokens_a, new_entity_pos = tokenizer.tokenize(example.text_a,example.entity_pos)\n        \n        old_entity0 = ''.join(example.text_a.split()[old_entity_pos[0][0]:old_entity_pos[0][1]])\n        old_entity1 = ''.join(example.text_a.split()[old_entity_pos[1][0]:old_entity_pos[1][1]])\n        new_entity0 = ''.join(tokens_a[new_entity_pos[0][0]:new_entity_pos[0][1]])\n        new_entity1 = ''.join(tokens_a[new_entity_pos[1][0]:new_entity_pos[1][1]])\n        \n        old_entity0 = old_entity0.lower()\n        old_entity1 = old_entity1.lower()\n\n        if '##' in new_entity0 or '##' in new_entity1:\n            new_entity0 = new_entity0.replace('#','')\n            new_entity1 = new_entity1.replace('#','')\n        \n        try:\n            assert(old_entity0 == new_entity0)\n            assert(old_entity1 == new_entity1)\n        except:\n            continue\n            #import pdb;pdb.set_trace()\n        \n        # Entity marker\n        tokens_a_ = copy.deepcopy(tokens_a) \n        new_entity_pos_ = copy.deepcopy(new_entity_pos)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        \n        tokens_a.insert(entity1_start, '<s1>') \n        new_entity_pos[0][0] = entity1_start\n        tokens_a.insert(entity1_end+1, '<e1>')\n        new_entity_pos[0][1] = entity1_end+1+1\n        tokens_a.insert(entity2_start+2, '<s2>')\n        new_entity_pos[1][0] = entity2_start+2\n        tokens_a.insert(entity2_end+3,'<e2>')\n        new_entity_pos[1][1] = entity2_end+3+1\n\n        if new_entity_pos[1][1] > max_seq_length - 2 - 1:\n            continue\n            #import pdb;pdb.set_trace()\n            \n        tokens_b = None\n        if example.text_b:\n            tokens_b = tokenizer.tokenize(example.text_b)\n            # Modifies `tokens_a` and `tokens_b` in place so that the total\n            # length is less than the specified length.\n            # Account for [CLS], [SEP], [SEP] with \"- 3\"\n            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n        else:\n            # Account for [CLS] and [SEP] with \"- 2\"\n            if len(tokens_a) > max_seq_length - 2:\n                tokens_a = tokens_a[:(max_seq_length - 2)]\n\n        # The convention in BERT is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids: 0   0   0   0  0     0 0\n        #\n        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary\n        # since the [SEP] token unambiguously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n        segment_ids = [0] * len(tokens)\n\n        if tokens_b:\n            tokens += tokens_b + [\"[SEP]\"]\n            segment_ids += [1] * (len(tokens_b) + 1)\n\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n        # The mask has 1 for real tokens and 0 for padding tokens. Only real\n        # tokens are attended to.\n        input_mask = [1] * len(input_ids)\n\n        # Zero-pad up to the sequence length.\n        padding = [0] * (max_seq_length - len(input_ids))\n        input_ids += padding\n        input_mask += padding\n        segment_ids += padding\n        \n\n        # Used for mention pooling\n        entity_mask_tag = 1\n        entity_mask = [0] * len(input_ids)\n        for entity in new_entity_pos:\n            start, end = entity[0],entity[1]\n            for i in range(start, end):\n                # [CLS], need to +1 offset\n                entity_mask[i+1] = entity_mask_tag\n        \n        \"\"\"\n            Different position embedding\n        \"\"\"\n        # Strategy 1\n        entity1_pos_tag = 1\n        entity2_pos_tag = 2\n\n        entity_seg_pos = [0] * len(input_ids)\n\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        for i in range(entity1_start, entity1_end):\n            entity_seg_pos[i+1] = entity1_pos_tag\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        for i in range(entity2_start, entity2_end):\n            entity_seg_pos[i+1] = entity2_pos_tag\n        \n        # Strategy 2\n        entity_start_pos_tag = 1\n        entity_seg_pos_ = [0] * len(input_ids)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        entity_seg_pos_[entity1_start+1] = entity_start_pos_tag\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        entity_seg_pos_[entity2_start+1] = entity_start_pos_tag\n\n        # Strategy 3\n        entity_span1_pos = [0] * len(input_ids)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        for i in range(len(entity_span1_pos)):\n            if i < entity1_start:\n                #entity_span1_pos[i] = np.abs(i - entity1_start)\n                entity_span1_pos[i] = i - entity1_start\n            elif entity1_start <= i and i < entity1_end:\n                entity_span1_pos[i] = 0\n            elif i >= entity1_end:\n                entity_span1_pos[i] = i - entity1_end + 1\n        \n        entity_span2_pos = [0] * len(input_ids)\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        for i in range(len(entity_span2_pos)):\n            if i < entity2_start:\n                #entity_span2_pos[i] = np.abs(i - entity2_start)\n                entity_span2_pos[i] = i - entity2_start\n            elif entity2_start <= i and i < entity2_end:\n                entity_span2_pos[i] = 0\n            elif i >= entity2_end:\n                entity_span2_pos[i] = i - entity2_end + 1\n\n        # Avoid to get negative position to fuck the nn.Embedding\n        #entity_span1_pos = [pos+max_seq_length-1 for pos in entity_span1_pos]\n        #entity_span2_pos = [pos+max_seq_length-1 for pos in entity_span2_pos]\n        \n        assert len(input_ids) == max_seq_length\n        assert len(input_mask) == max_seq_length\n        assert len(segment_ids) == max_seq_length\n        assert len(entity_mask) == max_seq_length\n        assert len(entity_seg_pos) == max_seq_length\n        assert len(entity_seg_pos_) == max_seq_length\n        assert len(entity_span1_pos) == max_seq_length\n        assert len(entity_span2_pos) == max_seq_length\n        if output_mode == \"classification\":\n            label_id = label_map[example.label]\n        elif output_mode == \"regression\":\n            label_id = float(example.label)\n        else:\n            raise KeyError(output_mode)\n\n        if ex_index < 5:\n            logger.info(\"*** Example ***\")\n            logger.info(\"guid: %s\" % (example.guid))\n            logger.info(\"tokens: %s\" % \" \".join(\n                    [str(x) for x in tokens]))\n            logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n            logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n            logger.info(\"entity_mask: %s\" % \" \".join([str(x) for x in entity_mask]))\n            logger.info(\"entity_seg_pos: %s\" % \" \".join([str(x) for x in entity_seg_pos]))\n            logger.info(\"entity_seg_pos_: %s\" % \" \".join([str(x) for x in entity_seg_pos_]))\n            logger.info(\"entity_span1_pos: %s\" % \" \".join([str(x) for x in entity_span1_pos]))\n            logger.info(\"entity_span2_pos: %s\" % \" \".join([str(x) for x in entity_span2_pos]))\n            logger.info(\n                    \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n            logger.info(\"label: %s (id = %d)\" % (example.label, label_id))\n        \n        #if example.guid == 'train-3':\n        #    import pdb;pdb.set_trace()\n\n        features.append(\n                InputFeatures(input_ids=input_ids,\n                              input_mask=input_mask,\n                              segment_ids=segment_ids,\n                              label_id=label_id,\n                              entity_mask=entity_mask,\n                              entity_seg_pos=entity_seg_pos_,\n                              entity_span1_pos=entity_span1_pos,\n                              entity_span2_pos=entity_span2_pos))\n    return features\n\ndef _truncate_seq_pair(tokens_a, tokens_b, max_length):\n    \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n\n    # This is a simple heuristic which will always truncate the longer sequence\n    # one token at a time. This makes more sense than truncating an equal percent\n    # of tokens from each, since if one sequence is very short then each token\n    # that's truncated likely contains more information than a longer sequence.\n    while True:\n        total_length = len(tokens_a) + len(tokens_b)\n        if total_length <= max_length:\n            break\n        if len(tokens_a) > len(tokens_b):\n            tokens_a.pop()\n        else:\n            tokens_b.pop()\n\n\ndef simple_accuracy(preds, labels):\n    return (preds == labels).mean()\n\n\ndef acc_and_f1(preds, labels):\n    \n    # 删除no_relation的样本\n    class_num = 42\n    no_relation_label = 23\n    labels_ = [i for i in range(class_num)]\n    labels_.remove(no_relation_label)\n\n    report = classification_report(labels, preds, labels=labels_)\n    return {\n        \"report\": report\n    }\n\n\ndef pearson_and_spearman(preds, labels):\n    pearson_corr = pearsonr(preds, labels)[0]\n    spearman_corr = spearmanr(preds, labels)[0]\n    return {\n        \"pearson\": pearson_corr,\n        \"spearmanr\": spearman_corr,\n        \"corr\": (pearson_corr + spearman_corr) / 2,\n    }\n\n\ndef compute_metrics(task_name, preds, labels):\n    assert len(preds) == len(labels)\n    if task_name == \"cola\":\n        return {\"mcc\": matthews_corrcoef(labels, preds)}\n    elif task_name == \"sst-2\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"mrpc\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"sem\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"tacred\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"sts-b\":\n        return pearson_and_spearman(preds, labels)\n    elif task_name == \"qqp\":\n        return acc_and_f1(preds, labels)\n    elif task_name == \"mnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"mnli-mm\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"qnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"rte\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    elif task_name == \"wnli\":\n        return {\"acc\": simple_accuracy(preds, labels)}\n    else:\n        raise KeyError(task_name)\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n\n    ## Required parameters\n    parser.add_argument(\"--data_dir\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The input data dir. Should contain the .tsv files (or other data files) for the task.\")\n    parser.add_argument(\"--bert_model\", default=None, type=str, required=True,\n                        help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\n                        \"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, \"\n                        \"bert-base-multilingual-cased, bert-base-chinese.\")\n    parser.add_argument(\"--task_name\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The name of the task to train.\")\n    parser.add_argument(\"--output_dir\",\n                        default=None,\n                        type=str,\n                        required=True,\n                        help=\"The output directory where the model predictions and checkpoints will be written.\")\n\n    ## Other parameters\n    parser.add_argument(\"--cache_dir\",\n                        default=\"\",\n                        type=str,\n                        help=\"Where do you want to store the pre-trained models downloaded from s3\")\n    parser.add_argument(\"--max_seq_length\",\n                        default=128,\n                        type=int,\n                        help=\"The maximum total input sequence length after WordPiece tokenization. \\n\"\n                             \"Sequences longer than this will be truncated, and sequences shorter \\n\"\n                             \"than this will be padded.\")\n    parser.add_argument(\"--do_train\",\n                        action='store_true',\n                        help=\"Whether to run training.\")\n    parser.add_argument(\"--do_eval\",\n                        action='store_true',\n                        help=\"Whether to run eval on the dev set.\")\n    parser.add_argument(\"--do_test\",\n                        action='store_true',\n                        help=\"Whether to run eval on the test set.\")\n    parser.add_argument(\"--do_lower_case\",\n                        action='store_true',\n                        help=\"Set this flag if you are using an uncased model.\")\n    parser.add_argument(\"--train_batch_size\",\n                        default=32,\n                        type=int,\n                        help=\"Total batch size for training.\")\n    parser.add_argument(\"--eval_batch_size\",\n                        default=8,\n                        type=int,\n                        help=\"Total batch size for eval.\")\n    parser.add_argument(\"--learning_rate\",\n                        default=5e-5,\n                        type=float,\n                        help=\"The initial learning rate for Adam.\")\n    parser.add_argument(\"--num_train_epochs\",\n                        default=3.0,\n                        type=float,\n                        help=\"Total number of training epochs to perform.\")\n    parser.add_argument(\"--warmup_proportion\",\n                        default=0.1,\n                        type=float,\n                        help=\"Proportion of training to perform linear learning rate warmup for. \"\n                             \"E.g., 0.1 = 10%% of training.\")\n    parser.add_argument(\"--no_cuda\",\n                        action='store_true',\n                        help=\"Whether not to use CUDA when available\")\n    parser.add_argument(\"--local_rank\",\n                        type=int,\n                        default=-1,\n                        help=\"local_rank for distributed training on gpus\")\n    parser.add_argument('--seed',                            type=int,\n                        default=42,\n                        help=\"random seed for initialization\")\n    parser.add_argument('--gradient_accumulation_steps',\n                        type=int,\n                        default=1,\n                        help=\"Number of updates steps to accumulate before performing a backward/update pass.\")\n    parser.add_argument('--fp16',\n                        action='store_true',\n                        help=\"Whether to use 16-bit float precision instead of 32-bit\")\n    parser.add_argument('--loss_scale',\n                        type=float, default=0,\n                        help=\"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\\n\"\n                             \"0 (default value): dynamic loss scaling.\\n\"\n                             \"Positive power of 2: static loss scaling value.\\n\")\n    parser.add_argument('--server_ip', type=str, default='', help=\"Can be used for distant debugging.\")\n    parser.add_argument('--server_port', type=str, default='', help=\"Can be used for distant debugging.\")\n    args = parser.parse_args()\n\n    if args.server_ip and args.server_port:\n        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script\n        import ptvsd\n        print(\"Waiting for debugger attach\")\n        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)\n        ptvsd.wait_for_attach()\n\n    processors = {\n        \"cola\": ColaProcessor,\n        \"mnli\": MnliProcessor,\n        \"mnli-mm\": MnliMismatchedProcessor,\n        \"mrpc\": MrpcProcessor,\n        \"sem\": SemProcessor,\n        \"tacred\": TacredProcessor,\n        \"sst-2\": Sst2Processor,\n        \"sts-b\": StsbProcessor,\n        \"qqp\": QqpProcessor,\n        \"qnli\": QnliProcessor,\n        \"rte\": RteProcessor,\n        \"wnli\": WnliProcessor,\n    }\n\n    output_modes = {\n        \"cola\": \"classification\",\n        \"mnli\": \"classification\",\n        \"mrpc\": \"classification\",\n        \"sem\": \"classification\",\n        \"tacred\": \"classification\",\n        \"sst-2\": \"classification\",\n        \"sts-b\": \"regression\",\n        \"qqp\": \"classification\",\n        \"qnli\": \"classification\",\n        \"rte\": \"classification\",\n        \"wnli\": \"classification\",\n    }\n\n    if args.local_rank == -1 or args.no_cuda:\n        device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        torch.cuda.set_device(args.local_rank)\n        device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n        torch.distributed.init_process_group(backend='nccl')\n\n    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n                        datefmt = '%m/%d/%Y %H:%M:%S',\n                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)\n\n    logger.info(\"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}\".format(\n        device, n_gpu, bool(args.local_rank != -1), args.fp16))\n\n    if args.gradient_accumulation_steps < 1:\n        raise ValueError(\"Invalid gradient_accumulation_steps parameter: {}, should be >= 1\".format(\n                            args.gradient_accumulation_steps))\n\n    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps\n\n    random.seed(args.seed)\n    np.random.seed(args.seed)\n    torch.manual_seed(args.seed)\n    if n_gpu > 0:\n        torch.cuda.manual_seed_all(args.seed)\n\n    if not args.do_train and not args.do_eval and not args.do_test:\n        raise ValueError(\"At least one of `do_train` or `do_eval` must be True.\")\n\n    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:\n        raise ValueError(\"Output directory ({}) already exists and is not empty.\".format(args.output_dir))\n    if not os.path.exists(args.output_dir):\n        os.makedirs(args.output_dir)\n\n    task_name = args.task_name.lower()\n\n    if task_name not in processors:\n        raise ValueError(\"Task not found: %s\" % (task_name))\n\n    processor = processors[task_name]()\n    output_mode = output_modes[task_name]\n\n    label_list = processor.get_labels()\n    num_labels = len(label_list)\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)\n    train_examples = None\n    num_train_optimization_steps = None\n    if args.do_train:\n        train_examples = processor.get_train_examples(args.data_dir)\n        num_train_optimization_steps = int(\n            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs\n        if args.local_rank != -1:\n            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()\n\n    # Prepare model\n    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))\n    model = BertForSequenceClassification.from_pretrained(args.bert_model,\n              cache_dir=cache_dir,\n              num_labels=num_labels)\n    if args.fp16:\n        model.half()\n    model.to(device)\n    if args.local_rank != -1:\n        try:\n            from apex.parallel import DistributedDataParallel as DDP\n        except ImportError:\n            raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n        model = DDP(model)\n    elif n_gpu > 1:\n        model = torch.nn.DataParallel(model)\n\n    # Prepare optimizer\n    if args.do_train:\n        param_optimizer = list(model.named_parameters())\n        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n        optimizer_grouped_parameters = [\n            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n            ]\n        if args.fp16:\n            try:\n                from apex.optimizers import FP16_Optimizer\n                from apex.optimizers import FusedAdam\n            except ImportError:\n                raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.\")\n\n            optimizer = FusedAdam(optimizer_grouped_parameters,\n                                  lr=args.learning_rate,\n                                  bias_correction=False,\n                                  max_grad_norm=1.0)\n            if args.loss_scale == 0:\n                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)\n            else:\n                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)\n            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,\n                                                 t_total=num_train_optimization_steps)\n\n        else:\n            optimizer = BertAdam(optimizer_grouped_parameters,\n                                 lr=args.learning_rate,\n                                 warmup=args.warmup_proportion,\n                                 t_total=num_train_optimization_steps)\n\n    global_step = 0\n    nb_tr_steps = 0\n    tr_loss = 0\n    \n    if args.do_train:\n        train_features = convert_examples_to_features(\n            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)\n        logger.info(\"***** Running training *****\")\n        logger.info(\"  Num examples = %d\", len(train_examples))\n        logger.info(\"  Batch size = %d\", args.train_batch_size)\n        logger.info(\"  Num steps = %d\", num_train_optimization_steps)\n        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)\n        # FloatTensor(forward)\n        all_entity_mask = torch.tensor([f.entity_mask for f in train_features], dtype=torch.float)\n        all_entity_seg_pos = torch.tensor([f.entity_seg_pos for f in train_features], dtype=torch.long)\n        all_entity_span1_pos = torch.tensor([f.entity_span1_pos for f in train_features], dtype=torch.float)\n        all_entity_span2_pos = torch.tensor([f.entity_span2_pos for f in train_features], dtype=torch.float)\n        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)\n        if output_mode == \"classification\":\n            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)\n        elif output_mode == \"regression\":\n            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)\n\n        train_data = TensorDataset(all_input_ids, all_input_mask, all_entity_mask, all_entity_seg_pos, all_entity_span1_pos, all_entity_span2_pos, all_segment_ids, all_label_ids)\n        if args.local_rank == -1:\n            train_sampler = RandomSampler(train_data)\n        else:\n            train_sampler = DistributedSampler(train_data)\n        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)\n\n        model.train()\n        for epoch_num in trange(int(args.num_train_epochs), desc=\"Epoch\"):\n            tr_loss = 0\n            nb_tr_examples, nb_tr_steps = 0, 0\n            for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")):\n                batch = tuple(t.to(device) for t in batch)\n                input_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, segment_ids, label_ids = batch\n                # define a new function to compute loss values for both output_modes\n                logits = model(input_ids, segment_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, labels=None)\n\n                if output_mode == \"classification\":\n                    loss_fct = CrossEntropyLoss()\n                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n                elif output_mode == \"regression\":\n                    loss_fct = MSELoss()\n                    loss = loss_fct(logits.view(-1), label_ids.view(-1))\n\n                if n_gpu > 1:\n                    loss = loss.mean() # mean() to average on multi-gpu.\n                if args.gradient_accumulation_steps > 1:\n                    loss = loss / args.gradient_accumulation_steps\n\n                if args.fp16:\n                    optimizer.backward(loss)\n                else:\n                    loss.backward()\n\n                tr_loss += loss.item()\n                nb_tr_examples += input_ids.size(0)\n                nb_tr_steps += 1\n                if (step + 1) % args.gradient_accumulation_steps == 0:\n                    if args.fp16:\n                        # modify learning rate with special warm up BERT uses\n                        # if args.fp16 is False, BertAdam is used that handles this automatically\n                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)\n                        for param_group in optimizer.param_groups:\n                            param_group['lr'] = lr_this_step\n                    optimizer.step()\n                    optimizer.zero_grad()\n                    global_step += 1\n\n            # Save a trained model, configuration and tokenizer\n            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self\n\n            # If we save using the predefined names, we can load using `from_pretrained`\n            output_dir_ = args.output_dir+str(epoch_num)\n            if not os.path.exists(output_dir_):\n                os.makedirs(output_dir_)\n            \n            output_model_file = os.path.join(output_dir_, WEIGHTS_NAME)\n            output_config_file = os.path.join(output_dir_, CONFIG_NAME)\n\n            torch.save(model_to_save.state_dict(), output_model_file)\n           \n            model_to_save.config.to_json_file(output_config_file)\n            tokenizer.save_vocabulary(output_dir_)\n           \n            # Save latest model to load\n            output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)\n            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)\n            \n            torch.save(model_to_save.state_dict(), output_model_file)\n            model_to_save.config.to_json_file(output_config_file)\n            tokenizer.save_vocabulary(args.output_dir)\n           \n\n    if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0):\n        # Load a trained model and vocabulary that you have fine-tuned\n        model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)\n        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)\n        model.to(device)\n        \n        eval_examples = processor.get_test_examples(args.data_dir)\n        eval_features = convert_examples_to_features(\n            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)\n        logger.info(\"***** Running evaluation *****\")\n        logger.info(\"  Num examples = %d\", len(eval_examples))\n        logger.info(\"  Batch size = %d\", args.eval_batch_size)\n        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n        all_entity_mask = torch.tensor([f.entity_mask for f in eval_features], dtype=torch.float)\n        all_entity_seg_pos = torch.tensor([f.entity_seg_pos for f in eval_features], dtype=torch.long)\n        all_entity_span1_pos = torch.tensor([f.entity_span1_pos for f in eval_features], dtype=torch.float)\n        all_entity_span2_pos = torch.tensor([f.entity_span2_pos for f in eval_features], dtype=torch.float)\n        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n\n        if output_mode == \"classification\":\n            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)\n        elif output_mode == \"regression\":\n            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)\n\n        eval_data = TensorDataset(all_input_ids, all_input_mask, all_entity_mask, all_entity_seg_pos, all_entity_span1_pos, all_entity_span2_pos, all_segment_ids, all_label_ids)\n        # Run prediction for full data\n        eval_sampler = SequentialSampler(eval_data)\n        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)\n\n        model.eval()\n        eval_loss = 0\n        nb_eval_steps = 0\n        preds = []\n\n        for input_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, segment_ids, label_ids in tqdm(eval_dataloader, desc=\"Evaluating\"):\n            input_ids = input_ids.to(device)\n            input_mask = input_mask.to(device)\n            entity_mask = entity_mask.to(device)\n            entity_seg_pos = entity_seg_pos.to(device)\n            entity_span1_pos = entity_span1_pos.to(device)\n            entity_span2_pos = entity_span2_pos.to(device)\n            segment_ids = segment_ids.to(device)\n            label_ids = label_ids.to(device)\n            with torch.no_grad():\n                logits = model(input_ids, segment_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, labels=None)\n                #logits = model(input_ids, segment_ids, input_mask, labels=None)\n\n            # create eval loss and other metric required by the task\n            if output_mode == \"classification\":\n                loss_fct = CrossEntropyLoss()\n                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n            elif output_mode == \"regression\":\n                loss_fct = MSELoss()\n                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))\n            \n            eval_loss += tmp_eval_loss.mean().item()\n            nb_eval_steps += 1\n            if len(preds) == 0:\n                preds.append(logits.detach().cpu().numpy())\n            else:\n                preds[0] = np.append(\n                    preds[0], logits.detach().cpu().numpy(), axis=0)\n\n        eval_loss = eval_loss / nb_eval_steps\n        preds = preds[0]\n        if output_mode == \"classification\":\n            preds = np.argmax(preds, axis=1)\n        elif output_mode == \"regression\":\n            preds = np.squeeze(preds)\n        result = compute_metrics(task_name, preds, all_label_ids.numpy())\n        loss = tr_loss/global_step if args.do_train else None\n\n        result['eval_loss'] = eval_loss\n        result['global_step'] = global_step\n        result['loss'] = loss\n\n        output_eval_file = os.path.join(args.output_dir, \"eval_results.txt\")\n        with open(output_eval_file, \"w\") as writer:\n            logger.info(\"***** Eval results *****\")\n            for key in sorted(result.keys()):\n                logger.info(\"  %s = %s\", key, str(result[key]))\n                writer.write(\"%s = %s\\n\" % (key, str(result[key])))\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/tacred_run_infer.py",
    "content": "from __future__ import absolute_import, division, print_function\n\nimport argparse\nimport csv\nimport logging\nimport os\nimport random\n\nimport sys\nsys.path.append('..')\nos.environ['CUDA_VISIBLE_DEVICES']='0'\n\nimport copy\nimport json\nimport numpy as np\nimport torch\nfrom torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,\n                              TensorDataset)\nfrom torch.utils.data.distributed import DistributedSampler\nfrom tqdm import tqdm, trange\n\nfrom scipy.stats import pearsonr, spearmanr\nfrom sklearn.metrics import matthews_corrcoef, f1_score, classification_report,precision_recall_fscore_support\n\nfrom pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\n\nfrom pprint import pprint\nimport textdistance\nimport neuralcoref\nimport en_core_web_sm\nfrom itertools import groupby, combinations\nfrom utils import get_candidate_input\n\nfrom flask import Flask,request,jsonify\napp = Flask(__name__)\n\n\nclass InputExample(object):\n    \"\"\"A single training/test example for simple sequence classification.\"\"\"\n\n    def __init__(self, guid, text_a, text_b=None, label=None, entity_pos=None):\n        \"\"\"Constructs a InputExample.\n\n        Args:\n            guid: Unique id for the example.\n            text_a: string. The untokenized text of the first sequence. For single\n            sequence tasks, only this sequence must be specified.\n            text_b: (Optional) string. The untokenized text of the second sequence.\n            Only must be specified for sequence pair tasks.\n            label: (Optional) string. The label of the example. This should be\n            specified for train and dev examples, but not for test examples.\n        \"\"\"\n        self.guid = guid\n        self.text_a = text_a\n        self.text_b = text_b\n        self.label = label\n        self.entity_pos = entity_pos\n\nclass InputFeatures(object):\n    \"\"\"A single set of features of data.\"\"\"\n\n    def __init__(self,input_ids, input_mask, segment_ids, label_id, entity_mask=None, entity_seg_pos=None, entity_span1_pos=None, entity_span2_pos=None):\n        self.input_ids = input_ids\n        self.input_mask = input_mask\n        self.segment_ids = segment_ids\n        self.label_id = label_id\n        self.entity_mask = entity_mask\n        self.entity_seg_pos = entity_seg_pos\n        self.entity_span1_pos = entity_span1_pos\n        self.entity_span2_pos = entity_span2_pos\n\n\nclass DataProcessor(object):\n    \"\"\"Base class for data converters for sequence classification data sets.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the train set.\"\"\"\n        raise NotImplementedError()\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"Gets a collection of `InputExample`s for the dev set.\"\"\"\n        raise NotImplementedError()\n\n    def get_labels(self):\n        \"\"\"Gets the list of labels for this data set.\"\"\"\n        raise NotImplementedError()\n\n    @classmethod\n    def _read_tsv(cls, input_file, quotechar=None):\n        \"\"\"Reads a tab separated value file.\"\"\"\n        with open(input_file, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.reader(f, delimiter=\"\\t\", quotechar=quotechar)\n            lines = []\n            for line in reader:\n                if sys.version_info[0] == 2:\n                    line = list(unicode(cell, 'utf-8') for cell in line)\n                lines.append(line)\n            return lines\n\nclass TacredProcessor(DataProcessor):\n    \"\"\"Processor for the TACRED dataset.\"\"\"\n\n    def get_train_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.jsonl\")))\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"train_dev.jsonl\")), \"train\")\n\n    def get_dev_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"dev.jsonl\")), \"dev\")\n    \n    def get_test_examples(self, data_dir):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(\n            self._read_tsv(os.path.join(data_dir, \"test.jsonl\")), \"test\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return ['per:parents', 'per:country_of_birth', 'org:political/religious_affiliation', 'org:parents', 'org:members', 'per:schools_attended', 'org:shareholders', 'per:stateorprovince_of_death', 'per:age', 'per:city_of_death', 'per:siblings', 'per:date_of_birth', 'org:founded', 'per:stateorprovince_of_birth', 'per:origin', 'per:charges', 'per:children', 'per:title', 'per:countries_of_residence', 'org:top_members/employees', 'per:religion', 'per:country_of_death', 'per:employee_of', 'no_relation', 'per:stateorprovinces_of_residence', 'org:city_of_headquarters', 'org:dissolved', 'per:date_of_death', 'per:other_family', 'per:alternate_names', 'org:number_of_employees/members', 'per:spouse', 'per:cause_of_death', 'org:alternate_names', 'org:founded_by', 'org:stateorprovince_of_headquarters', 'per:city_of_birth', 'org:subsidiaries', 'org:website', 'org:member_of', 'per:cities_of_residence', 'org:country_of_headquarters']\n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            line = json.loads(line[0])\n            text_a = ' '.join(line['tokens'])\n            label = line['label']\n            entity_pos = line['entities']\n            # 假设entity之间不重叠\n            entity_pos = sorted(entity_pos) \n            examples.append(\n                InputExample(guid=guid, text_a=text_a, label=label, entity_pos = entity_pos))\n        return examples\n\nclass _TacredProcessor(DataProcessor):\n    \"\"\"Processor for the TACRED dataset.\"\"\"\n\n    def get_test_examples(self, lines):\n        \"\"\"See base class.\"\"\"\n        return self._create_examples(lines, \"test\")\n\n    def get_labels(self):\n        \"\"\"See base class.\"\"\"\n        return ['per:parents', 'per:country_of_birth', 'org:political/religious_affiliation', 'org:parents', 'org:members', 'per:schools_attended', 'org:shareholders', 'per:stateorprovince_of_death', 'per:age', 'per:city_of_death', 'per:siblings', 'per:date_of_birth', 'org:founded', 'per:stateorprovince_of_birth', 'per:origin', 'per:charges', 'per:children', 'per:title', 'per:countries_of_residence', 'org:top_members/employees', 'per:religion', 'per:country_of_death', 'per:employee_of', 'no_relation', 'per:stateorprovinces_of_residence', 'org:city_of_headquarters', 'org:dissolved', 'per:date_of_death', 'per:other_family', 'per:alternate_names', 'org:number_of_employees/members', 'per:spouse', 'per:cause_of_death', 'org:alternate_names', 'org:founded_by', 'org:stateorprovince_of_headquarters', 'per:city_of_birth', 'org:subsidiaries', 'org:website', 'org:member_of', 'per:cities_of_residence', 'org:country_of_headquarters']\n    \n    def _create_examples(self, lines, set_type):\n        \"\"\"Creates examples for the training and dev sets.\"\"\"\n        examples = []\n        for (i, line) in enumerate(lines):\n            guid = \"%s-%s\" % (set_type, i)\n            text_a = ' '.join(line['tokens'])\n            label = line['label']\n            entity_pos = line['entities']\n            # 假设entity之间不重叠\n            entity_pos = sorted(entity_pos) \n            examples.append(\n                InputExample(guid=guid, text_a=text_a, label=label, entity_pos = entity_pos))\n        return examples\n\n\ndef convert_examples_to_features(examples, label_list, max_seq_length,\n                                 tokenizer, output_mode):\n    \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n    label_map = {label : i for i, label in enumerate(label_list)}\n    reverse_label_map = {i : label for i, label in enumerate(label_list)}\n    samples = []\n    features = []\n    for (ex_index, example) in enumerate(examples):\n        old_entity_pos = copy.deepcopy(example.entity_pos)\n        tokens_a, new_entity_pos = tokenizer.tokenize(example.text_a,example.entity_pos)\n        \n        old_entity0_ = ' '.join(example.text_a.split()[old_entity_pos[0][0]:old_entity_pos[0][1]])\n        old_entity1_ = ' '.join(example.text_a.split()[old_entity_pos[1][0]:old_entity_pos[1][1]])\n        \n        old_entity0 = ''.join(example.text_a.split()[old_entity_pos[0][0]:old_entity_pos[0][1]])\n        old_entity1 = ''.join(example.text_a.split()[old_entity_pos[1][0]:old_entity_pos[1][1]])\n        new_entity0 = ''.join(tokens_a[new_entity_pos[0][0]:new_entity_pos[0][1]])\n        new_entity1 = ''.join(tokens_a[new_entity_pos[1][0]:new_entity_pos[1][1]])\n        \n        old_entity0 = old_entity0.lower()\n        old_entity1 = old_entity1.lower()\n        if '##' in new_entity0 or '##' in new_entity1:\n            new_entity0 = new_entity0.replace('#','')\n            new_entity1 = new_entity1.replace('#','')\n        \n        try:\n            assert(old_entity0 == new_entity0)\n            assert(old_entity1 == new_entity1)\n        except:\n            continue\n        # Entity marker\n        tokens_a_ = copy.deepcopy(tokens_a) \n        new_entity_pos_ = copy.deepcopy(new_entity_pos)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        \n        tokens_a.insert(entity1_start, '<s1>') \n        new_entity_pos[0][0] = entity1_start\n        tokens_a.insert(entity1_end+1, '<e1>')\n        new_entity_pos[0][1] = entity1_end+1+1\n        tokens_a.insert(entity2_start+2, '<s2>')\n        new_entity_pos[1][0] = entity2_start+2\n        tokens_a.insert(entity2_end+3,'<e2>')\n        new_entity_pos[1][1] = entity2_end+3+1\n\n        if new_entity_pos[1][1] > max_seq_length - 2 - 1:\n            continue\n            \n        tokens_b = None\n        if example.text_b:\n            tokens_b = tokenizer.tokenize(example.text_b)\n            # Modifies `tokens_a` and `tokens_b` in place so that the total\n            # length is less than the specified length.\n            # Account for [CLS], [SEP], [SEP] with \"- 3\"\n            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n        else:\n            # Account for [CLS] and [SEP] with \"- 2\"\n            if len(tokens_a) > max_seq_length - 2:\n                tokens_a = tokens_a[:(max_seq_length - 2)]\n\n        # The convention in BERT is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids: 0   0   0   0  0     0 0\n        #\n        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary\n        # since the [SEP] token unambiguously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n        segment_ids = [0] * len(tokens)\n\n        if tokens_b:\n            tokens += tokens_b + [\"[SEP]\"]\n            segment_ids += [1] * (len(tokens_b) + 1)\n\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n        # The mask has 1 for real tokens and 0 for padding tokens. Only real\n        # tokens are attended to.\n        input_mask = [1] * len(input_ids)\n\n        # Zero-pad up to the sequence length.\n        padding = [0] * (max_seq_length - len(input_ids))\n        input_ids += padding\n        input_mask += padding\n        segment_ids += padding\n        \n\n        # Used for mention pooling\n        entity_mask_tag = 1\n        entity_mask = [0] * len(input_ids)\n        for entity in new_entity_pos:\n            start, end = entity[0],entity[1]\n            for i in range(start, end):\n                # [CLS], need to +1 offset\n                entity_mask[i+1] = entity_mask_tag\n        \n        \"\"\"\n            Different position embedding\n        \"\"\"\n        # Strategy 1\n        entity1_pos_tag = 1\n        entity2_pos_tag = 2\n\n        entity_seg_pos = [0] * len(input_ids)\n\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        for i in range(entity1_start, entity1_end):\n            entity_seg_pos[i+1] = entity1_pos_tag\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        for i in range(entity2_start, entity2_end):\n            entity_seg_pos[i+1] = entity2_pos_tag\n        \n        # Strategy 2\n        entity_start_pos_tag = 1\n        entity_seg_pos_ = [0] * len(input_ids)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        entity_seg_pos_[entity1_start+1] = entity_start_pos_tag\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        entity_seg_pos_[entity2_start+1] = entity_start_pos_tag\n\n        # Strategy 3\n        entity_span1_pos = [0] * len(input_ids)\n        entity1_start, entity1_end = new_entity_pos[0][0], new_entity_pos[0][1] \n        for i in range(len(entity_span1_pos)):\n            if i < entity1_start:\n                #entity_span1_pos[i] = np.abs(i - entity1_start)\n                entity_span1_pos[i] = i - entity1_start\n            elif entity1_start <= i and i < entity1_end:\n                entity_span1_pos[i] = 0\n            elif i >= entity1_end:\n                entity_span1_pos[i] = i - entity1_end + 1\n        \n        entity_span2_pos = [0] * len(input_ids)\n        entity2_start, entity2_end = new_entity_pos[1][0], new_entity_pos[1][1] \n        for i in range(len(entity_span2_pos)):\n            if i < entity2_start:\n                #entity_span2_pos[i] = np.abs(i - entity2_start)\n                entity_span2_pos[i] = i - entity2_start\n            elif entity2_start <= i and i < entity2_end:\n                entity_span2_pos[i] = 0\n            elif i >= entity2_end:\n                entity_span2_pos[i] = i - entity2_end + 1\n\n        # Avoid to get negative position to fuck the nn.Embedding\n        #entity_span1_pos = [pos+max_seq_length-1 for pos in entity_span1_pos]\n        #entity_span2_pos = [pos+max_seq_length-1 for pos in entity_span2_pos]\n        \n        assert len(input_ids) == max_seq_length\n        assert len(input_mask) == max_seq_length\n        assert len(segment_ids) == max_seq_length\n        assert len(entity_mask) == max_seq_length\n        assert len(entity_seg_pos) == max_seq_length\n        assert len(entity_seg_pos_) == max_seq_length\n        assert len(entity_span1_pos) == max_seq_length\n        assert len(entity_span2_pos) == max_seq_length\n        if output_mode == \"classification\":\n            label_id = label_map[example.label]\n        else:\n            raise KeyError(output_mode)\n\n        print(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n        print(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n        print(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n        print(\"entity_mask: %s\" % \" \".join([str(x) for x in entity_mask]))\n        print(\"entity_seg_pos: %s\" % \" \".join([str(x) for x in entity_seg_pos]))\n        print(\"entity_seg_pos_: %s\" % \" \".join([str(x) for x in entity_seg_pos_]))\n        print(\"entity_span1_pos: %s\" % \" \".join([str(x) for x in entity_span1_pos]))\n        print(\"entity_span2_pos: %s\" % \" \".join([str(x) for x in entity_span2_pos]))\n        print(\"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n        print(\"label: %s (id = %d)\" % (example.label, label_id))\n\n        samples.append([example.text_a, (old_entity0_,old_entity1_)])\n        features.append(\n                InputFeatures(input_ids=input_ids,\n                              input_mask=input_mask,\n                              segment_ids=segment_ids,\n                              label_id=label_id,\n                              entity_mask=entity_mask,\n                              entity_seg_pos=entity_seg_pos_,\n                              entity_span1_pos=entity_span1_pos,\n                              entity_span2_pos=entity_span2_pos))\n    return features, samples, reverse_label_map\n\ndef _truncate_seq_pair(tokens_a, tokens_b, max_length):\n    \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n\n    # This is a simple heuristic which will always truncate the longer sequence\n    # one token at a time. This makes more sense than truncating an equal percent\n    # of tokens from each, since if one sequence is very short then each token\n    # that's truncated likely contains more information than a longer sequence.\n    while True:\n        total_length = len(tokens_a) + len(tokens_b)\n        if total_length <= max_length:\n            break\n        if len(tokens_a) > len(tokens_b):\n            tokens_a.pop()\n        else:\n            tokens_b.pop()\n\ndef load_model():\n    parser = argparse.ArgumentParser()\n\n    ## Required parameters\n    parser.add_argument(\"--data_dir\",\n                        default='/data/share/zhanghaipeng/tre/datasets/data/tacred/',\n                        type=str,\n                        help=\"The input data dir. Should contain the .tsv files (or other data files) for the task.\")\n    parser.add_argument(\"--bert_model\",\n                        default='model/bert-base-uncased',\n                        type=str,\n                        help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, \"\"bert-base-multilingual-cased, bert-base-chinese.\")\n    parser.add_argument(\"--task_name\",\n                        default='tacred',\n                        type=str,\n                        help=\"The name of the task to train.\")\n    parser.add_argument(\"--output_dir\",\n                        default='train/23/tacred3.0',\n                        type=str,\n                        help=\"The output directory where the model predictions and checkpoints will be written.\")\n\n    ## Other parameters\n    parser.add_argument(\"--cache_dir\",\n                        default=\"cache/\",\n                        type=str,\n                        help=\"Where do you want to store the pre-trained models downloaded from s3\")\n    parser.add_argument(\"--max_seq_length\",\n                        default=128,\n                        type=int,\n                        help=\"The maximum total input sequence length after WordPiece tokenization. \\n\"\n                             \"Sequences longer than this will be truncated, and sequences shorter \\n\"\n                             \"than this will be padded.\")\n    parser.add_argument(\"--do_lower_case\",\n                        action='store_true',\n                        help=\"Set this flag if you are using an uncased model.\")\n    parser.add_argument(\"--test_batch_size\",\n                        default=1,\n                        type=int,\n                        help=\"Total batch size for eval.\")\n    parser.add_argument(\"--no_cuda\",\n                        action='store_true',\n                        help=\"Whether not to use CUDA when available\")\n    parser.add_argument(\"--local_rank\",\n                        type=int,\n                        default=-1,\n                        help=\"local_rank for distributed training on gpus\")\n    parser.add_argument('--seed',                            type=int,\n                        default=42,\n                        help=\"random seed for initialization\")\n    parser.add_argument('--fp16',\n                        action='store_true',\n                        help=\"Whether to use 16-bit float precision instead of 32-bit\")\n    args = parser.parse_args()\n\n\n    if args.local_rank == -1 or args.no_cuda:\n        device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n        n_gpu = torch.cuda.device_count()\n    else:\n        torch.cuda.set_device(args.local_rank)\n        device = torch.device(\"cuda\", args.local_rank)\n        n_gpu = 1\n\n    task_name = args.task_name.lower()\n    processor = _TacredProcessor()\n    output_mode = 'classification'\n\n    label_list = processor.get_labels()\n    num_labels = len(label_list)\n    # Load a trained model and vocabulary that you have fine-tuned\n    model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)\n    tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)\n    if n_gpu > 1:\n        model = torch.nn.DataParallel(model)\n    model.to(device)\n    model.eval()\n    return model, tokenizer, label_list, args, output_mode, processor, device\n\ndef get_helper_model(spacy_used=False):\n    from simplex_sdk import SimplexClient\n    if spacy_used:\n        model = en_core_web_sm.load() \n    else:\n        model = SimplexClient('BertNerApi-tmp')\n    return model\n\n# 载入模型\nmodel, tokenizer, label_list, args, output_mode, processor, device = load_model()\nner_model = get_helper_model(spacy_used=False)\n\n@app.route('/nre')\ndef predict():\n    data = request.args\n    if 'text' not in data.keys():\n        warning_str = 'pleasen input right arg!\\n'\n        return jsonify(warning_str)\n    else:\n        line = data['text']\n    \n    relation = []\n    candidate_input_list, entity = get_candidate_input(line, ner_model,spacy_used=False)\n    # 实体未找到\n    if not candidate_input_list:\n        return jsonify({'relations':relation,'entities':entity})\n        \n    test_examples = processor.get_test_examples(candidate_input_list)\n    \n    test_features,samples,reverse_label_map = convert_examples_to_features(\n        test_examples, label_list, args.max_seq_length, tokenizer, output_mode)\n    \n    test_num = len(candidate_input_list)\n    for i in range(test_num):\n        f,sample = test_features[i],samples[i]\n        input_ids = torch.tensor(f.input_ids,dtype=torch.long).reshape(1, args.max_seq_length).to(device)\n        segment_ids = torch.tensor(f.segment_ids,dtype=torch.long).reshape(1, args.max_seq_length).to(device)\n        input_mask = torch.tensor(f.input_mask,dtype=torch.long).reshape(1, args.max_seq_length).to(device)\n        entity_mask = torch.tensor(f.entity_mask,dtype=torch.float).reshape(1, args.max_seq_length).to(device)\n        entity_seg_pos = torch.tensor(f.entity_seg_pos,dtype=torch.long).reshape(1, args.max_seq_length).to(device)\n        entity_span1_pos = torch.tensor(f.entity_span1_pos,dtype=torch.float).reshape(1, args.max_seq_length).to(device)\n        entity_span2_pos = torch.tensor(f.entity_span2_pos,dtype=torch.float).reshape(1, args.max_seq_length).to(device)\n        \n        with torch.no_grad():\n            logits = model(input_ids, segment_ids, input_mask, entity_mask, entity_seg_pos, entity_span1_pos, entity_span2_pos, labels=None)\n        \"\"\"\n            np.argmax v.s. torch.argmax\n        \"\"\"\n        pred_id = np.argmax(logits.detach().cpu().numpy()[0].tolist()) \n        pred_label = reverse_label_map[pred_id]\n        text,entity0, entity1 = sample[0], sample[1][0], sample[1][1]\n        relation.append({'text':text,'entity pair':[entity0,entity1],'relation':pred_label})\n    return jsonify({'relations':relation,'entities':entity})\n\nif __name__ == \"__main__\":\n    app.run(host='0.0.0.0',port='5050',debug=True)\n"
  },
  {
    "path": "examples/test.sh",
    "content": "#export GLUE_DIR=/data/share/zhanghaipeng/pytorch-pretrained-BERT/examples/general_ner_test\nexport GLUE_DIR=/data/share/zhanghaipeng/tre/datasets/data\nexport TASK_NAME=tacred\n\nEXPR=23\nBS=64\nCUDA=0\nEPOCH=3.0\n\nCUDA_VISIBLE_DEVICES=$CUDA python tacred_run_classifier.py \\\n\t--task_name $TASK_NAME \\\n\t--do_test \\\n\t--do_lower_case \\\n\t--data_dir $GLUE_DIR/tacred/ \\\n\t--max_seq_length 128 \\\n\t--eval_batch_size $BS \\\n\t--output_dir train/$EXPR/$TASK_NAME$EPOCH \\\n\t--bert_model model/bert-large-uncased\n"
  },
  {
    "path": "examples/train.sh",
    "content": "export GLUE_DIR=/data/share/zhanghaipeng/tre/datasets/data\nexport TASK_NAME=tacred\n\nEXPR=25\nBS=16\nCUDA=2\nLR=3e-5\nEPOCH=4.0\n\nCUDA_VISIBLE_DEVICES=$CUDA python tacred_run_classifier.py \\\n\t--task_name $TASK_NAME \\\n\t--do_train \\\n\t--do_lower_case \\\n\t--data_dir $GLUE_DIR/tacred/ \\\n\t--max_seq_length 128 \\\n\t--train_batch_size $BS \\\n\t--learning_rate $LR \\\n\t--num_train_epochs $EPOCH \\\n\t--output_dir train/$EXPR/$TASK_NAME$EPOCH \\\n\t--bert_model model/bert-base-uncased\n"
  },
  {
    "path": "hubconf.py",
    "content": "dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']\n\nfrom hubconfs.bert_hubconf import (\n    bertTokenizer,\n    bertModel,\n    bertForNextSentencePrediction,\n    bertForPreTraining,\n    bertForMaskedLM,\n    bertForSequenceClassification,\n    bertForMultipleChoice,\n    bertForQuestionAnswering,\n    bertForTokenClassification\n)\nfrom hubconfs.gpt_hubconf import (\n    openAIGPTTokenizer,\n    openAIGPTModel,\n    openAIGPTLMHeadModel,\n    openAIGPTDoubleHeadsModel\n)\nfrom hubconfs.gpt2_hubconf import (\n    gpt2Tokenizer,\n    gpt2Model,\n    gpt2LMHeadModel,\n    gpt2DoubleHeadsModel\n)\nfrom hubconfs.transformer_xl_hubconf import (\n    transformerXLTokenizer,\n    transformerXLModel,\n    transformerXLLMHeadModel\n)\n"
  },
  {
    "path": "hubconfs/bert_hubconf.py",
    "content": "from pytorch_pretrained_bert.tokenization import BertTokenizer\nfrom pytorch_pretrained_bert.modeling import (\n        BertModel,\n        BertForNextSentencePrediction,\n        BertForMaskedLM,\n        BertForMultipleChoice,\n        BertForPreTraining,\n        BertForQuestionAnswering,\n        BertForSequenceClassification,\n        BertForTokenClassification,\n        )\n\n# A lot of models share the same param doc. Use a decorator\n# to save typing\nbert_docstring = \"\"\"\n    Params:\n        pretrained_model_name_or_path: either:\n            - a str with the name of a pre-trained model to load\n                . `bert-base-uncased`\n                . `bert-large-uncased`\n                . `bert-base-cased`\n                . `bert-large-cased`\n                . `bert-base-multilingual-uncased`\n                . `bert-base-multilingual-cased`\n                . `bert-base-chinese`\n                . `bert-base-german-cased`\n                . `bert-large-uncased-whole-word-masking`\n                . `bert-large-cased-whole-word-masking`\n            - a path or url to a pretrained model archive containing:\n                . `bert_config.json` a configuration file for the model\n                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining\n                  instance\n            - a path or url to a pretrained model archive containing:\n                . `bert_config.json` a configuration file for the model\n                . `model.chkpt` a TensorFlow checkpoint\n        from_tf: should we load the weights from a locally saved TensorFlow\n                 checkpoint\n        cache_dir: an optional path to a folder in which the pre-trained models\n                   will be cached.\n        state_dict: an optional state dictionnary\n                    (collections.OrderedDict object) to use instead of Google\n                    pre-trained models\n        *inputs, **kwargs: additional input for the specific Bert class\n            (ex: num_labels for BertForSequenceClassification)\n\"\"\"\n\n\ndef _append_from_pretrained_docstring(docstr):\n    def docstring_decorator(fn):\n        fn.__doc__ = fn.__doc__ + docstr\n        return fn\n    return docstring_decorator\n\n\ndef bertTokenizer(*args, **kwargs):\n    \"\"\"\n    Instantiate a BertTokenizer from a pre-trained/customized vocab file\n    Args:\n    pretrained_model_name_or_path: Path to pretrained model archive\n                                   or one of pre-trained vocab configs below.\n                                       * bert-base-uncased\n                                       * bert-large-uncased\n                                       * bert-base-cased\n                                       * bert-large-cased\n                                       * bert-base-multilingual-uncased\n                                       * bert-base-multilingual-cased\n                                       * bert-base-chinese\n    Keyword args:\n    cache_dir: an optional path to a specific directory to download and cache\n               the pre-trained model weights.\n               Default: None\n    do_lower_case: Whether to lower case the input.\n                   Only has an effect when do_wordpiece_only=False\n                   Default: True\n    do_basic_tokenize: Whether to do basic tokenization before wordpiece.\n                       Default: True\n    max_len: An artificial maximum length to truncate tokenized sequences to;\n             Effective maximum length is always the minimum of this\n             value (if specified) and the underlying BERT model's\n             sequence length.\n             Default: None\n    never_split: List of tokens which will never be split during tokenization.\n                 Only has an effect when do_wordpiece_only=False\n                 Default: [\"[UNK]\", \"[SEP]\", \"[PAD]\", \"[CLS]\", \"[MASK]\"]\n\n    Example:\n        >>> import torch\n        >>> sentence = 'Hello, World!'\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n        >>> toks = tokenizer.tokenize(sentence)\n        ['Hello', '##,', 'World', '##!']\n        >>> ids = tokenizer.convert_tokens_to_ids(toks)\n        [8667, 28136, 1291, 28125]\n    \"\"\"\n    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)\n    return tokenizer\n\n\n@_append_from_pretrained_docstring(bert_docstring)\ndef bertModel(*args, **kwargs):\n    \"\"\"\n    BertModel is the basic BERT Transformer model with a layer of summed token,\n    position and sequence embeddings followed by a series of identical\n    self-attention blocks (12 for BERT-base, 24 for BERT-large).\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n        #  Prepare tokenized input\n        >>> text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> tokens_tensor = torch.tensor([indexed_tokens])\n        >>> segments_tensors = torch.tensor([segments_ids])\n        # Load bertModel\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')\n        >>> model.eval()\n        # Predict hidden states features for each layer\n        >>> with torch.no_grad():\n                encoded_layers, _ = model(tokens_tensor, segments_tensors)\n    \"\"\"\n    model = BertModel.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(bert_docstring)\ndef bertForNextSentencePrediction(*args, **kwargs):\n    \"\"\"\n    BERT model with next sentence prediction head.\n    This module comprises the BERT model followed by the next sentence\n    classification head.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n        #  Prepare tokenized input\n        >>> text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> tokens_tensor = torch.tensor([indexed_tokens])\n        >>> segments_tensors = torch.tensor([segments_ids])\n        # Load bertForNextSentencePrediction\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased')\n        >>> model.eval()\n        # Predict the next sentence classification logits\n        >>> with torch.no_grad():\n                next_sent_classif_logits = model(tokens_tensor, segments_tensors)\n    \"\"\"\n    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(bert_docstring)\ndef bertForPreTraining(*args, **kwargs):\n    \"\"\"\n    BERT model with pre-training heads.\n    This module comprises the BERT model followed by the two pre-training heads\n        - the masked language modeling head, and\n        - the next sentence classification head.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n        #  Prepare tokenized input\n        >>> text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> tokens_tensor = torch.tensor([indexed_tokens])\n        >>> segments_tensors = torch.tensor([segments_ids])\n        # Load bertForPreTraining\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')\n        >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)\n    \"\"\"\n    model = BertForPreTraining.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(bert_docstring)\ndef bertForMaskedLM(*args, **kwargs):\n    \"\"\"\n    BertForMaskedLM includes the BertModel Transformer followed by the\n    (possibly) pre-trained masked language modeling head.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n        #  Prepare tokenized input\n        >>> text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> masked_index = 8\n        >>> tokenized_text[masked_index] = '[MASK]'\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> tokens_tensor = torch.tensor([indexed_tokens])\n        >>> segments_tensors = torch.tensor([segments_ids])\n        # Load bertForMaskedLM\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')\n        >>> model.eval()\n        # Predict all tokens\n        >>> with torch.no_grad():\n                predictions = model(tokens_tensor, segments_tensors)\n        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()\n        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]\n        'henson'\n    \"\"\"\n    model = BertForMaskedLM.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(bert_docstring)\ndef bertForSequenceClassification(*args, **kwargs):\n    \"\"\"\n    BertForSequenceClassification is a fine-tuning model that includes\n    BertModel and a sequence-level (sequence or pair of sequences) classifier\n    on top of the BertModel. Note that the classification head is only initialized\n    and has to be trained.\n\n    The sequence-level classifier is a linear layer that takes as input the\n    last hidden state of the first character in the input sequence\n    (see Figures 3a and 3b in the BERT paper).\n\n    Args:\n    num_labels: the number (>=2) of classes for the classifier.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n        #  Prepare tokenized input\n        >>> text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> tokens_tensor = torch.tensor([indexed_tokens])\n        >>> segments_tensors = torch.tensor([segments_ids])\n        # Load bertForSequenceClassification\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)\n        >>> model.eval()\n        # Predict the sequence classification logits\n        >>> with torch.no_grad():\n                seq_classif_logits = model(tokens_tensor, segments_tensors)\n        # Or get the sequence classification loss\n        >>> labels = torch.tensor([1])\n        >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss\n    \"\"\"\n    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(bert_docstring)\ndef bertForMultipleChoice(*args, **kwargs):\n    \"\"\"\n    BertForMultipleChoice is a fine-tuning model that includes BertModel and a\n    linear layer on top of the BertModel. Note that the multiple choice head is\n    only initialized and has to be trained.\n\n    Args:\n    num_choices: the number (>=2) of classes for the classifier.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n        #  Prepare tokenized input\n        >>> text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)\n        >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)\n        # Load bertForMultipleChoice\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)\n        >>> model.eval()\n        # Predict the multiple choice logits\n        >>> with torch.no_grad():\n                multiple_choice_logits = model(tokens_tensor, segments_tensors)\n        # Or get the multiple choice loss\n        >>> labels = torch.tensor([1])\n        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss\n    \"\"\"\n    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(bert_docstring)\ndef bertForQuestionAnswering(*args, **kwargs):\n    \"\"\"\n    BertForQuestionAnswering is a fine-tuning model that includes BertModel\n    with a token-level classifiers on top of the full sequence of last hidden\n    states. Note that the classification head is only initialized\n    and has to be trained.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n        #  Prepare tokenized input\n        >>> text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> tokens_tensor = torch.tensor([indexed_tokens])\n        >>> segments_tensors = torch.tensor([segments_ids])\n        # Load bertForQuestionAnswering\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased')\n        >>> model.eval()\n        # Predict the start and end positions logits\n        >>> with torch.no_grad():\n                start_logits, end_logits = model(tokens_tensor, segments_tensors)\n        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions\n        >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14])\n        # set model.train() before if training this loss\n        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)\n    \"\"\"\n    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(bert_docstring)\ndef bertForTokenClassification(*args, **kwargs):\n    \"\"\"\n    BertForTokenClassification is a fine-tuning model that includes BertModel\n    and a token-level classifier on top of the BertModel. Note that the classification\n    head is only initialized and has to be trained.\n\n    The token-level classifier is a linear layer that takes as input the last\n    hidden state of the sequence.\n\n    Args:\n    num_labels: the number (>=2) of classes for the classifier.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)\n        #  Prepare tokenized input\n        >>> text = \"[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> tokens_tensor = torch.tensor([indexed_tokens])\n        >>> segments_tensors = torch.tensor([segments_ids])\n        # Load bertForTokenClassification\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)\n        >>> model.eval()\n        # Predict the token classification logits\n        >>> with torch.no_grad():\n                classif_logits = model(tokens_tensor, segments_tensors)\n        # Or get the token classification loss\n        >>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])\n        >>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss\n    \"\"\"\n    model = BertForTokenClassification.from_pretrained(*args, **kwargs)\n    return model\n"
  },
  {
    "path": "hubconfs/gpt2_hubconf.py",
    "content": "from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer\nfrom pytorch_pretrained_bert.modeling_gpt2 import (\n    GPT2Model,\n    GPT2LMHeadModel,\n    GPT2DoubleHeadsModel\n)\n\n# A lot of models share the same param doc. Use a decorator\n# to save typing\ngpt2_docstring = \"\"\"\n    Params:\n        pretrained_model_name_or_path: either:\n            - a str with the name of a pre-trained model to load selected in the list of:\n                . `gpt2`, `gpt2-medium`\n            - a path or url to a pretrained model archive containing:\n                . `gpt2_config.json` a configuration file for the model\n                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance\n            - a path or url to a pretrained model archive containing:\n                . `gpt2_config.json` a configuration file for the model\n                . a TensorFlow checkpoint with trained weights\n        from_tf: should we load the weights from a locally saved TensorFlow checkpoint\n        cache_dir: an optional path to a folder in which the pre-trained models will be cached.\n        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models\n        *inputs, **kwargs: additional input for the specific GPT-2 class\n\"\"\"\n\n\ndef _append_from_pretrained_docstring(docstr):\n    def docstring_decorator(fn):\n        fn.__doc__ = fn.__doc__ + docstr\n        return fn\n    return docstring_decorator\n\n\ndef gpt2Tokenizer(*args, **kwargs):\n    \"\"\"\n    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.\n    Peculiarities:\n        - Byte-level BPE\n\n    Args:\n    pretrained_model_name_or_path: Path to pretrained model archive\n                                   or one of pre-trained vocab configs below.\n                                       * gpt2\n    Keyword args:\n    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)\n                    Default: None\n    max_len: An artificial maximum length to truncate tokenized sequences to;\n             Effective maximum length is always the minimum of this\n             value (if specified) and the underlying BERT model's\n             sequence length.\n             Default: None\n\n    Example:\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')\n\n        >>> text = \"Who was Jim Henson ?\"\n        >>> indexed_tokens = tokenizer.encode(tokenized_text)\n    \"\"\"\n    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)\n    return tokenizer\n\n\n@_append_from_pretrained_docstring(gpt2_docstring)\ndef gpt2Model(*args, **kwargs):\n    \"\"\"\n    gpt2Model is the basic OpenAI GPT-2 Transformer model based on\n    identical stacked masked self-attention blocks and pre-trained\n    on large scale dataset using language modeling signal.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')\n\n        #  Prepare tokenized input\n        >>> text_1 = \"Who was Jim Henson ?\"\n        >>> text_2 = \"Jim Henson was a puppeteer\"\n        >>> indexed_tokens_1 = tokenizer.encode(text_1)\n        >>> indexed_tokens_2 = tokenizer.encode(text_2)\n        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])\n        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])\n\n        # Load gpt2Model\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')\n        >>> model.eval()\n\n        # Predict hidden states features for each layer\n        # past can be used to reuse precomputed hidden state in a subsequent predictions\n        >>> with torch.no_grad():\n                hidden_states_1, past = model(tokens_tensor_1)\n                hidden_states_2, past = model(tokens_tensor_2, past=past)\n    \"\"\"\n    model = GPT2Model.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(gpt2_docstring)\ndef gpt2LMHeadModel(*args, **kwargs):\n    \"\"\"\n    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the\n    tied (pre-trained) language modeling head on top.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')\n\n        #  Prepare tokenized input\n        >>> text_1 = \"Who was Jim Henson ?\"\n        >>> text_2 = \"Jim Henson was a puppeteer\"\n        >>> indexed_tokens_1 = tokenizer.encode(text_1)\n        >>> indexed_tokens_2 = tokenizer.encode(text_2)\n        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])\n        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])\n\n        # Load gpt2LMHeadModel\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')\n        >>> model.eval()\n\n        # Predict hidden states features for each layer\n        # past can be used to reuse precomputed hidden state in a subsequent predictions\n        >>> with torch.no_grad():\n                predictions_1, past = model(tokens_tensor_1)\n                predictions_2, past = model(tokens_tensor_2, past=past)\n\n        # Get the predicted last token\n        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()\n        >>> predicted_token = tokenizer.decode([predicted_index])\n        >>> assert predicted_token == ' who'\n    \"\"\"\n    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(gpt2_docstring)\ndef gpt2DoubleHeadsModel(*args, **kwargs):\n    \"\"\"\n    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the\n    tied (pre-trained) language modeling head and a multiple choice\n    classification head (only initialized, not pre-trained).\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')\n\n        #  Prepare tokenized input\n        >>> text1 = \"Who was Jim Henson ? Jim Henson was a puppeteer\"\n        >>> text2 = \"Who was Jim Henson ? Jim Henson was a mysterious young man\"\n        >>> tokenized_text1 = tokenizer.tokenize(text1)\n        >>> tokenized_text2 = tokenizer.tokenize(text2)\n        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)\n        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)\n        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])\n        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])\n\n        # Load gpt2DoubleHeadsModel\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')\n        >>> model.eval()\n\n        # Predict hidden states features for each layer\n        >>> with torch.no_grad():\n                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)\n    \"\"\"\n    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)\n    return model\n"
  },
  {
    "path": "hubconfs/gpt_hubconf.py",
    "content": "from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer\nfrom pytorch_pretrained_bert.modeling_openai import (\n\tOpenAIGPTModel,\n\tOpenAIGPTLMHeadModel,\n\tOpenAIGPTDoubleHeadsModel\n)\n\n# Dependecies that are not specified in global hubconf.py\nspecific_dependencies = ['spacy', 'ftfy']\n\n# A lot of models share the same param doc. Use a decorator\n# to save typing\ngpt_docstring = \"\"\"\n    OpenAI GPT use a single embedding matrix to store the word and special embeddings.\n    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...\n    Special tokens need to be trained during the fine-tuning if you use them.\n    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.\n\n    The embeddings are ordered as follow in the token embeddings matrice:\n        [0,                                                         ----------------------\n         ...                                                        -> word embeddings\n         config.vocab_size - 1,                                     ______________________\n         config.vocab_size,\n         ...                                                        -> special embeddings\n         config.vocab_size + config.n_special - 1]                  ______________________\n\n    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:\n        total_tokens_embeddings = config.vocab_size + config.n_special\n    You should use the associate indices to index the embeddings.\n\n    Params:\n\t\tpretrained_model_name_or_path: either:\n\t\t\t- a str with the name of a pre-trained model to load selected in the list of:\n\t\t\t\t. `openai-gpt`\n\t\t\t- a path or url to a pretrained model archive containing:\n\t\t\t\t. `openai_gpt_config.json` a configuration file for the model\n\t\t\t\t. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance\n\t\t\t- a path or url to a pretrained model archive containing:\n\t\t\t\t. `openai-gpt-config.json` a configuration file for the model\n\t\t\t\t. a series of NumPy files containing OpenAI TensorFlow trained weights\n\t\tfrom_tf: should we load the weights from a locally saved TensorFlow checkpoint\n\t\tcache_dir: an optional path to a folder in which the pre-trained models will be cached.\n\t\tstate_dict: an optional state dictionnary (collections.OrderedDict object)\n\t\t        \tto use instead of pre-trained models\n\t\t*inputs, **kwargs: additional input for the specific OpenAI-GPT class\n\"\"\"\n\n\ndef _append_from_pretrained_docstring(docstr):\n    def docstring_decorator(fn):\n        fn.__doc__ = fn.__doc__ + docstr\n        return fn\n    return docstring_decorator\n\n\ndef openAIGPTTokenizer(*args, **kwargs):\n    \"\"\"\n    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.\n\tPeculiarities:\n        - lower case all inputs\n        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.\n        - argument special_tokens and function set_special_tokens:\n            can be used to add additional symbols (ex: \"__classify__\") to a vocabulary.\n\n    Args:\n    pretrained_model_name_or_path: Path to pretrained model archive\n                                   or one of pre-trained vocab configs below.\n                                       * openai-gpt\n    Keyword args:\n\tspecial_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)\n\t\t\t\t\tDefault: None\n\tmax_len: An artificial maximum length to truncate tokenized sequences to;\n        \t Effective maximum length is always the minimum of this\n             value (if specified) and the underlying BERT model's\n             sequence length.\n\t\t\t Default: None\n\n    Example:\n\t\t>>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')\n\t\t\n\t\t>>> text = \"Who was Jim Henson ? Jim Henson was a puppeteer\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]\n    \"\"\"\n    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)\n    return tokenizer\n\n\n@_append_from_pretrained_docstring(gpt_docstring)\ndef openAIGPTModel(*args, **kwargs):\n    \"\"\"\n    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on\n\tidentical stacked masked self-attention blocks and pre-trained\n\ton large scale dataset using language modeling signal.\n\n    Example:\n        # Load the tokenizer\n\t\t>>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')\n\n        #  Prepare tokenized input\n        >>> text = \"Who was Jim Henson ? Jim Henson was a puppeteer\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        >>> tokens_tensor = torch.tensor([indexed_tokens])\n\n        # Load openAIGPTModel\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')\n        >>> model.eval()\n\n        # Predict hidden states features for each layer\n        >>> with torch.no_grad():\n                hidden_states = model(tokens_tensor)\n    \"\"\"\n    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(gpt_docstring)\ndef openAIGPTLMHeadModel(*args, **kwargs):\n    \"\"\"\n    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the\n\ttied (pre-trained) language modeling head on top.\n\n\tExample:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')\n\n        #  Prepare tokenized input\n        >>> text = \"Who was Jim Henson ? Jim Henson was a puppeteer\"\n        >>> tokenized_text = tokenizer.tokenize(text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n        >>> tokens_tensor = torch.tensor([indexed_tokens])\n\n        # Load openAIGPTLMHeadModel\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')\n        >>> model.eval()\n\n        # Predict hidden states features for each layer\n        >>> with torch.no_grad():\n                predictions = model(tokens_tensor)\n\n\t\t# Get the predicted last token\n\t\t>>> predicted_index = torch.argmax(predictions[0, -1, :]).item()\n\t\t>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]\n        '.</w>'\n    \"\"\"\n    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(gpt_docstring)\ndef openAIGPTDoubleHeadsModel(*args, **kwargs):\n    \"\"\"\n    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the\n\ttied (pre-trained) language modeling head and a multiple choice\n\tclassification head (only initialized, not pre-trained).\n\n\tExample:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')\n\n        #  Prepare tokenized input\n        >>> text1 = \"Who was Jim Henson ? Jim Henson was a puppeteer\"\n        >>> text2 = \"Who was Jim Henson ? Jim Henson was a mysterious young man\"\n        >>> tokenized_text1 = tokenizer.tokenize(text1)\n        >>> tokenized_text2 = tokenizer.tokenize(text2)\n        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)\n        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)\n        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])\n        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])\n\n        # Load openAIGPTDoubleHeadsModel\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')\n        >>> model.eval()\n\n        # Predict hidden states features for each layer\n        >>> with torch.no_grad():\n                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)\n    \"\"\"\n    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)\n    return model\n"
  },
  {
    "path": "hubconfs/transformer_xl_hubconf.py",
    "content": "from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer\nfrom pytorch_pretrained_bert.modeling_transfo_xl import (\n    TransfoXLModel,\n    TransfoXLLMHeadModel\n)\n\n# A lot of models share the same param doc. Use a decorator\n# to save typing\ntransformer_xl_docstring = \"\"\"\n    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:\n    - you don't need to specify positioning embeddings indices\n    - the tokens in the vocabulary have to be sorted to decreasing frequency.\n\n    Params:\n        pretrained_model_name_or_path: either:\n            - a str with the name of a pre-trained model to load selected in the list of:\n                . `transfo-xl-wt103`\n            - a path or url to a pretrained model archive containing:\n                . `transfo_xl_config.json` a configuration file for the model\n                . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance\n            - a path or url to a pretrained model archive containing:\n                . `transfo_xl_config.json` a configuration file for the model\n                . `model.chkpt` a TensorFlow checkpoint\n        from_tf: should we load the weights from a locally saved TensorFlow checkpoint\n        cache_dir: an optional path to a folder in which the pre-trained models will be cached.\n        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models\n        *inputs, **kwargs: additional input for the specific TransformerXL class\n\"\"\"\n\n\ndef _append_from_pretrained_docstring(docstr):\n    def docstring_decorator(fn):\n        fn.__doc__ = fn.__doc__ + docstr\n        return fn\n    return docstring_decorator\n\n\ndef transformerXLTokenizer(*args, **kwargs):\n    \"\"\"\n    Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl\n\n    Args:\n    pretrained_model_name_or_path: Path to pretrained model archive\n                                   or one of pre-trained vocab configs below.\n                                       * transfo-xl-wt103\n\n    Example:\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')\n        \n        >>> text = \"Who was Jim Henson ?\"\n        >>> tokenized_text = tokenizer.tokenize(tokenized_text)\n        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n    \"\"\"\n    tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)\n    return tokenizer\n\n\n@_append_from_pretrained_docstring(transformer_xl_docstring)\ndef transformerXLModel(*args, **kwargs):\n    \"\"\"\n    transformerXLModel is the basic Transformer XL model.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')\n\n        #  Prepare tokenized input\n        >>> text_1 = \"Who was Jim Henson ?\"\n        >>> text_2 = \"Jim Henson was a puppeteer\"\n        >>> tokenized_text_1 = tokenizer.tokenize(text_1)\n        >>> tokenized_text_2 = tokenizer.tokenize(text_2)\n        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)\n        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)\n        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])\n        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])\n\n        # Load transformerXLModel\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLModel', 'transfo-xl-wt103')\n        >>> model.eval()\n\n        # Predict hidden states features for each layer\n        # We can re-use the memory cells in a subsequent call to attend a longer context\n        >>> with torch.no_grad():\n                hidden_states_1, mems_1 = model(tokens_tensor_1)\n                hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)\n    \"\"\"\n    model = TransfoXLModel.from_pretrained(*args, **kwargs)\n    return model\n\n\n@_append_from_pretrained_docstring(transformer_xl_docstring)\ndef transformerXLLMHeadModel(*args, **kwargs):\n    \"\"\"\n    transformerXLModel is the basic Transformer XL model with the\n    tied (pre-trained) language modeling head on top.\n\n    Example:\n        # Load the tokenizer\n        >>> import torch\n        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')\n\n        #  Prepare tokenized input\n        >>> text_1 = \"Who was Jim Henson ?\"\n        >>> text_2 = \"Jim Henson was a puppeteer\"\n        >>> tokenized_text_1 = tokenizer.tokenize(text_1)\n        >>> tokenized_text_2 = tokenizer.tokenize(text_2)\n        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)\n        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)\n        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])\n        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])\n\n        # Load transformerXLLMHeadModel\n        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLLMHeadModel', 'transfo-xl-wt103')\n        >>> model.eval()\n\n        # Predict hidden states features for each layer\n        # We can re-use the memory cells in a subsequent call to attend a longer context\n        >>> with torch.no_grad():\n                predictions_1, mems_1 = model(tokens_tensor_1)\n                predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)\n\n        # Get the predicted last token\n        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()\n        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]\n        >>> assert predicted_token == 'who'\n    \"\"\"\n    model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)\n    return model\n"
  },
  {
    "path": "notebooks/Comparing-PT-and-TF-models.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Pytorch to Tensorflow Conversion Test Notebook\\n\",\n    \"\\n\",\n    \"To run this notebook follow these steps, modifying the **Config** section as necessary:\\n\",\n    \"\\n\",\n    \"1. Point `pt_model_dir` to your local directory containing the pytorch Bert model to be converted.\\n\",\n    \"2. Point `tf_bert_dir` to your clone of Google's Bert implementation which can be found here: https://github.com/google-research/bert.\\n\",\n    \"\\n\",\n    \"Note: \\n\",\n    \"1. This feature currently only supports the base BERT models (uncased/cased).\\n\",\n    \"2. Tensorflow model will be dumped in `tf_model_dir`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Config\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"import sys\\n\",\n    \"\\n\",\n    \"model_cls  = 'BertModel'\\n\",\n    \"model_typ  = 'bert-base-uncased'\\n\",\n    \"token_cls  = 'BertTokenizer'\\n\",\n    \"max_seq    = 12\\n\",\n    \"CLS        = \\\"[CLS]\\\"\\n\",\n    \"SEP        = \\\"[SEP]\\\"\\n\",\n    \"MASK       = \\\"[MASK]\\\"\\n\",\n    \"CLS_IDX    = 0\\n\",\n    \"layer_idxs = tuple(range(12))\\n\",\n    \"input_text = \\\"jim henson was a puppeteer\\\"\\n\",\n    \"\\n\",\n    \"pt_model_dir = \\\"/home/ubuntu/.pytorch-pretrained-BERT-cache/{}\\\".format(model_typ)\\n\",\n    \"tf_bert_dir  = \\\"/home/ubuntu/bert\\\"\\n\",\n    \"\\n\",\n    \"pt_vocab_file  = os.path.join(pt_model_dir, \\\"vocab.txt\\\")\\n\",\n    \"pt_init_ckpt   = os.path.join(pt_model_dir, model_typ.replace(\\\"-\\\", \\\"_\\\") + \\\".bin\\\")\\n\",\n    \"tf_model_dir   = os.path.join(pt_model_dir, 'tf')\\n\",\n    \"tf_vocab_file  = os.path.join(tf_model_dir, \\\"vocab.txt\\\")\\n\",\n    \"tf_init_ckpt   = os.path.join(tf_model_dir, model_typ.replace(\\\"-\\\", \\\"_\\\") + \\\".ckpt\\\")\\n\",\n    \"tf_config_file = os.path.join(tf_model_dir, \\\"bert_config.json\\\")\\n\",\n    \"\\n\",\n    \"if not os.path.isdir(tf_model_dir): \\n\",\n    \"    os.makedirs(tf_model_dir, exist_ok=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Tokenization\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def tokenize(text, tokenizer):\\n\",\n    \"    text = text.strip().lower()\\n\",\n    \"    tok_ids = tokenizer.tokenize(text)\\n\",\n    \"    if len(tok_ids) > max_seq - 2:\\n\",\n    \"        tok_ids = tok_ids[:max_seq - 2]\\n\",\n    \"    tok_ids.insert(CLS_IDX, CLS)\\n\",\n    \"    tok_ids.append(SEP)\\n\",\n    \"    input_ids = tokenizer.convert_tokens_to_ids(tok_ids)\\n\",\n    \"    mask_ids = [1] * len(input_ids)\\n\",\n    \"    seg_ids = [0] * len(input_ids)\\n\",\n    \"    padding = [0] * (max_seq - len(input_ids))\\n\",\n    \"    input_ids += padding\\n\",\n    \"    mask_ids += padding\\n\",\n    \"    seg_ids += padding\\n\",\n    \"    return input_ids, mask_ids, seg_ids\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Pytorch execution\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 231508/231508 [00:00<00:00, 41092464.26B/s]\\n\",\n      \"100%|██████████| 407873900/407873900 [00:07<00:00, 58092479.52B/s]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Pytorch embedding shape: (1, 768)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import numpy as np\\n\",\n    \"import torch\\n\",\n    \"from pytorch_pretrained_bert import (BertConfig,\\n\",\n    \"                                     BertModel, \\n\",\n    \"                                     BertTokenizer, \\n\",\n    \"                                     BertForSequenceClassification)\\n\",\n    \"\\n\",\n    \"# Save Vocab\\n\",\n    \"pt_tokenizer = BertTokenizer.from_pretrained(\\n\",\n    \"    pretrained_model_name_or_path=model_typ, \\n\",\n    \"    cache_dir=pt_model_dir)\\n\",\n    \"pt_tokenizer.save_vocabulary(pt_model_dir)\\n\",\n    \"pt_tokenizer.save_vocabulary(tf_model_dir)\\n\",\n    \"\\n\",\n    \"# Save Model\\n\",\n    \"pt_model = BertModel.from_pretrained(\\n\",\n    \"    pretrained_model_name_or_path=model_typ, \\n\",\n    \"    cache_dir=pt_model_dir).to('cpu')\\n\",\n    \"pt_model.eval()\\n\",\n    \"pt_model.config.hidden_dropout_prob = 0.0\\n\",\n    \"pt_model.config.attention_probs_dropout_prob = 0.0\\n\",\n    \"pt_model.config.to_json_file(tf_config_file)\\n\",\n    \"torch.save(pt_model.state_dict(), pt_init_ckpt)\\n\",\n    \"\\n\",\n    \"# Inputs\\n\",\n    \"input_ids_pt, mask_ids_pt, seg_ids_pt = tokenize(input_text, pt_tokenizer)\\n\",\n    \"\\n\",\n    \"# PT Embedding\\n\",\n    \"tok_tensor = torch.tensor(input_ids_pt).to('cpu').unsqueeze(0)\\n\",\n    \"seg_tensor = torch.tensor(seg_ids_pt).to('cpu').unsqueeze(0)\\n\",\n    \"msk_tensor = torch.tensor(mask_ids_pt).to('cpu').unsqueeze(0)\\n\",\n    \"attn_blks, nsp_logits = pt_model(tok_tensor, seg_tensor, msk_tensor)\\n\",\n    \"pt_embedding = nsp_logits.detach().numpy() \\n\",\n    \"print(\\\"Pytorch embedding shape: {}\\\".format(pt_embedding.shape))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Pytorch &rarr; Tensorflow conversion\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Colocations handled automatically by placer.\\n\",\n      \"bert/embeddings/word_embeddings                             initialized\\n\",\n      \"bert/embeddings/position_embeddings                         initialized\\n\",\n      \"bert/embeddings/token_type_embeddings                       initialized\\n\",\n      \"bert/embeddings/LayerNorm/gamma                             initialized\\n\",\n      \"bert/embeddings/LayerNorm/beta                              initialized\\n\",\n      \"bert/encoder/layer_0/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_0/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_0/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_0/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_0/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_0/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_0/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_0/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_0/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_0/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_0/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_0/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_0/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_0/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_0/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_0/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_1/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_1/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_1/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_1/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_1/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_1/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_1/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_1/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_1/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_1/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_1/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_1/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_1/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_1/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_1/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_1/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_2/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_2/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_2/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_2/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_2/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_2/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_2/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_2/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_2/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_2/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_2/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_2/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_2/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_2/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_2/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_2/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_3/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_3/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_3/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_3/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_3/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_3/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_3/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_3/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_3/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_3/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_3/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_3/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_3/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_3/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_3/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_3/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_4/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_4/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_4/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_4/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_4/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_4/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_4/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_4/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_4/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_4/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_4/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_4/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_4/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_4/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_4/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_4/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_5/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_5/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_5/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_5/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_5/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_5/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_5/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_5/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_5/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_5/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_5/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_5/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_5/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_5/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_5/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_5/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_6/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_6/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_6/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_6/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_6/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_6/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_6/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_6/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_6/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_6/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_6/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_6/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_6/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_6/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_6/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_6/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_7/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_7/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_7/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_7/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_7/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_7/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_7/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_7/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_7/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_7/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_7/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_7/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_7/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_7/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_7/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_7/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_8/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_8/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_8/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_8/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_8/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_8/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_8/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_8/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_8/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_8/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_8/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_8/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_8/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_8/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_8/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_8/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_9/attention/self/query/kernel            initialized\\n\",\n      \"bert/encoder/layer_9/attention/self/query/bias              initialized\\n\",\n      \"bert/encoder/layer_9/attention/self/key/kernel              initialized\\n\",\n      \"bert/encoder/layer_9/attention/self/key/bias                initialized\\n\",\n      \"bert/encoder/layer_9/attention/self/value/kernel            initialized\\n\",\n      \"bert/encoder/layer_9/attention/self/value/bias              initialized\\n\",\n      \"bert/encoder/layer_9/attention/output/dense/kernel          initialized\\n\",\n      \"bert/encoder/layer_9/attention/output/dense/bias            initialized\\n\",\n      \"bert/encoder/layer_9/attention/output/LayerNorm/gamma       initialized\\n\",\n      \"bert/encoder/layer_9/attention/output/LayerNorm/beta        initialized\\n\",\n      \"bert/encoder/layer_9/intermediate/dense/kernel              initialized\\n\",\n      \"bert/encoder/layer_9/intermediate/dense/bias                initialized\\n\",\n      \"bert/encoder/layer_9/output/dense/kernel                    initialized\\n\",\n      \"bert/encoder/layer_9/output/dense/bias                      initialized\\n\",\n      \"bert/encoder/layer_9/output/LayerNorm/gamma                 initialized\\n\",\n      \"bert/encoder/layer_9/output/LayerNorm/beta                  initialized\\n\",\n      \"bert/encoder/layer_10/attention/self/query/kernel           initialized\\n\",\n      \"bert/encoder/layer_10/attention/self/query/bias             initialized\\n\",\n      \"bert/encoder/layer_10/attention/self/key/kernel             initialized\\n\",\n      \"bert/encoder/layer_10/attention/self/key/bias               initialized\\n\",\n      \"bert/encoder/layer_10/attention/self/value/kernel           initialized\\n\",\n      \"bert/encoder/layer_10/attention/self/value/bias             initialized\\n\",\n      \"bert/encoder/layer_10/attention/output/dense/kernel         initialized\\n\",\n      \"bert/encoder/layer_10/attention/output/dense/bias           initialized\\n\",\n      \"bert/encoder/layer_10/attention/output/LayerNorm/gamma      initialized\\n\",\n      \"bert/encoder/layer_10/attention/output/LayerNorm/beta       initialized\\n\",\n      \"bert/encoder/layer_10/intermediate/dense/kernel             initialized\\n\",\n      \"bert/encoder/layer_10/intermediate/dense/bias               initialized\\n\",\n      \"bert/encoder/layer_10/output/dense/kernel                   initialized\\n\",\n      \"bert/encoder/layer_10/output/dense/bias                     initialized\\n\",\n      \"bert/encoder/layer_10/output/LayerNorm/gamma                initialized\\n\",\n      \"bert/encoder/layer_10/output/LayerNorm/beta                 initialized\\n\",\n      \"bert/encoder/layer_11/attention/self/query/kernel           initialized\\n\",\n      \"bert/encoder/layer_11/attention/self/query/bias             initialized\\n\",\n      \"bert/encoder/layer_11/attention/self/key/kernel             initialized\\n\",\n      \"bert/encoder/layer_11/attention/self/key/bias               initialized\\n\",\n      \"bert/encoder/layer_11/attention/self/value/kernel           initialized\\n\",\n      \"bert/encoder/layer_11/attention/self/value/bias             initialized\\n\",\n      \"bert/encoder/layer_11/attention/output/dense/kernel         initialized\\n\",\n      \"bert/encoder/layer_11/attention/output/dense/bias           initialized\\n\",\n      \"bert/encoder/layer_11/attention/output/LayerNorm/gamma      initialized\\n\",\n      \"bert/encoder/layer_11/attention/output/LayerNorm/beta       initialized\\n\",\n      \"bert/encoder/layer_11/intermediate/dense/kernel             initialized\\n\",\n      \"bert/encoder/layer_11/intermediate/dense/bias               initialized\\n\",\n      \"bert/encoder/layer_11/output/dense/kernel                   initialized\\n\",\n      \"bert/encoder/layer_11/output/dense/bias                     initialized\\n\",\n      \"bert/encoder/layer_11/output/LayerNorm/gamma                initialized\\n\",\n      \"bert/encoder/layer_11/output/LayerNorm/beta                 initialized\\n\",\n      \"bert/pooler/dense/kernel                                    initialized\\n\",\n      \"bert/pooler/dense/bias                                      initialized\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from pytorch_pretrained_bert.convert_pytorch_checkpoint_to_tf import main\\n\",\n    \"\\n\",\n    \"main([\\n\",\n    \"    '--model_name', model_typ, \\n\",\n    \"    '--pytorch_model_path', pt_init_ckpt,\\n\",\n    \"    '--tf_cache_dir', tf_model_dir,\\n\",\n    \"    '--cache_dir', pt_model_dir\\n\",\n    \"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Tensorflow execution\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\\n\",\n      \"For more information, please see:\\n\",\n      \"  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\\n\",\n      \"  * https://github.com/tensorflow/addons\\n\",\n      \"If you depend on functionality not listed there, please file an issue.\\n\",\n      \"\\n\",\n      \"WARNING:tensorflow:From /home/ubuntu/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Use keras.layers.dense instead.\\n\",\n      \"WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Use standard file APIs to check for files with this prefix.\\n\",\n      \"INFO:tensorflow:Restoring parameters from /home/ubuntu/.pytorch-pretrained-BERT-cache/bert-base-uncased/tf/bert_base_uncased.ckpt\\n\",\n      \"Tensorflow embedding shape: (1, 768)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import tensorflow as tf\\n\",\n    \"sys.path.insert(0, tf_bert_dir)\\n\",\n    \"import modeling\\n\",\n    \"import tokenization\\n\",\n    \"\\n\",\n    \"tf.reset_default_graph()\\n\",\n    \"\\n\",\n    \"# Process text\\n\",\n    \"tf_tokenizer = tokenization.FullTokenizer(vocab_file=tf_vocab_file)\\n\",\n    \"\\n\",\n    \"# Graph inputs\\n\",\n    \"input_ids_tf, mask_ids_tf, seg_ids_tf = tokenize(input_text, tf_tokenizer)\\n\",\n    \"config = modeling.BertConfig.from_json_file(\\n\",\n    \"    os.path.join(tf_model_dir, 'bert_config.json'))\\n\",\n    \"input_tensor = tf.placeholder(\\n\",\n    \"    dtype=tf.int32,\\n\",\n    \"    shape=[1, None],\\n\",\n    \"    name='input_ids')\\n\",\n    \"mask_tensor = tf.placeholder(\\n\",\n    \"    dtype=tf.int32,\\n\",\n    \"    shape=[1, None],\\n\",\n    \"    name='mask_ids')\\n\",\n    \"seg_tensor = tf.placeholder(\\n\",\n    \"    dtype=tf.int32,\\n\",\n    \"    shape=[1, None],\\n\",\n    \"    name='seg_ids')\\n\",\n    \"tf_model = modeling.BertModel(\\n\",\n    \"    config=config,\\n\",\n    \"    is_training=False,\\n\",\n    \"    input_ids=input_tensor,\\n\",\n    \"    input_mask=mask_tensor,\\n\",\n    \"    token_type_ids=seg_tensor,\\n\",\n    \"    use_one_hot_embeddings=False)\\n\",\n    \"output_layer = tf_model.get_pooled_output()\\n\",\n    \"\\n\",\n    \"# Load tf model\\n\",\n    \"session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\\n\",\n    \"vars_to_load = [v for v in tf.global_variables()]\\n\",\n    \"session.run(tf.variables_initializer(var_list=vars_to_load))\\n\",\n    \"saver = tf.train.Saver(vars_to_load)\\n\",\n    \"saver.restore(session, save_path=tf_init_ckpt)\\n\",\n    \"\\n\",\n    \"# TF Embedding\\n\",\n    \"fetches = output_layer\\n\",\n    \"feed_dict  = {\\n\",\n    \"    input_tensor: [input_ids_tf],\\n\",\n    \"    mask_tensor: [mask_ids_tf],\\n\",\n    \"    seg_tensor: [seg_ids_tf]\\n\",\n    \"}\\n\",\n    \"tf_embedding = session.run(fetches=fetches, feed_dict=feed_dict)\\n\",\n    \"print(\\\"Tensorflow embedding shape: {}\\\".format(tf_embedding.shape))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Compare Tokenization\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"TOKEN_IDS_PT: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\\n\",\n      \"TOKEN_IDS_TF: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\\n\",\n      \"SEG_IDS_PT:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"SEG_IDS_TF:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"MASK_IDS_PT:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\\n\",\n      \"MASK_IDS_TF:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"TOKEN_IDS_PT: {}\\\".format(input_ids_pt))\\n\",\n    \"print(\\\"TOKEN_IDS_TF: {}\\\".format(input_ids_tf))\\n\",\n    \"print(\\\"SEG_IDS_PT:   {}\\\".format(seg_ids_pt))\\n\",\n    \"print(\\\"SEG_IDS_TF:   {}\\\".format(seg_ids_tf))\\n\",\n    \"print(\\\"MASK_IDS_PT:  {}\\\".format(mask_ids_pt))\\n\",\n    \"print(\\\"MASK_IDS_TF:  {}\\\".format(mask_ids_tf))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Compare Model Weights\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"bert/embeddings/word_embeddings\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\\n\",\n      \"TF: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\\n\",\n      \"\\n\",\n      \"bert/embeddings/token_type_embeddings\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\\n\",\n      \"TF: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\\n\",\n      \"\\n\",\n      \"bert/embeddings/position_embeddings\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\\n\",\n      \"TF: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\\n\",\n      \"\\n\",\n      \"bert/embeddings/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\\n\",\n      \"TF: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\\n\",\n      \"\\n\",\n      \"bert/embeddings/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\\n\",\n      \"TF: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\\n\",\n      \"TF: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\\n\",\n      \"TF: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\\n\",\n      \"TF: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\\n\",\n      \"TF: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\\n\",\n      \"TF: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\\n\",\n      \"TF: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\\n\",\n      \"TF: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\\n\",\n      \"TF: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\\n\",\n      \"TF: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\\n\",\n      \"TF: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\\n\",\n      \"TF: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\\n\",\n      \"TF: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_0/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\\n\",\n      \"TF: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\\n\",\n      \"TF: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\\n\",\n      \"TF: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\\n\",\n      \"TF: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\\n\",\n      \"TF: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\\n\",\n      \"TF: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\\n\",\n      \"TF: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\\n\",\n      \"TF: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\\n\",\n      \"TF: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\\n\",\n      \"TF: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\\n\",\n      \"TF: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\\n\",\n      \"TF: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\\n\",\n      \"TF: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\\n\",\n      \"TF: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\\n\",\n      \"TF: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_1/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\\n\",\n      \"TF: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\\n\",\n      \"TF: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\\n\",\n      \"TF: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\\n\",\n      \"TF: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\\n\",\n      \"TF: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\\n\",\n      \"TF: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\\n\",\n      \"TF: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\\n\",\n      \"TF: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\\n\",\n      \"TF: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\\n\",\n      \"TF: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\\n\",\n      \"TF: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\\n\",\n      \"TF: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\\n\",\n      \"TF: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_2/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\\n\",\n      \"TF: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\\n\",\n      \"TF: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\\n\",\n      \"TF: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\\n\",\n      \"TF: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\\n\",\n      \"TF: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\\n\",\n      \"TF: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\\n\",\n      \"TF: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\\n\",\n      \"TF: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\\n\",\n      \"TF: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\\n\",\n      \"TF: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\\n\",\n      \"TF: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\\n\",\n      \"TF: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\\n\",\n      \"TF: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_3/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\\n\",\n      \"TF: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\\n\",\n      \"TF: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\\n\",\n      \"TF: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\\n\",\n      \"TF: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\\n\",\n      \"TF: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\\n\",\n      \"TF: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\\n\",\n      \"TF: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\\n\",\n      \"TF: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\\n\",\n      \"TF: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\\n\",\n      \"TF: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\\n\",\n      \"TF: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\\n\",\n      \"TF: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_4/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\\n\",\n      \"TF: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\\n\",\n      \"TF: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\\n\",\n      \"TF: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\\n\",\n      \"TF: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\\n\",\n      \"TF: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\\n\",\n      \"TF: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\\n\",\n      \"TF: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\\n\",\n      \"TF: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\\n\",\n      \"TF: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\\n\",\n      \"TF: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\\n\",\n      \"TF: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\\n\",\n      \"TF: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\\n\",\n      \"TF: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\\n\",\n      \"TF: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_5/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\\n\",\n      \"TF: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\\n\",\n      \"TF: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\\n\",\n      \"TF: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\\n\",\n      \"TF: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\\n\",\n      \"TF: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\\n\",\n      \"TF: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\\n\",\n      \"TF: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\\n\",\n      \"TF: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\\n\",\n      \"TF: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\\n\",\n      \"TF: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\\n\",\n      \"TF: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\\n\",\n      \"TF: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\\n\",\n      \"TF: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\\n\",\n      \"TF: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_6/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\\n\",\n      \"TF: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\\n\",\n      \"TF: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\\n\",\n      \"TF: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\\n\",\n      \"TF: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\\n\",\n      \"TF: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\\n\",\n      \"TF: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\\n\",\n      \"TF: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\\n\",\n      \"TF: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\\n\",\n      \"TF: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\\n\",\n      \"TF: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\\n\",\n      \"TF: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\\n\",\n      \"TF: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\\n\",\n      \"TF: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\\n\",\n      \"TF: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\\n\",\n      \"TF: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_7/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\\n\",\n      \"TF: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\\n\",\n      \"TF: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\\n\",\n      \"TF: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\\n\",\n      \"TF: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\\n\",\n      \" -4.4074579e-04]\\n\",\n      \"TF: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\\n\",\n      \" -4.4074579e-04]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\\n\",\n      \"TF: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\\n\",\n      \"TF: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\\n\",\n      \"TF: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\\n\",\n      \"TF: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\\n\",\n      \"TF: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\\n\",\n      \"TF: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\\n\",\n      \"TF: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\\n\",\n      \"TF: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\\n\",\n      \"TF: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\\n\",\n      \"TF: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_8/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\\n\",\n      \"TF: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\\n\",\n      \"TF: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\\n\",\n      \"TF: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\\n\",\n      \"TF: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\\n\",\n      \"TF: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\\n\",\n      \"TF: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\\n\",\n      \"TF: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\\n\",\n      \"TF: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\\n\",\n      \"TF: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\\n\",\n      \"TF: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\\n\",\n      \"TF: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\\n\",\n      \"TF: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\\n\",\n      \"TF: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\\n\",\n      \"TF: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\\n\",\n      \"TF: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_9/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\\n\",\n      \"TF: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\\n\",\n      \"TF: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\\n\",\n      \"TF: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\\n\",\n      \"TF: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\\n\",\n      \"TF: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\\n\",\n      \"TF: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\\n\",\n      \"TF: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\\n\",\n      \"TF: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\\n\",\n      \"TF: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\\n\",\n      \"TF: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\\n\",\n      \"TF: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\\n\",\n      \"TF: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\\n\",\n      \"TF: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\\n\",\n      \"TF: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_10/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\\n\",\n      \"TF: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/self/query/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\\n\",\n      \"TF: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/self/query/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\\n\",\n      \"TF: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/self/key/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/self/key/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\\n\",\n      \"TF: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/self/value/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/self/value/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\\n\",\n      \"TF: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\\n\",\n      \"TF: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\\n\",\n      \"TF: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\\n\",\n      \"TF: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/attention/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\\n\",\n      \"TF: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/intermediate/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\\n\",\n      \"TF: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/intermediate/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\\n\",\n      \"TF: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/output/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\\n\",\n      \"TF: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/output/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\\n\",\n      \"TF: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/output/LayerNorm/beta\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\\n\",\n      \"TF: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\\n\",\n      \"\\n\",\n      \"bert/encoder/layer_11/output/LayerNorm/gamma\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\\n\",\n      \"TF: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\\n\",\n      \"\\n\",\n      \"bert/pooler/dense/kernel\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\\n\",\n      \"TF: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\\n\",\n      \"\\n\",\n      \"bert/pooler/dense/bias\\n\",\n      \"|sum(pt_wts - tf_wts)| = 0.0\\n\",\n      \"PT: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\\n\",\n      \"TF: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"tensors_to_transopse = (\\n\",\n    \"    \\\"dense.weight\\\",\\n\",\n    \"    \\\"attention.self.query\\\",\\n\",\n    \"    \\\"attention.self.key\\\",\\n\",\n    \"    \\\"attention.self.value\\\"\\n\",\n    \")\\n\",\n    \"var_map = (\\n\",\n    \"    ('layer.', 'layer_'),\\n\",\n    \"    ('word_embeddings.weight', 'word_embeddings'),\\n\",\n    \"    ('position_embeddings.weight', 'position_embeddings'),\\n\",\n    \"    ('token_type_embeddings.weight', 'token_type_embeddings'),\\n\",\n    \"    ('.', '/'),\\n\",\n    \"    ('LayerNorm/weight', 'LayerNorm/gamma'),\\n\",\n    \"    ('LayerNorm/bias', 'LayerNorm/beta'),\\n\",\n    \"    ('weight', 'kernel')\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"def to_tf_var_name(name:str):\\n\",\n    \"    for patt, repl in iter(var_map):\\n\",\n    \"        name = name.replace(patt, repl)\\n\",\n    \"    return 'bert/{}'.format(name)\\n\",\n    \"\\n\",\n    \"tf_vars = {v.name: session.run(fetches=v) for v in tf.global_variables()}\\n\",\n    \"pt_vars = {}\\n\",\n    \"for v, T in pt_model.state_dict().items():\\n\",\n    \"    T = T.detach().numpy()\\n\",\n    \"    if any([x in v for x in tensors_to_transopse]):\\n\",\n    \"        T = T.T\\n\",\n    \"    pt_vars.update({to_tf_var_name(v): T})\\n\",\n    \"\\n\",\n    \"for var_name in tf_vars:\\n\",\n    \"    \\n\",\n    \"    pt = pt_vars[var_name.strip(\\\":0\\\")]\\n\",\n    \"    tf = tf_vars[var_name]\\n\",\n    \"\\n\",\n    \"    print(var_name.strip(\\\":0\\\"))\\n\",\n    \"    \\n\",\n    \"    # Assert equivalence\\n\",\n    \"    print(\\\"|sum(pt_wts - tf_wts)| = {}\\\".format(\\n\",\n    \"        np.abs(np.sum(pt - tf, keepdims=False))\\n\",\n    \"    ))\\n\",\n    \"    assert not np.sum(pt - tf, keepdims=False)\\n\",\n    \"    \\n\",\n    \"    if len(pt.shape) == 2:\\n\",\n    \"        print(\\\"PT: shape: {0} values: {1}\\\".format(pt.shape, pt[0, :5]))\\n\",\n    \"        print(\\\"TF: shape: {0} values: {1}\\\".format(tf.shape, tf[0, :5]))\\n\",\n    \"    else:\\n\",\n    \"        print(\\\"PT: shape: {0} values: {1}\\\".format(pt.shape, pt[:5]))\\n\",\n    \"        print(\\\"TF: shape: {0} values: {1}\\\".format(tf.shape, tf[:5]))\\n\",\n    \"    print()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Compare Layer-12 Projections\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"MSE: 2.7155439966009e-05\\n\",\n      \"PT-values: [-0.876663   -0.41088238 -0.12200808  0.44941     0.19445966]\\n\",\n      \"TF-values: [-0.8742865  -0.40621698 -0.10585472  0.444904    0.1825743 ]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Mean Squared Error (MSE) between last projection of each model\\n\",\n    \"MSE = np.mean((pt_embedding - tf_embedding) ** 2, keepdims=False)\\n\",\n    \"print(\\\"MSE: {}\\\".format(MSE))\\n\",\n    \"print(\\\"PT-values: {}\\\".format(pt_embedding[0, :5]))\\n\",\n    \"print(\\\"TF-values: {}\\\".format(tf_embedding[0, :5]))\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"nlp\",\n   \"language\": \"python\",\n   \"name\": \"nlp\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Comparing TensorFlow (original) and PyTorch models\\n\",\n    \"\\n\",\n    \"You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.\\n\",\n    \"\\n\",\n    \"To run this notebook, follow these instructions:\\n\",\n    \"- make sure that your Python environment has both TensorFlow and PyTorch installed,\\n\",\n    \"- download the original TensorFlow implementation,\\n\",\n    \"- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\\n\",\n    \"- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\\n\",\n    \"\\n\",\n    \"If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:26.999106Z\",\n     \"start_time\": \"2018-11-16T10:02:26.985709Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"os.chdir('../')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1/ TensorFlow code\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:27.664528Z\",\n     \"start_time\": \"2018-11-16T10:02:27.651019Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"original_tf_inplem_dir = \\\"./tensorflow_code/\\\"\\n\",\n    \"model_dir = \\\"../google_models/uncased_L-12_H-768_A-12/\\\"\\n\",\n    \"\\n\",\n    \"vocab_file = model_dir + \\\"vocab.txt\\\"\\n\",\n    \"bert_config_file = model_dir + \\\"bert_config.json\\\"\\n\",\n    \"init_checkpoint = model_dir + \\\"bert_model.ckpt\\\"\\n\",\n    \"\\n\",\n    \"input_file = \\\"./samples/input.txt\\\"\\n\",\n    \"max_seq_length = 128\\n\",\n    \"max_predictions_per_seq = 20\\n\",\n    \"\\n\",\n    \"masked_lm_positions = [6]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:30.202182Z\",\n     \"start_time\": \"2018-11-16T10:02:28.112570Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import importlib.util\\n\",\n    \"import sys\\n\",\n    \"import tensorflow as tf\\n\",\n    \"import pytorch_pretrained_bert as ppb\\n\",\n    \"\\n\",\n    \"def del_all_flags(FLAGS):\\n\",\n    \"    flags_dict = FLAGS._flags()    \\n\",\n    \"    keys_list = [keys for keys in flags_dict]    \\n\",\n    \"    for keys in keys_list:\\n\",\n    \"        FLAGS.__delattr__(keys)\\n\",\n    \"\\n\",\n    \"del_all_flags(tf.flags.FLAGS)\\n\",\n    \"import tensorflow_code.extract_features as ef\\n\",\n    \"del_all_flags(tf.flags.FLAGS)\\n\",\n    \"import tensorflow_code.modeling as tfm\\n\",\n    \"del_all_flags(tf.flags.FLAGS)\\n\",\n    \"import tensorflow_code.tokenization as tft\\n\",\n    \"del_all_flags(tf.flags.FLAGS)\\n\",\n    \"import tensorflow_code.run_pretraining as rp\\n\",\n    \"del_all_flags(tf.flags.FLAGS)\\n\",\n    \"import tensorflow_code.create_pretraining_data as cpp\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:30.238027Z\",\n     \"start_time\": \"2018-11-16T10:02:30.204943Z\"\n    },\n    \"code_folding\": [\n     15\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import re\\n\",\n    \"class InputExample(object):\\n\",\n    \"    \\\"\\\"\\\"A single instance example.\\\"\\\"\\\"\\n\",\n    \"\\n\",\n    \"    def __init__(self, tokens, segment_ids, masked_lm_positions,\\n\",\n    \"                 masked_lm_labels, is_random_next):\\n\",\n    \"        self.tokens = tokens\\n\",\n    \"        self.segment_ids = segment_ids\\n\",\n    \"        self.masked_lm_positions = masked_lm_positions\\n\",\n    \"        self.masked_lm_labels = masked_lm_labels\\n\",\n    \"        self.is_random_next = is_random_next\\n\",\n    \"    def __repr__(self):\\n\",\n    \"        return '\\\\n'.join(k + \\\":\\\" + str(v) for k, v in self.__dict__.items())\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def read_examples(input_file, tokenizer, masked_lm_positions):\\n\",\n    \"    \\\"\\\"\\\"Read a list of `InputExample`s from an input file.\\\"\\\"\\\"\\n\",\n    \"    examples = []\\n\",\n    \"    unique_id = 0\\n\",\n    \"    with tf.gfile.GFile(input_file, \\\"r\\\") as reader:\\n\",\n    \"        while True:\\n\",\n    \"            line = reader.readline()\\n\",\n    \"            if not line:\\n\",\n    \"                break\\n\",\n    \"            line = line.strip()\\n\",\n    \"            text_a = None\\n\",\n    \"            text_b = None\\n\",\n    \"            m = re.match(r\\\"^(.*) \\\\|\\\\|\\\\| (.*)$\\\", line)\\n\",\n    \"            if m is None:\\n\",\n    \"                text_a = line\\n\",\n    \"            else:\\n\",\n    \"                text_a = m.group(1)\\n\",\n    \"                text_b = m.group(2)\\n\",\n    \"            tokens_a = tokenizer.tokenize(text_a)\\n\",\n    \"            tokens_b = None\\n\",\n    \"            if text_b:\\n\",\n    \"                tokens_b = tokenizer.tokenize(text_b)\\n\",\n    \"            tokens = tokens_a + tokens_b\\n\",\n    \"            masked_lm_labels = []\\n\",\n    \"            for m_pos in masked_lm_positions:\\n\",\n    \"                masked_lm_labels.append(tokens[m_pos])\\n\",\n    \"                tokens[m_pos] = '[MASK]'\\n\",\n    \"            examples.append(\\n\",\n    \"                InputExample(\\n\",\n    \"                    tokens = tokens,\\n\",\n    \"                    segment_ids = [0] * len(tokens_a) + [1] * len(tokens_b),\\n\",\n    \"                    masked_lm_positions = masked_lm_positions,\\n\",\n    \"                    masked_lm_labels = masked_lm_labels,\\n\",\n    \"                    is_random_next = False))\\n\",\n    \"            unique_id += 1\\n\",\n    \"    return examples\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:30.304018Z\",\n     \"start_time\": \"2018-11-16T10:02:30.240189Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"tokens:['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']\\n\",\n      \"segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]\\n\",\n      \"masked_lm_positions:[6]\\n\",\n      \"masked_lm_labels:['henson']\\n\",\n      \"is_random_next:False\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"bert_config = tfm.BertConfig.from_json_file(bert_config_file)\\n\",\n    \"tokenizer = ppb.BertTokenizer(\\n\",\n    \"    vocab_file=vocab_file, do_lower_case=True)\\n\",\n    \"examples = read_examples(input_file, tokenizer, masked_lm_positions=masked_lm_positions)\\n\",\n    \"\\n\",\n    \"print(examples[0])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:33.324167Z\",\n     \"start_time\": \"2018-11-16T10:02:33.291909Z\"\n    },\n    \"code_folding\": [\n     16\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"class InputFeatures(object):\\n\",\n    \"    \\\"\\\"\\\"A single set of features of data.\\\"\\\"\\\"\\n\",\n    \"\\n\",\n    \"    def __init__(self, input_ids, input_mask, segment_ids, masked_lm_positions,\\n\",\n    \"                 masked_lm_ids, masked_lm_weights, next_sentence_label):\\n\",\n    \"        self.input_ids = input_ids\\n\",\n    \"        self.input_mask = input_mask\\n\",\n    \"        self.segment_ids = segment_ids\\n\",\n    \"        self.masked_lm_positions = masked_lm_positions\\n\",\n    \"        self.masked_lm_ids = masked_lm_ids\\n\",\n    \"        self.masked_lm_weights = masked_lm_weights\\n\",\n    \"        self.next_sentence_labels = next_sentence_label\\n\",\n    \"\\n\",\n    \"    def __repr__(self):\\n\",\n    \"        return '\\\\n'.join(k + \\\":\\\" + str(v) for k, v in self.__dict__.items())\\n\",\n    \"\\n\",\n    \"def pretraining_convert_examples_to_features(instances, tokenizer, max_seq_length,\\n\",\n    \"                                 max_predictions_per_seq):\\n\",\n    \"    \\\"\\\"\\\"Create TF example files from `TrainingInstance`s.\\\"\\\"\\\"\\n\",\n    \"    features = []\\n\",\n    \"    for (inst_index, instance) in enumerate(instances):\\n\",\n    \"        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)\\n\",\n    \"        input_mask = [1] * len(input_ids)\\n\",\n    \"        segment_ids = list(instance.segment_ids)\\n\",\n    \"        assert len(input_ids) <= max_seq_length\\n\",\n    \"\\n\",\n    \"        while len(input_ids) < max_seq_length:\\n\",\n    \"            input_ids.append(0)\\n\",\n    \"            input_mask.append(0)\\n\",\n    \"            segment_ids.append(0)\\n\",\n    \"\\n\",\n    \"        assert len(input_ids) == max_seq_length\\n\",\n    \"        assert len(input_mask) == max_seq_length\\n\",\n    \"        assert len(segment_ids) == max_seq_length\\n\",\n    \"\\n\",\n    \"        masked_lm_positions = list(instance.masked_lm_positions)\\n\",\n    \"        masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)\\n\",\n    \"        masked_lm_weights = [1.0] * len(masked_lm_ids)\\n\",\n    \"\\n\",\n    \"        while len(masked_lm_positions) < max_predictions_per_seq:\\n\",\n    \"            masked_lm_positions.append(0)\\n\",\n    \"            masked_lm_ids.append(0)\\n\",\n    \"            masked_lm_weights.append(0.0)\\n\",\n    \"\\n\",\n    \"        next_sentence_label = 1 if instance.is_random_next else 0\\n\",\n    \"\\n\",\n    \"        features.append(\\n\",\n    \"            InputFeatures(input_ids, input_mask, segment_ids,\\n\",\n    \"                          masked_lm_positions, masked_lm_ids,\\n\",\n    \"                          masked_lm_weights, next_sentence_label))\\n\",\n    \"\\n\",\n    \"        if inst_index < 5:\\n\",\n    \"            tf.logging.info(\\\"*** Example ***\\\")\\n\",\n    \"            tf.logging.info(\\\"tokens: %s\\\" % \\\" \\\".join(\\n\",\n    \"                [str(x) for x in instance.tokens]))\\n\",\n    \"            tf.logging.info(\\\"features: %s\\\" % str(features[-1]))\\n\",\n    \"    return features\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:34.185367Z\",\n     \"start_time\": \"2018-11-16T10:02:34.155046Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:*** Example ***\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:34 - INFO - tensorflow -   *** Example ***\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:tokens: who was jim henson ? jim [MASK] was a puppet ##eer\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:34 - INFO - tensorflow -   tokens: who was jim henson ? jim [MASK] was a puppet ##eer\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:features: input_ids:[2040, 2001, 3958, 27227, 1029, 3958, 103, 2001, 1037, 13997, 11510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"input_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"masked_lm_positions:[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"masked_lm_ids:[27227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"masked_lm_weights:[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\\n\",\n      \"next_sentence_labels:0\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:34 - INFO - tensorflow -   features: input_ids:[2040, 2001, 3958, 27227, 1029, 3958, 103, 2001, 1037, 13997, 11510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"input_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"masked_lm_positions:[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"masked_lm_ids:[27227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\\n\",\n      \"masked_lm_weights:[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\\n\",\n      \"next_sentence_labels:0\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"features = pretraining_convert_examples_to_features(\\n\",\n    \"    instances=examples, max_seq_length=max_seq_length, \\n\",\n    \"    max_predictions_per_seq=max_predictions_per_seq, tokenizer=tokenizer)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:34.912005Z\",\n     \"start_time\": \"2018-11-16T10:02:34.882111Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"def input_fn_builder(features, seq_length, max_predictions_per_seq, tokenizer):\\n\",\n    \"    \\\"\\\"\\\"Creates an `input_fn` closure to be passed to TPUEstimator.\\\"\\\"\\\"\\n\",\n    \"\\n\",\n    \"    all_input_ids = []\\n\",\n    \"    all_input_mask = []\\n\",\n    \"    all_segment_ids = []\\n\",\n    \"    all_masked_lm_positions = []\\n\",\n    \"    all_masked_lm_ids = []\\n\",\n    \"    all_masked_lm_weights = []\\n\",\n    \"    all_next_sentence_labels = []\\n\",\n    \"\\n\",\n    \"    for feature in features:\\n\",\n    \"        all_input_ids.append(feature.input_ids)\\n\",\n    \"        all_input_mask.append(feature.input_mask)\\n\",\n    \"        all_segment_ids.append(feature.segment_ids)\\n\",\n    \"        all_masked_lm_positions.append(feature.masked_lm_positions)\\n\",\n    \"        all_masked_lm_ids.append(feature.masked_lm_ids)\\n\",\n    \"        all_masked_lm_weights.append(feature.masked_lm_weights)\\n\",\n    \"        all_next_sentence_labels.append(feature.next_sentence_labels)\\n\",\n    \"\\n\",\n    \"    def input_fn(params):\\n\",\n    \"        \\\"\\\"\\\"The actual input function.\\\"\\\"\\\"\\n\",\n    \"        batch_size = params[\\\"batch_size\\\"]\\n\",\n    \"\\n\",\n    \"        num_examples = len(features)\\n\",\n    \"\\n\",\n    \"        # This is for demo purposes and does NOT scale to large data sets. We do\\n\",\n    \"        # not use Dataset.from_generator() because that uses tf.py_func which is\\n\",\n    \"        # not TPU compatible. The right way to load data is with TFRecordReader.\\n\",\n    \"        d = tf.data.Dataset.from_tensor_slices({\\n\",\n    \"            \\\"input_ids\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_input_ids, shape=[num_examples, seq_length],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"            \\\"input_mask\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_input_mask,\\n\",\n    \"                    shape=[num_examples, seq_length],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"            \\\"segment_ids\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_segment_ids,\\n\",\n    \"                    shape=[num_examples, seq_length],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"            \\\"masked_lm_positions\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_masked_lm_positions,\\n\",\n    \"                    shape=[num_examples, max_predictions_per_seq],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"        \\\"masked_lm_ids\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_masked_lm_ids,\\n\",\n    \"                    shape=[num_examples, max_predictions_per_seq],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"        \\\"masked_lm_weights\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_masked_lm_weights,\\n\",\n    \"                    shape=[num_examples, max_predictions_per_seq],\\n\",\n    \"                    dtype=tf.float32),\\n\",\n    \"        \\\"next_sentence_labels\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_next_sentence_labels,\\n\",\n    \"                    shape=[num_examples, 1],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"        })\\n\",\n    \"\\n\",\n    \"        d = d.batch(batch_size=batch_size, drop_remainder=False)\\n\",\n    \"        return d\\n\",\n    \"\\n\",\n    \"    return input_fn\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:35.671603Z\",\n     \"start_time\": \"2018-11-16T10:02:35.626167Z\"\n    },\n    \"code_folding\": [\n     64,\n     77\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"def model_fn_builder(bert_config, init_checkpoint, learning_rate,\\n\",\n    \"                     num_train_steps, num_warmup_steps, use_tpu,\\n\",\n    \"                     use_one_hot_embeddings):\\n\",\n    \"    \\\"\\\"\\\"Returns `model_fn` closure for TPUEstimator.\\\"\\\"\\\"\\n\",\n    \"\\n\",\n    \"    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\\n\",\n    \"        \\\"\\\"\\\"The `model_fn` for TPUEstimator.\\\"\\\"\\\"\\n\",\n    \"\\n\",\n    \"        tf.logging.info(\\\"*** Features ***\\\")\\n\",\n    \"        for name in sorted(features.keys()):\\n\",\n    \"            tf.logging.info(\\\"  name = %s, shape = %s\\\" % (name, features[name].shape))\\n\",\n    \"\\n\",\n    \"        input_ids = features[\\\"input_ids\\\"]\\n\",\n    \"        input_mask = features[\\\"input_mask\\\"]\\n\",\n    \"        segment_ids = features[\\\"segment_ids\\\"]\\n\",\n    \"        masked_lm_positions = features[\\\"masked_lm_positions\\\"]\\n\",\n    \"        masked_lm_ids = features[\\\"masked_lm_ids\\\"]\\n\",\n    \"        masked_lm_weights = features[\\\"masked_lm_weights\\\"]\\n\",\n    \"        next_sentence_labels = features[\\\"next_sentence_labels\\\"]\\n\",\n    \"\\n\",\n    \"        is_training = (mode == tf.estimator.ModeKeys.TRAIN)\\n\",\n    \"\\n\",\n    \"        model = tfm.BertModel(\\n\",\n    \"            config=bert_config,\\n\",\n    \"            is_training=is_training,\\n\",\n    \"            input_ids=input_ids,\\n\",\n    \"            input_mask=input_mask,\\n\",\n    \"            token_type_ids=segment_ids,\\n\",\n    \"            use_one_hot_embeddings=use_one_hot_embeddings)\\n\",\n    \"\\n\",\n    \"        (masked_lm_loss,\\n\",\n    \"         masked_lm_example_loss, masked_lm_log_probs) = rp.get_masked_lm_output(\\n\",\n    \"            bert_config, model.get_sequence_output(), model.get_embedding_table(),\\n\",\n    \"            masked_lm_positions, masked_lm_ids, masked_lm_weights)\\n\",\n    \"\\n\",\n    \"        (next_sentence_loss, next_sentence_example_loss,\\n\",\n    \"         next_sentence_log_probs) = rp.get_next_sentence_output(\\n\",\n    \"            bert_config, model.get_pooled_output(), next_sentence_labels)\\n\",\n    \"\\n\",\n    \"        total_loss = masked_lm_loss + next_sentence_loss\\n\",\n    \"\\n\",\n    \"        tvars = tf.trainable_variables()\\n\",\n    \"\\n\",\n    \"        initialized_variable_names = {}\\n\",\n    \"        scaffold_fn = None\\n\",\n    \"        if init_checkpoint:\\n\",\n    \"            (assignment_map,\\n\",\n    \"             initialized_variable_names) = tfm.get_assigment_map_from_checkpoint(\\n\",\n    \"                tvars, init_checkpoint)\\n\",\n    \"            if use_tpu:\\n\",\n    \"\\n\",\n    \"                def tpu_scaffold():\\n\",\n    \"                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\\n\",\n    \"                    return tf.train.Scaffold()\\n\",\n    \"\\n\",\n    \"                scaffold_fn = tpu_scaffold\\n\",\n    \"            else:\\n\",\n    \"                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\\n\",\n    \"\\n\",\n    \"        tf.logging.info(\\\"**** Trainable Variables ****\\\")\\n\",\n    \"        for var in tvars:\\n\",\n    \"            init_string = \\\"\\\"\\n\",\n    \"            if var.name in initialized_variable_names:\\n\",\n    \"                init_string = \\\", *INIT_FROM_CKPT*\\\"\\n\",\n    \"            tf.logging.info(\\\"  name = %s, shape = %s%s\\\", var.name, var.shape,\\n\",\n    \"                            init_string)\\n\",\n    \"\\n\",\n    \"        output_spec = None\\n\",\n    \"        if mode == tf.estimator.ModeKeys.TRAIN:\\n\",\n    \"            masked_lm_positions = features[\\\"masked_lm_positions\\\"]\\n\",\n    \"            masked_lm_ids = features[\\\"masked_lm_ids\\\"]\\n\",\n    \"            masked_lm_weights = features[\\\"masked_lm_weights\\\"]\\n\",\n    \"            next_sentence_labels = features[\\\"next_sentence_labels\\\"]\\n\",\n    \"            train_op = optimization.create_optimizer(\\n\",\n    \"                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)\\n\",\n    \"\\n\",\n    \"            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\\n\",\n    \"                mode=mode,\\n\",\n    \"                loss=total_loss,\\n\",\n    \"                train_op=train_op,\\n\",\n    \"                scaffold_fn=scaffold_fn)\\n\",\n    \"        elif mode == tf.estimator.ModeKeys.EVAL:\\n\",\n    \"            masked_lm_positions = features[\\\"masked_lm_positions\\\"]\\n\",\n    \"            masked_lm_ids = features[\\\"masked_lm_ids\\\"]\\n\",\n    \"            masked_lm_weights = features[\\\"masked_lm_weights\\\"]\\n\",\n    \"            next_sentence_labels = features[\\\"next_sentence_labels\\\"]\\n\",\n    \"\\n\",\n    \"            def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,\\n\",\n    \"                          masked_lm_weights, next_sentence_example_loss,\\n\",\n    \"                          next_sentence_log_probs, next_sentence_labels):\\n\",\n    \"                \\\"\\\"\\\"Computes the loss and accuracy of the model.\\\"\\\"\\\"\\n\",\n    \"                masked_lm_log_probs = tf.reshape(masked_lm_log_probs,\\n\",\n    \"                                                 [-1, masked_lm_log_probs.shape[-1]])\\n\",\n    \"                masked_lm_predictions = tf.argmax(\\n\",\n    \"                    masked_lm_log_probs, axis=-1, output_type=tf.int32)\\n\",\n    \"                masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])\\n\",\n    \"                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])\\n\",\n    \"                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])\\n\",\n    \"                masked_lm_accuracy = tf.metrics.accuracy(\\n\",\n    \"                    labels=masked_lm_ids,\\n\",\n    \"                    predictions=masked_lm_predictions,\\n\",\n    \"                    weights=masked_lm_weights)\\n\",\n    \"                masked_lm_mean_loss = tf.metrics.mean(\\n\",\n    \"                    values=masked_lm_example_loss, weights=masked_lm_weights)\\n\",\n    \"\\n\",\n    \"                next_sentence_log_probs = tf.reshape(\\n\",\n    \"                    next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])\\n\",\n    \"                next_sentence_predictions = tf.argmax(\\n\",\n    \"                    next_sentence_log_probs, axis=-1, output_type=tf.int32)\\n\",\n    \"                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])\\n\",\n    \"                next_sentence_accuracy = tf.metrics.accuracy(\\n\",\n    \"                    labels=next_sentence_labels, predictions=next_sentence_predictions)\\n\",\n    \"                next_sentence_mean_loss = tf.metrics.mean(\\n\",\n    \"                    values=next_sentence_example_loss)\\n\",\n    \"\\n\",\n    \"                return {\\n\",\n    \"                    \\\"masked_lm_accuracy\\\": masked_lm_accuracy,\\n\",\n    \"                    \\\"masked_lm_loss\\\": masked_lm_mean_loss,\\n\",\n    \"                    \\\"next_sentence_accuracy\\\": next_sentence_accuracy,\\n\",\n    \"                    \\\"next_sentence_loss\\\": next_sentence_mean_loss,\\n\",\n    \"                }\\n\",\n    \"\\n\",\n    \"            eval_metrics = (metric_fn, [\\n\",\n    \"                masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,\\n\",\n    \"                masked_lm_weights, next_sentence_example_loss,\\n\",\n    \"                next_sentence_log_probs, next_sentence_labels\\n\",\n    \"            ])\\n\",\n    \"            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\\n\",\n    \"                mode=mode,\\n\",\n    \"                loss=total_loss,\\n\",\n    \"                eval_metrics=eval_metrics,\\n\",\n    \"                scaffold_fn=scaffold_fn)\\n\",\n    \"        elif mode == tf.estimator.ModeKeys.PREDICT:\\n\",\n    \"            masked_lm_log_probs = tf.reshape(masked_lm_log_probs,\\n\",\n    \"                                                [-1, masked_lm_log_probs.shape[-1]])\\n\",\n    \"            masked_lm_predictions = tf.argmax(\\n\",\n    \"                masked_lm_log_probs, axis=-1, output_type=tf.int32)\\n\",\n    \"\\n\",\n    \"            next_sentence_log_probs = tf.reshape(\\n\",\n    \"                next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])\\n\",\n    \"            next_sentence_predictions = tf.argmax(\\n\",\n    \"                next_sentence_log_probs, axis=-1, output_type=tf.int32)\\n\",\n    \"\\n\",\n    \"            masked_lm_predictions = tf.reshape(masked_lm_predictions,\\n\",\n    \"                                                [1, masked_lm_positions.shape[-1]])\\n\",\n    \"            next_sentence_predictions = tf.reshape(next_sentence_predictions,\\n\",\n    \"                                                [1, 1])\\n\",\n    \"\\n\",\n    \"            predictions = {\\n\",\n    \"                \\\"masked_lm_predictions\\\": masked_lm_predictions,\\n\",\n    \"                \\\"next_sentence_predictions\\\": next_sentence_predictions\\n\",\n    \"            }\\n\",\n    \"\\n\",\n    \"            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\\n\",\n    \"                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)\\n\",\n    \"            return output_spec\\n\",\n    \"        else:\\n\",\n    \"            raise ValueError(\\\"Only TRAIN, EVAL and PREDICT modes are supported: %s\\\" % (mode))\\n\",\n    \"\\n\",\n    \"        return output_spec\\n\",\n    \"\\n\",\n    \"    return model_fn\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:40.328700Z\",\n     \"start_time\": \"2018-11-16T10:02:36.289676Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12a864ae8>) includes params argument, but params are not passed to Estimator.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - WARNING - tensorflow -   Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12a864ae8>) includes params argument, but params are not passed to Estimator.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - WARNING - tensorflow -   Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\\n\",\n      \"graph_options {\\n\",\n      \"  rewrite_options {\\n\",\n      \"    meta_optimizer_iterations: ONE\\n\",\n      \"  }\\n\",\n      \"}\\n\",\n      \", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dbb5ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -   Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\\n\",\n      \"graph_options {\\n\",\n      \"  rewrite_options {\\n\",\n      \"    meta_optimizer_iterations: ONE\\n\",\n      \"  }\\n\",\n      \"}\\n\",\n      \", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dbb5ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - WARNING - tensorflow -   Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:_TPUContext: eval_on_tpu True\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -   _TPUContext: eval_on_tpu True\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - WARNING - tensorflow -   eval_on_tpu ignored because use_tpu is False.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\\n\",\n    \"run_config = tf.contrib.tpu.RunConfig(\\n\",\n    \"    master=None,\\n\",\n    \"    tpu_config=tf.contrib.tpu.TPUConfig(\\n\",\n    \"        num_shards=1,\\n\",\n    \"        per_host_input_for_training=is_per_host))\\n\",\n    \"\\n\",\n    \"model_fn = model_fn_builder(\\n\",\n    \"    bert_config=bert_config,\\n\",\n    \"    init_checkpoint=init_checkpoint,\\n\",\n    \"    learning_rate=0,\\n\",\n    \"    num_train_steps=1,\\n\",\n    \"    num_warmup_steps=1,\\n\",\n    \"    use_tpu=False,\\n\",\n    \"    use_one_hot_embeddings=False)\\n\",\n    \"\\n\",\n    \"# If TPU is not available, this will fall back to normal Estimator on CPU\\n\",\n    \"# or GPU.\\n\",\n    \"estimator = tf.contrib.tpu.TPUEstimator(\\n\",\n    \"    use_tpu=False,\\n\",\n    \"    model_fn=model_fn,\\n\",\n    \"    config=run_config,\\n\",\n    \"    predict_batch_size=1)\\n\",\n    \"\\n\",\n    \"input_fn = input_fn_builder(\\n\",\n    \"    features=features, seq_length=max_seq_length, max_predictions_per_seq=max_predictions_per_seq,\\n\",\n    \"tokenizer=tokenizer)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:46.596956Z\",\n     \"start_time\": \"2018-11-16T10:02:40.331008Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d, running initialization to predict.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -   Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d, running initialization to predict.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Calling model_fn.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -   Calling model_fn.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Running infer on CPU\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -   Running infer on CPU\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:*** Features ***\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -   *** Features ***\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = input_ids, shape = (?, 128)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -     name = input_ids, shape = (?, 128)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = input_mask, shape = (?, 128)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -     name = input_mask, shape = (?, 128)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = masked_lm_ids, shape = (?, 20)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_ids, shape = (?, 20)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = masked_lm_positions, shape = (?, 20)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_positions, shape = (?, 20)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = masked_lm_weights, shape = (?, 20)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_weights, shape = (?, 20)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = next_sentence_labels, shape = (?, 1)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -     name = next_sentence_labels, shape = (?, 1)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = segment_ids, shape = (?, 128)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:40 - INFO - tensorflow -     name = segment_ids, shape = (?, 128)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:**** Trainable Variables ****\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -   **** Trainable Variables ****\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = cls/predictions/transform/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = cls/predictions/transform/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = cls/predictions/transform/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = cls/predictions/transform/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = cls/predictions/output_bias:0, shape = (30522,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/output_bias:0, shape = (30522,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = cls/seq_relationship/output_weights:0, shape = (2, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/seq_relationship/output_weights:0, shape = (2, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = cls/seq_relationship/output_bias:0, shape = (2,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/seq_relationship/output_bias:0, shape = (2,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Done calling model_fn.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:43 - INFO - tensorflow -   Done calling model_fn.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Graph was finalized.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:44 - INFO - tensorflow -   Graph was finalized.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Running local_init_op.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:45 - INFO - tensorflow -   Running local_init_op.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Done running local_init_op.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:45 - INFO - tensorflow -   Done running local_init_op.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:prediction_loop marked as finished\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:46 - INFO - tensorflow -   prediction_loop marked as finished\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:prediction_loop marked as finished\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:02:46 - INFO - tensorflow -   prediction_loop marked as finished\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"tensorflow_all_out = []\\n\",\n    \"for result in estimator.predict(input_fn, yield_single_examples=True):\\n\",\n    \"    tensorflow_all_out.append(result)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:46.634304Z\",\n     \"start_time\": \"2018-11-16T10:02:46.598800Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"1\\n\",\n      \"2\\n\",\n      \"dict_keys(['masked_lm_predictions', 'next_sentence_predictions'])\\n\",\n      \"masked_lm_predictions [27227  1010  1010  1010  1010  1010  1010  1010  1010  1010  1010  1010\\n\",\n      \"  1010  1010  1010  1010  1010  1010  1010  1010]\\n\",\n      \"predicted token ['henson', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(len(tensorflow_all_out))\\n\",\n    \"print(len(tensorflow_all_out[0]))\\n\",\n    \"print(tensorflow_all_out[0].keys())\\n\",\n    \"print(\\\"masked_lm_predictions\\\", tensorflow_all_out[0]['masked_lm_predictions'])\\n\",\n    \"print(\\\"predicted token\\\", tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions']))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:02:46.671229Z\",\n     \"start_time\": \"2018-11-16T10:02:46.637102Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"tensorflow_output: ['henson']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"tensorflow_outputs = tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions'])[:len(masked_lm_positions)]\\n\",\n    \"print(\\\"tensorflow_output:\\\", tensorflow_outputs)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2/ PyTorch code\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:03:03.556557Z\",\n     \"start_time\": \"2018-11-16T10:03:03.519654Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"from examples import extract_features\\n\",\n    \"from examples.extract_features import *\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:03:03.952710Z\",\n     \"start_time\": \"2018-11-16T10:03:03.921917Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"init_checkpoint_pt = \\\"../google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:03:12.307673Z\",\n     \"start_time\": \"2018-11-16T10:03:04.439317Z\"\n    },\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\\n\",\n      \"11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\\n\",\n      \"11/16/2018 11:03:08 - INFO - pytorch_pretrained_bert.modeling -   Model config {\\n\",\n      \"  \\\"attention_probs_dropout_prob\\\": 0.1,\\n\",\n      \"  \\\"hidden_act\\\": \\\"gelu\\\",\\n\",\n      \"  \\\"hidden_dropout_prob\\\": 0.1,\\n\",\n      \"  \\\"hidden_size\\\": 768,\\n\",\n      \"  \\\"initializer_range\\\": 0.02,\\n\",\n      \"  \\\"intermediate_size\\\": 3072,\\n\",\n      \"  \\\"max_position_embeddings\\\": 512,\\n\",\n      \"  \\\"num_attention_heads\\\": 12,\\n\",\n      \"  \\\"num_hidden_layers\\\": 12,\\n\",\n      \"  \\\"type_vocab_size\\\": 2,\\n\",\n      \"  \\\"vocab_size\\\": 30522\\n\",\n      \"}\\n\",\n      \"\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"BertForPreTraining(\\n\",\n       \"  (bert): BertModel(\\n\",\n       \"    (embeddings): BertEmbeddings(\\n\",\n       \"      (word_embeddings): Embedding(30522, 768)\\n\",\n       \"      (position_embeddings): Embedding(512, 768)\\n\",\n       \"      (token_type_embeddings): Embedding(2, 768)\\n\",\n       \"      (LayerNorm): BertLayerNorm()\\n\",\n       \"      (dropout): Dropout(p=0.1)\\n\",\n       \"    )\\n\",\n       \"    (encoder): BertEncoder(\\n\",\n       \"      (layer): ModuleList(\\n\",\n       \"        (0): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (1): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (2): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (3): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (4): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (5): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (6): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (7): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (8): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (9): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (10): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (11): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"    )\\n\",\n       \"    (pooler): BertPooler(\\n\",\n       \"      (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"      (activation): Tanh()\\n\",\n       \"    )\\n\",\n       \"  )\\n\",\n       \"  (cls): BertPreTrainingHeads(\\n\",\n       \"    (predictions): BertLMPredictionHead(\\n\",\n       \"      (transform): BertPredictionHeadTransform(\\n\",\n       \"        (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"        (LayerNorm): BertLayerNorm()\\n\",\n       \"      )\\n\",\n       \"      (decoder): Linear(in_features=768, out_features=30522, bias=False)\\n\",\n       \"    )\\n\",\n       \"    (seq_relationship): Linear(in_features=768, out_features=2, bias=True)\\n\",\n       \"  )\\n\",\n       \")\"\n      ]\n     },\n     \"execution_count\": 16,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"device = torch.device(\\\"cpu\\\")\\n\",\n    \"model = ppb.BertForPreTraining.from_pretrained('bert-base-uncased')\\n\",\n    \"model.to(device)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:03:12.351625Z\",\n     \"start_time\": \"2018-11-16T10:03:12.310736Z\"\n    },\n    \"code_folding\": []\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"BertForPreTraining(\\n\",\n       \"  (bert): BertModel(\\n\",\n       \"    (embeddings): BertEmbeddings(\\n\",\n       \"      (word_embeddings): Embedding(30522, 768)\\n\",\n       \"      (position_embeddings): Embedding(512, 768)\\n\",\n       \"      (token_type_embeddings): Embedding(2, 768)\\n\",\n       \"      (LayerNorm): BertLayerNorm()\\n\",\n       \"      (dropout): Dropout(p=0.1)\\n\",\n       \"    )\\n\",\n       \"    (encoder): BertEncoder(\\n\",\n       \"      (layer): ModuleList(\\n\",\n       \"        (0): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (1): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (2): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (3): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (4): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (5): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (6): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (7): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (8): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (9): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (10): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (11): BertLayer(\\n\",\n       \"          (attention): BertAttention(\\n\",\n       \"            (self): BertSelfAttention(\\n\",\n       \"              (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"            (output): BertSelfOutput(\\n\",\n       \"              (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"              (LayerNorm): BertLayerNorm()\\n\",\n       \"              (dropout): Dropout(p=0.1)\\n\",\n       \"            )\\n\",\n       \"          )\\n\",\n       \"          (intermediate): BertIntermediate(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"          )\\n\",\n       \"          (output): BertOutput(\\n\",\n       \"            (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"    )\\n\",\n       \"    (pooler): BertPooler(\\n\",\n       \"      (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"      (activation): Tanh()\\n\",\n       \"    )\\n\",\n       \"  )\\n\",\n       \"  (cls): BertPreTrainingHeads(\\n\",\n       \"    (predictions): BertLMPredictionHead(\\n\",\n       \"      (transform): BertPredictionHeadTransform(\\n\",\n       \"        (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"        (LayerNorm): BertLayerNorm()\\n\",\n       \"      )\\n\",\n       \"      (decoder): Linear(in_features=768, out_features=30522, bias=False)\\n\",\n       \"    )\\n\",\n       \"    (seq_relationship): Linear(in_features=768, out_features=2, bias=True)\\n\",\n       \"  )\\n\",\n       \")\"\n      ]\n     },\n     \"execution_count\": 17,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\\n\",\n    \"all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\\n\",\n    \"all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)\\n\",\n    \"all_masked_lm_positions = torch.tensor([f.masked_lm_positions for f in features], dtype=torch.long)\\n\",\n    \"\\n\",\n    \"eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_positions)\\n\",\n    \"eval_sampler = SequentialSampler(eval_data)\\n\",\n    \"eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\\n\",\n    \"\\n\",\n    \"model.eval()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:03:12.792741Z\",\n     \"start_time\": \"2018-11-16T10:03:12.354253Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"tensor([[ 2040,  2001,  3958, 27227,  1029,  3958,   103,  2001,  1037, 13997,\\n\",\n      \"         11510,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0]])\\n\",\n      \"tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0]])\\n\",\n      \"tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0]])\\n\",\n      \"(1, 20, 30522)\\n\",\n      \"[27227, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import numpy as np\\n\",\n    \"pytorch_all_out = []\\n\",\n    \"for input_ids, input_mask, segment_ids, tensor_masked_lm_positions in eval_dataloader:\\n\",\n    \"    print(input_ids)\\n\",\n    \"    print(input_mask)\\n\",\n    \"    print(segment_ids)\\n\",\n    \"    input_ids = input_ids.to(device)\\n\",\n    \"    input_mask = input_mask.to(device)\\n\",\n    \"    segment_ids = segment_ids.to(device)\\n\",\n    \"\\n\",\n    \"    prediction_scores, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)\\n\",\n    \"    prediction_scores = prediction_scores[0, tensor_masked_lm_positions].detach().cpu().numpy()\\n\",\n    \"    print(prediction_scores.shape)\\n\",\n    \"    masked_lm_predictions = np.argmax(prediction_scores, axis=-1).squeeze().tolist()\\n\",\n    \"    print(masked_lm_predictions)\\n\",\n    \"    pytorch_all_out.append(masked_lm_predictions)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-16T10:03:12.828439Z\",\n     \"start_time\": \"2018-11-16T10:03:12.795420Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"pytorch_output: ['henson']\\n\",\n      \"tensorflow_output: ['henson']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"pytorch_outputs = tokenizer.convert_ids_to_tokens(pytorch_all_out[0])[:len(masked_lm_positions)]\\n\",\n    \"print(\\\"pytorch_output:\\\", pytorch_outputs)\\n\",\n    \"print(\\\"tensorflow_output:\\\", tensorflow_outputs)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"hide_input\": false,\n  \"kernelspec\": {\n   \"display_name\": \"Python [default]\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.7\"\n  },\n  \"toc\": {\n   \"colors\": {\n    \"hover_highlight\": \"#DAA520\",\n    \"running_highlight\": \"#FF0000\",\n    \"selected_highlight\": \"#FFD700\"\n   },\n   \"moveMenuLeft\": true,\n   \"nav_menu\": {\n    \"height\": \"48px\",\n    \"width\": \"252px\"\n   },\n   \"navigate_menu\": true,\n   \"number_sections\": true,\n   \"sideBar\": true,\n   \"threshold\": 4,\n   \"toc_cell\": false,\n   \"toc_section_display\": \"block\",\n   \"toc_window_display\": false\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Comparing TensorFlow (original) and PyTorch model on the SQuAD task\\n\",\n    \"\\n\",\n    \"You can use this small notebook to check the loss computation from the TensorFlow model to the PyTorch model. In the following, we compare the total loss computed by the models starting from identical initializations (position prediction linear layers with weights at 1 and bias at 0).\\n\",\n    \"\\n\",\n    \"To run this notebook, follow these instructions:\\n\",\n    \"- make sure that your Python environment has both TensorFlow and PyTorch installed,\\n\",\n    \"- download the original TensorFlow implementation,\\n\",\n    \"- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\\n\",\n    \"- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\\n\",\n    \"\\n\",\n    \"If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:33.636911Z\",\n     \"start_time\": \"2018-11-06T10:11:33.623091Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"os.chdir('../')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1/ TensorFlow code\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:33.651792Z\",\n     \"start_time\": \"2018-11-06T10:11:33.638984Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"original_tf_inplem_dir = \\\"./tensorflow_code/\\\"\\n\",\n    \"model_dir = \\\"../google_models/uncased_L-12_H-768_A-12/\\\"\\n\",\n    \"\\n\",\n    \"vocab_file = model_dir + \\\"vocab.txt\\\"\\n\",\n    \"bert_config_file = model_dir + \\\"bert_config.json\\\"\\n\",\n    \"init_checkpoint = model_dir + \\\"bert_model.ckpt\\\"\\n\",\n    \"\\n\",\n    \"input_file = \\\"../data/squad_data/train-v1.1.json\\\"\\n\",\n    \"max_seq_length = 384\\n\",\n    \"outside_pos = max_seq_length + 10\\n\",\n    \"doc_stride = 128\\n\",\n    \"max_query_length = 64\\n\",\n    \"max_answer_length = 30\\n\",\n    \"output_dir = \\\"/tmp/squad_base/\\\"\\n\",\n    \"learning_rate = 3e-5\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:35.165788Z\",\n     \"start_time\": \"2018-11-06T10:11:33.653401Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import importlib.util\\n\",\n    \"import sys\\n\",\n    \"\\n\",\n    \"spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/modeling.py')\\n\",\n    \"module = importlib.util.module_from_spec(spec)\\n\",\n    \"spec.loader.exec_module(module)\\n\",\n    \"sys.modules['modeling_tensorflow'] = module\\n\",\n    \"\\n\",\n    \"spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/run_squad.py')\\n\",\n    \"module = importlib.util.module_from_spec(spec)\\n\",\n    \"spec.loader.exec_module(module)\\n\",\n    \"sys.modules['run_squad_tensorflow'] = module\\n\",\n    \"import modeling_tensorflow\\n\",\n    \"from run_squad_tensorflow import *\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:37.494391Z\",\n     \"start_time\": \"2018-11-06T10:11:35.168615Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000000\\n\",\n      \"INFO:tensorflow:example_index: 0\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] to whom did the virgin mary allegedly appear in 1858 in lou ##rdes france ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \\\" ve ##ni ##te ad me om ##nes \\\" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 17:0 18:0 19:0 20:1 21:2 22:3 23:4 24:5 25:6 26:6 27:7 28:8 29:9 30:10 31:10 32:10 33:11 34:12 35:13 36:14 37:15 38:16 39:17 40:18 41:19 42:20 43:20 44:21 45:22 46:23 47:24 48:25 49:26 50:27 51:28 52:29 53:30 54:30 55:31 56:32 57:33 58:34 59:35 60:36 61:37 62:38 63:39 64:39 65:39 66:40 67:41 68:42 69:43 70:43 71:43 72:43 73:44 74:45 75:46 76:46 77:46 78:46 79:47 80:48 81:49 82:50 83:51 84:52 85:53 86:54 87:55 88:56 89:57 90:58 91:58 92:59 93:60 94:61 95:62 96:63 97:64 98:65 99:65 100:65 101:66 102:67 103:68 104:69 105:70 106:71 107:72 108:72 109:73 110:74 111:75 112:76 113:77 114:78 115:79 116:79 117:80 118:81 119:81 120:81 121:82 122:83 123:84 124:85 125:86 126:87 127:87 128:88 129:89 130:90 131:91 132:91 133:91 134:92 135:92 136:92 137:92 138:93 139:94 140:94 141:95 142:96 143:97 144:98 145:99 146:100 147:101 148:102 149:102 150:103 151:104 152:105 153:106 154:107 155:108 156:109 157:110 158:111 159:112 160:113 161:114 162:115 163:115 164:115 165:116 166:117 167:118 168:118 169:119 170:120 171:121 172:122 173:123 174:123\\n\",\n      \"INFO:tensorflow:token_is_max_context: 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2000 3183 2106 1996 6261 2984 9382 3711 1999 8517 1999 10223 26371 2605 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 130\\n\",\n      \"INFO:tensorflow:end_position: 137\\n\",\n      \"INFO:tensorflow:answer: saint bern ##ade ##tte so ##ub ##iro ##us\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000001\\n\",\n      \"INFO:tensorflow:example_index: 1\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] what is in front of the notre dame main building ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \\\" ve ##ni ##te ad me om ##nes \\\" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 13:0 14:0 15:0 16:1 17:2 18:3 19:4 20:5 21:6 22:6 23:7 24:8 25:9 26:10 27:10 28:10 29:11 30:12 31:13 32:14 33:15 34:16 35:17 36:18 37:19 38:20 39:20 40:21 41:22 42:23 43:24 44:25 45:26 46:27 47:28 48:29 49:30 50:30 51:31 52:32 53:33 54:34 55:35 56:36 57:37 58:38 59:39 60:39 61:39 62:40 63:41 64:42 65:43 66:43 67:43 68:43 69:44 70:45 71:46 72:46 73:46 74:46 75:47 76:48 77:49 78:50 79:51 80:52 81:53 82:54 83:55 84:56 85:57 86:58 87:58 88:59 89:60 90:61 91:62 92:63 93:64 94:65 95:65 96:65 97:66 98:67 99:68 100:69 101:70 102:71 103:72 104:72 105:73 106:74 107:75 108:76 109:77 110:78 111:79 112:79 113:80 114:81 115:81 116:81 117:82 118:83 119:84 120:85 121:86 122:87 123:87 124:88 125:89 126:90 127:91 128:91 129:91 130:92 131:92 132:92 133:92 134:93 135:94 136:94 137:95 138:96 139:97 140:98 141:99 142:100 143:101 144:102 145:102 146:103 147:104 148:105 149:106 150:107 151:108 152:109 153:110 154:111 155:112 156:113 157:114 158:115 159:115 160:115 161:116 162:117 163:118 164:118 165:119 166:120 167:121 168:122 169:123 170:123\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2054 2003 1999 2392 1997 1996 10289 8214 2364 2311 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 52\\n\",\n      \"INFO:tensorflow:end_position: 56\\n\",\n      \"INFO:tensorflow:answer: a copper statue of christ\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000002\\n\",\n      \"INFO:tensorflow:example_index: 2\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] the basilica of the sacred heart at notre dame is beside to which structure ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \\\" ve ##ni ##te ad me om ##nes \\\" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 17:0 18:0 19:0 20:1 21:2 22:3 23:4 24:5 25:6 26:6 27:7 28:8 29:9 30:10 31:10 32:10 33:11 34:12 35:13 36:14 37:15 38:16 39:17 40:18 41:19 42:20 43:20 44:21 45:22 46:23 47:24 48:25 49:26 50:27 51:28 52:29 53:30 54:30 55:31 56:32 57:33 58:34 59:35 60:36 61:37 62:38 63:39 64:39 65:39 66:40 67:41 68:42 69:43 70:43 71:43 72:43 73:44 74:45 75:46 76:46 77:46 78:46 79:47 80:48 81:49 82:50 83:51 84:52 85:53 86:54 87:55 88:56 89:57 90:58 91:58 92:59 93:60 94:61 95:62 96:63 97:64 98:65 99:65 100:65 101:66 102:67 103:68 104:69 105:70 106:71 107:72 108:72 109:73 110:74 111:75 112:76 113:77 114:78 115:79 116:79 117:80 118:81 119:81 120:81 121:82 122:83 123:84 124:85 125:86 126:87 127:87 128:88 129:89 130:90 131:91 132:91 133:91 134:92 135:92 136:92 137:92 138:93 139:94 140:94 141:95 142:96 143:97 144:98 145:99 146:100 147:101 148:102 149:102 150:103 151:104 152:105 153:106 154:107 155:108 156:109 157:110 158:111 159:112 160:113 161:114 162:115 163:115 164:115 165:116 166:117 167:118 168:118 169:119 170:120 171:121 172:122 173:123 174:123\\n\",\n      \"INFO:tensorflow:token_is_max_context: 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 1996 13546 1997 1996 6730 2540 2012 10289 8214 2003 3875 2000 2029 3252 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 81\\n\",\n      \"INFO:tensorflow:end_position: 83\\n\",\n      \"INFO:tensorflow:answer: the main building\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000003\\n\",\n      \"INFO:tensorflow:example_index: 3\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] what is the gr ##otto at notre dame ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \\\" ve ##ni ##te ad me om ##nes \\\" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 11:0 12:0 13:0 14:1 15:2 16:3 17:4 18:5 19:6 20:6 21:7 22:8 23:9 24:10 25:10 26:10 27:11 28:12 29:13 30:14 31:15 32:16 33:17 34:18 35:19 36:20 37:20 38:21 39:22 40:23 41:24 42:25 43:26 44:27 45:28 46:29 47:30 48:30 49:31 50:32 51:33 52:34 53:35 54:36 55:37 56:38 57:39 58:39 59:39 60:40 61:41 62:42 63:43 64:43 65:43 66:43 67:44 68:45 69:46 70:46 71:46 72:46 73:47 74:48 75:49 76:50 77:51 78:52 79:53 80:54 81:55 82:56 83:57 84:58 85:58 86:59 87:60 88:61 89:62 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:72 103:73 104:74 105:75 106:76 107:77 108:78 109:79 110:79 111:80 112:81 113:81 114:81 115:82 116:83 117:84 118:85 119:86 120:87 121:87 122:88 123:89 124:90 125:91 126:91 127:91 128:92 129:92 130:92 131:92 132:93 133:94 134:94 135:95 136:96 137:97 138:98 139:99 140:100 141:101 142:102 143:102 144:103 145:104 146:105 147:106 148:107 149:108 150:109 151:110 152:111 153:112 154:113 155:114 156:115 157:115 158:115 159:116 160:117 161:118 162:118 163:119 164:120 165:121 166:122 167:123 168:123\\n\",\n      \"INFO:tensorflow:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2054 2003 1996 24665 23052 2012 10289 8214 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 95\\n\",\n      \"INFO:tensorflow:end_position: 101\\n\",\n      \"INFO:tensorflow:answer: a marian place of prayer and reflection\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:unique_id: 1000000004\\n\",\n      \"INFO:tensorflow:example_index: 4\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] what sits on top of the main building at notre dame ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \\\" ve ##ni ##te ad me om ##nes \\\" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 14:0 15:0 16:0 17:1 18:2 19:3 20:4 21:5 22:6 23:6 24:7 25:8 26:9 27:10 28:10 29:10 30:11 31:12 32:13 33:14 34:15 35:16 36:17 37:18 38:19 39:20 40:20 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:28 49:29 50:30 51:30 52:31 53:32 54:33 55:34 56:35 57:36 58:37 59:38 60:39 61:39 62:39 63:40 64:41 65:42 66:43 67:43 68:43 69:43 70:44 71:45 72:46 73:46 74:46 75:46 76:47 77:48 78:49 79:50 80:51 81:52 82:53 83:54 84:55 85:56 86:57 87:58 88:58 89:59 90:60 91:61 92:62 93:63 94:64 95:65 96:65 97:65 98:66 99:67 100:68 101:69 102:70 103:71 104:72 105:72 106:73 107:74 108:75 109:76 110:77 111:78 112:79 113:79 114:80 115:81 116:81 117:81 118:82 119:83 120:84 121:85 122:86 123:87 124:87 125:88 126:89 127:90 128:91 129:91 130:91 131:92 132:92 133:92 134:92 135:93 136:94 137:94 138:95 139:96 140:97 141:98 142:99 143:100 144:101 145:102 146:102 147:103 148:104 149:105 150:106 151:107 152:108 153:109 154:110 155:111 156:112 157:113 158:114 159:115 160:115 161:115 162:116 163:117 164:118 165:118 166:119 167:120 168:121 169:122 170:123 171:123\\n\",\n      \"INFO:tensorflow:token_is_max_context: 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2054 7719 2006 2327 1997 1996 2364 2311 2012 10289 8214 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 33\\n\",\n      \"INFO:tensorflow:end_position: 39\\n\",\n      \"INFO:tensorflow:answer: a golden statue of the virgin mary\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000005\\n\",\n      \"INFO:tensorflow:example_index: 5\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] when did the scholastic magazine of notre dame begin publishing ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:4 19:5 20:6 21:6 22:6 23:7 24:8 25:9 26:10 27:11 28:12 29:13 30:14 31:14 32:15 33:16 34:17 35:17 36:17 37:18 38:19 39:20 40:21 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:36 60:36 61:37 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:55 81:56 82:57 83:58 84:59 85:60 86:60 87:61 88:62 89:63 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:75 105:76 106:77 107:77 108:78 109:79 110:80 111:81 112:82 113:83 114:83 115:84 116:85 117:86 118:87 119:88 120:89 121:89 122:90 123:91 124:92 125:93 126:94 127:95 128:96 129:97 130:98 131:99 132:100 133:101 134:101 135:102 136:103 137:104 138:105 139:106 140:107 141:108 142:109 143:110 144:111 145:112 146:112 147:112 148:113 149:113 150:114 151:115 152:116 153:117 154:118 155:118 156:119 157:120 158:121 159:122 160:123 161:124 162:125 163:126 164:127 165:128 166:129 167:130 168:131 169:132 170:133 171:134 172:135 173:136 174:137 175:138 176:138 177:139 178:140 179:140 180:141 181:142 182:143 183:144 184:145 185:146 186:147 187:148 188:149 189:150 190:151 191:152 192:153 193:153 194:154 195:155 196:156 197:156 198:157 199:158 200:159 201:160 202:160 203:161 204:161 205:162 206:163 207:163 208:164 209:165 210:166 211:167 212:168 213:169 214:170 215:171 216:172 217:173 218:174 219:174 220:175 221:176 222:177 223:178 224:179 225:180 226:181 227:182 228:182 229:183 230:184 231:185 232:186 233:187 234:188 235:189 236:190 237:191 238:191 239:192 240:192 241:193 242:194 243:195 244:196 245:197 246:198 247:199 248:199 249:200 250:200 251:201 252:202 253:203 254:204 255:205 256:206 257:207 258:208 259:209 260:210 261:210 262:211 263:212 264:212 265:213 266:214 267:215 268:215\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2043 2106 1996 24105 2932 1997 10289 8214 4088 4640 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 63\\n\",\n      \"INFO:tensorflow:end_position: 64\\n\",\n      \"INFO:tensorflow:answer: september 1876\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000006\\n\",\n      \"INFO:tensorflow:example_index: 6\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] how often is notre dame ' s the jug ##gler published ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 14:0 15:1 16:2 17:3 18:4 19:4 20:5 21:6 22:6 23:6 24:7 25:8 26:9 27:10 28:11 29:12 30:13 31:14 32:14 33:15 34:16 35:17 36:17 37:17 38:18 39:19 40:20 41:21 42:21 43:22 44:23 45:24 46:25 47:26 48:27 49:27 50:28 51:29 52:30 53:31 54:32 55:32 56:33 57:34 58:35 59:36 60:36 61:36 62:37 63:38 64:39 65:40 66:40 67:41 68:42 69:43 70:44 71:45 72:46 73:47 74:48 75:49 76:50 77:51 78:52 79:53 80:54 81:55 82:56 83:57 84:58 85:59 86:60 87:60 88:61 89:62 90:63 91:63 92:64 93:65 94:65 95:65 96:66 97:67 98:68 99:69 100:70 101:71 102:72 103:73 104:74 105:75 106:76 107:77 108:77 109:78 110:79 111:80 112:81 113:82 114:83 115:83 116:84 117:85 118:86 119:87 120:88 121:89 122:89 123:90 124:91 125:92 126:93 127:94 128:95 129:96 130:97 131:98 132:99 133:100 134:101 135:101 136:102 137:103 138:104 139:105 140:106 141:107 142:108 143:109 144:110 145:111 146:112 147:112 148:112 149:113 150:113 151:114 152:115 153:116 154:117 155:118 156:118 157:119 158:120 159:121 160:122 161:123 162:124 163:125 164:126 165:127 166:128 167:129 168:130 169:131 170:132 171:133 172:134 173:135 174:136 175:137 176:138 177:138 178:139 179:140 180:140 181:141 182:142 183:143 184:144 185:145 186:146 187:147 188:148 189:149 190:150 191:151 192:152 193:153 194:153 195:154 196:155 197:156 198:156 199:157 200:158 201:159 202:160 203:160 204:161 205:161 206:162 207:163 208:163 209:164 210:165 211:166 212:167 213:168 214:169 215:170 216:171 217:172 218:173 219:174 220:174 221:175 222:176 223:177 224:178 225:179 226:180 227:181 228:182 229:182 230:183 231:184 232:185 233:186 234:187 235:188 236:189 237:190 238:191 239:191 240:192 241:192 242:193 243:194 244:195 245:196 246:197 247:198 248:199 249:199 250:200 251:200 252:201 253:202 254:203 255:204 256:205 257:206 258:207 259:208 260:209 261:210 262:210 263:211 264:212 265:212 266:213 267:214 268:215 269:215\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:token_is_max_context: 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True 269:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2129 2411 2003 10289 8214 1005 1055 1996 26536 17420 2405 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 98\\n\",\n      \"INFO:tensorflow:end_position: 98\\n\",\n      \"INFO:tensorflow:answer: twice\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000007\\n\",\n      \"INFO:tensorflow:example_index: 7\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] what is the daily student paper at notre dame called ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:4 19:5 20:6 21:6 22:6 23:7 24:8 25:9 26:10 27:11 28:12 29:13 30:14 31:14 32:15 33:16 34:17 35:17 36:17 37:18 38:19 39:20 40:21 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:36 60:36 61:37 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:55 81:56 82:57 83:58 84:59 85:60 86:60 87:61 88:62 89:63 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:75 105:76 106:77 107:77 108:78 109:79 110:80 111:81 112:82 113:83 114:83 115:84 116:85 117:86 118:87 119:88 120:89 121:89 122:90 123:91 124:92 125:93 126:94 127:95 128:96 129:97 130:98 131:99 132:100 133:101 134:101 135:102 136:103 137:104 138:105 139:106 140:107 141:108 142:109 143:110 144:111 145:112 146:112 147:112 148:113 149:113 150:114 151:115 152:116 153:117 154:118 155:118 156:119 157:120 158:121 159:122 160:123 161:124 162:125 163:126 164:127 165:128 166:129 167:130 168:131 169:132 170:133 171:134 172:135 173:136 174:137 175:138 176:138 177:139 178:140 179:140 180:141 181:142 182:143 183:144 184:145 185:146 186:147 187:148 188:149 189:150 190:151 191:152 192:153 193:153 194:154 195:155 196:156 197:156 198:157 199:158 200:159 201:160 202:160 203:161 204:161 205:162 206:163 207:163 208:164 209:165 210:166 211:167 212:168 213:169 214:170 215:171 216:172 217:173 218:174 219:174 220:175 221:176 222:177 223:178 224:179 225:180 226:181 227:182 228:182 229:183 230:184 231:185 232:186 233:187 234:188 235:189 236:190 237:191 238:191 239:192 240:192 241:193 242:194 243:195 244:196 245:197 246:198 247:199 248:199 249:200 250:200 251:201 252:202 253:203 254:204 255:205 256:206 257:207 258:208 259:209 260:210 261:210 262:211 263:212 264:212 265:213 266:214 267:215 268:215\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2054 2003 1996 3679 3076 3259 2012 10289 8214 2170 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 123\\n\",\n      \"INFO:tensorflow:end_position: 124\\n\",\n      \"INFO:tensorflow:answer: the observer\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000008\\n\",\n      \"INFO:tensorflow:example_index: 8\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] how many student news papers are found at notre dame ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:4 19:5 20:6 21:6 22:6 23:7 24:8 25:9 26:10 27:11 28:12 29:13 30:14 31:14 32:15 33:16 34:17 35:17 36:17 37:18 38:19 39:20 40:21 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:36 60:36 61:37 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:55 81:56 82:57 83:58 84:59 85:60 86:60 87:61 88:62 89:63 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:75 105:76 106:77 107:77 108:78 109:79 110:80 111:81 112:82 113:83 114:83 115:84 116:85 117:86 118:87 119:88 120:89 121:89 122:90 123:91 124:92 125:93 126:94 127:95 128:96 129:97 130:98 131:99 132:100 133:101 134:101 135:102 136:103 137:104 138:105 139:106 140:107 141:108 142:109 143:110 144:111 145:112 146:112 147:112 148:113 149:113 150:114 151:115 152:116 153:117 154:118 155:118 156:119 157:120 158:121 159:122 160:123 161:124 162:125 163:126 164:127 165:128 166:129 167:130 168:131 169:132 170:133 171:134 172:135 173:136 174:137 175:138 176:138 177:139 178:140 179:140 180:141 181:142 182:143 183:144 184:145 185:146 186:147 187:148 188:149 189:150 190:151 191:152 192:153 193:153 194:154 195:155 196:156 197:156 198:157 199:158 200:159 201:160 202:160 203:161 204:161 205:162 206:163 207:163 208:164 209:165 210:166 211:167 212:168 213:169 214:170 215:171 216:172 217:173 218:174 219:174 220:175 221:176 222:177 223:178 224:179 225:180 226:181 227:182 228:182 229:183 230:184 231:185 232:186 233:187 234:188 235:189 236:190 237:191 238:191 239:192 240:192 241:193 242:194 243:195 244:196 245:197 246:198 247:199 248:199 249:200 250:200 251:201 252:202 253:203 254:204 255:205 256:206 257:207 258:208 259:209 260:210 261:210 262:211 263:212 264:212 265:213 266:214 267:215 268:215\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2129 2116 3076 2739 4981 2024 2179 2012 10289 8214 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 39\\n\",\n      \"INFO:tensorflow:end_position: 39\\n\",\n      \"INFO:tensorflow:answer: three\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000009\\n\",\n      \"INFO:tensorflow:example_index: 9\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] in what year did the student paper common sense begin publication at notre dame ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 17:0 18:1 19:2 20:3 21:4 22:4 23:5 24:6 25:6 26:6 27:7 28:8 29:9 30:10 31:11 32:12 33:13 34:14 35:14 36:15 37:16 38:17 39:17 40:17 41:18 42:19 43:20 44:21 45:21 46:22 47:23 48:24 49:25 50:26 51:27 52:27 53:28 54:29 55:30 56:31 57:32 58:32 59:33 60:34 61:35 62:36 63:36 64:36 65:37 66:38 67:39 68:40 69:40 70:41 71:42 72:43 73:44 74:45 75:46 76:47 77:48 78:49 79:50 80:51 81:52 82:53 83:54 84:55 85:56 86:57 87:58 88:59 89:60 90:60 91:61 92:62 93:63 94:63 95:64 96:65 97:65 98:65 99:66 100:67 101:68 102:69 103:70 104:71 105:72 106:73 107:74 108:75 109:76 110:77 111:77 112:78 113:79 114:80 115:81 116:82 117:83 118:83 119:84 120:85 121:86 122:87 123:88 124:89 125:89 126:90 127:91 128:92 129:93 130:94 131:95 132:96 133:97 134:98 135:99 136:100 137:101 138:101 139:102 140:103 141:104 142:105 143:106 144:107 145:108 146:109 147:110 148:111 149:112 150:112 151:112 152:113 153:113 154:114 155:115 156:116 157:117 158:118 159:118 160:119 161:120 162:121 163:122 164:123 165:124 166:125 167:126 168:127 169:128 170:129 171:130 172:131 173:132 174:133 175:134 176:135 177:136 178:137 179:138 180:138 181:139 182:140 183:140 184:141 185:142 186:143 187:144 188:145 189:146 190:147 191:148 192:149 193:150 194:151 195:152 196:153 197:153 198:154 199:155 200:156 201:156 202:157 203:158 204:159 205:160 206:160 207:161 208:161 209:162 210:163 211:163 212:164 213:165 214:166 215:167 216:168 217:169 218:170 219:171 220:172 221:173 222:174 223:174 224:175 225:176 226:177 227:178 228:179 229:180 230:181 231:182 232:182 233:183 234:184 235:185 236:186 237:187 238:188 239:189 240:190 241:191 242:191 243:192 244:192 245:193 246:194 247:195 248:196 249:197 250:198 251:199 252:199 253:200 254:200 255:201 256:202 257:203 258:204 259:205 260:206 261:207 262:208 263:209 264:210 265:210 266:211 267:212 268:212 269:213 270:214 271:215 272:215\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:token_is_max_context: 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True 269:True 270:True 271:True 272:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 1999 2054 2095 2106 1996 3076 3259 2691 3168 4088 4772 2012 10289 8214 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 182\\n\",\n      \"INFO:tensorflow:end_position: 182\\n\",\n      \"INFO:tensorflow:answer: 1987\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000010\\n\",\n      \"INFO:tensorflow:example_index: 10\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] where is the headquarters of the congregation of the holy cross ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 14:0 15:1 16:2 17:3 18:4 19:5 20:6 21:7 22:8 23:9 24:10 25:11 26:12 27:12 28:13 29:14 30:15 31:16 32:16 33:17 34:18 35:19 36:20 37:20 38:20 39:21 40:22 41:23 42:23 43:24 44:24 45:25 46:25 47:26 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:37 60:38 61:38 62:39 63:40 64:40 65:41 66:42 67:43 68:44 69:45 70:46 71:47 72:48 73:49 74:50 75:51 76:52 77:52 78:53 79:54 80:54 81:55 82:56 83:57 84:57 85:57 86:58 87:59 88:60 89:61 90:62 91:63 92:64 93:65 94:66 95:66 96:67 97:68 98:69 99:69 100:69 101:70 102:71 103:72 104:72 105:73 106:74 107:75 108:76 109:76 110:76 111:77 112:78 113:79 114:80 115:80 116:80 117:81 118:82 119:83 120:84 121:85 122:85 123:86 124:87 125:88 126:89 127:90 128:91 129:92 130:92 131:92 132:92 133:93 134:94 135:95 136:95 137:96 138:96 139:96 140:97 141:98 142:99 143:100 144:101 145:102 146:103 147:104 148:104 149:105 150:106 151:107 152:108 153:108 154:108 155:109 156:110 157:111 158:111\\n\",\n      \"INFO:tensorflow:token_is_max_context: 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:input_ids: 101 2073 2003 1996 4075 1997 1996 7769 1997 1996 4151 2892 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 36\\n\",\n      \"INFO:tensorflow:end_position: 36\\n\",\n      \"INFO:tensorflow:answer: rome\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000011\\n\",\n      \"INFO:tensorflow:example_index: 11\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] what is the primary seminary of the congregation of the holy cross ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 15:0 16:1 17:2 18:3 19:4 20:5 21:6 22:7 23:8 24:9 25:10 26:11 27:12 28:12 29:13 30:14 31:15 32:16 33:16 34:17 35:18 36:19 37:20 38:20 39:20 40:21 41:22 42:23 43:23 44:24 45:24 46:25 47:25 48:26 49:27 50:28 51:29 52:30 53:31 54:32 55:32 56:33 57:34 58:35 59:36 60:37 61:38 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:52 79:53 80:54 81:54 82:55 83:56 84:57 85:57 86:57 87:58 88:59 89:60 90:61 91:62 92:63 93:64 94:65 95:66 96:66 97:67 98:68 99:69 100:69 101:69 102:70 103:71 104:72 105:72 106:73 107:74 108:75 109:76 110:76 111:76 112:77 113:78 114:79 115:80 116:80 117:80 118:81 119:82 120:83 121:84 122:85 123:85 124:86 125:87 126:88 127:89 128:90 129:91 130:92 131:92 132:92 133:92 134:93 135:94 136:95 137:95 138:96 139:96 140:96 141:97 142:98 143:99 144:100 145:101 146:102 147:103 148:104 149:104 150:105 151:106 152:107 153:108 154:108 155:108 156:109 157:110 158:111 159:111\\n\",\n      \"INFO:tensorflow:token_is_max_context: 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2054 2003 1996 3078 8705 1997 1996 7769 1997 1996 4151 2892 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 44\\n\",\n      \"INFO:tensorflow:end_position: 46\\n\",\n      \"INFO:tensorflow:answer: more ##au seminary\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000012\\n\",\n      \"INFO:tensorflow:example_index: 12\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] what is the oldest structure at notre dame ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 11:0 12:1 13:2 14:3 15:4 16:5 17:6 18:7 19:8 20:9 21:10 22:11 23:12 24:12 25:13 26:14 27:15 28:16 29:16 30:17 31:18 32:19 33:20 34:20 35:20 36:21 37:22 38:23 39:23 40:24 41:24 42:25 43:25 44:26 45:27 46:28 47:29 48:30 49:31 50:32 51:32 52:33 53:34 54:35 55:36 56:37 57:38 58:38 59:39 60:40 61:40 62:41 63:42 64:43 65:44 66:45 67:46 68:47 69:48 70:49 71:50 72:51 73:52 74:52 75:53 76:54 77:54 78:55 79:56 80:57 81:57 82:57 83:58 84:59 85:60 86:61 87:62 88:63 89:64 90:65 91:66 92:66 93:67 94:68 95:69 96:69 97:69 98:70 99:71 100:72 101:72 102:73 103:74 104:75 105:76 106:76 107:76 108:77 109:78 110:79 111:80 112:80 113:80 114:81 115:82 116:83 117:84 118:85 119:85 120:86 121:87 122:88 123:89 124:90 125:91 126:92 127:92 128:92 129:92 130:93 131:94 132:95 133:95 134:96 135:96 136:96 137:97 138:98 139:99 140:100 141:101 142:102 143:103 144:104 145:104 146:105 147:106 148:107 149:108 150:108 151:108 152:109 153:110 154:111 155:111\\n\",\n      \"INFO:tensorflow:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2054 2003 1996 4587 3252 2012 10289 8214 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 59\\n\",\n      \"INFO:tensorflow:end_position: 60\\n\",\n      \"INFO:tensorflow:answer: old college\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000013\\n\",\n      \"INFO:tensorflow:example_index: 13\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] what individuals live at fatima house at notre dame ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 12:0 13:1 14:2 15:3 16:4 17:5 18:6 19:7 20:8 21:9 22:10 23:11 24:12 25:12 26:13 27:14 28:15 29:16 30:16 31:17 32:18 33:19 34:20 35:20 36:20 37:21 38:22 39:23 40:23 41:24 42:24 43:25 44:25 45:26 46:27 47:28 48:29 49:30 50:31 51:32 52:32 53:33 54:34 55:35 56:36 57:37 58:38 59:38 60:39 61:40 62:40 63:41 64:42 65:43 66:44 67:45 68:46 69:47 70:48 71:49 72:50 73:51 74:52 75:52 76:53 77:54 78:54 79:55 80:56 81:57 82:57 83:57 84:58 85:59 86:60 87:61 88:62 89:63 90:64 91:65 92:66 93:66 94:67 95:68 96:69 97:69 98:69 99:70 100:71 101:72 102:72 103:73 104:74 105:75 106:76 107:76 108:76 109:77 110:78 111:79 112:80 113:80 114:80 115:81 116:82 117:83 118:84 119:85 120:85 121:86 122:87 123:88 124:89 125:90 126:91 127:92 128:92 129:92 130:92 131:93 132:94 133:95 134:95 135:96 136:96 137:96 138:97 139:98 140:99 141:100 142:101 143:102 144:103 145:104 146:104 147:105 148:106 149:107 150:108 151:108 152:108 153:109 154:110 155:111 156:111\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:token_is_max_context: 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2054 3633 2444 2012 27596 2160 2012 10289 8214 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 84\\n\",\n      \"INFO:tensorflow:end_position: 87\\n\",\n      \"INFO:tensorflow:answer: retired priests and brothers\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000014\\n\",\n      \"INFO:tensorflow:example_index: 14\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] which prize did frederick bu ##ech ##ner create ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 11:0 12:1 13:2 14:3 15:4 16:5 17:6 18:7 19:8 20:9 21:10 22:11 23:12 24:12 25:13 26:14 27:15 28:16 29:16 30:17 31:18 32:19 33:20 34:20 35:20 36:21 37:22 38:23 39:23 40:24 41:24 42:25 43:25 44:26 45:27 46:28 47:29 48:30 49:31 50:32 51:32 52:33 53:34 54:35 55:36 56:37 57:38 58:38 59:39 60:40 61:40 62:41 63:42 64:43 65:44 66:45 67:46 68:47 69:48 70:49 71:50 72:51 73:52 74:52 75:53 76:54 77:54 78:55 79:56 80:57 81:57 82:57 83:58 84:59 85:60 86:61 87:62 88:63 89:64 90:65 91:66 92:66 93:67 94:68 95:69 96:69 97:69 98:70 99:71 100:72 101:72 102:73 103:74 104:75 105:76 106:76 107:76 108:77 109:78 110:79 111:80 112:80 113:80 114:81 115:82 116:83 117:84 118:85 119:85 120:86 121:87 122:88 123:89 124:90 125:91 126:92 127:92 128:92 129:92 130:93 131:94 132:95 133:95 134:96 135:96 136:96 137:97 138:98 139:99 140:100 141:101 142:102 143:103 144:104 145:104 146:105 147:106 148:107 149:108 150:108 151:108 152:109 153:110 154:111 155:111\\n\",\n      \"INFO:tensorflow:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2029 3396 2106 5406 20934 15937 3678 3443 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 149\\n\",\n      \"INFO:tensorflow:end_position: 154\\n\",\n      \"INFO:tensorflow:answer: bu ##ech ##ner prize for preaching\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000015\\n\",\n      \"INFO:tensorflow:example_index: 15\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] how many bs level degrees are offered in the college of engineering at notre dame ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 18:0 19:1 20:2 21:3 22:4 23:5 24:6 25:7 26:7 27:8 28:8 29:9 30:10 31:11 32:12 33:13 34:14 35:15 36:16 37:17 38:18 39:19 40:20 41:21 42:22 43:23 44:24 45:25 46:26 47:26 48:27 49:28 50:29 51:29 52:30 53:31 54:32 55:33 56:33 57:34 58:34 59:34 60:35 61:36 62:36 63:36 64:36 65:36 66:36 67:36 68:37 69:38 70:39 71:39 72:40 73:41 74:42 75:43 76:44 77:45 78:46 79:47 80:48 81:49 82:49 83:50 84:51 85:52 86:52 87:52 88:52 89:53 90:53 91:54 92:55 93:56 94:57 95:58 96:58 97:59 98:60 99:61 100:62 101:62 102:63 103:64 104:65 105:66 106:67 107:68 108:69 109:69 110:69 111:69 112:70 113:71 114:71 115:72 116:72 117:73 118:74 119:75 120:76 121:76 122:76 123:77 124:78 125:79 126:80 127:81 128:82 129:83 130:84 131:85 132:86 133:87 134:88 135:89 136:90 137:91 138:92 139:92 140:92 141:92 142:93 143:94 144:95 145:96 146:97 147:98 148:98 149:98 150:99 151:99 152:100 153:100\\n\",\n      \"INFO:tensorflow:token_is_max_context: 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2129 2116 18667 2504 5445 2024 3253 1999 1996 2267 1997 3330 2012 10289 8214 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 107\\n\",\n      \"INFO:tensorflow:end_position: 107\\n\",\n      \"INFO:tensorflow:answer: eight\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000016\\n\",\n      \"INFO:tensorflow:example_index: 16\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] in what year was the college of engineering at notre dame formed ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:token_to_orig_map: 15:0 16:1 17:2 18:3 19:4 20:5 21:6 22:7 23:7 24:8 25:8 26:9 27:10 28:11 29:12 30:13 31:14 32:15 33:16 34:17 35:18 36:19 37:20 38:21 39:22 40:23 41:24 42:25 43:26 44:26 45:27 46:28 47:29 48:29 49:30 50:31 51:32 52:33 53:33 54:34 55:34 56:34 57:35 58:36 59:36 60:36 61:36 62:36 63:36 64:36 65:37 66:38 67:39 68:39 69:40 70:41 71:42 72:43 73:44 74:45 75:46 76:47 77:48 78:49 79:49 80:50 81:51 82:52 83:52 84:52 85:52 86:53 87:53 88:54 89:55 90:56 91:57 92:58 93:58 94:59 95:60 96:61 97:62 98:62 99:63 100:64 101:65 102:66 103:67 104:68 105:69 106:69 107:69 108:69 109:70 110:71 111:71 112:72 113:72 114:73 115:74 116:75 117:76 118:76 119:76 120:77 121:78 122:79 123:80 124:81 125:82 126:83 127:84 128:85 129:86 130:87 131:88 132:89 133:90 134:91 135:92 136:92 137:92 138:92 139:93 140:94 141:95 142:96 143:97 144:98 145:98 146:98 147:99 148:99 149:100 150:100\\n\",\n      \"INFO:tensorflow:token_is_max_context: 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 1999 2054 2095 2001 1996 2267 1997 3330 2012 10289 8214 2719 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 22\\n\",\n      \"INFO:tensorflow:end_position: 22\\n\",\n      \"INFO:tensorflow:answer: 1920\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000017\\n\",\n      \"INFO:tensorflow:example_index: 17\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] before the creation of the college of engineering similar studies were carried out at which notre dame college ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 21:0 22:1 23:2 24:3 25:4 26:5 27:6 28:7 29:7 30:8 31:8 32:9 33:10 34:11 35:12 36:13 37:14 38:15 39:16 40:17 41:18 42:19 43:20 44:21 45:22 46:23 47:24 48:25 49:26 50:26 51:27 52:28 53:29 54:29 55:30 56:31 57:32 58:33 59:33 60:34 61:34 62:34 63:35 64:36 65:36 66:36 67:36 68:36 69:36 70:36 71:37 72:38 73:39 74:39 75:40 76:41 77:42 78:43 79:44 80:45 81:46 82:47 83:48 84:49 85:49 86:50 87:51 88:52 89:52 90:52 91:52 92:53 93:53 94:54 95:55 96:56 97:57 98:58 99:58 100:59 101:60 102:61 103:62 104:62 105:63 106:64 107:65 108:66 109:67 110:68 111:69 112:69 113:69 114:69 115:70 116:71 117:71 118:72 119:72 120:73 121:74 122:75 123:76 124:76 125:76 126:77 127:78 128:79 129:80 130:81 131:82 132:83 133:84 134:85 135:86 136:87 137:88 138:89 139:90 140:91 141:92 142:92 143:92 144:92 145:93 146:94 147:95 148:96 149:97 150:98 151:98 152:98 153:99 154:99 155:100 156:100\\n\",\n      \"INFO:tensorflow:token_is_max_context: 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2077 1996 4325 1997 1996 2267 1997 3330 2714 2913 2020 3344 2041 2012 2029 10289 8214 2267 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 43\\n\",\n      \"INFO:tensorflow:end_position: 46\\n\",\n      \"INFO:tensorflow:answer: the college of science\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000018\\n\",\n      \"INFO:tensorflow:example_index: 18\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] how many departments are within the st ##ins ##on - re ##mic ##k hall of engineering ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\\n\",\n      \"INFO:tensorflow:token_to_orig_map: 19:0 20:1 21:2 22:3 23:4 24:5 25:6 26:7 27:7 28:8 29:8 30:9 31:10 32:11 33:12 34:13 35:14 36:15 37:16 38:17 39:18 40:19 41:20 42:21 43:22 44:23 45:24 46:25 47:26 48:26 49:27 50:28 51:29 52:29 53:30 54:31 55:32 56:33 57:33 58:34 59:34 60:34 61:35 62:36 63:36 64:36 65:36 66:36 67:36 68:36 69:37 70:38 71:39 72:39 73:40 74:41 75:42 76:43 77:44 78:45 79:46 80:47 81:48 82:49 83:49 84:50 85:51 86:52 87:52 88:52 89:52 90:53 91:53 92:54 93:55 94:56 95:57 96:58 97:58 98:59 99:60 100:61 101:62 102:62 103:63 104:64 105:65 106:66 107:67 108:68 109:69 110:69 111:69 112:69 113:70 114:71 115:71 116:72 117:72 118:73 119:74 120:75 121:76 122:76 123:76 124:77 125:78 126:79 127:80 128:81 129:82 130:83 131:84 132:85 133:86 134:87 135:88 136:89 137:90 138:91 139:92 140:92 141:92 142:92 143:93 144:94 145:95 146:96 147:97 148:98 149:98 150:98 151:99 152:99 153:100 154:100\\n\",\n      \"INFO:tensorflow:token_is_max_context: 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 2129 2116 7640 2024 2306 1996 2358 7076 2239 1011 2128 7712 2243 2534 1997 3330 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 74\\n\",\n      \"INFO:tensorflow:end_position: 74\\n\",\n      \"INFO:tensorflow:answer: five\\n\",\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 1000000019\\n\",\n      \"INFO:tensorflow:example_index: 19\\n\",\n      \"INFO:tensorflow:doc_span_index: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] the college of science began to offer civil engineering courses beginning at what time at notre dame ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:token_to_orig_map: 20:0 21:1 22:2 23:3 24:4 25:5 26:6 27:7 28:7 29:8 30:8 31:9 32:10 33:11 34:12 35:13 36:14 37:15 38:16 39:17 40:18 41:19 42:20 43:21 44:22 45:23 46:24 47:25 48:26 49:26 50:27 51:28 52:29 53:29 54:30 55:31 56:32 57:33 58:33 59:34 60:34 61:34 62:35 63:36 64:36 65:36 66:36 67:36 68:36 69:36 70:37 71:38 72:39 73:39 74:40 75:41 76:42 77:43 78:44 79:45 80:46 81:47 82:48 83:49 84:49 85:50 86:51 87:52 88:52 89:52 90:52 91:53 92:53 93:54 94:55 95:56 96:57 97:58 98:58 99:59 100:60 101:61 102:62 103:62 104:63 105:64 106:65 107:66 108:67 109:68 110:69 111:69 112:69 113:69 114:70 115:71 116:71 117:72 118:72 119:73 120:74 121:75 122:76 123:76 124:76 125:77 126:78 127:79 128:80 129:81 130:82 131:83 132:84 133:85 134:86 135:87 136:88 137:89 138:90 139:91 140:92 141:92 142:92 143:92 144:93 145:94 146:95 147:96 148:97 149:98 150:98 151:98 152:99 153:99 154:100 155:100\\n\",\n      \"INFO:tensorflow:token_is_max_context: 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True\\n\",\n      \"INFO:tensorflow:input_ids: 101 1996 2267 1997 2671 2211 2000 3749 2942 3330 5352 2927 2012 2054 2051 2012 10289 8214 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:start_position: 47\\n\",\n      \"INFO:tensorflow:end_position: 48\\n\",\n      \"INFO:tensorflow:answer: the 1870s\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"bert_config = modeling_tensorflow.BertConfig.from_json_file(bert_config_file)\\n\",\n    \"tokenizer = tokenization.BertTokenizer(\\n\",\n    \"    vocab_file=vocab_file, do_lower_case=True)\\n\",\n    \"\\n\",\n    \"eval_examples = read_squad_examples(\\n\",\n    \"    input_file=input_file, is_training=True, max_num=16)\\n\",\n    \"\\n\",\n    \"eval_features = convert_examples_to_features(\\n\",\n    \"    examples=eval_examples,\\n\",\n    \"    tokenizer=tokenizer,\\n\",\n    \"    max_seq_length=max_seq_length,\\n\",\n    \"    doc_stride=doc_stride,\\n\",\n    \"    max_query_length=max_query_length,\\n\",\n    \"    is_training=True)\\n\",\n    \"\\n\",\n    \"# You can use that to test the behavior of the models when target are outside of the model input sequence\\n\",\n    \"# for feature in eval_features:\\n\",\n    \"#     feature.start_position = outside_pos\\n\",\n    \"#     feature.end_position = outside_pos\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:37.525632Z\",\n     \"start_time\": \"2018-11-06T10:11:37.498695Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"eval_unique_id_to_feature = {}\\n\",\n    \"for eval_feature in eval_features:\\n\",\n    \"    eval_unique_id_to_feature[eval_feature.unique_id] = eval_feature\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:37.558325Z\",\n     \"start_time\": \"2018-11-06T10:11:37.527972Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"def input_fn_builder(features, seq_length, drop_remainder):\\n\",\n    \"    \\\"\\\"\\\"Creates an `input_fn` closure to be passed to TPUEstimator.\\\"\\\"\\\"\\n\",\n    \"\\n\",\n    \"    all_unique_ids = []\\n\",\n    \"    all_input_ids = []\\n\",\n    \"    all_input_mask = []\\n\",\n    \"    all_segment_ids = []\\n\",\n    \"    all_start_positions = []\\n\",\n    \"    all_end_positions = []\\n\",\n    \"\\n\",\n    \"    for feature in features:\\n\",\n    \"        all_unique_ids.append(feature.unique_id)\\n\",\n    \"        all_input_ids.append(feature.input_ids)\\n\",\n    \"        all_input_mask.append(feature.input_mask)\\n\",\n    \"        all_segment_ids.append(feature.segment_ids)\\n\",\n    \"        all_start_positions.append(feature.start_position)\\n\",\n    \"        all_end_positions.append(feature.end_position)\\n\",\n    \"\\n\",\n    \"    def input_fn(params):\\n\",\n    \"        \\\"\\\"\\\"The actual input function.\\\"\\\"\\\"\\n\",\n    \"        batch_size = params[\\\"batch_size\\\"]\\n\",\n    \"\\n\",\n    \"        num_examples = len(features)\\n\",\n    \"\\n\",\n    \"        # This is for demo purposes and does NOT scale to large data sets. We do\\n\",\n    \"        # not use Dataset.from_generator() because that uses tf.py_func which is\\n\",\n    \"        # not TPU compatible. The right way to load data is with TFRecordReader.\\n\",\n    \"        feature_map = {\\n\",\n    \"            \\\"unique_ids\\\":\\n\",\n    \"                tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),\\n\",\n    \"            \\\"input_ids\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_input_ids, shape=[num_examples, seq_length],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"            \\\"input_mask\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_input_mask,\\n\",\n    \"                    shape=[num_examples, seq_length],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"            \\\"segment_ids\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_segment_ids,\\n\",\n    \"                    shape=[num_examples, seq_length],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"            \\\"start_positions\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_start_positions,\\n\",\n    \"                    shape=[num_examples],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"            \\\"end_positions\\\":\\n\",\n    \"                tf.constant(\\n\",\n    \"                    all_end_positions,\\n\",\n    \"                    shape=[num_examples],\\n\",\n    \"                    dtype=tf.int32),\\n\",\n    \"        }\\n\",\n    \"\\n\",\n    \"        d = tf.data.Dataset.from_tensor_slices(feature_map)\\n\",\n    \"        d = d.repeat()\\n\",\n    \"        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)\\n\",\n    \"        return d\\n\",\n    \"\\n\",\n    \"    return input_fn\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:37.601666Z\",\n     \"start_time\": \"2018-11-06T10:11:37.560082Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"def model_fn_builder(bert_config, init_checkpoint, learning_rate,\\n\",\n    \"                     num_train_steps, num_warmup_steps, use_tpu,\\n\",\n    \"                     use_one_hot_embeddings):\\n\",\n    \"    \\\"\\\"\\\"Returns `model_fn` closure for TPUEstimator.\\\"\\\"\\\"\\n\",\n    \"\\n\",\n    \"    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\\n\",\n    \"        \\\"\\\"\\\"The `model_fn` for TPUEstimator.\\\"\\\"\\\"\\n\",\n    \"\\n\",\n    \"        tf.logging.info(\\\"*** Features ***\\\")\\n\",\n    \"        for name in sorted(features.keys()):\\n\",\n    \"            tf.logging.info(\\\"  name = %s, shape = %s\\\" % (name, features[name].shape))\\n\",\n    \"\\n\",\n    \"        unique_ids = features[\\\"unique_ids\\\"]\\n\",\n    \"        input_ids = features[\\\"input_ids\\\"]\\n\",\n    \"        input_mask = features[\\\"input_mask\\\"]\\n\",\n    \"        segment_ids = features[\\\"segment_ids\\\"]\\n\",\n    \"\\n\",\n    \"        is_training = (mode == tf.estimator.ModeKeys.TRAIN)\\n\",\n    \"\\n\",\n    \"        (start_logits, end_logits) = create_model(\\n\",\n    \"            bert_config=bert_config,\\n\",\n    \"            is_training=is_training,\\n\",\n    \"            input_ids=input_ids,\\n\",\n    \"            input_mask=input_mask,\\n\",\n    \"            segment_ids=segment_ids,\\n\",\n    \"            use_one_hot_embeddings=use_one_hot_embeddings)\\n\",\n    \"\\n\",\n    \"        tvars = tf.trainable_variables()\\n\",\n    \"\\n\",\n    \"        initialized_variable_names = {}\\n\",\n    \"        scaffold_fn = None\\n\",\n    \"        if init_checkpoint:\\n\",\n    \"            (assignment_map,\\n\",\n    \"             initialized_variable_names) = modeling_tensorflow.get_assigment_map_from_checkpoint(\\n\",\n    \"                tvars, init_checkpoint)\\n\",\n    \"            if use_tpu:\\n\",\n    \"\\n\",\n    \"                def tpu_scaffold():\\n\",\n    \"                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\\n\",\n    \"                    return tf.train.Scaffold()\\n\",\n    \"\\n\",\n    \"                scaffold_fn = tpu_scaffold\\n\",\n    \"            else:\\n\",\n    \"                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\\n\",\n    \"\\n\",\n    \"        tf.logging.info(\\\"**** Trainable Variables ****\\\")\\n\",\n    \"        for var in tvars:\\n\",\n    \"            init_string = \\\"\\\"\\n\",\n    \"            if var.name in initialized_variable_names:\\n\",\n    \"                init_string = \\\", *INIT_FROM_CKPT*\\\"\\n\",\n    \"            tf.logging.info(\\\"  name = %s, shape = %s%s\\\", var.name, var.shape,\\n\",\n    \"                            init_string)\\n\",\n    \"\\n\",\n    \"        output_spec = None\\n\",\n    \"        if mode == tf.estimator.ModeKeys.TRAIN:\\n\",\n    \"            seq_length = modeling_tensorflow.get_shape_list(input_ids)[1]\\n\",\n    \"\\n\",\n    \"            def compute_loss(logits, positions):\\n\",\n    \"                one_hot_positions = tf.one_hot(\\n\",\n    \"                    positions, depth=seq_length, dtype=tf.float32)\\n\",\n    \"                log_probs = tf.nn.log_softmax(logits, axis=-1)\\n\",\n    \"                loss = -tf.reduce_mean(\\n\",\n    \"                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))\\n\",\n    \"                return loss\\n\",\n    \"\\n\",\n    \"            start_positions = features[\\\"start_positions\\\"]\\n\",\n    \"            end_positions = features[\\\"end_positions\\\"]\\n\",\n    \"\\n\",\n    \"            start_loss = compute_loss(start_logits, start_positions)\\n\",\n    \"            end_loss = compute_loss(end_logits, end_positions)\\n\",\n    \"\\n\",\n    \"            total_loss = (start_loss + end_loss) / 2.0\\n\",\n    \"\\n\",\n    \"            train_op = optimization.create_optimizer(\\n\",\n    \"                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)\\n\",\n    \"\\n\",\n    \"            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\\n\",\n    \"                mode=mode,\\n\",\n    \"                loss=total_loss,\\n\",\n    \"                train_op=train_op,\\n\",\n    \"                scaffold_fn=scaffold_fn)\\n\",\n    \"        elif mode == tf.estimator.ModeKeys.PREDICT:\\n\",\n    \"            batch_size = modeling_tensorflow.get_shape_list(start_logits)[0]\\n\",\n    \"            seq_length = modeling_tensorflow.get_shape_list(input_ids)[1]\\n\",\n    \"\\n\",\n    \"            def compute_loss(logits, positions):\\n\",\n    \"                one_hot_positions = tf.one_hot(\\n\",\n    \"                    positions, depth=seq_length, dtype=tf.float32)\\n\",\n    \"                log_probs = tf.nn.log_softmax(logits, axis=-1)\\n\",\n    \"                loss = -tf.reduce_mean(\\n\",\n    \"                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))\\n\",\n    \"                return loss\\n\",\n    \"\\n\",\n    \"            start_positions = features[\\\"start_positions\\\"]\\n\",\n    \"            end_positions = features[\\\"end_positions\\\"]\\n\",\n    \"\\n\",\n    \"            start_loss = compute_loss(start_logits, start_positions)\\n\",\n    \"            end_loss = compute_loss(end_logits, end_positions)\\n\",\n    \"\\n\",\n    \"            total_loss = (start_loss + end_loss) / 2.0\\n\",\n    \"\\n\",\n    \"            predictions = {\\n\",\n    \"                \\\"unique_ids\\\": unique_ids,\\n\",\n    \"                \\\"start_logits\\\": start_logits,\\n\",\n    \"                \\\"end_logits\\\": end_logits,\\n\",\n    \"                \\\"total_loss\\\": tf.reshape(total_loss, [batch_size, 1]),\\n\",\n    \"                \\\"start_loss\\\": tf.reshape(start_loss, [batch_size, 1]),\\n\",\n    \"                \\\"end_loss\\\": tf.reshape(end_loss, [batch_size, 1]),\\n\",\n    \"            }\\n\",\n    \"            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\\n\",\n    \"                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)\\n\",\n    \"        else:\\n\",\n    \"            raise ValueError(\\n\",\n    \"                \\\"Only TRAIN and PREDICT modes are supported: %s\\\" % (mode))\\n\",\n    \"\\n\",\n    \"        return output_spec\\n\",\n    \"\\n\",\n    \"    return model_fn\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:41.104542Z\",\n     \"start_time\": \"2018-11-06T10:11:37.603474Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x120df3f28>) includes params argument, but params are not passed to Estimator.\\n\",\n      \"INFO:tensorflow:Using config: {'_model_dir': '/tmp/squad_base/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\\n\",\n      \"graph_options {\\n\",\n      \"  rewrite_options {\\n\",\n      \"    meta_optimizer_iterations: ONE\\n\",\n      \"  }\\n\",\n      \"}\\n\",\n      \", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11fd09630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\\n\",\n      \"INFO:tensorflow:_TPUContext: eval_on_tpu True\\n\",\n      \"WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\\n\",\n    \"run_config = tf.contrib.tpu.RunConfig(\\n\",\n    \"    cluster=None,\\n\",\n    \"    master=None,\\n\",\n    \"    model_dir=output_dir,\\n\",\n    \"    save_checkpoints_steps=1000,\\n\",\n    \"    tpu_config=tf.contrib.tpu.TPUConfig(\\n\",\n    \"        iterations_per_loop=1000,\\n\",\n    \"        num_shards=8,\\n\",\n    \"        per_host_input_for_training=is_per_host))\\n\",\n    \"\\n\",\n    \"model_fn = model_fn_builder(\\n\",\n    \"    bert_config=bert_config,\\n\",\n    \"    init_checkpoint=init_checkpoint,\\n\",\n    \"    learning_rate=learning_rate,\\n\",\n    \"    num_train_steps=None,\\n\",\n    \"    num_warmup_steps=None,\\n\",\n    \"    use_tpu=False,\\n\",\n    \"    use_one_hot_embeddings=False)\\n\",\n    \"\\n\",\n    \"estimator = tf.contrib.tpu.TPUEstimator(\\n\",\n    \"    use_tpu=False,\\n\",\n    \"    model_fn=model_fn,\\n\",\n    \"    config=run_config,\\n\",\n    \"    train_batch_size=12,\\n\",\n    \"    predict_batch_size=1)\\n\",\n    \"\\n\",\n    \"predict_input_fn = input_fn_builder(\\n\",\n    \"    features=eval_features,\\n\",\n    \"    seq_length=max_seq_length,\\n\",\n    \"    drop_remainder=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:47.857601Z\",\n     \"start_time\": \"2018-11-06T10:11:41.106219Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Could not find trained model in model_dir: /tmp/squad_base/, running initialization to predict.\\n\",\n      \"INFO:tensorflow:Calling model_fn.\\n\",\n      \"INFO:tensorflow:Running infer on CPU\\n\",\n      \"INFO:tensorflow:*** Features ***\\n\",\n      \"INFO:tensorflow:  name = end_positions, shape = (1,)\\n\",\n      \"INFO:tensorflow:  name = input_ids, shape = (1, 384)\\n\",\n      \"INFO:tensorflow:  name = input_mask, shape = (1, 384)\\n\",\n      \"INFO:tensorflow:  name = segment_ids, shape = (1, 384)\\n\",\n      \"INFO:tensorflow:  name = start_positions, shape = (1,)\\n\",\n      \"INFO:tensorflow:  name = unique_ids, shape = (1,)\\n\",\n      \"INFO:tensorflow:**** Trainable Variables ****\\n\",\n      \"INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\\n\",\n      \"INFO:tensorflow:  name = cls/squad/output_weights:0, shape = (2, 768)\\n\",\n      \"INFO:tensorflow:  name = cls/squad/output_bias:0, shape = (2,)\\n\",\n      \"INFO:tensorflow:Done calling model_fn.\\n\",\n      \"INFO:tensorflow:Graph was finalized.\\n\",\n      \"INFO:tensorflow:Running local_init_op.\\n\",\n      \"INFO:tensorflow:Done running local_init_op.\\n\",\n      \"INFO:tensorflow:prediction_loop marked as finished\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"tensorflow_all_out = []\\n\",\n    \"tensorflow_all_results = []\\n\",\n    \"for result in estimator.predict(predict_input_fn, yield_single_examples=True):\\n\",\n    \"    unique_id = int(result[\\\"unique_ids\\\"])\\n\",\n    \"    eval_feature = eval_unique_id_to_feature[unique_id]\\n\",\n    \"    start_logits = result[\\\"start_logits\\\"]\\n\",\n    \"    end_logits = result[\\\"end_logits\\\"]\\n\",\n    \"    total_loss = result[\\\"total_loss\\\"]\\n\",\n    \"    start_loss = result[\\\"start_loss\\\"]\\n\",\n    \"    end_loss = result[\\\"end_loss\\\"]\\n\",\n    \"\\n\",\n    \"    output_json = collections.OrderedDict()\\n\",\n    \"    output_json[\\\"linex_index\\\"] = unique_id\\n\",\n    \"    output_json[\\\"tokens\\\"] = [token for (i, token) in enumerate(eval_feature.tokens)]\\n\",\n    \"    output_json[\\\"start_logits\\\"] = [round(float(x), 6) for x in start_logits.flat]\\n\",\n    \"    output_json[\\\"end_logits\\\"] = [round(float(x), 6) for x in end_logits.flat]\\n\",\n    \"    output_json[\\\"total_loss\\\"] = [round(float(x), 6) for x in total_loss.flat]\\n\",\n    \"    output_json[\\\"start_loss\\\"] = [round(float(x), 6) for x in start_loss.flat]\\n\",\n    \"    output_json[\\\"end_loss\\\"] = [round(float(x), 6) for x in end_loss.flat]\\n\",\n    \"    tensorflow_all_out.append(output_json)\\n\",\n    \"    tensorflow_all_results.append(RawResult(\\n\",\n    \"                                    unique_id=unique_id,\\n\",\n    \"                                    start_logits=start_logits,\\n\",\n    \"                                    end_logits=end_logits))\\n\",\n    \"    break\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:47.912836Z\",\n     \"start_time\": \"2018-11-06T10:11:47.859679Z\"\n    },\n    \"code_folding\": []\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"def _get_best_indexes(logits, n_best_size):\\n\",\n    \"    \\\"\\\"\\\"Get the n-best logits from a list.\\\"\\\"\\\"\\n\",\n    \"    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)\\n\",\n    \"\\n\",\n    \"    best_indexes = []\\n\",\n    \"    for i in range(len(index_and_score)):\\n\",\n    \"        if i >= n_best_size:\\n\",\n    \"            break\\n\",\n    \"        best_indexes.append(index_and_score[i][0])\\n\",\n    \"    return best_indexes\\n\",\n    \"\\n\",\n    \"def _compute_softmax(scores):\\n\",\n    \"    \\\"\\\"\\\"Compute softmax probability over raw logits.\\\"\\\"\\\"\\n\",\n    \"    if not scores:\\n\",\n    \"        return []\\n\",\n    \"\\n\",\n    \"    max_score = None\\n\",\n    \"    for score in scores:\\n\",\n    \"        if max_score is None or score > max_score:\\n\",\n    \"            max_score = score\\n\",\n    \"\\n\",\n    \"    exp_scores = []\\n\",\n    \"    total_sum = 0.0\\n\",\n    \"    for score in scores:\\n\",\n    \"        x = math.exp(score - max_score)\\n\",\n    \"        exp_scores.append(x)\\n\",\n    \"        total_sum += x\\n\",\n    \"\\n\",\n    \"    probs = []\\n\",\n    \"    for score in exp_scores:\\n\",\n    \"        probs.append(score / total_sum)\\n\",\n    \"    return probs\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def compute_predictions(all_examples, all_features, all_results, n_best_size,\\n\",\n    \"                      max_answer_length, do_lower_case):\\n\",\n    \"    \\\"\\\"\\\"Compute final predictions.\\\"\\\"\\\"\\n\",\n    \"    example_index_to_features = collections.defaultdict(list)\\n\",\n    \"    for feature in all_features:\\n\",\n    \"        example_index_to_features[feature.example_index].append(feature)\\n\",\n    \"\\n\",\n    \"    unique_id_to_result = {}\\n\",\n    \"    for result in all_results:\\n\",\n    \"        unique_id_to_result[result.unique_id] = result\\n\",\n    \"\\n\",\n    \"    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name\\n\",\n    \"        \\\"PrelimPrediction\\\",\\n\",\n    \"        [\\\"feature_index\\\", \\\"start_index\\\", \\\"end_index\\\", \\\"start_logit\\\", \\\"end_logit\\\"])\\n\",\n    \"\\n\",\n    \"    all_predictions = collections.OrderedDict()\\n\",\n    \"    all_nbest_json = collections.OrderedDict()\\n\",\n    \"    for (example_index, example) in enumerate(all_examples):\\n\",\n    \"        features = example_index_to_features[example_index]\\n\",\n    \"\\n\",\n    \"        prelim_predictions = []\\n\",\n    \"        for (feature_index, feature) in enumerate(features):\\n\",\n    \"            result = unique_id_to_result[feature.unique_id]\\n\",\n    \"\\n\",\n    \"            start_indexes = _get_best_indexes(result.start_logits, n_best_size)\\n\",\n    \"            end_indexes = _get_best_indexes(result.end_logits, n_best_size)\\n\",\n    \"            for start_index in start_indexes:\\n\",\n    \"                for end_index in end_indexes:\\n\",\n    \"                    # We could hypothetically create invalid predictions, e.g., predict\\n\",\n    \"                    # that the start of the span is in the question. We throw out all\\n\",\n    \"                    # invalid predictions.\\n\",\n    \"                    if start_index >= len(feature.tokens):\\n\",\n    \"                        continue\\n\",\n    \"                    if end_index >= len(feature.tokens):\\n\",\n    \"                        continue\\n\",\n    \"                    if start_index not in feature.token_to_orig_map:\\n\",\n    \"                        continue\\n\",\n    \"                    if end_index not in feature.token_to_orig_map:\\n\",\n    \"                        continue\\n\",\n    \"                    if not feature.token_is_max_context.get(start_index, False):\\n\",\n    \"                        continue\\n\",\n    \"                    if end_index < start_index:\\n\",\n    \"                        continue\\n\",\n    \"                    length = end_index - start_index + 1\\n\",\n    \"                    if length > max_answer_length:\\n\",\n    \"                        continue\\n\",\n    \"                    prelim_predictions.append(\\n\",\n    \"                        _PrelimPrediction(\\n\",\n    \"                            feature_index=feature_index,\\n\",\n    \"                            start_index=start_index,\\n\",\n    \"                            end_index=end_index,\\n\",\n    \"                            start_logit=result.start_logits[start_index],\\n\",\n    \"                            end_logit=result.end_logits[end_index]))\\n\",\n    \"\\n\",\n    \"        prelim_predictions = sorted(\\n\",\n    \"            prelim_predictions,\\n\",\n    \"            key=lambda x: (x.start_logit + x.end_logit),\\n\",\n    \"            reverse=True)\\n\",\n    \"\\n\",\n    \"        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name\\n\",\n    \"            \\\"NbestPrediction\\\", [\\\"text\\\", \\\"start_logit\\\", \\\"end_logit\\\"])\\n\",\n    \"\\n\",\n    \"        seen_predictions = {}\\n\",\n    \"        nbest = []\\n\",\n    \"        for pred in prelim_predictions:\\n\",\n    \"            if len(nbest) >= n_best_size:\\n\",\n    \"                break\\n\",\n    \"            feature = features[pred.feature_index]\\n\",\n    \"\\n\",\n    \"            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]\\n\",\n    \"            orig_doc_start = feature.token_to_orig_map[pred.start_index]\\n\",\n    \"            orig_doc_end = feature.token_to_orig_map[pred.end_index]\\n\",\n    \"            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]\\n\",\n    \"            tok_text = \\\" \\\".join(tok_tokens)\\n\",\n    \"\\n\",\n    \"            # De-tokenize WordPieces that have been split off.\\n\",\n    \"            tok_text = tok_text.replace(\\\" ##\\\", \\\"\\\")\\n\",\n    \"            tok_text = tok_text.replace(\\\"##\\\", \\\"\\\")\\n\",\n    \"\\n\",\n    \"            # Clean whitespace\\n\",\n    \"            tok_text = tok_text.strip()\\n\",\n    \"            tok_text = \\\" \\\".join(tok_text.split())\\n\",\n    \"            orig_text = \\\" \\\".join(orig_tokens)\\n\",\n    \"\\n\",\n    \"            final_text = get_final_text(tok_text, orig_text, do_lower_case)\\n\",\n    \"            if final_text in seen_predictions:\\n\",\n    \"                continue\\n\",\n    \"\\n\",\n    \"            seen_predictions[final_text] = True\\n\",\n    \"            nbest.append(\\n\",\n    \"                _NbestPrediction(\\n\",\n    \"                    text=final_text,\\n\",\n    \"                    start_logit=pred.start_logit,\\n\",\n    \"                    end_logit=pred.end_logit))\\n\",\n    \"\\n\",\n    \"        # In very rare edge cases we could have no valid predictions. So we\\n\",\n    \"        # just create a nonce prediction in this case to avoid failure.\\n\",\n    \"        if not nbest:\\n\",\n    \"            nbest.append(\\n\",\n    \"                _NbestPrediction(text=\\\"empty\\\", start_logit=0.0, end_logit=0.0))\\n\",\n    \"\\n\",\n    \"        assert len(nbest) >= 1\\n\",\n    \"\\n\",\n    \"        total_scores = []\\n\",\n    \"        for entry in nbest:\\n\",\n    \"            total_scores.append(entry.start_logit + entry.end_logit)\\n\",\n    \"\\n\",\n    \"        probs = _compute_softmax(total_scores)\\n\",\n    \"\\n\",\n    \"        nbest_json = []\\n\",\n    \"        for (i, entry) in enumerate(nbest):\\n\",\n    \"            output = collections.OrderedDict()\\n\",\n    \"            output[\\\"text\\\"] = entry.text\\n\",\n    \"            output[\\\"probability\\\"] = probs[i]\\n\",\n    \"            output[\\\"start_logit\\\"] = entry.start_logit\\n\",\n    \"            output[\\\"end_logit\\\"] = entry.end_logit\\n\",\n    \"            nbest_json.append(output)\\n\",\n    \"\\n\",\n    \"        assert len(nbest_json) >= 1\\n\",\n    \"\\n\",\n    \"        all_predictions[example.qas_id] = nbest_json[0][\\\"text\\\"]\\n\",\n    \"        all_nbest_json[example.qas_id] = nbest_json\\n\",\n    \"\\n\",\n    \"    return all_predictions, all_nbest_json\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:47.953205Z\",\n     \"start_time\": \"2018-11-06T10:11:47.914751Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"all_predictions, all_nbest_json = compute_predictions(eval_examples[:1], eval_features[:1], tensorflow_all_results, 20, max_answer_length, True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:47.994647Z\",\n     \"start_time\": \"2018-11-06T10:11:47.955015Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"OrderedDict([('5733be284776f41900661182',\\n\",\n       \"              [OrderedDict([('text', 'empty'),\\n\",\n       \"                            ('probability', 1.0),\\n\",\n       \"                            ('start_logit', 0.0),\\n\",\n       \"                            ('end_logit', 0.0)])])])\"\n      ]\n     },\n     \"execution_count\": 12,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"all_nbest_json\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:48.028473Z\",\n     \"start_time\": \"2018-11-06T10:11:47.996311Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"1\\n\",\n      \"7\\n\",\n      \"odict_keys(['linex_index', 'tokens', 'start_logits', 'end_logits', 'total_loss', 'start_loss', 'end_loss'])\\n\",\n      \"number of tokens 176\\n\",\n      \"number of start_logits 384\\n\",\n      \"shape of end_logits 384\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(len(tensorflow_all_out))\\n\",\n    \"print(len(tensorflow_all_out[0]))\\n\",\n    \"print(tensorflow_all_out[0].keys())\\n\",\n    \"print(\\\"number of tokens\\\", len(tensorflow_all_out[0]['tokens']))\\n\",\n    \"print(\\\"number of start_logits\\\", len(tensorflow_all_out[0]['start_logits']))\\n\",\n    \"print(\\\"shape of end_logits\\\", len(tensorflow_all_out[0]['end_logits']))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:48.060658Z\",\n     \"start_time\": \"2018-11-06T10:11:48.030289Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"tensorflow_outputs = [tensorflow_all_out[0]['start_logits'], tensorflow_all_out[0]['end_logits'],\\n\",\n    \"                     tensorflow_all_out[0]['total_loss'], tensorflow_all_out[0]['start_loss'],\\n\",\n    \"                     tensorflow_all_out[0]['end_loss']]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2/ PyTorch code\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:48.478814Z\",\n     \"start_time\": \"2018-11-06T10:11:48.062585Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import modeling\\n\",\n    \"from run_squad import *\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:48.512607Z\",\n     \"start_time\": \"2018-11-06T10:11:48.480729Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"init_checkpoint_pt = \\\"../google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:51.023405Z\",\n     \"start_time\": \"2018-11-06T10:11:48.514306Z\"\n    },\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"tensor([0., 0.])\"\n      ]\n     },\n     \"execution_count\": 17,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"device = torch.device(\\\"cpu\\\")\\n\",\n    \"model = modeling.BertForQuestionAnswering(bert_config)\\n\",\n    \"model.bert.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\\n\",\n    \"model.to(device)\\n\",\n    \"model.qa_outputs.weight.data.fill_(1.0)\\n\",\n    \"model.qa_outputs.bias.data.zero_()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:51.079364Z\",\n     \"start_time\": \"2018-11-06T10:11:51.028228Z\"\n    },\n    \"code_folding\": []\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\\n\",\n    \"all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\\n\",\n    \"all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\\n\",\n    \"all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\\n\",\n    \"all_start_positions = torch.tensor([[f.start_position] for f in eval_features], dtype=torch.long)\\n\",\n    \"all_end_positions = torch.tensor([[f.end_position] for f in eval_features], dtype=torch.long)\\n\",\n    \"\\n\",\n    \"eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,\\n\",\n    \"                                   all_start_positions, all_end_positions, all_example_index)\\n\",\n    \"eval_sampler = SequentialSampler(eval_data)\\n\",\n    \"eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\\n\",\n    \"\\n\",\n    \"model.eval()\\n\",\n    \"None\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:51.114686Z\",\n     \"start_time\": \"2018-11-06T10:11:51.081474Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"[torch.Size([1, 384]), torch.Size([1, 384]), torch.Size([1, 384]), torch.Size([1, 1]), torch.Size([1, 1]), torch.Size([1])]\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"torch.Size([1, 1])\"\n      ]\n     },\n     \"execution_count\": 19,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"batch = iter(eval_dataloader).next()\\n\",\n    \"input_ids, input_mask, segment_ids, start_positions, end_positions, example_index = batch\\n\",\n    \"print([t.shape for t in batch])\\n\",\n    \"start_positions.size()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:52.298367Z\",\n     \"start_time\": \"2018-11-06T10:11:51.116219Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Evaluating:   0%|          | 0/270 [00:00<?, ?it/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"pytorch_all_out = []\\n\",\n    \"for batch in tqdm(eval_dataloader, desc=\\\"Evaluating\\\"):\\n\",\n    \"    input_ids, input_mask, segment_ids, start_positions, end_positions, example_index = batch\\n\",\n    \"    input_ids = input_ids.to(device)\\n\",\n    \"    input_mask = input_mask.to(device)\\n\",\n    \"    segment_ids = segment_ids.to(device)\\n\",\n    \"    start_positions = start_positions.to(device)\\n\",\n    \"    end_positions = end_positions.to(device)\\n\",\n    \"\\n\",\n    \"    total_loss, (start_logits, end_logits) = model(input_ids, segment_ids, input_mask, start_positions, end_positions)\\n\",\n    \"    \\n\",\n    \"    eval_feature = eval_features[example_index.item()]\\n\",\n    \"\\n\",\n    \"    output_json = collections.OrderedDict()\\n\",\n    \"    output_json[\\\"linex_index\\\"] = unique_id\\n\",\n    \"    output_json[\\\"tokens\\\"] = [token for (i, token) in enumerate(eval_feature.tokens)]\\n\",\n    \"    output_json[\\\"total_loss\\\"] = total_loss.detach().cpu().numpy()\\n\",\n    \"    output_json[\\\"start_logits\\\"] = start_logits.detach().cpu().numpy()\\n\",\n    \"    output_json[\\\"end_logits\\\"] = end_logits.detach().cpu().numpy()\\n\",\n    \"    pytorch_all_out.append(output_json)\\n\",\n    \"    break\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:52.339553Z\",\n     \"start_time\": \"2018-11-06T10:11:52.300335Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"1\\n\",\n      \"5\\n\",\n      \"odict_keys(['linex_index', 'tokens', 'total_loss', 'start_logits', 'end_logits'])\\n\",\n      \"number of tokens 176\\n\",\n      \"number of start_logits 1\\n\",\n      \"number of end_logits 1\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(len(pytorch_all_out))\\n\",\n    \"print(len(pytorch_all_out[0]))\\n\",\n    \"print(pytorch_all_out[0].keys())\\n\",\n    \"print(\\\"number of tokens\\\", len(pytorch_all_out[0]['tokens']))\\n\",\n    \"print(\\\"number of start_logits\\\", len(pytorch_all_out[0]['start_logits']))\\n\",\n    \"print(\\\"number of end_logits\\\", len(pytorch_all_out[0]['end_logits']))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:52.372827Z\",\n     \"start_time\": \"2018-11-06T10:11:52.341393Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"pytorch_outputs = [pytorch_all_out[0]['start_logits'], pytorch_all_out[0]['end_logits'], pytorch_all_out[0]['total_loss']]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3/ Comparing the standard deviation of start_logits, end_logits and loss of both models\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:52.402814Z\",\n     \"start_time\": \"2018-11-06T10:11:52.374329Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import numpy as np\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 24,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:11:52.434743Z\",\n     \"start_time\": \"2018-11-06T10:11:52.404345Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"shape tensorflow layer, shape pytorch layer, standard deviation\\n\",\n      \"((384,), (1, 384), 5.244962470555037e-06)\\n\",\n      \"((384,), (1, 384), 5.244962470555037e-06)\\n\",\n      \"((1,), (), 4.560241698925438e-06)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print('shape tensorflow layer, shape pytorch layer, standard deviation')\\n\",\n    \"print('\\\\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,\\n\",\n    \"                          np.array(pytorch_outputs[i]).shape, \\n\",\n    \"                          np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(3))))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 27,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-06T10:12:54.200059Z\",\n     \"start_time\": \"2018-11-06T10:12:54.167355Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Total loss of the TF model 9.06024 - Total loss of the PT model 9.0602445602417\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"Total loss of the TF model {} - Total loss of the PT model {}\\\".format(tensorflow_outputs[2][0], pytorch_outputs[2]))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"hide_input\": false,\n  \"kernelspec\": {\n   \"display_name\": \"Python [default]\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.7\"\n  },\n  \"toc\": {\n   \"colors\": {\n    \"hover_highlight\": \"#DAA520\",\n    \"running_highlight\": \"#FF0000\",\n    \"selected_highlight\": \"#FFD700\"\n   },\n   \"moveMenuLeft\": true,\n   \"nav_menu\": {\n    \"height\": \"48px\",\n    \"width\": \"252px\"\n   },\n   \"navigate_menu\": true,\n   \"number_sections\": true,\n   \"sideBar\": true,\n   \"threshold\": 4,\n   \"toc_cell\": false,\n   \"toc_section_display\": \"block\",\n   \"toc_window_display\": false\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "notebooks/Comparing-TF-and-PT-models.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Comparing TensorFlow (original) and PyTorch models\\n\",\n    \"\\n\",\n    \"You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.\\n\",\n    \"\\n\",\n    \"To run this notebook, follow these instructions:\\n\",\n    \"- make sure that your Python environment has both TensorFlow and PyTorch installed,\\n\",\n    \"- download the original TensorFlow implementation,\\n\",\n    \"- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\\n\",\n    \"- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\\n\",\n    \"\\n\",\n    \"If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T14:56:48.412622Z\",\n     \"start_time\": \"2018-11-15T14:56:48.400110Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"os.chdir('../')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1/ TensorFlow code\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T14:56:49.483829Z\",\n     \"start_time\": \"2018-11-15T14:56:49.471296Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"original_tf_inplem_dir = \\\"./tensorflow_code/\\\"\\n\",\n    \"model_dir = \\\"../google_models/uncased_L-12_H-768_A-12/\\\"\\n\",\n    \"\\n\",\n    \"vocab_file = model_dir + \\\"vocab.txt\\\"\\n\",\n    \"bert_config_file = model_dir + \\\"bert_config.json\\\"\\n\",\n    \"init_checkpoint = model_dir + \\\"bert_model.ckpt\\\"\\n\",\n    \"\\n\",\n    \"input_file = \\\"./samples/input.txt\\\"\\n\",\n    \"max_seq_length = 128\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T14:57:51.597932Z\",\n     \"start_time\": \"2018-11-15T14:57:51.549466Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"ename\": \"DuplicateFlagError\",\n     \"evalue\": \"The flag 'input_file' is defined twice. First from *, Second from *.  Description from first occurrence: (no help available)\",\n     \"output_type\": \"error\",\n     \"traceback\": [\n      \"\\u001b[0;31m---------------------------------------------------------------------------\\u001b[0m\",\n      \"\\u001b[0;31mDuplicateFlagError\\u001b[0m                        Traceback (most recent call last)\",\n      \"\\u001b[0;32m<ipython-input-6-86ecffb49060>\\u001b[0m in \\u001b[0;36m<module>\\u001b[0;34m\\u001b[0m\\n\\u001b[1;32m      4\\u001b[0m \\u001b[0mspec\\u001b[0m \\u001b[0;34m=\\u001b[0m \\u001b[0mimportlib\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mutil\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mspec_from_file_location\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0;34m'*'\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0moriginal_tf_inplem_dir\\u001b[0m \\u001b[0;34m+\\u001b[0m \\u001b[0;34m'/extract_features_tensorflow.py'\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m      5\\u001b[0m \\u001b[0mmodule\\u001b[0m \\u001b[0;34m=\\u001b[0m \\u001b[0mimportlib\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mutil\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mmodule_from_spec\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0mspec\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0;32m----> 6\\u001b[0;31m \\u001b[0mspec\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mloader\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mexec_module\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0mmodule\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0m\\u001b[1;32m      7\\u001b[0m \\u001b[0msys\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mmodules\\u001b[0m\\u001b[0;34m[\\u001b[0m\\u001b[0;34m'extract_features_tensorflow'\\u001b[0m\\u001b[0;34m]\\u001b[0m \\u001b[0;34m=\\u001b[0m \\u001b[0mmodule\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m      8\\u001b[0m \\u001b[0;34m\\u001b[0m\\u001b[0m\\n\",\n      \"\\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/importlib/_bootstrap_external.py\\u001b[0m in \\u001b[0;36mexec_module\\u001b[0;34m(self, module)\\u001b[0m\\n\",\n      \"\\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/importlib/_bootstrap.py\\u001b[0m in \\u001b[0;36m_call_with_frames_removed\\u001b[0;34m(f, *args, **kwds)\\u001b[0m\\n\",\n      \"\\u001b[0;32m~/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/tensorflow_code/extract_features_tensorflow.py\\u001b[0m in \\u001b[0;36m<module>\\u001b[0;34m\\u001b[0m\\n\\u001b[1;32m     32\\u001b[0m \\u001b[0mFLAGS\\u001b[0m \\u001b[0;34m=\\u001b[0m \\u001b[0mflags\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mFLAGS\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m     33\\u001b[0m \\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0;32m---> 34\\u001b[0;31m \\u001b[0mflags\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mDEFINE_string\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0;34m\\\"input_file\\\"\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0;32mNone\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0;34m\\\"\\\"\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0m\\u001b[1;32m     35\\u001b[0m \\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m     36\\u001b[0m \\u001b[0mflags\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mDEFINE_string\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0;34m\\\"output_file\\\"\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0;32mNone\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0;34m\\\"\\\"\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\",\n      \"\\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/tensorflow/python/platform/flags.py\\u001b[0m in \\u001b[0;36mwrapper\\u001b[0;34m(*args, **kwargs)\\u001b[0m\\n\\u001b[1;32m     56\\u001b[0m           \\u001b[0;34m'Use of the keyword argument names (flag_name, default_value, '\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m     57\\u001b[0m           'docstring) is deprecated, please use (name, default, help) instead.')\\n\\u001b[0;32m---> 58\\u001b[0;31m     \\u001b[0;32mreturn\\u001b[0m \\u001b[0moriginal_function\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0;34m*\\u001b[0m\\u001b[0margs\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0;34m**\\u001b[0m\\u001b[0mkwargs\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0m\\u001b[1;32m     59\\u001b[0m \\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m     60\\u001b[0m   \\u001b[0;32mreturn\\u001b[0m \\u001b[0mtf_decorator\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mmake_decorator\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0moriginal_function\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0mwrapper\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\",\n      \"\\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_defines.py\\u001b[0m in \\u001b[0;36mDEFINE_string\\u001b[0;34m(name, default, help, flag_values, **args)\\u001b[0m\\n\\u001b[1;32m    239\\u001b[0m   \\u001b[0mparser\\u001b[0m \\u001b[0;34m=\\u001b[0m \\u001b[0m_argument_parser\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mArgumentParser\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m    240\\u001b[0m   \\u001b[0mserializer\\u001b[0m \\u001b[0;34m=\\u001b[0m \\u001b[0m_argument_parser\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mArgumentSerializer\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0;32m--> 241\\u001b[0;31m   \\u001b[0mDEFINE\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0mparser\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0mname\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0mdefault\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0mhelp\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0mflag_values\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0mserializer\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0;34m**\\u001b[0m\\u001b[0margs\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0m\\u001b[1;32m    242\\u001b[0m \\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m    243\\u001b[0m \\u001b[0;34m\\u001b[0m\\u001b[0m\\n\",\n      \"\\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_defines.py\\u001b[0m in \\u001b[0;36mDEFINE\\u001b[0;34m(parser, name, default, help, flag_values, serializer, module_name, **args)\\u001b[0m\\n\\u001b[1;32m     80\\u001b[0m   \\\"\\\"\\\"\\n\\u001b[1;32m     81\\u001b[0m   DEFINE_flag(_flag.Flag(parser, serializer, name, default, help, **args),\\n\\u001b[0;32m---> 82\\u001b[0;31m               flag_values, module_name)\\n\\u001b[0m\\u001b[1;32m     83\\u001b[0m \\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m     84\\u001b[0m \\u001b[0;34m\\u001b[0m\\u001b[0m\\n\",\n      \"\\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_defines.py\\u001b[0m in \\u001b[0;36mDEFINE_flag\\u001b[0;34m(flag, flag_values, module_name)\\u001b[0m\\n\\u001b[1;32m    102\\u001b[0m   \\u001b[0;31m# Copying the reference to flag_values prevents pychecker warnings.\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m    103\\u001b[0m   \\u001b[0mfv\\u001b[0m \\u001b[0;34m=\\u001b[0m \\u001b[0mflag_values\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0;32m--> 104\\u001b[0;31m   \\u001b[0mfv\\u001b[0m\\u001b[0;34m[\\u001b[0m\\u001b[0mflag\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mname\\u001b[0m\\u001b[0;34m]\\u001b[0m \\u001b[0;34m=\\u001b[0m \\u001b[0mflag\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0m\\u001b[1;32m    105\\u001b[0m   \\u001b[0;31m# Tell flag_values who's defining the flag.\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m    106\\u001b[0m   \\u001b[0;32mif\\u001b[0m \\u001b[0mmodule_name\\u001b[0m\\u001b[0;34m:\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\",\n      \"\\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_flagvalues.py\\u001b[0m in \\u001b[0;36m__setitem__\\u001b[0;34m(self, name, flag)\\u001b[0m\\n\\u001b[1;32m    427\\u001b[0m         \\u001b[0;31m# module is simply being imported a subsequent time.\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m    428\\u001b[0m         \\u001b[0;32mreturn\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0;32m--> 429\\u001b[0;31m       \\u001b[0;32mraise\\u001b[0m \\u001b[0m_exceptions\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mDuplicateFlagError\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mfrom_flag\\u001b[0m\\u001b[0;34m(\\u001b[0m\\u001b[0mname\\u001b[0m\\u001b[0;34m,\\u001b[0m \\u001b[0mself\\u001b[0m\\u001b[0;34m)\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[0m\\u001b[1;32m    430\\u001b[0m     \\u001b[0mshort_name\\u001b[0m \\u001b[0;34m=\\u001b[0m \\u001b[0mflag\\u001b[0m\\u001b[0;34m.\\u001b[0m\\u001b[0mshort_name\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\\u001b[1;32m    431\\u001b[0m     \\u001b[0;31m# If a new flag overrides an old one, we need to cleanup the old flag's\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0;34m\\u001b[0m\\u001b[0m\\n\",\n      \"\\u001b[0;31mDuplicateFlagError\\u001b[0m: The flag 'input_file' is defined twice. First from *, Second from *.  Description from first occurrence: (no help available)\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import importlib.util\\n\",\n    \"import sys\\n\",\n    \"\\n\",\n    \"spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/extract_features_tensorflow.py')\\n\",\n    \"module = importlib.util.module_from_spec(spec)\\n\",\n    \"spec.loader.exec_module(module)\\n\",\n    \"sys.modules['extract_features_tensorflow'] = module\\n\",\n    \"\\n\",\n    \"from extract_features_tensorflow import *\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T14:58:05.650987Z\",\n     \"start_time\": \"2018-11-15T14:58:05.541620Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:*** Example ***\\n\",\n      \"INFO:tensorflow:unique_id: 0\\n\",\n      \"INFO:tensorflow:tokens: [CLS] who was jim henson ? [SEP] jim henson was a puppet ##eer [SEP]\\n\",\n      \"INFO:tensorflow:input_ids: 101 2040 2001 3958 27227 1029 102 3958 27227 2001 1037 13997 11510 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\",\n      \"INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"layer_indexes = list(range(12))\\n\",\n    \"bert_config = modeling.BertConfig.from_json_file(bert_config_file)\\n\",\n    \"tokenizer = tokenization.FullTokenizer(\\n\",\n    \"    vocab_file=vocab_file, do_lower_case=True)\\n\",\n    \"examples = read_examples(input_file)\\n\",\n    \"\\n\",\n    \"features = convert_examples_to_features(\\n\",\n    \"    examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)\\n\",\n    \"unique_id_to_feature = {}\\n\",\n    \"for feature in features:\\n\",\n    \"    unique_id_to_feature[feature.unique_id] = feature\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T14:58:11.562443Z\",\n     \"start_time\": \"2018-11-15T14:58:08.036485Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x11ea7f1e0>) includes params argument, but params are not passed to Estimator.\\n\",\n      \"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphs4_nsq9\\n\",\n      \"INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphs4_nsq9', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\\n\",\n      \"graph_options {\\n\",\n      \"  rewrite_options {\\n\",\n      \"    meta_optimizer_iterations: ONE\\n\",\n      \"  }\\n\",\n      \"}\\n\",\n      \", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x121b163c8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\\n\",\n      \"WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\\n\",\n      \"INFO:tensorflow:_TPUContext: eval_on_tpu True\\n\",\n      \"WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\\n\",\n    \"run_config = tf.contrib.tpu.RunConfig(\\n\",\n    \"    master=None,\\n\",\n    \"    tpu_config=tf.contrib.tpu.TPUConfig(\\n\",\n    \"        num_shards=1,\\n\",\n    \"        per_host_input_for_training=is_per_host))\\n\",\n    \"\\n\",\n    \"model_fn = model_fn_builder(\\n\",\n    \"    bert_config=bert_config,\\n\",\n    \"    init_checkpoint=init_checkpoint,\\n\",\n    \"    layer_indexes=layer_indexes,\\n\",\n    \"    use_tpu=False,\\n\",\n    \"    use_one_hot_embeddings=False)\\n\",\n    \"\\n\",\n    \"# If TPU is not available, this will fall back to normal Estimator on CPU\\n\",\n    \"# or GPU.\\n\",\n    \"estimator = tf.contrib.tpu.TPUEstimator(\\n\",\n    \"    use_tpu=False,\\n\",\n    \"    model_fn=model_fn,\\n\",\n    \"    config=run_config,\\n\",\n    \"    predict_batch_size=1)\\n\",\n    \"\\n\",\n    \"input_fn = input_fn_builder(\\n\",\n    \"    features=features, seq_length=max_seq_length)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T14:58:21.736543Z\",\n     \"start_time\": \"2018-11-15T14:58:16.723829Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphs4_nsq9, running initialization to predict.\\n\",\n      \"INFO:tensorflow:Calling model_fn.\\n\",\n      \"INFO:tensorflow:Running infer on CPU\\n\",\n      \"INFO:tensorflow:Done calling model_fn.\\n\",\n      \"INFO:tensorflow:Graph was finalized.\\n\",\n      \"INFO:tensorflow:Running local_init_op.\\n\",\n      \"INFO:tensorflow:Done running local_init_op.\\n\",\n      \"extracting layer 0\\n\",\n      \"extracting layer 1\\n\",\n      \"extracting layer 2\\n\",\n      \"extracting layer 3\\n\",\n      \"extracting layer 4\\n\",\n      \"extracting layer 5\\n\",\n      \"extracting layer 6\\n\",\n      \"extracting layer 7\\n\",\n      \"extracting layer 8\\n\",\n      \"extracting layer 9\\n\",\n      \"extracting layer 10\\n\",\n      \"extracting layer 11\\n\",\n      \"INFO:tensorflow:prediction_loop marked as finished\\n\",\n      \"INFO:tensorflow:prediction_loop marked as finished\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"tensorflow_all_out = []\\n\",\n    \"for result in estimator.predict(input_fn, yield_single_examples=True):\\n\",\n    \"    unique_id = int(result[\\\"unique_id\\\"])\\n\",\n    \"    feature = unique_id_to_feature[unique_id]\\n\",\n    \"    output_json = collections.OrderedDict()\\n\",\n    \"    output_json[\\\"linex_index\\\"] = unique_id\\n\",\n    \"    tensorflow_all_out_features = []\\n\",\n    \"    # for (i, token) in enumerate(feature.tokens):\\n\",\n    \"    all_layers = []\\n\",\n    \"    for (j, layer_index) in enumerate(layer_indexes):\\n\",\n    \"        print(\\\"extracting layer {}\\\".format(j))\\n\",\n    \"        layer_output = result[\\\"layer_output_%d\\\" % j]\\n\",\n    \"        layers = collections.OrderedDict()\\n\",\n    \"        layers[\\\"index\\\"] = layer_index\\n\",\n    \"        layers[\\\"values\\\"] = layer_output\\n\",\n    \"        all_layers.append(layers)\\n\",\n    \"    tensorflow_out_features = collections.OrderedDict()\\n\",\n    \"    tensorflow_out_features[\\\"layers\\\"] = all_layers\\n\",\n    \"    tensorflow_all_out_features.append(tensorflow_out_features)\\n\",\n    \"\\n\",\n    \"    output_json[\\\"features\\\"] = tensorflow_all_out_features\\n\",\n    \"    tensorflow_all_out.append(output_json)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T14:58:23.970714Z\",\n     \"start_time\": \"2018-11-15T14:58:23.931930Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"1\\n\",\n      \"2\\n\",\n      \"odict_keys(['linex_index', 'features'])\\n\",\n      \"number of tokens 1\\n\",\n      \"number of layers 12\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"(128, 768)\"\n      ]\n     },\n     \"execution_count\": 11,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"print(len(tensorflow_all_out))\\n\",\n    \"print(len(tensorflow_all_out[0]))\\n\",\n    \"print(tensorflow_all_out[0].keys())\\n\",\n    \"print(\\\"number of tokens\\\", len(tensorflow_all_out[0]['features']))\\n\",\n    \"print(\\\"number of layers\\\", len(tensorflow_all_out[0]['features'][0]['layers']))\\n\",\n    \"tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T14:58:25.547012Z\",\n     \"start_time\": \"2018-11-15T14:58:25.516076Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2/ PyTorch code\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"os.chdir('./examples')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:03:49.528679Z\",\n     \"start_time\": \"2018-11-15T15:03:49.497697Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import extract_features\\n\",\n    \"import pytorch_pretrained_bert as ppb\\n\",\n    \"from extract_features import *\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 25,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:21:18.001177Z\",\n     \"start_time\": \"2018-11-15T15:21:17.970369Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"init_checkpoint_pt = \\\"../../google_models/uncased_L-12_H-768_A-12/\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 26,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:21:20.893669Z\",\n     \"start_time\": \"2018-11-15T15:21:18.786623Z\"\n    },\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\\n\",\n      \"11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling -   Model config {\\n\",\n      \"  \\\"attention_probs_dropout_prob\\\": 0.1,\\n\",\n      \"  \\\"hidden_act\\\": \\\"gelu\\\",\\n\",\n      \"  \\\"hidden_dropout_prob\\\": 0.1,\\n\",\n      \"  \\\"hidden_size\\\": 768,\\n\",\n      \"  \\\"initializer_range\\\": 0.02,\\n\",\n      \"  \\\"intermediate_size\\\": 3072,\\n\",\n      \"  \\\"max_position_embeddings\\\": 512,\\n\",\n      \"  \\\"num_attention_heads\\\": 12,\\n\",\n      \"  \\\"num_hidden_layers\\\": 12,\\n\",\n      \"  \\\"type_vocab_size\\\": 2,\\n\",\n      \"  \\\"vocab_size\\\": 30522\\n\",\n      \"}\\n\",\n      \"\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"BertModel(\\n\",\n       \"  (embeddings): BertEmbeddings(\\n\",\n       \"    (word_embeddings): Embedding(30522, 768)\\n\",\n       \"    (position_embeddings): Embedding(512, 768)\\n\",\n       \"    (token_type_embeddings): Embedding(2, 768)\\n\",\n       \"    (LayerNorm): BertLayerNorm()\\n\",\n       \"    (dropout): Dropout(p=0.1)\\n\",\n       \"  )\\n\",\n       \"  (encoder): BertEncoder(\\n\",\n       \"    (layer): ModuleList(\\n\",\n       \"      (0): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (1): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (2): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (3): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (4): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (5): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (6): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (7): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (8): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (9): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (10): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (11): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"    )\\n\",\n       \"  )\\n\",\n       \"  (pooler): BertPooler(\\n\",\n       \"    (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"    (activation): Tanh()\\n\",\n       \"  )\\n\",\n       \")\"\n      ]\n     },\n     \"execution_count\": 26,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"device = torch.device(\\\"cpu\\\")\\n\",\n    \"model = ppb.BertModel.from_pretrained(init_checkpoint_pt)\\n\",\n    \"model.to(device)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 27,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:21:26.963427Z\",\n     \"start_time\": \"2018-11-15T15:21:26.922494Z\"\n    },\n    \"code_folding\": []\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"BertModel(\\n\",\n       \"  (embeddings): BertEmbeddings(\\n\",\n       \"    (word_embeddings): Embedding(30522, 768)\\n\",\n       \"    (position_embeddings): Embedding(512, 768)\\n\",\n       \"    (token_type_embeddings): Embedding(2, 768)\\n\",\n       \"    (LayerNorm): BertLayerNorm()\\n\",\n       \"    (dropout): Dropout(p=0.1)\\n\",\n       \"  )\\n\",\n       \"  (encoder): BertEncoder(\\n\",\n       \"    (layer): ModuleList(\\n\",\n       \"      (0): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (1): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (2): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (3): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (4): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (5): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (6): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (7): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (8): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (9): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (10): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"      (11): BertLayer(\\n\",\n       \"        (attention): BertAttention(\\n\",\n       \"          (self): BertSelfAttention(\\n\",\n       \"            (query): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (key): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (value): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"          (output): BertSelfOutput(\\n\",\n       \"            (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"            (LayerNorm): BertLayerNorm()\\n\",\n       \"            (dropout): Dropout(p=0.1)\\n\",\n       \"          )\\n\",\n       \"        )\\n\",\n       \"        (intermediate): BertIntermediate(\\n\",\n       \"          (dense): Linear(in_features=768, out_features=3072, bias=True)\\n\",\n       \"        )\\n\",\n       \"        (output): BertOutput(\\n\",\n       \"          (dense): Linear(in_features=3072, out_features=768, bias=True)\\n\",\n       \"          (LayerNorm): BertLayerNorm()\\n\",\n       \"          (dropout): Dropout(p=0.1)\\n\",\n       \"        )\\n\",\n       \"      )\\n\",\n       \"    )\\n\",\n       \"  )\\n\",\n       \"  (pooler): BertPooler(\\n\",\n       \"    (dense): Linear(in_features=768, out_features=768, bias=True)\\n\",\n       \"    (activation): Tanh()\\n\",\n       \"  )\\n\",\n       \")\"\n      ]\n     },\n     \"execution_count\": 27,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\\n\",\n    \"all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\\n\",\n    \"all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\\n\",\n    \"all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\\n\",\n    \"\\n\",\n    \"eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)\\n\",\n    \"eval_sampler = SequentialSampler(eval_data)\\n\",\n    \"eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\\n\",\n    \"\\n\",\n    \"model.eval()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 28,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:21:30.718724Z\",\n     \"start_time\": \"2018-11-15T15:21:30.329205Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,  2001,\\n\",\n      \"          1037, 13997, 11510,   102,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\\n\",\n      \"             0,     0,     0,     0,     0,     0,     0,     0]])\\n\",\n      \"tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\\n\",\n      \"         0, 0, 0, 0, 0, 0, 0, 0]])\\n\",\n      \"tensor([0])\\n\",\n      \"layer 0 0\\n\",\n      \"layer 1 1\\n\",\n      \"layer 2 2\\n\",\n      \"layer 3 3\\n\",\n      \"layer 4 4\\n\",\n      \"layer 5 5\\n\",\n      \"layer 6 6\\n\",\n      \"layer 7 7\\n\",\n      \"layer 8 8\\n\",\n      \"layer 9 9\\n\",\n      \"layer 10 10\\n\",\n      \"layer 11 11\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"layer_indexes = list(range(12))\\n\",\n    \"\\n\",\n    \"pytorch_all_out = []\\n\",\n    \"for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:\\n\",\n    \"    print(input_ids)\\n\",\n    \"    print(input_mask)\\n\",\n    \"    print(example_indices)\\n\",\n    \"    input_ids = input_ids.to(device)\\n\",\n    \"    input_mask = input_mask.to(device)\\n\",\n    \"\\n\",\n    \"    all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)\\n\",\n    \"\\n\",\n    \"    for b, example_index in enumerate(example_indices):\\n\",\n    \"        feature = features[example_index.item()]\\n\",\n    \"        unique_id = int(feature.unique_id)\\n\",\n    \"        # feature = unique_id_to_feature[unique_id]\\n\",\n    \"        output_json = collections.OrderedDict()\\n\",\n    \"        output_json[\\\"linex_index\\\"] = unique_id\\n\",\n    \"        all_out_features = []\\n\",\n    \"        # for (i, token) in enumerate(feature.tokens):\\n\",\n    \"        all_layers = []\\n\",\n    \"        for (j, layer_index) in enumerate(layer_indexes):\\n\",\n    \"            print(\\\"layer\\\", j, layer_index)\\n\",\n    \"            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\\n\",\n    \"            layer_output = layer_output[b]\\n\",\n    \"            layers = collections.OrderedDict()\\n\",\n    \"            layers[\\\"index\\\"] = layer_index\\n\",\n    \"            layer_output = layer_output\\n\",\n    \"            layers[\\\"values\\\"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]\\n\",\n    \"            all_layers.append(layers)\\n\",\n    \"\\n\",\n    \"            out_features = collections.OrderedDict()\\n\",\n    \"            out_features[\\\"layers\\\"] = all_layers\\n\",\n    \"            all_out_features.append(out_features)\\n\",\n    \"        output_json[\\\"features\\\"] = all_out_features\\n\",\n    \"        pytorch_all_out.append(output_json)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 29,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:21:35.703615Z\",\n     \"start_time\": \"2018-11-15T15:21:35.666150Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"1\\n\",\n      \"2\\n\",\n      \"odict_keys(['linex_index', 'features'])\\n\",\n      \"number of tokens 1\\n\",\n      \"number of layers 12\\n\",\n      \"hidden_size 128\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"(128, 768)\"\n      ]\n     },\n     \"execution_count\": 29,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"print(len(pytorch_all_out))\\n\",\n    \"print(len(pytorch_all_out[0]))\\n\",\n    \"print(pytorch_all_out[0].keys())\\n\",\n    \"print(\\\"number of tokens\\\", len(pytorch_all_out))\\n\",\n    \"print(\\\"number of layers\\\", len(pytorch_all_out[0]['features'][0]['layers']))\\n\",\n    \"print(\\\"hidden_size\\\", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))\\n\",\n    \"pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 30,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:21:36.999073Z\",\n     \"start_time\": \"2018-11-15T15:21:36.966762Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"(128, 768)\\n\",\n      \"(128, 768)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\\n\",\n    \"print(pytorch_outputs[0].shape)\\n\",\n    \"print(pytorch_outputs[1].shape)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 31,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:21:37.936522Z\",\n     \"start_time\": \"2018-11-15T15:21:37.905269Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"(128, 768)\\n\",\n      \"(128, 768)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(tensorflow_outputs[0].shape)\\n\",\n    \"print(tensorflow_outputs[1].shape)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3/ Comparing the standard deviation on the last layer of both models\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 32,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:21:39.437137Z\",\n     \"start_time\": \"2018-11-15T15:21:39.406150Z\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import numpy as np\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 33,\n   \"metadata\": {\n    \"ExecuteTime\": {\n     \"end_time\": \"2018-11-15T15:21:40.181870Z\",\n     \"start_time\": \"2018-11-15T15:21:40.137023Z\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"shape tensorflow layer, shape pytorch layer, standard deviation\\n\",\n      \"((128, 768), (128, 768), 1.5258875e-07)\\n\",\n      \"((128, 768), (128, 768), 2.342731e-07)\\n\",\n      \"((128, 768), (128, 768), 2.801949e-07)\\n\",\n      \"((128, 768), (128, 768), 3.5904986e-07)\\n\",\n      \"((128, 768), (128, 768), 4.2842768e-07)\\n\",\n      \"((128, 768), (128, 768), 5.127951e-07)\\n\",\n      \"((128, 768), (128, 768), 6.14668e-07)\\n\",\n      \"((128, 768), (128, 768), 7.063922e-07)\\n\",\n      \"((128, 768), (128, 768), 7.906173e-07)\\n\",\n      \"((128, 768), (128, 768), 8.475192e-07)\\n\",\n      \"((128, 768), (128, 768), 8.975489e-07)\\n\",\n      \"((128, 768), (128, 768), 4.1671223e-07)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print('shape tensorflow layer, shape pytorch layer, standard deviation')\\n\",\n    \"print('\\\\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,\\n\",\n    \"                          np.array(pytorch_outputs[i]).shape, \\n\",\n    \"                          np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(12))))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"hide_input\": false,\n  \"kernelspec\": {\n   \"display_name\": \"Python [default]\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.7\"\n  },\n  \"toc\": {\n   \"colors\": {\n    \"hover_highlight\": \"#DAA520\",\n    \"running_highlight\": \"#FF0000\",\n    \"selected_highlight\": \"#FFD700\"\n   },\n   \"moveMenuLeft\": true,\n   \"nav_menu\": {\n    \"height\": \"48px\",\n    \"width\": \"252px\"\n   },\n   \"navigate_menu\": true,\n   \"number_sections\": true,\n   \"sideBar\": true,\n   \"threshold\": 4,\n   \"toc_cell\": false,\n   \"toc_section_display\": \"block\",\n   \"toc_window_display\": false\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "pytorch_pretrained_bert/__init__.py",
    "content": "__version__ = \"0.6.2\"\nfrom .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer\nfrom .tokenization_openai import OpenAIGPTTokenizer\nfrom .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)\nfrom .tokenization_gpt2 import GPT2Tokenizer\n\nfrom .modeling import (BertConfig, BertModel, BertForPreTraining,\n                       BertForMaskedLM, BertForNextSentencePrediction,\n                       BertForSequenceClassification, BertForMultipleChoice,\n                       BertForTokenClassification, BertForQuestionAnswering,\n                       load_tf_weights_in_bert)\nfrom .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,\n                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,\n                              load_tf_weights_in_openai_gpt)\nfrom .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,\n                                  load_tf_weights_in_transfo_xl)\nfrom .modeling_gpt2 import (GPT2Config, GPT2Model,\n                            GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead,\n                            load_tf_weights_in_gpt2)\n\nfrom .optimization import BertAdam\nfrom .optimization_openai import OpenAIAdam\n\nfrom .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME\n"
  },
  {
    "path": "pytorch_pretrained_bert/__main__.py",
    "content": "# coding: utf8\ndef main():\n    import sys\n    if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [\n        \"convert_tf_checkpoint_to_pytorch\",\n        \"convert_openai_checkpoint\",\n        \"convert_transfo_xl_checkpoint\",\n        \"convert_gpt2_checkpoint\",\n    ]:\n        print(\n        \"Should be used as one of: \\n\"\n        \">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \\n\"\n        \">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \\n\"\n        \">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \\n\"\n        \">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`\")\n    else:\n        if sys.argv[1] == \"convert_tf_checkpoint_to_pytorch\":\n            try:\n                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch\n            except ImportError:\n                print(\"pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, \"\n                    \"In that case, it requires TensorFlow to be installed. Please see \"\n                    \"https://www.tensorflow.org/install/ for installation instructions.\")\n                raise\n\n            if len(sys.argv) != 5:\n                # pylint: disable=line-too-long\n                print(\"Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`\")\n            else:\n                PYTORCH_DUMP_OUTPUT = sys.argv.pop()\n                TF_CONFIG = sys.argv.pop()\n                TF_CHECKPOINT = sys.argv.pop()\n                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)\n        elif sys.argv[1] == \"convert_openai_checkpoint\":\n            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch\n            OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]\n            PYTORCH_DUMP_OUTPUT = sys.argv[3]\n            if len(sys.argv) == 5:\n                OPENAI_GPT_CONFIG = sys.argv[4]\n            else:\n                OPENAI_GPT_CONFIG = \"\"\n            convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,\n                                                 OPENAI_GPT_CONFIG,\n                                                 PYTORCH_DUMP_OUTPUT)\n        elif sys.argv[1] == \"convert_transfo_xl_checkpoint\":\n            try:\n                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch\n            except ImportError:\n                print(\"pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, \"\n                    \"In that case, it requires TensorFlow to be installed. Please see \"\n                    \"https://www.tensorflow.org/install/ for installation instructions.\")\n                raise\n\n            if 'ckpt' in sys.argv[2].lower():\n                TF_CHECKPOINT = sys.argv[2]\n                TF_DATASET_FILE = \"\"\n            else:\n                TF_DATASET_FILE = sys.argv[2]\n                TF_CHECKPOINT = \"\"\n            PYTORCH_DUMP_OUTPUT = sys.argv[3]\n            if len(sys.argv) == 5:\n                TF_CONFIG = sys.argv[4]\n            else:\n                TF_CONFIG = \"\"\n            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)\n        else:\n            try:\n                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch\n            except ImportError:\n                print(\"pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, \"\n                    \"In that case, it requires TensorFlow to be installed. Please see \"\n                    \"https://www.tensorflow.org/install/ for installation instructions.\")\n                raise\n\n            TF_CHECKPOINT = sys.argv[2]\n            PYTORCH_DUMP_OUTPUT = sys.argv[3]\n            if len(sys.argv) == 5:\n                TF_CONFIG = sys.argv[4]\n            else:\n                TF_CONFIG = \"\"\n            convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py",
    "content": "# coding=utf-8\n# Copyright 2018 The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Convert OpenAI GPT checkpoint.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function\n\nimport argparse\nfrom io import open\n\nimport torch\n\nfrom pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,\n                                                     GPT2Config,\n                                                     GPT2Model,\n                                                     load_tf_weights_in_gpt2)\n\n\ndef convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):\n    # Construct model\n    if gpt2_config_file == \"\":\n        config = GPT2Config()\n    else:\n        config = GPT2Config(gpt2_config_file)\n    model = GPT2Model(config)\n\n    # Load weights from numpy\n    load_tf_weights_in_gpt2(model, gpt2_checkpoint_path)\n\n    # Save pytorch-model\n    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME\n    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME\n    print(\"Save PyTorch model to {}\".format(pytorch_weights_dump_path))\n    torch.save(model.state_dict(), pytorch_weights_dump_path)\n    print(\"Save configuration file to {}\".format(pytorch_config_dump_path))\n    with open(pytorch_config_dump_path, \"w\", encoding=\"utf-8\") as f:\n        f.write(config.to_json_string())\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    ## Required parameters\n    parser.add_argument(\"--gpt2_checkpoint_path\",\n                        default = None,\n                        type = str,\n                        required = True,\n                        help = \"Path the TensorFlow checkpoint path.\")\n    parser.add_argument(\"--pytorch_dump_folder_path\",\n                        default = None,\n                        type = str,\n                        required = True,\n                        help = \"Path to the output PyTorch model.\")\n    parser.add_argument(\"--gpt2_config_file\",\n                        default = \"\",\n                        type = str,\n                        help = \"An optional config json file corresponding to the pre-trained OpenAI model. \\n\"\n                            \"This specifies the model architecture.\")\n    args = parser.parse_args()\n    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,\n                                         args.gpt2_config_file,\n                                         args.pytorch_dump_folder_path)\n"
  },
  {
    "path": "pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py",
    "content": "# coding=utf-8\n# Copyright 2018 The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Convert OpenAI GPT checkpoint.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function\n\nimport argparse\nfrom io import open\n\nimport torch\n\nfrom pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,\n                                                     OpenAIGPTConfig,\n                                                     OpenAIGPTModel,\n                                                     load_tf_weights_in_openai_gpt)\n\n\ndef convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):\n    # Construct model\n    if openai_config_file == \"\":\n        config = OpenAIGPTConfig()\n    else:\n        config = OpenAIGPTConfig(openai_config_file)\n    model = OpenAIGPTModel(config)\n\n    # Load weights from numpy\n    load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)\n\n    # Save pytorch-model\n    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME\n    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME\n    print(\"Save PyTorch model to {}\".format(pytorch_weights_dump_path))\n    torch.save(model.state_dict(), pytorch_weights_dump_path)\n    print(\"Save configuration file to {}\".format(pytorch_config_dump_path))\n    with open(pytorch_config_dump_path, \"w\", encoding=\"utf-8\") as f:\n        f.write(config.to_json_string())\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    ## Required parameters\n    parser.add_argument(\"--openai_checkpoint_folder_path\",\n                        default = None,\n                        type = str,\n                        required = True,\n                        help = \"Path the TensorFlow checkpoint path.\")\n    parser.add_argument(\"--pytorch_dump_folder_path\",\n                        default = None,\n                        type = str,\n                        required = True,\n                        help = \"Path to the output PyTorch model.\")\n    parser.add_argument(\"--openai_config_file\",\n                        default = \"\",\n                        type = str,\n                        help = \"An optional config json file corresponding to the pre-trained OpenAI model. \\n\"\n                            \"This specifies the model architecture.\")\n    args = parser.parse_args()\n    convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,\n                                         args.openai_config_file,\n                                         args.pytorch_dump_folder_path)\n"
  },
  {
    "path": "pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py",
    "content": "# coding=utf-8\n# Copyright 2018 The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\"\"\"Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.\"\"\"\n\nimport os\nimport argparse\nimport torch\nimport numpy as np\nimport tensorflow as tf\nfrom pytorch_pretrained_bert.modeling import BertModel\n\n\ndef convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):\n\n    \"\"\"\n    :param model:BertModel Pytorch model instance to be converted\n    :param ckpt_dir: Tensorflow model directory\n    :param model_name: model name\n    :return:\n\n    Currently supported HF models:\n        Y BertModel\n        N BertForMaskedLM\n        N BertForPreTraining\n        N BertForMultipleChoice\n        N BertForNextSentencePrediction\n        N BertForSequenceClassification\n        N BertForQuestionAnswering\n    \"\"\"\n\n    tensors_to_transopse = (\n        \"dense.weight\",\n        \"attention.self.query\",\n        \"attention.self.key\",\n        \"attention.self.value\"\n    )\n\n    var_map = (\n        ('layer.', 'layer_'),\n        ('word_embeddings.weight', 'word_embeddings'),\n        ('position_embeddings.weight', 'position_embeddings'),\n        ('token_type_embeddings.weight', 'token_type_embeddings'),\n        ('.', '/'),\n        ('LayerNorm/weight', 'LayerNorm/gamma'),\n        ('LayerNorm/bias', 'LayerNorm/beta'),\n        ('weight', 'kernel')\n    )\n\n    if not os.path.isdir(ckpt_dir):\n        os.makedirs(ckpt_dir)\n\n    session = tf.Session()\n    state_dict = model.state_dict()\n    tf_vars = []\n\n    def to_tf_var_name(name:str):\n        for patt, repl in iter(var_map):\n            name = name.replace(patt, repl)\n        return 'bert/{}'.format(name)\n\n    def assign_tf_var(tensor:np.ndarray, name:str):\n        tmp_var = tf.Variable(initial_value=tensor)\n        tf_var = tf.get_variable(dtype=tmp_var.dtype, shape=tmp_var.shape, name=name)\n        op = tf.assign(ref=tf_var, value=tmp_var)\n        session.run(tf.variables_initializer([tmp_var, tf_var]))\n        session.run(fetches=[op, tf_var])\n        return tf_var\n\n    for var_name in state_dict:\n        tf_name = to_tf_var_name(var_name)\n        torch_tensor = state_dict[var_name].numpy()\n        if any([x in var_name for x in tensors_to_transopse]):\n            torch_tensor = torch_tensor.T\n        tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name)\n        tf_vars.append(tf_tensor)\n        print(\"{0}{1}initialized\".format(tf_name, \" \" * (60 - len(tf_name))))\n\n    saver = tf.train.Saver(tf_vars)\n    saver.save(session, os.path.join(ckpt_dir, model_name.replace(\"-\", \"_\") + \".ckpt\"))\n\n\ndef main(raw_args=None):\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--model_name\",\n                        type=str,\n                        required=True,\n                        help=\"model name e.g. bert-base-uncased\")\n    parser.add_argument(\"--cache_dir\",\n                        type=str,\n                        default=None,\n                        required=False,\n                        help=\"Directory containing pytorch model\")\n    parser.add_argument(\"--pytorch_model_path\",\n                        type=str,\n                        required=True,\n                        help=\"/path/to/<pytorch-model-name>.bin\")\n    parser.add_argument(\"--tf_cache_dir\",\n                        type=str,\n                        required=True,\n                        help=\"Directory in which to save tensorflow model\")\n    args = parser.parse_args(raw_args)\n    \n    model = BertModel.from_pretrained(\n        pretrained_model_name_or_path=args.model_name,\n        state_dict=torch.load(args.pytorch_model_path),\n        cache_dir=args.cache_dir\n    )\n    \n    convert_pytorch_checkpoint_to_tf(\n        model=model,\n        ckpt_dir=args.tf_cache_dir,\n        model_name=args.model_name\n    )\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py",
    "content": "# coding=utf-8\n# Copyright 2018 The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Convert BERT checkpoint.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport re\nimport argparse\nimport tensorflow as tf\nimport torch\nimport numpy as np\n\nfrom pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert\n\ndef convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):\n    # Initialise PyTorch model\n    config = BertConfig.from_json_file(bert_config_file)\n    print(\"Building PyTorch model from configuration: {}\".format(str(config)))\n    model = BertForPreTraining(config)\n\n    # Load weights from tf checkpoint\n    load_tf_weights_in_bert(model, tf_checkpoint_path)\n\n    # Save pytorch-model\n    print(\"Save PyTorch model to {}\".format(pytorch_dump_path))\n    torch.save(model.state_dict(), pytorch_dump_path)\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    ## Required parameters\n    parser.add_argument(\"--tf_checkpoint_path\",\n                        default = None,\n                        type = str,\n                        required = True,\n                        help = \"Path the TensorFlow checkpoint path.\")\n    parser.add_argument(\"--bert_config_file\",\n                        default = None,\n                        type = str,\n                        required = True,\n                        help = \"The config json file corresponding to the pre-trained BERT model. \\n\"\n                            \"This specifies the model architecture.\")\n    parser.add_argument(\"--pytorch_dump_path\",\n                        default = None,\n                        type = str,\n                        required = True,\n                        help = \"Path to the output PyTorch model.\")\n    args = parser.parse_args()\n    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,\n                                     args.bert_config_file,\n                                     args.pytorch_dump_path)\n"
  },
  {
    "path": "pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py",
    "content": "# coding=utf-8\n# Copyright 2018 The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Convert Transformer XL checkpoint and datasets.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function\n\nimport argparse\nimport os\nimport sys\nfrom io import open\n\nimport torch\n\nimport pytorch_pretrained_bert.tokenization_transfo_xl as data_utils\nfrom pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,\n                                                         WEIGHTS_NAME,\n                                                         TransfoXLConfig,\n                                                         TransfoXLLMHeadModel,\n                                                         load_tf_weights_in_transfo_xl)\nfrom pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,\n                                                             VOCAB_NAME)\n\nif sys.version_info[0] == 2:\n    import cPickle as pickle\nelse:\n    import pickle\n\n# We do this to be able to load python 2 datasets pickles\n# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918\ndata_utils.Vocab = data_utils.TransfoXLTokenizer\ndata_utils.Corpus = data_utils.TransfoXLCorpus\nsys.modules['data_utils'] = data_utils\nsys.modules['vocabulary'] = data_utils\n\ndef convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,\n                                             transfo_xl_config_file,\n                                             pytorch_dump_folder_path,\n                                             transfo_xl_dataset_file):\n    if transfo_xl_dataset_file:\n        # Convert a pre-processed corpus (see original TensorFlow repo)\n        with open(transfo_xl_dataset_file, \"rb\") as fp:\n            corpus = pickle.load(fp, encoding=\"latin1\")\n        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)\n        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME\n        print(\"Save vocabulary to {}\".format(pytorch_vocab_dump_path))\n        corpus_vocab_dict = corpus.vocab.__dict__\n        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)\n\n        corpus_dict_no_vocab = corpus.__dict__\n        corpus_dict_no_vocab.pop('vocab', None)\n        pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME\n        print(\"Save dataset to {}\".format(pytorch_dataset_dump_path))\n        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)\n\n    if tf_checkpoint_path:\n        # Convert a pre-trained TensorFlow model\n        config_path = os.path.abspath(transfo_xl_config_file)\n        tf_path = os.path.abspath(tf_checkpoint_path)\n\n        print(\"Converting Transformer XL checkpoint from {} with config at {}\".format(tf_path, config_path))\n        # Initialise PyTorch model\n        if transfo_xl_config_file == \"\":\n            config = TransfoXLConfig()\n        else:\n            config = TransfoXLConfig(transfo_xl_config_file)\n        print(\"Building PyTorch model from configuration: {}\".format(str(config)))\n        model = TransfoXLLMHeadModel(config)\n\n        model = load_tf_weights_in_transfo_xl(model, config, tf_path)\n        # Save pytorch-model\n        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)\n        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)\n        print(\"Save PyTorch model to {}\".format(os.path.abspath(pytorch_weights_dump_path)))\n        torch.save(model.state_dict(), pytorch_weights_dump_path)\n        print(\"Save configuration file to {}\".format(os.path.abspath(pytorch_config_dump_path)))\n        with open(pytorch_config_dump_path, \"w\", encoding=\"utf-8\") as f:\n            f.write(config.to_json_string())\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--pytorch_dump_folder_path\",\n                        default = None,\n                        type = str,\n                        required = True,\n                        help = \"Path to the folder to store the PyTorch model or dataset/vocab.\")\n    parser.add_argument(\"--tf_checkpoint_path\",\n                        default = \"\",\n                        type = str,\n                        help = \"An optional path to a TensorFlow checkpoint path to be converted.\")\n    parser.add_argument(\"--transfo_xl_config_file\",\n                        default = \"\",\n                        type = str,\n                        help = \"An optional config json file corresponding to the pre-trained BERT model. \\n\"\n                            \"This specifies the model architecture.\")\n    parser.add_argument(\"--transfo_xl_dataset_file\",\n                        default = \"\",\n                        type = str,\n                        help = \"An optional dataset file to be converted in a vocabulary.\")\n    args = parser.parse_args()\n    convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,\n                                     args.transfo_xl_config_file,\n                                     args.pytorch_dump_folder_path,\n                                     args.transfo_xl_dataset_file)\n"
  },
  {
    "path": "pytorch_pretrained_bert/file_utils.py",
    "content": "\"\"\"\nUtilities for working with the local dataset cache.\nThis file is adapted from the AllenNLP library at https://github.com/allenai/allennlp\nCopyright by the AllenNLP authors.\n\"\"\"\nfrom __future__ import (absolute_import, division, print_function, unicode_literals)\n\nimport sys\nimport json\nimport logging\nimport os\nimport shutil\nimport tempfile\nimport fnmatch\nfrom functools import wraps\nfrom hashlib import sha256\nimport sys\nfrom io import open\n\nimport boto3\nimport requests\nfrom botocore.exceptions import ClientError\nfrom tqdm import tqdm\n\ntry:\n    from torch.hub import _get_torch_home\n    torch_cache_home = _get_torch_home()\nexcept ImportError:\n    torch_cache_home = os.path.expanduser(\n        os.getenv('TORCH_HOME', os.path.join(\n            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))\ndefault_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert')\n\ntry:\n    from urllib.parse import urlparse\nexcept ImportError:\n    from urlparse import urlparse\n\ntry:\n    from pathlib import Path\n    PYTORCH_PRETRAINED_BERT_CACHE = Path(\n        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))\nexcept (AttributeError, ImportError):\n    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',\n                                              default_cache_path)\n\nCONFIG_NAME = \"config.json\"\nWEIGHTS_NAME = \"pytorch_model.bin\"\n\nlogger = logging.getLogger(__name__)  # pylint: disable=invalid-name\n\n\ndef url_to_filename(url, etag=None):\n    \"\"\"\n    Convert `url` into a hashed filename in a repeatable way.\n    If `etag` is specified, append its hash to the url's, delimited\n    by a period.\n    \"\"\"\n    url_bytes = url.encode('utf-8')\n    url_hash = sha256(url_bytes)\n    filename = url_hash.hexdigest()\n\n    if etag:\n        etag_bytes = etag.encode('utf-8')\n        etag_hash = sha256(etag_bytes)\n        filename += '.' + etag_hash.hexdigest()\n\n    return filename\n\n\ndef filename_to_url(filename, cache_dir=None):\n    \"\"\"\n    Return the url and etag (which may be ``None``) stored for `filename`.\n    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.\n    \"\"\"\n    if cache_dir is None:\n        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE\n    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):\n        cache_dir = str(cache_dir)\n\n    cache_path = os.path.join(cache_dir, filename)\n    if not os.path.exists(cache_path):\n        raise EnvironmentError(\"file {} not found\".format(cache_path))\n\n    meta_path = cache_path + '.json'\n    if not os.path.exists(meta_path):\n        raise EnvironmentError(\"file {} not found\".format(meta_path))\n\n    with open(meta_path, encoding=\"utf-8\") as meta_file:\n        metadata = json.load(meta_file)\n    url = metadata['url']\n    etag = metadata['etag']\n\n    return url, etag\n\n\ndef cached_path(url_or_filename, cache_dir=None):\n    \"\"\"\n    Given something that might be a URL (or might be a local path),\n    determine which. If it's a URL, download the file and cache it, and\n    return the path to the cached file. If it's already a local path,\n    make sure the file exists and then return the path.\n    \"\"\"\n    if cache_dir is None:\n        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE\n    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):\n        url_or_filename = str(url_or_filename)\n    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):\n        cache_dir = str(cache_dir)\n\n    parsed = urlparse(url_or_filename)\n\n    if parsed.scheme in ('http', 'https', 's3'):\n        # URL, so get it from the cache (downloading if necessary)\n        return get_from_cache(url_or_filename, cache_dir)\n    elif os.path.exists(url_or_filename):\n        # File, and it exists.\n        return url_or_filename\n    elif parsed.scheme == '':\n        # File, but it doesn't exist.\n        raise EnvironmentError(\"file {} not found\".format(url_or_filename))\n    else:\n        # Something unknown\n        raise ValueError(\"unable to parse {} as a URL or as a local path\".format(url_or_filename))\n\n\ndef split_s3_path(url):\n    \"\"\"Split a full s3 path into the bucket name and path.\"\"\"\n    parsed = urlparse(url)\n    if not parsed.netloc or not parsed.path:\n        raise ValueError(\"bad s3 path {}\".format(url))\n    bucket_name = parsed.netloc\n    s3_path = parsed.path\n    # Remove '/' at beginning of path.\n    if s3_path.startswith(\"/\"):\n        s3_path = s3_path[1:]\n    return bucket_name, s3_path\n\n\ndef s3_request(func):\n    \"\"\"\n    Wrapper function for s3 requests in order to create more helpful error\n    messages.\n    \"\"\"\n\n    @wraps(func)\n    def wrapper(url, *args, **kwargs):\n        try:\n            return func(url, *args, **kwargs)\n        except ClientError as exc:\n            if int(exc.response[\"Error\"][\"Code\"]) == 404:\n                raise EnvironmentError(\"file {} not found\".format(url))\n            else:\n                raise\n\n    return wrapper\n\n\n@s3_request\ndef s3_etag(url):\n    \"\"\"Check ETag on S3 object.\"\"\"\n    s3_resource = boto3.resource(\"s3\")\n    bucket_name, s3_path = split_s3_path(url)\n    s3_object = s3_resource.Object(bucket_name, s3_path)\n    return s3_object.e_tag\n\n\n@s3_request\ndef s3_get(url, temp_file):\n    \"\"\"Pull a file directly from S3.\"\"\"\n    s3_resource = boto3.resource(\"s3\")\n    bucket_name, s3_path = split_s3_path(url)\n    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)\n\n\ndef http_get(url, temp_file):\n    req = requests.get(url, stream=True)\n    content_length = req.headers.get('Content-Length')\n    total = int(content_length) if content_length is not None else None\n    progress = tqdm(unit=\"B\", total=total)\n    for chunk in req.iter_content(chunk_size=1024):\n        if chunk: # filter out keep-alive new chunks\n            progress.update(len(chunk))\n            temp_file.write(chunk)\n    progress.close()\n\n\ndef get_from_cache(url, cache_dir=None):\n    \"\"\"\n    Given a URL, look for the corresponding dataset in the local cache.\n    If it's not there, download it. Then return the path to the cached file.\n    \"\"\"\n    if cache_dir is None:\n        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE\n    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):\n        cache_dir = str(cache_dir)\n\n    if not os.path.exists(cache_dir):\n        os.makedirs(cache_dir)\n\n    # Get eTag to add to filename, if it exists.\n    if url.startswith(\"s3://\"):\n        etag = s3_etag(url)\n    else:\n        try:\n            response = requests.head(url, allow_redirects=True)\n            if response.status_code != 200:\n                etag = None\n            else:\n                etag = response.headers.get(\"ETag\")\n        except EnvironmentError:\n            etag = None\n\n    if sys.version_info[0] == 2 and etag is not None:\n        etag = etag.decode('utf-8')\n    filename = url_to_filename(url, etag)\n\n    # get cache path to put the file\n    cache_path = os.path.join(cache_dir, filename)\n\n    # If we don't have a connection (etag is None) and can't identify the file\n    # try to get the last downloaded one\n    if not os.path.exists(cache_path) and etag is None:\n        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')\n        matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))\n        if matching_files:\n            cache_path = os.path.join(cache_dir, matching_files[-1])\n\n    if not os.path.exists(cache_path):\n        # Download to temporary file, then copy to cache dir once finished.\n        # Otherwise you get corrupt cache entries if the download gets interrupted.\n        with tempfile.NamedTemporaryFile() as temp_file:\n            logger.info(\"%s not found in cache, downloading to %s\", url, temp_file.name)\n\n            # GET file object\n            if url.startswith(\"s3://\"):\n                s3_get(url, temp_file)\n            else:\n                http_get(url, temp_file)\n\n            # we are copying the file before closing it, so flush to avoid truncation\n            temp_file.flush()\n            # shutil.copyfileobj() starts at the current position, so go to the start\n            temp_file.seek(0)\n\n            logger.info(\"copying %s to cache at %s\", temp_file.name, cache_path)\n            with open(cache_path, 'wb') as cache_file:\n                shutil.copyfileobj(temp_file, cache_file)\n\n            logger.info(\"creating metadata file for %s\", cache_path)\n            meta = {'url': url, 'etag': etag}\n            meta_path = cache_path + '.json'\n            with open(meta_path, 'w') as meta_file:\n                output_string = json.dumps(meta)\n                if sys.version_info[0] == 2 and isinstance(output_string, str):\n                    output_string = unicode(output_string, 'utf-8')  # The beauty of python 2\n                meta_file.write(output_string)\n\n            logger.info(\"removing temp file %s\", temp_file.name)\n\n    return cache_path\n\n\ndef read_set_from_file(filename):\n    '''\n    Extract a de-duped collection (set) of text from a file.\n    Expected file format is one item per line.\n    '''\n    collection = set()\n    with open(filename, 'r', encoding='utf-8') as file_:\n        for line in file_:\n            collection.add(line.rstrip())\n    return collection\n\n\ndef get_file_extension(path, dot=True, lower=True):\n    ext = os.path.splitext(path)[1]\n    ext = ext if dot else ext[1:]\n    return ext.lower() if lower else ext\n"
  },
  {
    "path": "pytorch_pretrained_bert/modeling.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"PyTorch BERT model.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport copy\nimport json\nimport logging\nimport math\nimport os\nimport shutil\nimport tarfile\nimport tempfile\nimport sys\nfrom io import open\n\nimport torch\nfrom torch import nn\nfrom torch.nn import CrossEntropyLoss\n\nfrom .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME\n\nlogger = logging.getLogger(__name__)\n\nPRETRAINED_MODEL_ARCHIVE_MAP = {\n    'bert-base-uncased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz\",\n    'bert-large-uncased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz\",\n    'bert-base-cased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz\",\n    'bert-large-cased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz\",\n    'bert-base-multilingual-uncased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz\",\n    'bert-base-multilingual-cased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz\",\n    'bert-base-chinese': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz\",\n}\nBERT_CONFIG_NAME = 'bert_config.json'\nTF_WEIGHTS_NAME = 'model.ckpt'\n\ndef load_tf_weights_in_bert(model, tf_checkpoint_path):\n    \"\"\" Load tf checkpoints in a pytorch model\n    \"\"\"\n    try:\n        import re\n        import numpy as np\n        import tensorflow as tf\n    except ImportError:\n        print(\"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see \"\n            \"https://www.tensorflow.org/install/ for installation instructions.\")\n        raise\n    tf_path = os.path.abspath(tf_checkpoint_path)\n    print(\"Converting TensorFlow checkpoint from {}\".format(tf_path))\n    # Load weights from TF model\n    init_vars = tf.train.list_variables(tf_path)\n    names = []\n    arrays = []\n    for name, shape in init_vars:\n        print(\"Loading TF weight {} with shape {}\".format(name, shape))\n        array = tf.train.load_variable(tf_path, name)\n        names.append(name)\n        arrays.append(array)\n\n    for name, array in zip(names, arrays):\n        name = name.split('/')\n        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v\n        # which are not required for using pretrained model\n        if any(n in [\"adam_v\", \"adam_m\", \"global_step\"] for n in name):\n            print(\"Skipping {}\".format(\"/\".join(name)))\n            continue\n        pointer = model\n        for m_name in name:\n            if re.fullmatch(r'[A-Za-z]+_\\d+', m_name):\n                l = re.split(r'_(\\d+)', m_name)\n            else:\n                l = [m_name]\n            if l[0] == 'kernel' or l[0] == 'gamma':\n                pointer = getattr(pointer, 'weight')\n            elif l[0] == 'output_bias' or l[0] == 'beta':\n                pointer = getattr(pointer, 'bias')\n            elif l[0] == 'output_weights':\n                pointer = getattr(pointer, 'weight')\n            elif l[0] == 'squad':\n                pointer = getattr(pointer, 'classifier')\n            else:\n                try:\n                    pointer = getattr(pointer, l[0])\n                except AttributeError:\n                    print(\"Skipping {}\".format(\"/\".join(name)))\n                    continue\n            if len(l) >= 2:\n                num = int(l[1])\n                pointer = pointer[num]\n        if m_name[-11:] == '_embeddings':\n            pointer = getattr(pointer, 'weight')\n        elif m_name == 'kernel':\n            array = np.transpose(array)\n        try:\n            assert pointer.shape == array.shape\n        except AssertionError as e:\n            e.args += (pointer.shape, array.shape)\n            raise\n        print(\"Initialize PyTorch weight {}\".format(name))\n        pointer.data = torch.from_numpy(array)\n    return model\n\n\ndef gelu(x):\n    \"\"\"Implementation of the gelu activation function.\n        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):\n        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n        Also see https://arxiv.org/abs/1606.08415\n    \"\"\"\n    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))\n\n\ndef swish(x):\n    return x * torch.sigmoid(x)\n\n\nACT2FN = {\"gelu\": gelu, \"relu\": torch.nn.functional.relu, \"swish\": swish}\n\n\nclass BertConfig(object):\n    \"\"\"Configuration class to store the configuration of a `BertModel`.\n    \"\"\"\n    def __init__(self,\n                 vocab_size_or_config_json_file,\n                 hidden_size=768,\n                 num_hidden_layers=12,\n                 num_attention_heads=12,\n                 intermediate_size=3072,\n                 hidden_act=\"gelu\",\n                 hidden_dropout_prob=0.1,\n                 attention_probs_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=2,\n                 initializer_range=0.02,\n                 layer_norm_eps=1e-12):\n        \"\"\"Constructs BertConfig.\n\n        Args:\n            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.\n            hidden_size: Size of the encoder layers and the pooler layer.\n            num_hidden_layers: Number of hidden layers in the Transformer encoder.\n            num_attention_heads: Number of attention heads for each attention layer in\n                the Transformer encoder.\n            intermediate_size: The size of the \"intermediate\" (i.e., feed-forward)\n                layer in the Transformer encoder.\n            hidden_act: The non-linear activation function (function or string) in the\n                encoder and pooler. If string, \"gelu\", \"relu\" and \"swish\" are supported.\n            hidden_dropout_prob: The dropout probabilitiy for all fully connected\n                layers in the embeddings, encoder, and pooler.\n            attention_probs_dropout_prob: The dropout ratio for the attention\n                probabilities.\n            max_position_embeddings: The maximum sequence length that this model might\n                ever be used with. Typically set this to something large just in case\n                (e.g., 512 or 1024 or 2048).\n            type_vocab_size: The vocabulary size of the `token_type_ids` passed into\n                `BertModel`.\n            initializer_range: The sttdev of the truncated_normal_initializer for\n                initializing all weight matrices.\n            layer_norm_eps: The epsilon used by LayerNorm.\n        \"\"\"\n        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2\n                        and isinstance(vocab_size_or_config_json_file, unicode)):\n            with open(vocab_size_or_config_json_file, \"r\", encoding='utf-8') as reader:\n                json_config = json.loads(reader.read())\n            for key, value in json_config.items():\n                self.__dict__[key] = value\n        elif isinstance(vocab_size_or_config_json_file, int):\n            self.vocab_size = vocab_size_or_config_json_file\n            self.hidden_size = hidden_size\n            self.num_hidden_layers = num_hidden_layers\n            self.num_attention_heads = num_attention_heads\n            self.hidden_act = hidden_act\n            self.intermediate_size = intermediate_size\n            self.hidden_dropout_prob = hidden_dropout_prob\n            self.attention_probs_dropout_prob = attention_probs_dropout_prob\n            self.max_position_embeddings = max_position_embeddings\n            self.type_vocab_size = type_vocab_size\n            self.initializer_range = initializer_range\n            self.layer_norm_eps = layer_norm_eps\n        else:\n            raise ValueError(\"First argument must be either a vocabulary size (int)\"\n                             \"or the path to a pretrained model config file (str)\")\n\n    @classmethod\n    def from_dict(cls, json_object):\n        \"\"\"Constructs a `BertConfig` from a Python dictionary of parameters.\"\"\"\n        config = BertConfig(vocab_size_or_config_json_file=-1)\n        for key, value in json_object.items():\n            config.__dict__[key] = value\n        return config\n\n    @classmethod\n    def from_json_file(cls, json_file):\n        \"\"\"Constructs a `BertConfig` from a json file of parameters.\"\"\"\n        with open(json_file, \"r\", encoding='utf-8') as reader:\n            text = reader.read()\n        return cls.from_dict(json.loads(text))\n\n    def __repr__(self):\n        return str(self.to_json_string())\n\n    def to_dict(self):\n        \"\"\"Serializes this instance to a Python dictionary.\"\"\"\n        output = copy.deepcopy(self.__dict__)\n        return output\n\n    def to_json_string(self):\n        \"\"\"Serializes this instance to a JSON string.\"\"\"\n        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + \"\\n\"\n\n    def to_json_file(self, json_file_path):\n        \"\"\" Save this instance to a json file.\"\"\"\n        with open(json_file_path, \"w\", encoding='utf-8') as writer:\n            writer.write(self.to_json_string())\n\ntry:\n    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm\nexcept ImportError:\n    logger.info(\"Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\")\n    class BertLayerNorm(nn.Module):\n        def __init__(self, hidden_size, eps=1e-12):\n            \"\"\"Construct a layernorm module in the TF style (epsilon inside the square root).\n            \"\"\"\n            super(BertLayerNorm, self).__init__()\n            self.weight = nn.Parameter(torch.ones(hidden_size))\n            self.bias = nn.Parameter(torch.zeros(hidden_size))\n            self.variance_epsilon = eps\n\n        def forward(self, x):\n            u = x.mean(-1, keepdim=True)\n            s = (x - u).pow(2).mean(-1, keepdim=True)\n            x = (x - u) / torch.sqrt(s + self.variance_epsilon)\n            return self.weight * x + self.bias\n\nclass BertEmbeddings(nn.Module):\n    \"\"\"Construct the embeddings from word, position and token_type embeddings.\n    \"\"\"\n    def __init__(self, config):\n        super(BertEmbeddings, self).__init__()\n        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)\n        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)\n        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)\n\n        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load\n        # any TensorFlow checkpoint file\n        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)\n        self.dropout = nn.Dropout(config.hidden_dropout_prob)\n\n    def forward(self, input_ids, entity_pos_seg=None, entity_span1_pos=None, entity_span2_pos=None, token_type_ids=None):\n        seq_length = input_ids.size(1)\n        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)\n        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)\n        if token_type_ids is None:\n            token_type_ids = torch.zeros_like(input_ids)\n        if entity_pos_seg is None:\n            entity_pos_seg = torch.zeros_like(input_ids)\n        if entity_span1_pos is None:\n            entity_span1_pos = torch.zeros_like(input_ids)\n        if entity_span2_pos is None:\n            entity_span2_pos = torch.zeros_like(input_ids)\n        \n        words_embeddings = self.word_embeddings(input_ids)\n        position_embeddings = self.position_embeddings(position_ids)\n        token_type_embeddings = self.token_type_embeddings(token_type_ids) \n        \n        \"\"\"\n            pos embedding 0\n        \"\"\"\n        # '0','1','2' for non-entity, entity1, and entity2\n        entity_seg_pos_num = 3\n        hidden_size = words_embeddings.shape[2]\n        entity_seg_pos_embeddings_func = nn.Embedding(entity_seg_pos_num, hidden_size)\n        entity_seg_pos_embeddings_func = entity_seg_pos_embeddings_func.cuda()\n        entity_seg_pos_embeddings = entity_seg_pos_embeddings_func(entity_pos_seg)\n        \n        \"\"\"\n            pos embedding 1\n        \"\"\"\n        num_embeddings = 2 * (seq_length - 1) + 1\n        entity_span_pos_embeddings_func = nn.Embedding(num_embeddings, hidden_size)\n        entity_span_pos_embeddings_func = entity_span_pos_embeddings_func.cuda()\n        try:\n            pass\n            #print(entity_span1_pos)\n            #entity_span1_pos_embeddings = entity_span_pos_embeddings_func(entity_span1_pos)\n            #print(entity_span2_pos)\n            #entity_span2_pos_embeddings = entity_span_pos_embeddings_func(entity_span2_pos)\n        except: \n            import pdb;pdb.set_trace()\n        \"\"\"\n            Different feature strategy\n        \"\"\"\n        # 0\n        #embeddings = words_embeddings + position_embeddings + token_type_embeddings + entity_seg_pos_embeddings\n        \n        # 0\n        #embeddings = words_embeddings + token_type_embeddings + entity_seg_pos_embeddings\n        \n        # 1\n        embeddings = words_embeddings + position_embeddings + token_type_embeddings\n        \n        # 0 \n        #embeddings = words_embeddings + position_embeddings + token_type_embeddings+entity_span1_pos_embeddings+entity_span2_pos_embeddings\n        \n        # 0\n        #embeddings = words_embeddings + token_type_embeddings+entity_span1_pos_embeddings+entity_span2_pos_embeddings\n        \n        \n        embeddings = self.LayerNorm(embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings\n\nclass BertSelfAttention(nn.Module):\n    def __init__(self, config):\n        super(BertSelfAttention, self).__init__()\n        if config.hidden_size % config.num_attention_heads != 0:\n            raise ValueError(\n                \"The hidden size (%d) is not a multiple of the number of attention \"\n                \"heads (%d)\" % (config.hidden_size, config.num_attention_heads))\n        self.num_attention_heads = config.num_attention_heads\n        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)\n        self.all_head_size = self.num_attention_heads * self.attention_head_size\n\n        self.query = nn.Linear(config.hidden_size, self.all_head_size)\n        self.key = nn.Linear(config.hidden_size, self.all_head_size)\n        self.value = nn.Linear(config.hidden_size, self.all_head_size)\n\n        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)\n\n    def transpose_for_scores(self, x):\n        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)\n        x = x.view(*new_x_shape)\n        return x.permute(0, 2, 1, 3)\n\n    def forward(self, hidden_states, attention_mask):\n        mixed_query_layer = self.query(hidden_states)\n        mixed_key_layer = self.key(hidden_states)\n        mixed_value_layer = self.value(hidden_states)\n\n        query_layer = self.transpose_for_scores(mixed_query_layer)\n        key_layer = self.transpose_for_scores(mixed_key_layer)\n        value_layer = self.transpose_for_scores(mixed_value_layer)\n\n        # Take the dot product between \"query\" and \"key\" to get the raw attention scores.\n        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))\n        attention_scores = attention_scores / math.sqrt(self.attention_head_size)\n        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)\n        attention_scores = attention_scores + attention_mask\n\n        # Normalize the attention scores to probabilities.\n        attention_probs = nn.Softmax(dim=-1)(attention_scores)\n\n        # This is actually dropping out entire tokens to attend to, which might\n        # seem a bit unusual, but is taken from the original Transformer paper.\n        attention_probs = self.dropout(attention_probs)\n\n        context_layer = torch.matmul(attention_probs, value_layer)\n        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()\n        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)\n        context_layer = context_layer.view(*new_context_layer_shape)\n        return context_layer\n\n\nclass BertSelfOutput(nn.Module):\n    def __init__(self, config):\n        super(BertSelfOutput, self).__init__()\n        self.dense = nn.Linear(config.hidden_size, config.hidden_size)\n        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)\n        self.dropout = nn.Dropout(config.hidden_dropout_prob)\n\n    def forward(self, hidden_states, input_tensor):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.LayerNorm(hidden_states + input_tensor)\n        return hidden_states\n\n\nclass BertAttention(nn.Module):\n    def __init__(self, config):\n        super(BertAttention, self).__init__()\n        self.self = BertSelfAttention(config)\n        self.output = BertSelfOutput(config)\n\n    def forward(self, input_tensor, attention_mask):\n        self_output = self.self(input_tensor, attention_mask)\n        attention_output = self.output(self_output, input_tensor)\n        return attention_output\n\n\nclass BertIntermediate(nn.Module):\n    def __init__(self, config):\n        super(BertIntermediate, self).__init__()\n        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)\n        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):\n            self.intermediate_act_fn = ACT2FN[config.hidden_act]\n        else:\n            self.intermediate_act_fn = config.hidden_act\n\n    def forward(self, hidden_states):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.intermediate_act_fn(hidden_states)\n        return hidden_states\n\n\nclass BertOutput(nn.Module):\n    def __init__(self, config):\n        super(BertOutput, self).__init__()\n        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)\n        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)\n        self.dropout = nn.Dropout(config.hidden_dropout_prob)\n\n    def forward(self, hidden_states, input_tensor):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.LayerNorm(hidden_states + input_tensor)\n        return hidden_states\n\n\nclass BertLayer(nn.Module):\n    def __init__(self, config):\n        super(BertLayer, self).__init__()\n        self.attention = BertAttention(config)\n        self.intermediate = BertIntermediate(config)\n        self.output = BertOutput(config)\n\n    def forward(self, hidden_states, attention_mask):\n        attention_output = self.attention(hidden_states, attention_mask)\n        intermediate_output = self.intermediate(attention_output)\n        layer_output = self.output(intermediate_output, attention_output)\n        return layer_output\n\n\nclass BertEncoder(nn.Module):\n    def __init__(self, config):\n        super(BertEncoder, self).__init__()\n        layer = BertLayer(config)\n        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])\n\n    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):\n        all_encoder_layers = []\n        for layer_module in self.layer:\n            hidden_states = layer_module(hidden_states, attention_mask)\n            if output_all_encoded_layers:\n                all_encoder_layers.append(hidden_states)\n        if not output_all_encoded_layers:\n            all_encoder_layers.append(hidden_states)\n        return all_encoder_layers\n\n\nclass BertPooler(nn.Module):\n    def __init__(self, config):\n        super(BertPooler, self).__init__()\n        self.dense = nn.Linear(config.hidden_size, config.hidden_size)\n        self.activation = nn.Tanh()\n\n    def forward(self, hidden_states):\n        # We \"pool\" the model by simply taking the hidden state corresponding\n        # to the first token.\n        first_token_tensor = hidden_states[:, 0]\n        pooled_output = self.dense(first_token_tensor)\n        pooled_output = self.activation(pooled_output)\n        return pooled_output\n\n\nclass BertPredictionHeadTransform(nn.Module):\n    def __init__(self, config):\n        super(BertPredictionHeadTransform, self).__init__()\n        self.dense = nn.Linear(config.hidden_size, config.hidden_size)\n        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):\n            self.transform_act_fn = ACT2FN[config.hidden_act]\n        else:\n            self.transform_act_fn = config.hidden_act\n        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)\n\n    def forward(self, hidden_states):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.transform_act_fn(hidden_states)\n        hidden_states = self.LayerNorm(hidden_states)\n        return hidden_states\n\n\nclass BertLMPredictionHead(nn.Module):\n    def __init__(self, config, bert_model_embedding_weights):\n        super(BertLMPredictionHead, self).__init__()\n        self.transform = BertPredictionHeadTransform(config)\n\n        # The output weights are the same as the input embeddings, but there is\n        # an output-only bias for each token.\n        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),\n                                 bert_model_embedding_weights.size(0),\n                                 bias=False)\n        self.decoder.weight = bert_model_embedding_weights\n        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))\n\n    def forward(self, hidden_states):\n        hidden_states = self.transform(hidden_states)\n        hidden_states = self.decoder(hidden_states) + self.bias\n        return hidden_states\n\n\nclass BertOnlyMLMHead(nn.Module):\n    def __init__(self, config, bert_model_embedding_weights):\n        super(BertOnlyMLMHead, self).__init__()\n        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)\n\n    def forward(self, sequence_output):\n        prediction_scores = self.predictions(sequence_output)\n        return prediction_scores\n\n\nclass BertOnlyNSPHead(nn.Module):\n    def __init__(self, config):\n        super(BertOnlyNSPHead, self).__init__()\n        self.seq_relationship = nn.Linear(config.hidden_size, 2)\n\n    def forward(self, pooled_output):\n        seq_relationship_score = self.seq_relationship(pooled_output)\n        return seq_relationship_score\n\n\nclass BertPreTrainingHeads(nn.Module):\n    def __init__(self, config, bert_model_embedding_weights):\n        super(BertPreTrainingHeads, self).__init__()\n        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)\n        self.seq_relationship = nn.Linear(config.hidden_size, 2)\n\n    def forward(self, sequence_output, pooled_output):\n        prediction_scores = self.predictions(sequence_output)\n        seq_relationship_score = self.seq_relationship(pooled_output)\n        return prediction_scores, seq_relationship_score\n\n\nclass BertPreTrainedModel(nn.Module):\n    \"\"\" An abstract class to handle weights initialization and\n        a simple interface for dowloading and loading pretrained models.\n    \"\"\"\n    def __init__(self, config, *inputs, **kwargs):\n        super(BertPreTrainedModel, self).__init__()\n        if not isinstance(config, BertConfig):\n            raise ValueError(\n                \"Parameter config in `{}(config)` should be an instance of class `BertConfig`. \"\n                \"To create a model from a Google pretrained model use \"\n                \"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`\".format(\n                    self.__class__.__name__, self.__class__.__name__\n                ))\n        self.config = config\n\n    def init_bert_weights(self, module):\n        \"\"\" Initialize the weights.\n        \"\"\"\n        if isinstance(module, (nn.Linear, nn.Embedding)):\n            # Slightly different from the TF version which uses truncated_normal for initialization\n            # cf https://github.com/pytorch/pytorch/pull/5617\n            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)\n        elif isinstance(module, BertLayerNorm):\n            module.bias.data.zero_()\n            module.weight.data.fill_(1.0)\n        if isinstance(module, nn.Linear) and module.bias is not None:\n            module.bias.data.zero_()\n\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):\n        \"\"\"\n        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.\n        Download and cache the pre-trained model file if needed.\n\n        Params:\n            pretrained_model_name_or_path: either:\n                - a str with the name of a pre-trained model to load selected in the list of:\n                    . `bert-base-uncased`\n                    . `bert-large-uncased`\n                    . `bert-base-cased`\n                    . `bert-large-cased`\n                    . `bert-base-multilingual-uncased`\n                    . `bert-base-multilingual-cased`\n                    . `bert-base-chinese`\n                - a path or url to a pretrained model archive containing:\n                    . `bert_config.json` a configuration file for the model\n                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance\n                - a path or url to a pretrained model archive containing:\n                    . `bert_config.json` a configuration file for the model\n                    . `model.chkpt` a TensorFlow checkpoint\n            from_tf: should we load the weights from a locally saved TensorFlow checkpoint\n            cache_dir: an optional path to a folder in which the pre-trained models will be cached.\n            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models\n            *inputs, **kwargs: additional input for the specific Bert class\n                (ex: num_labels for BertForSequenceClassification)\n        \"\"\"\n        state_dict = kwargs.get('state_dict', None)\n        kwargs.pop('state_dict', None)\n        cache_dir = kwargs.get('cache_dir', None)\n        kwargs.pop('cache_dir', None)\n        from_tf = kwargs.get('from_tf', False)\n        kwargs.pop('from_tf', None)\n\n        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:\n            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]\n        else:\n            archive_file = pretrained_model_name_or_path\n        # redirect to the cache, if necessary\n        try:\n            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            logger.error(\n                \"Model name '{}' was not found in model name list ({}). \"\n                \"We assumed '{}' was a path or url but couldn't find any file \"\n                \"associated to this path or url.\".format(\n                    pretrained_model_name_or_path,\n                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),\n                    archive_file))\n            return None\n        if resolved_archive_file == archive_file:\n            logger.info(\"loading archive file {}\".format(archive_file))\n        else:\n            logger.info(\"loading archive file {} from cache at {}\".format(\n                archive_file, resolved_archive_file))\n        tempdir = None\n        if os.path.isdir(resolved_archive_file) or from_tf:\n            serialization_dir = resolved_archive_file\n        else:\n            # Extract archive to temp dir\n            tempdir = tempfile.mkdtemp()\n            logger.info(\"extracting archive file {} to temp dir {}\".format(\n                resolved_archive_file, tempdir))\n            with tarfile.open(resolved_archive_file, 'r:gz') as archive:\n                archive.extractall(tempdir)\n            serialization_dir = tempdir\n        # Load config\n        config_file = os.path.join(serialization_dir, CONFIG_NAME)\n        if not os.path.exists(config_file):\n            # Backward compatibility with old naming format\n            config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)\n        config = BertConfig.from_json_file(config_file)\n        logger.info(\"Model config {}\".format(config))\n        # Instantiate model.\n        model = cls(config, *inputs, **kwargs)\n        if state_dict is None and not from_tf:\n            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)\n            state_dict = torch.load(weights_path, map_location='cpu')\n        if tempdir:\n            # Clean up temp dir\n            shutil.rmtree(tempdir)\n        if from_tf:\n            # Directly load from a TensorFlow checkpoint\n            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)\n            return load_tf_weights_in_bert(model, weights_path)\n        # Load from a PyTorch state_dict\n        old_keys = []\n        new_keys = []\n        for key in state_dict.keys():\n            new_key = None\n            if 'gamma' in key:\n                new_key = key.replace('gamma', 'weight')\n            if 'beta' in key:\n                new_key = key.replace('beta', 'bias')\n            if new_key:\n                old_keys.append(key)\n                new_keys.append(new_key)\n        for old_key, new_key in zip(old_keys, new_keys):\n            state_dict[new_key] = state_dict.pop(old_key)\n\n        missing_keys = []\n        unexpected_keys = []\n        error_msgs = []\n        # copy state_dict so _load_from_state_dict can modify it\n        metadata = getattr(state_dict, '_metadata', None)\n        state_dict = state_dict.copy()\n        if metadata is not None:\n            state_dict._metadata = metadata\n\n        def load(module, prefix=''):\n            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})\n            module._load_from_state_dict(\n                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)\n            for name, child in module._modules.items():\n                if child is not None:\n                    load(child, prefix + name + '.')\n        start_prefix = ''\n        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):\n            start_prefix = 'bert.'\n        load(model, prefix=start_prefix)\n        if len(missing_keys) > 0:\n            logger.info(\"Weights of {} not initialized from pretrained model: {}\".format(\n                model.__class__.__name__, missing_keys))\n        if len(unexpected_keys) > 0:\n            logger.info(\"Weights from pretrained model not used in {}: {}\".format(\n                model.__class__.__name__, unexpected_keys))\n        if len(error_msgs) > 0:\n            raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\n                               model.__class__.__name__, \"\\n\\t\".join(error_msgs)))\n        return model\n\n\nclass BertModel(BertPreTrainedModel):\n    \"\"\"BERT model (\"Bidirectional Embedding Representations from a Transformer\").\n\n    Params:\n        config: a BertConfig class instance with the configuration to build a new model\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]\n            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts\n            `extract_features.py`, `run_classifier.py` and `run_squad.py`)\n        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token\n            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to\n            a `sentence B` token (see BERT paper for more details).\n        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices\n            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max\n            input sequence length in the current batch. It's the mask that we typically use for attention when\n            a batch has varying length sentences.\n        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.\n\n    Outputs: Tuple of (encoded_layers, pooled_output)\n        `encoded_layers`: controled by `output_all_encoded_layers` argument:\n            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end\n                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each\n                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],\n            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding\n                to the last attention block of shape [batch_size, sequence_length, hidden_size],\n        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a\n            classifier pretrained on top of the hidden state associated to the first character of the\n            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).\n\n    Example usage:\n    ```python\n    # Already been converted into WordPiece token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])\n    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])\n\n    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)\n\n    model = modeling.BertModel(config=config)\n    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)\n    ```\n    \"\"\"\n    def __init__(self, config):\n        super(BertModel, self).__init__(config)\n        self.embeddings = BertEmbeddings(config)\n        self.encoder = BertEncoder(config)\n        self.pooler = BertPooler(config)\n        self.apply(self.init_bert_weights)\n\n    def forward(self, input_ids, entity_seg_pos = None, entity_span1_pos=None, entity_span2_pos=None, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):\n        if attention_mask is None:\n            attention_mask = torch.ones_like(input_ids)\n        if token_type_ids is None:\n            token_type_ids = torch.zeros_like(input_ids)\n        if entity_seg_pos is None:\n            entity_seg_pos = torch.zeros_like(input_ids)\n        if entity_span1_pos is None:\n            entity_span1_pos = torch.zeros_like(input_ids)\n        if entity_span2_pos is None:\n            entity_span2_pos = torch.zeros_like(input_ids)\n        # We create a 3D attention mask from a 2D tensor mask.\n        # Sizes are [batch_size, 1, 1, to_seq_length]\n        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]\n        # this attention mask is more simple than the triangular masking of causal attention\n        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.\n        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)\n\n        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for\n        # masked positions, this operation will create a tensor which is 0.0 for\n        # positions we want to attend and -10000.0 for masked positions.\n        # Since we are adding it to the raw scores before the softmax, this is\n        # effectively the same as removing these entirely.\n        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility\n        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0\n        embedding_output = self.embeddings(input_ids, entity_seg_pos, entity_span1_pos, entity_span2_pos, token_type_ids)\n        encoded_layers = self.encoder(embedding_output,\n                                      extended_attention_mask,\n                                      output_all_encoded_layers=output_all_encoded_layers)\n        sequence_output = encoded_layers[-1]\n        pooled_output = self.pooler(sequence_output)\n        if not output_all_encoded_layers:\n            encoded_layers = encoded_layers[-1]\n        return encoded_layers, pooled_output\n\n\nclass BertForPreTraining(BertPreTrainedModel):\n    \"\"\"BERT model with pre-training heads.\n    This module comprises the BERT model followed by the two pre-training heads:\n        - the masked language modeling head, and\n        - the next sentence classification head.\n\n    Params:\n        config: a BertConfig class instance with the configuration to build a new model.\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]\n            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts\n            `extract_features.py`, `run_classifier.py` and `run_squad.py`)\n        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token\n            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to\n            a `sentence B` token (see BERT paper for more details).\n        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices\n            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max\n            input sequence length in the current batch. It's the mask that we typically use for attention when\n            a batch has varying length sentences.\n        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]\n            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss\n            is only computed for the labels set in [0, ..., vocab_size]\n        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]\n            with indices selected in [0, 1].\n            0 => next sentence is the continuation, 1 => next sentence is a random sentence.\n\n    Outputs:\n        if `masked_lm_labels` and `next_sentence_label` are not `None`:\n            Outputs the total_loss which is the sum of the masked language modeling loss and the next\n            sentence classification loss.\n        if `masked_lm_labels` or `next_sentence_label` is `None`:\n            Outputs a tuple comprising\n            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and\n            - the next sentence classification logits of shape [batch_size, 2].\n\n    Example usage:\n    ```python\n    # Already been converted into WordPiece token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])\n    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])\n\n    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)\n\n    model = BertForPreTraining(config)\n    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)\n    ```\n    \"\"\"\n    def __init__(self, config):\n        super(BertForPreTraining, self).__init__(config)\n        self.bert = BertModel(config)\n        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)\n        self.apply(self.init_bert_weights)\n\n    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None):\n        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,\n                                                   output_all_encoded_layers=False)\n        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)\n\n        if masked_lm_labels is not None and next_sentence_label is not None:\n            loss_fct = CrossEntropyLoss(ignore_index=-1)\n            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))\n            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))\n            total_loss = masked_lm_loss + next_sentence_loss\n            return total_loss\n        else:\n            return prediction_scores, seq_relationship_score\n\n\nclass BertForMaskedLM(BertPreTrainedModel):\n    \"\"\"BERT model with the masked language modeling head.\n    This module comprises the BERT model followed by the masked language modeling head.\n\n    Params:\n        config: a BertConfig class instance with the configuration to build a new model.\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]\n            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts\n            `extract_features.py`, `run_classifier.py` and `run_squad.py`)\n        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token\n            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to\n            a `sentence B` token (see BERT paper for more details).\n        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices\n            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max\n            input sequence length in the current batch. It's the mask that we typically use for attention when\n            a batch has varying length sentences.\n        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]\n            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss\n            is only computed for the labels set in [0, ..., vocab_size]\n\n    Outputs:\n        if `masked_lm_labels` is  not `None`:\n            Outputs the masked language modeling loss.\n        if `masked_lm_labels` is `None`:\n            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].\n\n    Example usage:\n    ```python\n    # Already been converted into WordPiece token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])\n    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])\n\n    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)\n\n    model = BertForMaskedLM(config)\n    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)\n    ```\n    \"\"\"\n    def __init__(self, config):\n        super(BertForMaskedLM, self).__init__(config)\n        self.bert = BertModel(config)\n        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)\n        self.apply(self.init_bert_weights)\n\n    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):\n        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,\n                                       output_all_encoded_layers=False)\n        prediction_scores = self.cls(sequence_output)\n\n        if masked_lm_labels is not None:\n            loss_fct = CrossEntropyLoss(ignore_index=-1)\n            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))\n            return masked_lm_loss\n        else:\n            return prediction_scores\n\n\nclass BertForNextSentencePrediction(BertPreTrainedModel):\n    \"\"\"BERT model with next sentence prediction head.\n    This module comprises the BERT model followed by the next sentence classification head.\n\n    Params:\n        config: a BertConfig class instance with the configuration to build a new model.\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]\n            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts\n            `extract_features.py`, `run_classifier.py` and `run_squad.py`)\n        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token\n            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to\n            a `sentence B` token (see BERT paper for more details).\n        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices\n            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max\n            input sequence length in the current batch. It's the mask that we typically use for attention when\n            a batch has varying length sentences.\n        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]\n            with indices selected in [0, 1].\n            0 => next sentence is the continuation, 1 => next sentence is a random sentence.\n\n    Outputs:\n        if `next_sentence_label` is not `None`:\n            Outputs the total_loss which is the sum of the masked language modeling loss and the next\n            sentence classification loss.\n        if `next_sentence_label` is `None`:\n            Outputs the next sentence classification logits of shape [batch_size, 2].\n\n    Example usage:\n    ```python\n    # Already been converted into WordPiece token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])\n    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])\n\n    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)\n\n    model = BertForNextSentencePrediction(config)\n    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)\n    ```\n    \"\"\"\n    def __init__(self, config):\n        super(BertForNextSentencePrediction, self).__init__(config)\n        self.bert = BertModel(config)\n        self.cls = BertOnlyNSPHead(config)\n        self.apply(self.init_bert_weights)\n\n    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):\n        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,\n                                     output_all_encoded_layers=False)\n        seq_relationship_score = self.cls( pooled_output)\n\n        if next_sentence_label is not None:\n            loss_fct = CrossEntropyLoss(ignore_index=-1)\n            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))\n            return next_sentence_loss\n        else:\n            return seq_relationship_score\n\n\nclass BertForSequenceClassification(BertPreTrainedModel):\n    \"\"\"BERT model for classification.\n    This module is composed of the BERT model with a linear layer on top of\n    the pooled output.\n\n    Params:\n        `config`: a BertConfig class instance with the configuration to build a new model.\n        `num_labels`: the number of classes for the classifier. Default = 2.\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]\n            with the word token indices in the vocabulary. Items in the batch should begin with the special \"CLS\" token. (see the tokens preprocessing logic in the scripts\n            `extract_features.py`, `run_classifier.py` and `run_squad.py`)\n        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token\n            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to\n            a `sentence B` token (see BERT paper for more details).\n        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices\n            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max\n            input sequence length in the current batch. It's the mask that we typically use for attention when\n            a batch has varying length sentences.\n        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]\n            with indices selected in [0, ..., num_labels].\n\n    Outputs:\n        if `labels` is not `None`:\n            Outputs the CrossEntropy classification loss of the output with the labels.\n        if `labels` is `None`:\n            Outputs the classification logits of shape [batch_size, num_labels].\n\n    Example usage:\n    ```python\n    # Already been converted into WordPiece token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])\n    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])\n\n    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)\n\n    num_labels = 2\n\n    model = BertForSequenceClassification(config, num_labels)\n    logits = model(input_ids, token_type_ids, input_mask)\n    ```\n    \"\"\"\n    def __init__(self, config, num_labels):\n        super(BertForSequenceClassification, self).__init__(config)\n        self.num_labels = num_labels\n        self.bert = BertModel(config)\n        self.dropout = nn.Dropout(config.hidden_dropout_prob)\n        \n        self.layernorm = nn.LayerNorm(config.hidden_size)\n        self.layernorm_concat = nn.LayerNorm(config.hidden_size * 2)\n        max_seq_length = 128\n        #self.layernorm_concat = nn.LayerNorm(config.hidden_size + max_seq_length * 2)\n        self.layernorm_span = nn.LayerNorm(max_seq_length)\n\n        self.relu = nn.ReLU()\n        \n        self.classifier = nn.Linear(config.hidden_size, num_labels)\n        self.classifier_concat = nn.Linear(config.hidden_size * 2, num_labels)\n        #self.classifier_concat = nn.Linear(config.hidden_size + max_seq_length * 2, num_labels)\n        \n        self.apply(self.init_bert_weights)\n\n    def forward(self, input_ids, token_type_ids=None, attention_mask=None, entity_mask=None, entity_seg_pos=None, entity_span1_pos=None, entity_span2_pos=None, labels=None):\n        encoded_layers, pooled_output = self.bert(input_ids, entity_seg_pos, entity_span1_pos, entity_span2_pos, token_type_ids, attention_mask, output_all_encoded_layers=False)\n        batch_size, max_seq_length = entity_mask.shape[0],entity_mask.shape[1] \n         \n        diag_entity_mask_ = []\n        for i in range(batch_size):\n            diag_entity_mask_.append(torch.diag(entity_mask[i]).cpu().numpy())\n        diag_entity_mask = torch.tensor(diag_entity_mask_).cuda()\n        \n        diag_entity_seg_pos_ = []\n        for i in range(batch_size):\n            diag_entity_seg_pos_.append(torch.diag(entity_seg_pos[i]).cpu().numpy())\n        diag_entity_seg_pos = torch.tensor(diag_entity_seg_pos_,dtype=torch.float).cuda()\n        \n        # Get all embedding of entity\n        #batch_entity_emb = torch.matmul(diag_entity_mask, encoded_layers)\n        \n        # Get start embedding of entity with marker\n        batch_entity_emb = torch.matmul(diag_entity_seg_pos, encoded_layers)\n        \n        \"\"\"\n            Strategy 0: concat start entity marker\n            Bug: TODO, get [1536,768,1536,1536,...]\n        \"\"\"\n        concat_tag = 0\n        if concat_tag == 1:\n            entity_marker_emb_ = []\n            for i in range(batch_size):\n                marker_index = entity_seg_pos[i]\n                per_encoded_layer = encoded_layers[i]\n                entity_marker_emb_.append(torch.index_select(per_encoded_layer, 0, torch.nonzero(marker_index).view(-1)).view(-1).detach().cpu().numpy())\n            entity_emb_output = torch.tensor(entity_marker_emb_).cuda()\n            entity_emb_output = self.dropout(entity_emb_output)\n        \n        \"\"\"\n            Strategy 1: sum all the emb of entity \n        \"\"\"\n        entity_emb_output = batch_entity_emb.sum(dim=1)\n        entity_emb_output = self.dropout(entity_emb_output)\n        \n        \"\"\"\n            Strategy 2: pooling the emb of entity \n                        get the max value along the embedding axis\n        \"\"\"\n        #pooling = nn.MaxPool1d(kernel_size=max_seq_length, stride=1)\n        #entity_emb_output = pooling( batch_entity_emb.permute(0,2,1) ).squeeze()\n        #entity_emb_output = self.dropout(entity_emb_output) \n        \n        \"\"\"\n            Strategy 3: mention pooling + position embedding\n        \"\"\"\n        #entity_span1_pos = self.layernorm_span(entity_span1_pos)\n        #entity_span2_pos = self.layernorm_span(entity_span2_pos)\n        #entity_span_concat = torch.cat((entity_span1_pos,entity_span2_pos),1)\n        #entity_emb_output = torch.cat((entity_span_concat, entity_emb_output),1)\n        \n        \"\"\"\n            Strategy TODO\n        \"\"\"\n        #entity_emb_output = self.layernorm(entity_emb_output)\n        #entity_emb_output = self.layernorm_concat(entity_emb_output)\n        #entity_emb_output = self.relu(entity_emb_output)\n        #import pdb;pdb.set_trace()\n        representation = entity_emb_output\n        \n        # Classifier without concat embedding[hidden_size]\n        logits = self.classifier(representation)\n        \n        # Classifier with concat embedding[hidden_size * 2]\n        #logits = self.classifier_concat(representation)\n         \n        # Classifier with [CLS]\n        #pooled_output = self.dropout(pooled_output)\n        #logits = self.classifier(pooled_output)\n        \n        if labels is not None:\n            loss_fct = CrossEntropyLoss()\n            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))\n            return loss\n        else:\n            return logits\n\n\nclass BertForMultipleChoice(BertPreTrainedModel):\n    \"\"\"BERT model for multiple choice tasks.\n    This module is composed of the BERT model with a linear layer on top of\n    the pooled output.\n\n    Params:\n        `config`: a BertConfig class instance with the configuration to build a new model.\n        `num_choices`: the number of classes for the classifier. Default = 2.\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]\n            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts\n            `extract_features.py`, `run_classifier.py` and `run_squad.py`)\n        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]\n            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`\n            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).\n        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices\n            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max\n            input sequence length in the current batch. It's the mask that we typically use for attention when\n            a batch has varying length sentences.\n        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]\n            with indices selected in [0, ..., num_choices].\n\n    Outputs:\n        if `labels` is not `None`:\n            Outputs the CrossEntropy classification loss of the output with the labels.\n        if `labels` is `None`:\n            Outputs the classification logits of shape [batch_size, num_labels].\n\n    Example usage:\n    ```python\n    # Already been converted into WordPiece token ids\n    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])\n    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])\n    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])\n    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)\n\n    num_choices = 2\n\n    model = BertForMultipleChoice(config, num_choices)\n    logits = model(input_ids, token_type_ids, input_mask)\n    ```\n    \"\"\"\n    def __init__(self, config, num_choices):\n        super(BertForMultipleChoice, self).__init__(config)\n        self.num_choices = num_choices\n        self.bert = BertModel(config)\n        self.dropout = nn.Dropout(config.hidden_dropout_prob)\n        self.classifier = nn.Linear(config.hidden_size, 1)\n        self.apply(self.init_bert_weights)\n\n    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):\n        flat_input_ids = input_ids.view(-1, input_ids.size(-1))\n        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None\n        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None\n        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)\n        pooled_output = self.dropout(pooled_output)\n        logits = self.classifier(pooled_output)\n        reshaped_logits = logits.view(-1, self.num_choices)\n\n        if labels is not None:\n            loss_fct = CrossEntropyLoss()\n            loss = loss_fct(reshaped_logits, labels)\n            return loss\n        else:\n            return reshaped_logits\n\n\nclass BertForTokenClassification(BertPreTrainedModel):\n    \"\"\"BERT model for token-level classification.\n    This module is composed of the BERT model with a linear layer on top of\n    the full hidden state of the last layer.\n\n    Params:\n        `config`: a BertConfig class instance with the configuration to build a new model.\n        `num_labels`: the number of classes for the classifier. Default = 2.\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]\n            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts\n            `extract_features.py`, `run_classifier.py` and `run_squad.py`)\n        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token\n            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to\n            a `sentence B` token (see BERT paper for more details).\n        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices\n            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max\n            input sequence length in the current batch. It's the mask that we typically use for attention when\n            a batch has varying length sentences.\n        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]\n            with indices selected in [0, ..., num_labels].\n\n    Outputs:\n        if `labels` is not `None`:\n            Outputs the CrossEntropy classification loss of the output with the labels.\n        if `labels` is `None`:\n            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].\n\n    Example usage:\n    ```python\n    # Already been converted into WordPiece token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])\n    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])\n\n    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)\n\n    num_labels = 2\n\n    model = BertForTokenClassification(config, num_labels)\n    logits = model(input_ids, token_type_ids, input_mask)\n    ```\n    \"\"\"\n    def __init__(self, config, num_labels):\n        super(BertForTokenClassification, self).__init__(config)\n        self.num_labels = num_labels\n        self.bert = BertModel(config)\n        self.dropout = nn.Dropout(config.hidden_dropout_prob)\n        self.classifier = nn.Linear(config.hidden_size, num_labels)\n        self.apply(self.init_bert_weights)\n\n    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):\n        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)\n        sequence_output = self.dropout(sequence_output)\n        logits = self.classifier(sequence_output)\n\n        if labels is not None:\n            loss_fct = CrossEntropyLoss()\n            # Only keep active parts of the loss\n            if attention_mask is not None:\n                active_loss = attention_mask.view(-1) == 1\n                active_logits = logits.view(-1, self.num_labels)[active_loss]\n                active_labels = labels.view(-1)[active_loss]\n                loss = loss_fct(active_logits, active_labels)\n            else:\n                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))\n            return logits,loss\n        else:\n            return logits\n\n\nclass BertForQuestionAnswering(BertPreTrainedModel):\n    \"\"\"BERT model for Question Answering (span extraction).\n    This module is composed of the BERT model with a linear layer on top of\n    the sequence output that computes start_logits and end_logits\n\n    Params:\n        `config`: a BertConfig class instance with the configuration to build a new model.\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]\n            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts\n            `extract_features.py`, `run_classifier.py` and `run_squad.py`)\n        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token\n            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to\n            a `sentence B` token (see BERT paper for more details).\n        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices\n            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max\n            input sequence length in the current batch. It's the mask that we typically use for attention when\n            a batch has varying length sentences.\n        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].\n            Positions are clamped to the length of the sequence and position outside of the sequence are not taken\n            into account for computing the loss.\n        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].\n            Positions are clamped to the length of the sequence and position outside of the sequence are not taken\n            into account for computing the loss.\n\n    Outputs:\n        if `start_positions` and `end_positions` are not `None`:\n            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.\n        if `start_positions` or `end_positions` is `None`:\n            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end\n            position tokens of shape [batch_size, sequence_length].\n\n    Example usage:\n    ```python\n    # Already been converted into WordPiece token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])\n    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])\n\n    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)\n\n    model = BertForQuestionAnswering(config)\n    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)\n    ```\n    \"\"\"\n    def __init__(self, config):\n        super(BertForQuestionAnswering, self).__init__(config)\n        self.bert = BertModel(config)\n        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version\n        # self.dropout = nn.Dropout(config.hidden_dropout_prob)\n        self.qa_outputs = nn.Linear(config.hidden_size, 2)\n        self.apply(self.init_bert_weights)\n\n    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None):\n        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)\n        logits = self.qa_outputs(sequence_output)\n        start_logits, end_logits = logits.split(1, dim=-1)\n        start_logits = start_logits.squeeze(-1)\n        end_logits = end_logits.squeeze(-1)\n\n        if start_positions is not None and end_positions is not None:\n            # If we are on multi-GPU, split add a dimension\n            if len(start_positions.size()) > 1:\n                start_positions = start_positions.squeeze(-1)\n            if len(end_positions.size()) > 1:\n                end_positions = end_positions.squeeze(-1)\n            # sometimes the start/end positions are outside our model inputs, we ignore these terms\n            ignored_index = start_logits.size(1)\n            start_positions.clamp_(0, ignored_index)\n            end_positions.clamp_(0, ignored_index)\n\n            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)\n            start_loss = loss_fct(start_logits, start_positions)\n            end_loss = loss_fct(end_logits, end_positions)\n            total_loss = (start_loss + end_loss) / 2\n            return total_loss\n        else:\n            return start_logits, end_logits\n"
  },
  {
    "path": "pytorch_pretrained_bert/modeling_gpt2.py",
    "content": "# coding=utf-8\n# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"PyTorch OpenAI GPT-2 model.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport collections\nimport copy\nimport json\nimport logging\nimport math\nimport os\nimport sys\nfrom io import open\n\nimport torch\nimport torch.nn as nn\nfrom torch.nn import CrossEntropyLoss\nfrom torch.nn.parameter import Parameter\n\nfrom .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME\nfrom .modeling import BertLayerNorm as LayerNorm\n\nlogger = logging.getLogger(__name__)\n\nPRETRAINED_MODEL_ARCHIVE_MAP = {\"gpt2\": \"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin\",\n                                \"gpt2-medium\": \"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin\"}\nPRETRAINED_CONFIG_ARCHIVE_MAP = {\"gpt2\": \"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json\",\n                                 \"gpt2-medium\": \"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json\"}\n\ndef prune_conv1d_layer(layer, index, dim=1):\n    \"\"\" Prune a Conv1D layer (a model parameters) to keep only entries in index.\n        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.\n        Return the pruned layer as a new layer with requires_grad=True.\n        Used to remove heads.\n    \"\"\"\n    index = index.to(layer.weight.device)\n    W = layer.weight.index_select(dim, index).clone().detach()\n    if dim == 0:\n        b = layer.bias.clone().detach()\n    else:\n        b = layer.bias[index].clone().detach()\n    new_size = list(layer.weight.size())\n    new_size[dim] = len(index)\n    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)\n    new_layer.weight.requires_grad = False\n    new_layer.weight.copy_(W.contiguous())\n    new_layer.weight.requires_grad = True\n    new_layer.bias.requires_grad = False\n    new_layer.bias.copy_(b.contiguous())\n    new_layer.bias.requires_grad = True\n    return new_layer\n\n\ndef load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):\n    \"\"\" Load tf checkpoints in a pytorch model\n    \"\"\"\n    try:\n        import re\n        import numpy as np\n        import tensorflow as tf\n    except ImportError:\n        print(\"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see \"\n            \"https://www.tensorflow.org/install/ for installation instructions.\")\n        raise\n    tf_path = os.path.abspath(gpt2_checkpoint_path)\n    print(\"Converting TensorFlow checkpoint from {}\".format(tf_path))\n    # Load weights from TF model\n    init_vars = tf.train.list_variables(tf_path)\n    names = []\n    arrays = []\n    for name, shape in init_vars:\n        print(\"Loading TF weight {} with shape {}\".format(name, shape))\n        array = tf.train.load_variable(tf_path, name)\n        names.append(name)\n        arrays.append(array.squeeze())\n\n    for name, array in zip(names, arrays):\n        name = name[6:]  # skip \"model/\"\n        name = name.split('/')\n        pointer = model\n        for m_name in name:\n            if re.fullmatch(r'[A-Za-z]+\\d+', m_name):\n                l = re.split(r'(\\d+)', m_name)\n            else:\n                l = [m_name]\n            if l[0] == 'w' or l[0] == 'g':\n                pointer = getattr(pointer, 'weight')\n            elif l[0] == 'b':\n                pointer = getattr(pointer, 'bias')\n            elif l[0] == 'wpe' or l[0] == 'wte':\n                pointer = getattr(pointer, l[0])\n                pointer = getattr(pointer, 'weight')\n            else:\n                pointer = getattr(pointer, l[0])\n            if len(l) >= 2:\n                num = int(l[1])\n                pointer = pointer[num]\n        try:\n            assert pointer.shape == array.shape\n        except AssertionError as e:\n            e.args += (pointer.shape, array.shape)\n            raise\n        print(\"Initialize PyTorch weight {}\".format(name))\n        pointer.data = torch.from_numpy(array)\n    return model\n\n\ndef gelu(x):\n    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n\n\nclass GPT2Config(object):\n    \"\"\"Configuration class to store the configuration of a `GPT2Model`.\n    \"\"\"\n\n    def __init__(\n        self,\n        vocab_size_or_config_json_file=50257,\n        n_special=0,\n        n_positions=1024,\n        n_ctx=1024,\n        n_embd=768,\n        n_layer=12,\n        n_head=12,\n        resid_pdrop=0.1,\n        embd_pdrop=0.1,\n        attn_pdrop=0.1,\n        layer_norm_epsilon=1e-5,\n        initializer_range=0.02,\n        predict_special_tokens=True\n    ):\n        \"\"\"Constructs GPT2Config.\n\n        Args:\n            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.\n            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)\n            n_positions: Number of positional embeddings.\n            n_ctx: Size of the causal mask (usually same as n_positions).\n            n_embd: Dimensionality of the embeddings and hidden states.\n            n_layer: Number of hidden layers in the Transformer encoder.\n            n_head: Number of attention heads for each attention layer in\n                the Transformer encoder.\n            layer_norm_epsilon: epsilon to use in the layer norm layers\n            resid_pdrop: The dropout probabilitiy for all fully connected\n                layers in the embeddings, encoder, and pooler.\n            attn_pdrop: The dropout ratio for the attention\n                probabilities.\n            embd_pdrop: The dropout ratio for the embeddings.\n            initializer_range: The sttdev of the truncated_normal_initializer for\n                initializing all weight matrices.\n            predict_special_tokens: should we predict special tokens (when the model has a LM head)\n        \"\"\"\n        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2\n                        and isinstance(vocab_size_or_config_json_file, unicode)):\n            with open(vocab_size_or_config_json_file, \"r\", encoding=\"utf-8\") as reader:\n                json_config = json.loads(reader.read())\n            for key, value in json_config.items():\n                self.__dict__[key] = value\n        elif isinstance(vocab_size_or_config_json_file, int):\n            self.vocab_size = vocab_size_or_config_json_file\n            self.n_special = n_special\n            self.n_ctx = n_ctx\n            self.n_positions = n_positions\n            self.n_embd = n_embd\n            self.n_layer = n_layer\n            self.n_head = n_head\n            self.resid_pdrop = resid_pdrop\n            self.embd_pdrop = embd_pdrop\n            self.attn_pdrop = attn_pdrop\n            self.layer_norm_epsilon = layer_norm_epsilon\n            self.initializer_range = initializer_range\n            self.predict_special_tokens = predict_special_tokens\n        else:\n            raise ValueError(\n                \"First argument must be either a vocabulary size (int)\"\n                \"or the path to a pretrained model config file (str)\"\n            )\n\n    @property\n    def total_tokens_embeddings(self):\n        return self.vocab_size + self.n_special\n\n    @classmethod\n    def from_dict(cls, json_object):\n        \"\"\"Constructs a `GPT2Config` from a Python dictionary of parameters.\"\"\"\n        config = GPT2Config(vocab_size_or_config_json_file=-1)\n        for key, value in json_object.items():\n            config.__dict__[key] = value\n        return config\n\n    @classmethod\n    def from_json_file(cls, json_file):\n        \"\"\"Constructs a `GPT2Config` from a json file of parameters.\"\"\"\n        with open(json_file, \"r\", encoding=\"utf-8\") as reader:\n            text = reader.read()\n        return cls.from_dict(json.loads(text))\n\n    def __repr__(self):\n        return str(self.to_json_string())\n\n    def to_dict(self):\n        \"\"\"Serializes this instance to a Python dictionary.\"\"\"\n        output = copy.deepcopy(self.__dict__)\n        return output\n\n    def to_json_string(self):\n        \"\"\"Serializes this instance to a JSON string.\"\"\"\n        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + \"\\n\"\n\n    def to_json_file(self, json_file_path):\n        \"\"\" Save this instance to a json file.\"\"\"\n        with open(json_file_path, \"w\", encoding='utf-8') as writer:\n            writer.write(self.to_json_string())\n\n\nclass Conv1D(nn.Module):\n    def __init__(self, nf, nx):\n        super(Conv1D, self).__init__()\n        self.nf = nf\n        w = torch.empty(nx, nf)\n        nn.init.normal_(w, std=0.02)\n        self.weight = Parameter(w)\n        self.bias = Parameter(torch.zeros(nf))\n\n    def forward(self, x):\n        size_out = x.size()[:-1] + (self.nf,)\n        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)\n        x = x.view(*size_out)\n        return x\n\n\nclass Attention(nn.Module):\n    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):\n        super(Attention, self).__init__()\n        n_state = nx  # in Attention: n_state=768 (nx=n_embd)\n        # [switch nx => n_state from Block to Attention to keep identical to TF implem]\n        assert n_state % config.n_head == 0\n        self.register_buffer(\"bias\", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))\n        self.n_head = config.n_head\n        self.split_size = n_state\n        self.scale = scale\n\n        self.output_attentions = output_attentions\n        self.keep_multihead_output = keep_multihead_output\n        self.multihead_output = None\n\n        self.c_attn = Conv1D(n_state * 3, nx)\n        self.c_proj = Conv1D(n_state, nx)\n        self.attn_dropout = nn.Dropout(config.attn_pdrop)\n        self.resid_dropout = nn.Dropout(config.resid_pdrop)\n\n    def prune_heads(self, heads):\n        if len(heads) == 0:\n            return\n        mask = torch.ones(self.n_head, self.split_size // self.n_head)\n        for head in heads:\n            mask[head] = 0\n        mask = mask.view(-1).contiguous().eq(1)\n        index = torch.arange(len(mask))[mask].long()\n        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])\n        # Prune conv1d layers\n        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)\n        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)\n        # Update hyper params\n        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))\n        self.n_head = self.n_head - len(heads)\n\n    def _attn(self, q, k, v, head_mask=None):\n        w = torch.matmul(q, k)\n        if self.scale:\n            w = w / math.sqrt(v.size(-1))\n        nd, ns = w.size(-2), w.size(-1)\n        b = self.bias[:, :, ns-nd:ns, :ns]\n        w = w * b - 1e4 * (1 - b)\n\n        w = nn.Softmax(dim=-1)(w)\n        w = self.attn_dropout(w)\n\n        # Mask heads if we want to\n        if head_mask is not None:\n            w = w * head_mask\n\n        if self.output_attentions:\n            return w, torch.matmul(w, v)\n        return torch.matmul(w, v)\n\n    def merge_heads(self, x):\n        x = x.permute(0, 2, 1, 3).contiguous()\n        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)\n        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states\n\n    def split_heads(self, x, k=False):\n        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)\n        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states\n        if k:\n            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)\n        else:\n            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)\n\n    def forward(self, x, layer_past=None, head_mask=None):\n        x = self.c_attn(x)\n        query, key, value = x.split(self.split_size, dim=2)\n        query = self.split_heads(query)\n        key = self.split_heads(key, k=True)\n        value = self.split_heads(value)\n        if layer_past is not None:\n            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below\n            key = torch.cat((past_key, key), dim=-1)\n            value = torch.cat((past_value, value), dim=-2)\n        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking\n\n        a = self._attn(query, key, value, head_mask)\n        if self.keep_multihead_output:\n            self.multihead_output = a\n            self.multihead_output.retain_grad()\n\n        if self.output_attentions:\n            attentions, a = a\n        a = self.merge_heads(a)\n        a = self.c_proj(a)\n        a = self.resid_dropout(a)\n        if self.output_attentions:\n            return attentions, a, present\n        return a, present\n\n\nclass MLP(nn.Module):\n    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)\n        super(MLP, self).__init__()\n        nx = config.n_embd\n        self.c_fc = Conv1D(n_state, nx)\n        self.c_proj = Conv1D(nx, n_state)\n        self.act = gelu\n        self.dropout = nn.Dropout(config.resid_pdrop)\n\n    def forward(self, x):\n        h = self.act(self.c_fc(x))\n        h2 = self.c_proj(h)\n        return self.dropout(h2)\n\n\nclass Block(nn.Module):\n    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):\n        super(Block, self).__init__()\n        nx = config.n_embd\n        self.output_attentions = output_attentions\n        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)\n        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)\n        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)\n        self.mlp = MLP(4 * nx, config)\n\n    def forward(self, x, layer_past=None, head_mask=None):\n        output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)\n        if self.output_attentions:\n            attentions, a, present = output_attn\n        else:\n            a, present = output_attn\n        x = x + a\n        m = self.mlp(self.ln_2(x))\n        x = x + m\n        if self.output_attentions:\n            return attentions, x, present\n        return x, present\n\n\nclass GPT2LMHead(nn.Module):\n    \"\"\" Language Model Head for the transformer \"\"\"\n\n    def __init__(self, model_embeddings_weights, config):\n        super(GPT2LMHead, self).__init__()\n        self.n_embd = config.n_embd\n        self.vocab_size = config.vocab_size\n        self.predict_special_tokens = config.predict_special_tokens\n        embed_shape = model_embeddings_weights.shape\n        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)\n        self.set_embeddings_weights(model_embeddings_weights)\n\n    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):\n        self.predict_special_tokens = predict_special_tokens\n        self.decoder.weight = model_embeddings_weights  # Tied weights\n\n    def forward(self, hidden_state):\n        lm_logits = self.decoder(hidden_state)\n        if not self.predict_special_tokens:\n            lm_logits = lm_logits[..., :self.vocab_size]\n        return lm_logits\n\n\nclass GPT2MultipleChoiceHead(nn.Module):\n    \"\"\" Classifier Head for the transformer \"\"\"\n\n    def __init__(self, config):\n        super(GPT2MultipleChoiceHead, self).__init__()\n        self.n_embd = config.n_embd\n        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation\n        self.linear = nn.Linear(config.n_embd, 1)\n\n        nn.init.normal_(self.linear.weight, std=0.02)\n        nn.init.normal_(self.linear.bias, 0)\n\n    def forward(self, hidden_states, mc_token_ids):\n        # Classification logits\n        # hidden_state (bsz, num_choices, seq_length, hidden_size)\n        # mc_token_ids (bsz, num_choices)\n        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))\n        # (bsz, num_choices, 1, hidden_size)\n        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)\n        # (bsz, num_choices, hidden_size)\n        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)\n        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)\n        # (bsz, num_choices)\n        return multiple_choice_logits\n\n\nclass GPT2PreTrainedModel(nn.Module):\n    \"\"\" An abstract class to handle weights initialization and\n        a simple interface for dowloading and loading pretrained models.\n    \"\"\"\n\n    def __init__(self, config, *inputs, **kwargs):\n        super(GPT2PreTrainedModel, self).__init__()\n        if not isinstance(config, GPT2Config):\n            raise ValueError(\n                \"Parameter config in `{}(config)` should be an instance of class `GPT2Config`. \"\n                \"To create a model from a pretrained model use \"\n                \"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`\".format(\n                    self.__class__.__name__, self.__class__.__name__\n                )\n            )\n        self.config = config\n\n    def init_weights(self, module):\n        \"\"\" Initialize the weights.\n        \"\"\"\n        if isinstance(module, (nn.Linear, nn.Embedding)):\n            # Slightly different from the TF version which uses truncated_normal for initialization\n            # cf https://github.com/pytorch/pytorch/pull/5617\n            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)\n        elif isinstance(module, LayerNorm):\n            module.bias.data.zero_()\n            module.weight.data.fill_(1.0)\n        if isinstance(module, nn.Linear) and module.bias is not None:\n            module.bias.data.zero_()\n\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):\n        \"\"\"\n        Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.\n        Download and cache the pre-trained model file if needed.\n\n        Params:\n            pretrained_model_name_or_path: either:\n                - a str with the name of a pre-trained model to load selected in the list of:\n                    . `gpt2`\n                - a path or url to a pretrained model archive containing:\n                    . `gpt2_config.json` a configuration file for the model\n                    . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance\n                - a path or url to a pretrained model archive containing:\n                    . `gpt2_config.json` a configuration file for the model\n                    . a TensorFlow checkpoint with trained weights\n            from_tf: should we load the weights from a locally saved TensorFlow checkpoint\n            cache_dir: an optional path to a folder in which the pre-trained models will be cached.\n            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models\n            *inputs, **kwargs: additional input for the specific GPT2 class\n        \"\"\"\n        state_dict = kwargs.get('state_dict', None)\n        kwargs.pop('state_dict', None)\n        cache_dir = kwargs.get('cache_dir', None)\n        kwargs.pop('cache_dir', None)\n        from_tf = kwargs.get('from_tf', False)\n        kwargs.pop('from_tf', None)\n        num_special_tokens = kwargs.get('num_special_tokens', None)\n        kwargs.pop('num_special_tokens', None)\n\n        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:\n            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]\n            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]\n        else:\n            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)\n            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)\n        # redirect to the cache, if necessary\n        try:\n            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:\n                logger.error(\n                    \"Couldn't reach server at '{}' to download pretrained weights.\".format(\n                        archive_file))\n            else:\n                logger.error(\n                    \"Model name '{}' was not found in model name list ({}). \"\n                    \"We assumed '{}' was a path or url but couldn't find file {} \"\n                    \"at this path or url.\".format(\n                        pretrained_model_name_or_path, \", \".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,\n                        archive_file\n                    )\n                )\n            return None\n        try:\n            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:\n                logger.error(\n                    \"Couldn't reach server at '{}' to download pretrained model configuration file.\".format(\n                        config_file))\n            else:\n                logger.error(\n                    \"Model name '{}' was not found in model name list ({}). \"\n                    \"We assumed '{}' was a path or url but couldn't find file {} \"\n                    \"at this path or url.\".format(\n                        pretrained_model_name_or_path, \", \".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,\n                        config_file\n                    )\n                )\n            return None\n        if resolved_archive_file == archive_file and resolved_config_file == config_file:\n            logger.info(\"loading weights file {}\".format(archive_file))\n            logger.info(\"loading configuration file {}\".format(config_file))\n        else:\n            logger.info(\"loading weights file {} from cache at {}\".format(\n                archive_file, resolved_archive_file))\n            logger.info(\"loading configuration file {} from cache at {}\".format(\n                config_file, resolved_config_file))\n        # Load config\n        config = GPT2Config.from_json_file(resolved_config_file)\n        logger.info(\"Model config {}\".format(config))\n        # Instantiate model.\n        model = cls(config, *inputs, **kwargs)\n        if state_dict is None and not from_tf:\n            state_dict = torch.load(resolved_archive_file, map_location='cpu')\n        if from_tf:\n            # Directly load from a TensorFlow checkpoint (stored as NumPy array)\n            return load_tf_weights_in_gpt2(model, resolved_archive_file)\n\n        old_keys = []\n        new_keys = []\n        for key in state_dict.keys():\n            new_key = None\n            if key.endswith(\".g\"):\n                new_key = key[:-2] + \".weight\"\n            elif key.endswith(\".b\"):\n                new_key = key[:-2] + \".bias\"\n            elif key.endswith(\".w\"):\n                new_key = key[:-2] + \".weight\"\n            if new_key:\n                old_keys.append(key)\n                new_keys.append(new_key)\n        for old_key, new_key in zip(old_keys, new_keys):\n            state_dict[new_key] = state_dict.pop(old_key)\n\n        missing_keys = []\n        unexpected_keys = []\n        error_msgs = []\n        # copy state_dict so _load_from_state_dict can modify it\n        metadata = getattr(state_dict, \"_metadata\", None)\n        state_dict = state_dict.copy()\n        if metadata is not None:\n            state_dict._metadata = metadata\n\n        def load(module, prefix=\"\"):\n            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})\n            module._load_from_state_dict(\n                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs\n            )\n            for name, child in module._modules.items():\n                if child is not None:\n                    load(child, prefix + name + \".\")\n\n        start_model = model\n        if hasattr(model, \"transformer\") and all(not s.startswith('transformer.') for s in state_dict.keys()):\n            start_model = model.transformer\n        load(start_model, prefix=\"\")\n\n        if len(missing_keys) > 0:\n            logger.info(\n                \"Weights of {} not initialized from pretrained model: {}\".format(model.__class__.__name__, missing_keys)\n            )\n        if len(unexpected_keys) > 0:\n            logger.info(\n                \"Weights from pretrained model not used in {}: {}\".format(model.__class__.__name__, unexpected_keys)\n            )\n        if len(error_msgs) > 0:\n            raise RuntimeError(\n                \"Error(s) in loading state_dict for {}:\\n\\t{}\".format(model.__class__.__name__, \"\\n\\t\".join(error_msgs))\n            )\n\n        # Add additional embeddings for special tokens if needed\n        # This step also make sure we are still sharing the output and input embeddings after loading weights\n        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)\n        return model\n\n\nclass GPT2Model(GPT2PreTrainedModel):\n    \"\"\"OpenAI GPT-2 model (\"Language Models are Unsupervised Multitask Learners\").\n\n    GPT-2 use a single embedding matrix to store the word and special embeddings.\n    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...\n    Special tokens need to be trained during the fine-tuning if you use them.\n    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.\n\n    The embeddings are ordered as follow in the token embeddings matrice:\n        [0,                                                         ----------------------\n         ...                                                        -> word embeddings\n         config.vocab_size - 1,                                     ______________________\n         config.vocab_size,\n         ...                                                        -> special embeddings\n         config.vocab_size + config.n_special - 1]                  ______________________\n\n    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:\n        total_tokens_embeddings = config.vocab_size + config.n_special\n    You should use the associate indices to index the embeddings.\n\n    Params:\n        `config`: a GPT2Config class instance with the configuration to build a new model\n        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False\n        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.\n            This can be used to compute head importance metrics. Default: False\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]\n            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[\n        `position_ids`: an optional torch.LongTensor with the same shape as input_ids\n            with the position indices (selected in the range [0, config.n_positions - 1[.\n        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids\n            You can use it to add a third type of embedding to each input token in the sequence\n            (the previous two being the word and position embeddings).\n            The input, position and token_type embeddings are summed inside the Transformer before the first\n            self-attention block.\n        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states\n            (key and values in the attention blocks) to speed up sequential decoding\n            (this is the presents output of the model, cf. below).\n        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.\n            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.\n\n    Outputs a tuple consisting of:\n        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)\n            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]\n            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)\n        `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as\n            torch.FloatTensors. They can be reused to speed up sequential decoding.\n\n    Example usage:\n    ```python\n    # Already been converted into BPE token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n\n    config = modeling_gpt2.GPT2Config()\n\n    model = modeling_gpt2.GPT2Model(config)\n    hidden_states, presents = model(input_ids)\n    ```\n    \"\"\"\n\n    def __init__(self, config, output_attentions=False, keep_multihead_output=False):\n        super(GPT2Model, self).__init__(config)\n        self.output_attentions = output_attentions\n        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)\n        self.wpe = nn.Embedding(config.n_positions, config.n_embd)\n        self.drop = nn.Dropout(config.embd_pdrop)\n        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,\n                                                        keep_multihead_output=keep_multihead_output)\n        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])\n        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)\n\n        self.apply(self.init_weights)\n\n    def set_num_special_tokens(self, num_special_tokens):\n        \" Update input embeddings with new embedding matrice if needed \"\n        if self.config.n_special == num_special_tokens:\n            return\n        # Update config\n        self.config.n_special = num_special_tokens\n        # Build new embeddings and initialize all new embeddings (in particular the special tokens)\n        old_embed = self.wte\n        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)\n        self.wte.to(old_embed.weight.device)\n        self.init_weights(self.wte)\n        # Copy word embeddings from the previous weights\n        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]\n\n    def prune_heads(self, heads_to_prune):\n        \"\"\" Prunes heads of the model.\n            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}\n        \"\"\"\n        for layer, heads in heads_to_prune.items():\n            self.h[layer].attn.prune_heads(heads)\n\n    def get_multihead_outputs(self):\n        \"\"\" Gather all multi-head outputs.\n            Return: list (layers) of multihead module outputs with gradients\n        \"\"\"\n        return [h.attn.multihead_output for h in self.h]\n\n    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):\n        if past is None:\n            past_length = 0\n            past = [None] * len(self.h)\n        else:\n            past_length = past[0][0].size(-2)\n        if position_ids is None:\n            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)\n            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)\n\n        # Prepare head mask if needed\n        # 1.0 in head_mask indicate we keep the head\n        # attention_probs has shape bsz x n_heads x N x N\n        # head_mask has shape n_layer x batch x n_heads x N x N\n        if head_mask is not None:\n            if head_mask.dim() == 1:\n                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)\n                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)\n            elif head_mask.dim() == 2:\n                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer\n            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility\n        else:\n            head_mask = [None] * self.config.n_layer\n\n        input_shape = input_ids.size()\n        input_ids = input_ids.view(-1, input_ids.size(-1))\n        position_ids = position_ids.view(-1, position_ids.size(-1))\n\n        inputs_embeds = self.wte(input_ids)\n        position_embeds = self.wpe(position_ids)\n        if token_type_ids is not None:\n            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))\n            token_type_embeds = self.wte(token_type_ids)\n        else:\n            token_type_embeds = 0\n        hidden_states = inputs_embeds + position_embeds + token_type_embeds\n        hidden_states = self.drop(hidden_states)\n\n        output_shape = input_shape + (hidden_states.size(-1),)\n\n        presents = []\n        all_attentions = []\n        all_hidden_states = []\n        for i, (block, layer_past) in enumerate(zip(self.h, past)):\n            all_hidden_states.append(hidden_states.view(*output_shape))\n            outputs = block(hidden_states, layer_past, head_mask[i])\n            if self.output_attentions:\n                attentions, hidden_states, present = outputs\n                all_attentions.append(attentions)\n            else:\n                hidden_states, present = outputs\n            presents.append(present)\n        hidden_states = self.ln_f(hidden_states)\n        all_hidden_states.append(hidden_states.view(*output_shape))\n\n        if self.output_attentions:\n            return all_attentions, all_hidden_states, presents\n        return all_hidden_states, presents\n\n\nclass GPT2LMHeadModel(GPT2PreTrainedModel):\n    \"\"\"OpenAI GPT-2 model with a Language Modeling head (\"Language Models are Unsupervised Multitask Learners\").\n\n    Params:\n        `config`: a GPT2Config class instance with the configuration to build a new model\n        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False\n        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.\n            This can be used to compute head importance metrics. Default: False\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]\n            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[\n        `position_ids`: an optional torch.LongTensor with the same shape as input_ids\n            with the position indices (selected in the range [0, config.n_positions - 1[.\n        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids\n            You can use it to add a third type of embedding to each input token in the sequence\n            (the previous two being the word and position embeddings).\n            The input, position and token_type embeddings are summed inside the Transformer before the first\n            self-attention block.\n        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]\n            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss\n            is only computed for the labels set in [0, ..., vocab_size]\n        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states\n            (key and values in the attention blocks) to speed up sequential decoding\n            (this is the presents output of the model, cf. below).\n        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.\n            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.\n\n    Outputs:\n        if `lm_labels` is not `None`:\n            Outputs the language modeling loss.\n        else a tuple:\n            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size]\n                (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids)\n            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as\n                torch.FloatTensors. They can be reused to speed up sequential decoding.\n\n    Example usage:\n    ```python\n    # Already been converted into BPE token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n\n    config = modeling_gpt2.GPT2Config()\n\n    model = modeling_gpt2.GPT2LMHeadModel(config)\n    lm_logits, presents = model(input_ids)\n    ```\n    \"\"\"\n\n    def __init__(self, config, output_attentions=False, keep_multihead_output=False):\n        super(GPT2LMHeadModel, self).__init__(config)\n        self.transformer = GPT2Model(config, output_attentions=output_attentions,\n                                             keep_multihead_output=keep_multihead_output)\n        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)\n        self.apply(self.init_weights)\n\n    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):\n        \"\"\" Update input and output embeddings with new embedding matrice\n            Make sure we are sharing the embeddings\n        \"\"\"\n        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens\n        self.transformer.set_num_special_tokens(num_special_tokens)\n        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)\n\n    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):\n        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)\n        if self.transformer.output_attentions:\n            all_attentions, hidden_states, presents = transformer_output\n        else:\n            hidden_states, presents = transformer_output\n        hidden_states = hidden_states[-1]\n\n        lm_logits = self.lm_head(hidden_states)\n        if lm_labels is not None:\n            # Shift so that tokens < n predict n\n            shift_logits = lm_logits[..., :-1, :].contiguous()\n            shift_labels = lm_labels[..., 1:].contiguous()\n            # Flatten the tokens\n            loss_fct = CrossEntropyLoss(ignore_index=-1)\n            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),\n                            shift_labels.view(-1))\n            return loss\n        if self.transformer.output_attentions:\n            return all_attentions, lm_logits, presents\n        return lm_logits, presents\n\n\nclass GPT2DoubleHeadsModel(GPT2PreTrainedModel):\n    \"\"\"OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head (\"Language Models are Unsupervised Multitask Learners\").\n\n    Params:\n        `config`: a GPT2Config class instance with the configuration to build a new model\n        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False\n        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.\n            This can be used to compute head importance metrics. Default: False\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token\n            indices selected in the range [0, config.vocab_size[\n        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from\n            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)\n        `position_ids`: an optional torch.LongTensor with the same shape as input_ids\n            with the position indices (selected in the range [0, config.n_positions - 1[.\n        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids\n            You can use it to add a third type of embedding to each input token in the sequence\n            (the previous two being the word and position embeddings).\n            The input, position and token_type embeddings are summed inside the Transformer before the first\n            self-attention block.\n        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]\n            with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss\n            is only computed for the labels set in [0, ..., config.vocab_size]\n        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]\n            with indices selected in [0, ..., num_choices].\n        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states\n            (key and values in the attention blocks) to speed up sequential decoding\n            (this is the presents output of the model, cf. below).\n        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.\n            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.\n\n    Outputs:\n        if `lm_labels` and `multiple_choice_labels` are not `None`:\n            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.\n        else: a tuple with\n            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size]\n            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]\n            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as\n                torch.FloatTensors. They can be reused to speed up sequential decoding.\n\n    Example usage:\n    ```python\n    # Already been converted into BPE token ids\n    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)\n    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)\n\n    config = modeling_gpt2.GPT2Config()\n\n    model = modeling_gpt2.GPT2DoubleHeadsModel(config)\n    lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)\n    ```\n    \"\"\"\n\n    def __init__(self, config, output_attentions=False, keep_multihead_output=False):\n        super(GPT2DoubleHeadsModel, self).__init__(config)\n        self.transformer = GPT2Model(config, output_attentions=output_attentions,\n                                             keep_multihead_output=keep_multihead_output)\n        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)\n        self.multiple_choice_head = GPT2MultipleChoiceHead(config)\n        self.apply(self.init_weights)\n\n    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):\n        \"\"\" Update input and output embeddings with new embedding matrice\n            Make sure we are sharing the embeddings\n        \"\"\"\n        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens\n        self.transformer.set_num_special_tokens(num_special_tokens)\n        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)\n\n    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,\n                position_ids=None, past=None, head_mask=None):\n        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)\n        if self.transformer.output_attentions:\n            all_attentions, hidden_states, presents = transformer_output\n        else:\n            hidden_states, presents = transformer_output\n        hidden_states = hidden_states[-1]\n\n        lm_logits = self.lm_head(hidden_states)\n        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)\n        losses = []\n        if lm_labels is not None:\n            shift_logits = lm_logits[..., :-1, :].contiguous()\n            shift_labels = lm_labels[..., 1:].contiguous()\n            loss_fct = CrossEntropyLoss(ignore_index=-1)\n            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))\n        if mc_labels is not None:\n            loss_fct = CrossEntropyLoss()\n            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))\n        if losses:\n            return losses\n        if self.transformer.output_attentions:\n            return all_attentions, lm_logits, mc_logits, presents\n        return lm_logits, mc_logits, presents\n"
  },
  {
    "path": "pytorch_pretrained_bert/modeling_openai.py",
    "content": "# coding=utf-8\n# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"PyTorch OpenAI GPT model.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport collections\nimport copy\nimport json\nimport logging\nimport math\nimport os\nimport sys\nfrom io import open\n\nimport torch\nimport torch.nn as nn\nfrom torch.nn import CrossEntropyLoss\nfrom torch.nn.parameter import Parameter\n\nfrom .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME\nfrom .modeling import BertLayerNorm as LayerNorm\nfrom .modeling_gpt2 import prune_conv1d_layer\n\nlogger = logging.getLogger(__name__)\n\nPRETRAINED_MODEL_ARCHIVE_MAP = {\"openai-gpt\": \"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin\"}\nPRETRAINED_CONFIG_ARCHIVE_MAP = {\"openai-gpt\": \"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json\"}\n\n\ndef load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):\n    \"\"\" Load tf pre-trained weights in a pytorch model (from NumPy arrays here)\n    \"\"\"\n    import re\n    import numpy as np\n    print(\"Loading weights...\")\n    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', \"r\", encoding='utf-8'))\n    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', \"r\", encoding='utf-8'))\n    offsets = np.cumsum([np.prod(shape) for shape in shapes])\n    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]\n    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]\n    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]\n\n    # This was used when we had a single embedding matrix for positions and tokens\n    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)\n    # del init_params[1]\n    init_params = [arr.squeeze() for arr in init_params]\n\n    try:\n        assert model.tokens_embed.weight.shape == init_params[1].shape\n        assert model.positions_embed.weight.shape == init_params[0].shape\n    except AssertionError as e:\n        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)\n        e.args += (model.positions_embed.weight.shape, init_params[0].shape)\n        raise\n\n    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])\n    model.positions_embed.weight.data = torch.from_numpy(init_params[0])\n    names.pop(0)\n    # Pop position and token embedding arrays\n    init_params.pop(0)\n    init_params.pop(0)\n\n    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):\n        name = name[6:]  # skip \"model/\"\n        assert name[-2:] == \":0\"\n        name = name[:-2]\n        name = name.split('/')\n        pointer = model\n        for m_name in name:\n            if re.fullmatch(r'[A-Za-z]+\\d+', m_name):\n                l = re.split(r'(\\d+)', m_name)\n            else:\n                l = [m_name]\n            if l[0] == 'g':\n                pointer = getattr(pointer, 'weight')\n            elif l[0] == 'b':\n                pointer = getattr(pointer, 'bias')\n            elif l[0] == 'w':\n                pointer = getattr(pointer, 'weight')\n            else:\n                pointer = getattr(pointer, l[0])\n            if len(l) >= 2:\n                num = int(l[1])\n                pointer = pointer[num]\n        try:\n            assert pointer.shape == array.shape\n        except AssertionError as e:\n            e.args += (pointer.shape, array.shape)\n            raise\n        try:\n            assert pointer.shape == array.shape\n        except AssertionError as e:\n            e.args += (pointer.shape, array.shape)\n            raise\n        print(\"Initialize PyTorch weight {}\".format(name))\n        pointer.data = torch.from_numpy(array)\n    return model\n\n\ndef gelu(x):\n    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n\n\ndef swish(x):\n    return x * torch.sigmoid(x)\n\n\nACT_FNS = {\"relu\": nn.ReLU, \"swish\": swish, \"gelu\": gelu}\n\n\nclass OpenAIGPTConfig(object):\n    \"\"\"Configuration class to store the configuration of a `OpenAIGPTModel`.\n    \"\"\"\n\n    def __init__(\n        self,\n        vocab_size_or_config_json_file=40478,\n        n_special=0,\n        n_positions=512,\n        n_ctx=512,\n        n_embd=768,\n        n_layer=12,\n        n_head=12,\n        afn=\"gelu\",\n        resid_pdrop=0.1,\n        embd_pdrop=0.1,\n        attn_pdrop=0.1,\n        layer_norm_epsilon=1e-5,\n        initializer_range=0.02,\n        predict_special_tokens=True\n    ):\n        \"\"\"Constructs OpenAIGPTConfig.\n\n        Args:\n            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.\n            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)\n            n_positions: Number of positional embeddings.\n            n_ctx: Size of the causal mask (usually same as n_positions).\n            n_embd: Dimensionality of the embeddings and hidden states.\n            n_layer: Number of hidden layers in the Transformer encoder.\n            n_head: Number of attention heads for each attention layer in\n                the Transformer encoder.\n            afn: The non-linear activation function (function or string) in the\n                encoder and pooler. If string, \"gelu\", \"relu\" and \"swish\" are supported.\n            resid_pdrop: The dropout probabilitiy for all fully connected\n                layers in the embeddings, encoder, and pooler.\n            attn_pdrop: The dropout ratio for the attention\n                probabilities.\n            embd_pdrop: The dropout ratio for the embeddings.\n            layer_norm_epsilon: epsilon to use in the layer norm layers\n            initializer_range: The sttdev of the truncated_normal_initializer for\n                initializing all weight matrices.\n            predict_special_tokens: should we predict special tokens (when the model has a LM head)\n        \"\"\"\n        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2\n                        and isinstance(vocab_size_or_config_json_file, unicode)):\n            with open(vocab_size_or_config_json_file, \"r\", encoding=\"utf-8\") as reader:\n                json_config = json.loads(reader.read())\n            for key, value in json_config.items():\n                self.__dict__[key] = value\n        elif isinstance(vocab_size_or_config_json_file, int):\n            self.vocab_size = vocab_size_or_config_json_file\n            self.n_special = n_special\n            self.n_ctx = n_ctx\n            self.n_positions = n_positions\n            self.n_embd = n_embd\n            self.n_layer = n_layer\n            self.n_head = n_head\n            self.afn = afn\n            self.resid_pdrop = resid_pdrop\n            self.embd_pdrop = embd_pdrop\n            self.attn_pdrop = attn_pdrop\n            self.layer_norm_epsilon = layer_norm_epsilon\n            self.initializer_range = initializer_range\n            self.predict_special_tokens = predict_special_tokens\n        else:\n            raise ValueError(\n                \"First argument must be either a vocabulary size (int)\"\n                \"or the path to a pretrained model config file (str)\"\n            )\n\n    @property\n    def total_tokens_embeddings(self):\n        return self.vocab_size + self.n_special\n\n    @classmethod\n    def from_dict(cls, json_object):\n        \"\"\"Constructs a `OpenAIGPTConfig` from a Python dictionary of parameters.\"\"\"\n        config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1)\n        for key, value in json_object.items():\n            config.__dict__[key] = value\n        return config\n\n    @classmethod\n    def from_json_file(cls, json_file):\n        \"\"\"Constructs a `OpenAIGPTConfig` from a json file of parameters.\"\"\"\n        with open(json_file, \"r\", encoding=\"utf-8\") as reader:\n            text = reader.read()\n        return cls.from_dict(json.loads(text))\n\n    def __repr__(self):\n        return str(self.to_json_string())\n\n    def to_dict(self):\n        \"\"\"Serializes this instance to a Python dictionary.\"\"\"\n        output = copy.deepcopy(self.__dict__)\n        return output\n\n    def to_json_string(self):\n        \"\"\"Serializes this instance to a JSON string.\"\"\"\n        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + \"\\n\"\n\n    def to_json_file(self, json_file_path):\n        \"\"\" Save this instance to a json file.\"\"\"\n        with open(json_file_path, \"w\", encoding='utf-8') as writer:\n            writer.write(self.to_json_string())\n\n\nclass Conv1D(nn.Module):\n    def __init__(self, nf, rf, nx):\n        super(Conv1D, self).__init__()\n        self.rf = rf\n        self.nf = nf\n        if rf == 1:  # faster 1x1 conv\n            w = torch.empty(nx, nf)\n            nn.init.normal_(w, std=0.02)\n            self.weight = Parameter(w)\n            self.bias = Parameter(torch.zeros(nf))\n        else:  # was used to train LM\n            raise NotImplementedError\n\n    def forward(self, x):\n        if self.rf == 1:\n            size_out = x.size()[:-1] + (self.nf,)\n            x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)\n            x = x.view(*size_out)\n        else:\n            raise NotImplementedError\n        return x\n\n\nclass Attention(nn.Module):\n    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):\n        super(Attention, self).__init__()\n        n_state = nx  # in Attention: n_state=768 (nx=n_embd)\n        # [switch nx => n_state from Block to Attention to keep identical to TF implem]\n        assert n_state % config.n_head == 0\n        self.register_buffer(\"bias\", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))\n        self.n_head = config.n_head\n        self.split_size = n_state\n        self.scale = scale\n\n        self.output_attentions = output_attentions\n        self.keep_multihead_output = keep_multihead_output\n        self.multihead_output = None\n\n        self.c_attn = Conv1D(n_state * 3, 1, nx)\n        self.c_proj = Conv1D(n_state, 1, nx)\n        self.attn_dropout = nn.Dropout(config.attn_pdrop)\n        self.resid_dropout = nn.Dropout(config.resid_pdrop)\n\n    def prune_heads(self, heads):\n        if len(heads) == 0:\n            return\n        mask = torch.ones(self.n_head, self.split_size // self.n_head)\n        for head in heads:\n            mask[head] = 0\n        mask = mask.view(-1).contiguous().eq(1)\n        index = torch.arange(len(mask))[mask].long()\n        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])\n        # Prune conv1d layers\n        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)\n        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)\n        # Update hyper params\n        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))\n        self.n_head = self.n_head - len(heads)\n\n    def _attn(self, q, k, v, head_mask=None):\n        w = torch.matmul(q, k)\n        if self.scale:\n            w = w / math.sqrt(v.size(-1))\n        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights\n        # XD: self.b may be larger than w, so we need to crop it\n        b = self.bias[:, :, : w.size(-2), : w.size(-1)]\n        w = w * b + -1e9 * (1 - b)\n\n        w = nn.Softmax(dim=-1)(w)\n        w = self.attn_dropout(w)\n\n        # Mask heads if we want to\n        if head_mask is not None:\n            w = w * head_mask\n\n        if self.output_attentions:\n            return w, torch.matmul(w, v)\n        return torch.matmul(w, v)\n\n    def merge_heads(self, x):\n        x = x.permute(0, 2, 1, 3).contiguous()\n        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)\n        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states\n\n    def split_heads(self, x, k=False):\n        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)\n        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states\n        if k:\n            return x.permute(0, 2, 3, 1)\n        else:\n            return x.permute(0, 2, 1, 3)\n\n    def forward(self, x, head_mask=None):\n        x = self.c_attn(x)\n        query, key, value = x.split(self.split_size, dim=2)\n        query = self.split_heads(query)\n        key = self.split_heads(key, k=True)\n        value = self.split_heads(value)\n\n        a = self._attn(query, key, value, head_mask)\n        if self.keep_multihead_output:\n            self.multihead_output = a\n            self.multihead_output.retain_grad()\n\n        if self.output_attentions:\n            attentions, a = a\n        a = self.merge_heads(a)\n        a = self.c_proj(a)\n        a = self.resid_dropout(a)\n        if self.output_attentions:\n            return attentions, a\n        return a\n\n\nclass MLP(nn.Module):\n    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)\n        super(MLP, self).__init__()\n        nx = config.n_embd\n        self.c_fc = Conv1D(n_state, 1, nx)\n        self.c_proj = Conv1D(nx, 1, n_state)\n        self.act = ACT_FNS[config.afn]\n        self.dropout = nn.Dropout(config.resid_pdrop)\n\n    def forward(self, x):\n        h = self.act(self.c_fc(x))\n        h2 = self.c_proj(h)\n        return self.dropout(h2)\n\n\nclass Block(nn.Module):\n    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):\n        super(Block, self).__init__()\n        nx = config.n_embd\n        self.output_attentions = output_attentions\n        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)\n        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)\n        self.mlp = MLP(4 * nx, config)\n        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)\n\n    def forward(self, x, head_mask=None):\n        a = self.attn(x, head_mask=head_mask)\n        if self.output_attentions:\n            attentions, a = a\n        n = self.ln_1(x + a)\n        m = self.mlp(n)\n        h = self.ln_2(n + m)\n        if self.output_attentions:\n            return attentions, h\n        return h\n\n\nclass OpenAIGPTLMHead(nn.Module):\n    \"\"\" Language Model Head for the transformer \"\"\"\n\n    def __init__(self, model_embeddings_weights, config):\n        super(OpenAIGPTLMHead, self).__init__()\n        self.n_embd = config.n_embd\n        self.vocab_size = config.vocab_size\n        self.predict_special_tokens = config.predict_special_tokens\n        embed_shape = model_embeddings_weights.shape\n        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)\n        self.set_embeddings_weights(model_embeddings_weights)\n\n    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):\n        self.predict_special_tokens = predict_special_tokens\n        embed_shape = model_embeddings_weights.shape\n        self.decoder.weight = model_embeddings_weights  # Tied weights\n\n    def forward(self, hidden_state):\n        lm_logits = self.decoder(hidden_state)\n        if not self.predict_special_tokens:\n            lm_logits = lm_logits[..., :self.vocab_size]\n        return lm_logits\n\n\nclass OpenAIGPTMultipleChoiceHead(nn.Module):\n    \"\"\" Classifier Head for the transformer \"\"\"\n\n    def __init__(self, config):\n        super(OpenAIGPTMultipleChoiceHead, self).__init__()\n        self.n_embd = config.n_embd\n        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation\n        self.linear = nn.Linear(config.n_embd, 1)\n\n        nn.init.normal_(self.linear.weight, std=0.02)\n        nn.init.normal_(self.linear.bias, 0)\n\n    def forward(self, hidden_states, mc_token_ids):\n        # Classification logits\n        # hidden_state (bsz, num_choices, seq_length, hidden_size)\n        # mc_token_ids (bsz, num_choices)\n        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))\n        # (bsz, num_choices, 1, hidden_size)\n        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)\n        # (bsz, num_choices, hidden_size)\n        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)\n        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)\n        # (bsz, num_choices)\n        return multiple_choice_logits\n\n\nclass OpenAIGPTPreTrainedModel(nn.Module):\n    \"\"\" An abstract class to handle weights initialization and\n        a simple interface for dowloading and loading pretrained models.\n    \"\"\"\n\n    def __init__(self, config, *inputs, **kwargs):\n        super(OpenAIGPTPreTrainedModel, self).__init__()\n        if not isinstance(config, OpenAIGPTConfig):\n            raise ValueError(\n                \"Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. \"\n                \"To create a model from a pretrained model use \"\n                \"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`\".format(\n                    self.__class__.__name__, self.__class__.__name__\n                )\n            )\n        self.config = config\n\n    def init_weights(self, module):\n        \"\"\" Initialize the weights.\n        \"\"\"\n        if isinstance(module, (nn.Linear, nn.Embedding)):\n            # Slightly different from the TF version which uses truncated_normal for initialization\n            # cf https://github.com/pytorch/pytorch/pull/5617\n            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)\n        elif isinstance(module, LayerNorm):\n            module.bias.data.zero_()\n            module.weight.data.fill_(1.0)\n        if isinstance(module, nn.Linear) and module.bias is not None:\n            module.bias.data.zero_()\n\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, *inputs, **kwargs):\n        \"\"\"\n        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.\n        Download and cache the pre-trained model file if needed.\n\n        Params:\n            pretrained_model_name_or_path: either:\n                - a str with the name of a pre-trained model to load selected in the list of:\n                    . `openai-gpt`\n                - a path or url to a pretrained model archive containing:\n                    . `openai_gpt_config.json` a configuration file for the model\n                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance\n                - a path or url to a pretrained model archive containing:\n                    . `openai-gpt-config.json` a configuration file for the model\n                    . a series of NumPy files containing OpenAI TensorFlow trained weights\n            from_tf: should we load the weights from a locally saved TensorFlow checkpoint\n            cache_dir: an optional path to a folder in which the pre-trained models will be cached.\n            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models\n            *inputs, **kwargs: additional input for the specific OpenAI-GPT class\n        \"\"\"\n        state_dict = kwargs.get('state_dict', None)\n        kwargs.pop('state_dict', None)\n        cache_dir = kwargs.get('cache_dir', None)\n        kwargs.pop('cache_dir', None)\n        from_tf = kwargs.get('from_tf', False)\n        kwargs.pop('from_tf', None)\n\n        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:\n            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]\n            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]\n        else:\n            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)\n            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)\n        # redirect to the cache, if necessary\n        try:\n            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:\n                logger.error(\n                    \"Couldn't reach server at '{}' to download pretrained weights.\".format(\n                        archive_file))\n            else:\n                logger.error(\n                    \"Model name '{}' was not found in model name list ({}). \"\n                    \"We assumed '{}' was a path or url but couldn't find file {} \"\n                    \"at this path or url.\".format(\n                        pretrained_model_name_or_path, \", \".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,\n                        archive_file\n                    )\n                )\n            return None\n        try:\n            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:\n                logger.error(\n                    \"Couldn't reach server at '{}' to download pretrained model configuration file.\".format(\n                        config_file))\n            else:\n                logger.error(\n                    \"Model name '{}' was not found in model name list ({}). \"\n                    \"We assumed '{}' was a path or url but couldn't find file {} \"\n                    \"at this path or url.\".format(\n                        pretrained_model_name_or_path, \", \".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,\n                        config_file\n                    )\n                )\n            return None\n        if resolved_archive_file == archive_file and resolved_config_file == config_file:\n            logger.info(\"loading weights file {}\".format(archive_file))\n            logger.info(\"loading configuration file {}\".format(config_file))\n        else:\n            logger.info(\"loading weights file {} from cache at {}\".format(\n                archive_file, resolved_archive_file))\n            logger.info(\"loading configuration file {} from cache at {}\".format(\n                config_file, resolved_config_file))\n        # Load config\n        config = OpenAIGPTConfig.from_json_file(resolved_config_file)\n        logger.info(\"Model config {}\".format(config))\n        # Instantiate model.\n        model = cls(config, *inputs, **kwargs)\n        if state_dict is None and not from_tf:\n            state_dict = torch.load(resolved_archive_file, map_location='cpu')\n        if from_tf:\n            # Directly load from a TensorFlow checkpoint (stored as NumPy array)\n            return load_tf_weights_in_openai_gpt(model, resolved_archive_file)\n\n        old_keys = []\n        new_keys = []\n        for key in state_dict.keys():\n            new_key = None\n            if key.endswith(\".g\"):\n                new_key = key[:-2] + \".weight\"\n            elif key.endswith(\".b\"):\n                new_key = key[:-2] + \".bias\"\n            elif key.endswith(\".w\"):\n                new_key = key[:-2] + \".weight\"\n            if new_key:\n                old_keys.append(key)\n                new_keys.append(new_key)\n        for old_key, new_key in zip(old_keys, new_keys):\n            state_dict[new_key] = state_dict.pop(old_key)\n\n        missing_keys = []\n        unexpected_keys = []\n        error_msgs = []\n        # copy state_dict so _load_from_state_dict can modify it\n        metadata = getattr(state_dict, \"_metadata\", None)\n        state_dict = state_dict.copy()\n        if metadata is not None:\n            state_dict._metadata = metadata\n\n        def load(module, prefix=\"\"):\n            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})\n            module._load_from_state_dict(\n                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs\n            )\n            for name, child in module._modules.items():\n                if child is not None:\n                    load(child, prefix + name + \".\")\n\n        start_model = model\n        if hasattr(model, \"transformer\") and all(not s.startswith('transformer.') for s in state_dict.keys()):\n            start_model = model.transformer\n        load(start_model, prefix=\"\")\n\n        if len(missing_keys) > 0:\n            logger.info(\n                \"Weights of {} not initialized from pretrained model: {}\".format(model.__class__.__name__, missing_keys)\n            )\n        if len(unexpected_keys) > 0:\n            logger.info(\n                \"Weights from pretrained model not used in {}: {}\".format(model.__class__.__name__, unexpected_keys)\n            )\n        if len(error_msgs) > 0:\n            raise RuntimeError(\n                \"Error(s) in loading state_dict for {}:\\n\\t{}\".format(model.__class__.__name__, \"\\n\\t\".join(error_msgs))\n            )\n\n        # Add additional embeddings for special tokens if needed\n        # This step also make sure we are still sharing the output and input embeddings after loading weights\n        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)\n        return model\n\n\nclass OpenAIGPTModel(OpenAIGPTPreTrainedModel):\n    \"\"\"OpenAI GPT model (\"Improving Language Understanding by Generative Pre-Training\").\n\n    OpenAI GPT use a single embedding matrix to store the word and special embeddings.\n    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...\n    Special tokens need to be trained during the fine-tuning if you use them.\n    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.\n\n    The embeddings are ordered as follow in the token embeddings matrice:\n        [0,                                                         ----------------------\n         ...                                                        -> word embeddings\n         config.vocab_size - 1,                                     ______________________\n         config.vocab_size,\n         ...                                                        -> special embeddings\n         config.vocab_size + config.n_special - 1]                  ______________________\n\n    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:\n        total_tokens_embeddings = config.vocab_size + config.n_special\n    You should use the associate indices to index the embeddings.\n\n    Params:\n        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model\n        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False\n        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.\n            This can be used to compute head importance metrics. Default: False\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]\n            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[\n        `position_ids`: an optional torch.LongTensor with the same shape as input_ids\n            with the position indices (selected in the range [0, config.n_positions - 1[.\n        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids\n            You can use it to add a third type of embedding to each input token in the sequence\n            (the previous two being the word and position embeddings).\n            The input, position and token_type embeddings are summed inside the Transformer before the first\n            self-attention block.\n        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.\n            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.\n\n    Outputs:\n        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)\n            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]\n            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)\n\n    Example usage:\n    ```python\n    # Already been converted into BPE token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n\n    config = modeling_openai.OpenAIGPTConfig()\n\n    model = modeling_openai.OpenAIGPTModel(config)\n    hidden_states = model(input_ids)\n    ```\n    \"\"\"\n\n    def __init__(self, config, output_attentions=False, keep_multihead_output=False):\n        super(OpenAIGPTModel, self).__init__(config)\n        self.output_attentions = output_attentions\n        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)\n        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)\n        self.drop = nn.Dropout(config.embd_pdrop)\n        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,\n                                                        keep_multihead_output=keep_multihead_output)\n        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])\n\n        self.apply(self.init_weights)\n\n    def set_num_special_tokens(self, num_special_tokens):\n        \" Update input embeddings with new embedding matrice if needed \"\n        if self.config.n_special == num_special_tokens:\n            return\n        # Update config\n        self.config.n_special = num_special_tokens\n        # Build new embeddings and initialize all new embeddings (in particular the special tokens)\n        old_embed = self.tokens_embed\n        self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)\n        self.tokens_embed.to(old_embed.weight.device)\n        self.init_weights(self.tokens_embed)\n        # Copy word embeddings from the previous weights\n        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]\n\n    def prune_heads(self, heads_to_prune):\n        \"\"\" Prunes heads of the model.\n            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}\n        \"\"\"\n        for layer, heads in heads_to_prune.items():\n            self.h[layer].attn.prune_heads(heads)\n\n    def get_multihead_outputs(self):\n        \"\"\" Gather all multi-head outputs.\n            Return: list (layers) of multihead module outputs with gradients\n        \"\"\"\n        return [h.attn.multihead_output for h in self.h]\n\n    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):\n        if position_ids is None:\n            # This was used when we had a single embedding matrice from position and token embeddings\n            # start = self.config.vocab_size + self.config.n_special\n            # end = start + input_ids.size(-1)\n            # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)\n            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)\n            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)\n\n        # Prepare head mask if needed\n        # 1.0 in head_mask indicate we keep the head\n        # attention_probs has shape bsz x n_heads x N x N\n        # head_mask has shape n_layer x batch x n_heads x N x N\n        if head_mask is not None:\n            if head_mask.dim() == 1:\n                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)\n                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)\n            elif head_mask.dim() == 2:\n                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer\n            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility\n        else:\n            head_mask = [None] * self.config.n_layer\n\n        input_shape = input_ids.size()\n        input_ids = input_ids.view(-1, input_ids.size(-1))\n        position_ids = position_ids.view(-1, position_ids.size(-1))\n\n        inputs_embeds = self.tokens_embed(input_ids)\n        position_embeds = self.positions_embed(position_ids)\n        if token_type_ids is not None:\n            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))\n            token_type_embeds = self.tokens_embed(token_type_ids)\n        else:\n            token_type_embeds = 0\n        hidden_states = inputs_embeds + position_embeds + token_type_embeds\n        hidden_states = self.drop(hidden_states)\n\n        output_shape = input_shape + (hidden_states.size(-1),)\n\n        all_attentions = []\n        all_hidden_states = [hidden_states.view(*output_shape)]\n        for i, block in enumerate(self.h):\n            outputs = block(hidden_states, head_mask[i])\n            if self.output_attentions:\n                attentions, hidden_states = outputs\n                all_attentions.append(attentions)\n            else:\n                hidden_states = outputs\n            all_hidden_states.append(hidden_states.view(*output_shape))\n\n        if self.output_attentions:\n            return all_attentions, all_hidden_states\n        return all_hidden_states\n\n\nclass OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):\n    \"\"\"OpenAI GPT model with a Language Modeling head (\"Improving Language Understanding by Generative Pre-Training\").\n\n    OpenAI GPT use a single embedding matrix to store the word and special embeddings.\n    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...\n    Special tokens need to be trained during the fine-tuning if you use them.\n    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.\n\n    The embeddings are ordered as follow in the token embeddings matrice:\n        [0,                                                         ----------------------\n         ...                                                        -> word embeddings\n         config.vocab_size - 1,                                     ______________________\n         config.vocab_size,\n         ...                                                        -> special embeddings\n         config.vocab_size + config.n_special - 1]                  ______________________\n\n    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:\n        total_tokens_embeddings = config.vocab_size + config.n_special\n    You should use the associate indices to index the embeddings.\n\n    Params:\n        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model\n        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False\n        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.\n            This can be used to compute head importance metrics. Default: False\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]\n            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[\n        `position_ids`: an optional torch.LongTensor with the same shape as input_ids\n            with the position indices (selected in the range [0, config.n_positions - 1[.\n        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids\n            You can use it to add a third type of embedding to each input token in the sequence\n            (the previous two being the word and position embeddings).\n            The input, position and token_type embeddings are summed inside the Transformer before the first\n            self-attention block.\n        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]\n            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss\n            is only computed for the labels set in [0, ..., vocab_size]\n        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.\n            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.\n\n    Outputs:\n        if `lm_labels` is not `None`:\n            Outputs the language modeling loss.\n        else:\n            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings]\n                (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)\n\n    Example usage:\n    ```python\n    # Already been converted into BPE token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n\n    config = modeling_openai.OpenAIGPTConfig()\n\n    model = modeling_openai.OpenAIGPTLMHeadModel(config)\n    lm_logits = model(input_ids)\n    ```\n    \"\"\"\n\n    def __init__(self, config, output_attentions=False, keep_multihead_output=False):\n        super(OpenAIGPTLMHeadModel, self).__init__(config)\n        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,\n                                             keep_multihead_output=keep_multihead_output)\n        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)\n        self.apply(self.init_weights)\n\n    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):\n        \"\"\" Update input and output embeddings with new embedding matrice\n            Make sure we are sharing the embeddings\n        \"\"\"\n        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens\n        self.transformer.set_num_special_tokens(num_special_tokens)\n        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)\n\n    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):\n        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)\n        if self.transformer.output_attentions:\n            all_attentions, hidden_states = hidden_states\n        hidden_states = hidden_states[-1]\n\n        lm_logits = self.lm_head(hidden_states)\n        if lm_labels is not None:\n            # Shift so that tokens < n predict n\n            shift_logits = lm_logits[..., :-1, :].contiguous()\n            shift_labels = lm_labels[..., 1:].contiguous()\n            # Flatten the tokens\n            loss_fct = CrossEntropyLoss(ignore_index=-1)\n            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),\n                            shift_labels.view(-1))\n            return loss\n        if self.transformer.output_attentions:\n            return all_attentions, lm_logits\n        return lm_logits\n\n\nclass OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):\n    \"\"\"OpenAI GPT model with a Language Modeling and a Multiple Choice head (\"Improving Language Understanding by Generative Pre-Training\").\n\n    OpenAI GPT use a single embedding matrix to store the word and special embeddings.\n    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...\n    Special tokens need to be trained during the fine-tuning if you use them.\n    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.\n\n    The embeddings are ordered as follow in the token embeddings matrice:\n        [0,                                                         ----------------------\n         ...                                                        -> word embeddings\n         config.vocab_size - 1,                                     ______________________\n         config.vocab_size,\n         ...                                                        -> special embeddings\n         config.vocab_size + config.n_special - 1]                  ______________________\n\n    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:\n        total_tokens_embeddings = config.vocab_size + config.n_special\n    You should use the associate indices to index the embeddings.\n\n    Params:\n        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model\n        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False\n        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.\n            This can be used to compute head importance metrics. Default: False\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token\n            indices selected in the range [0, total_tokens_embeddings[\n        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from\n            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)\n        `position_ids`: an optional torch.LongTensor with the same shape as input_ids\n            with the position indices (selected in the range [0, config.n_positions - 1[.\n        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids\n            You can use it to add a third type of embedding to each input token in the sequence\n            (the previous two being the word and position embeddings).\n            The input, position and token_type embeddings are summed inside the Transformer before the first\n            self-attention block.\n        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]\n            with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss\n            is only computed for the labels set in [0, ..., total_tokens_embeddings]\n        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]\n            with indices selected in [0, ..., num_choices].\n        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.\n            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.\n\n    Outputs:\n        if `lm_labels` and `multiple_choice_labels` are not `None`:\n            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.\n        else: a tuple with\n            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]\n            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]\n\n    Example usage:\n    ```python\n    # Already been converted into BPE token ids\n    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)\n    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)\n\n    config = modeling_openai.OpenAIGPTConfig()\n\n    model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)\n    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)\n    ```\n    \"\"\"\n\n    def __init__(self, config, output_attentions=False, keep_multihead_output=False):\n        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)\n        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,\n                                             keep_multihead_output=keep_multihead_output)\n        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)\n        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)\n        self.apply(self.init_weights)\n\n    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):\n        \"\"\" Update input and output embeddings with new embedding matrice\n            Make sure we are sharing the embeddings\n        \"\"\"\n        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens\n        self.transformer.set_num_special_tokens(num_special_tokens)\n        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)\n\n    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,\n                position_ids=None, head_mask=None):\n        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)\n        if self.transformer.output_attentions:\n            all_attentions, hidden_states = hidden_states\n        hidden_states = hidden_states[-1]\n\n        lm_logits = self.lm_head(hidden_states)\n        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)\n        losses = []\n        if lm_labels is not None:\n            shift_logits = lm_logits[..., :-1, :].contiguous()\n            shift_labels = lm_labels[..., 1:].contiguous()\n            loss_fct = CrossEntropyLoss(ignore_index=-1)\n            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))\n        if mc_labels is not None:\n            loss_fct = CrossEntropyLoss()\n            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))\n        if losses:\n            return losses\n        if self.transformer.output_attentions:\n            return all_attentions, lm_logits, mc_logits\n        return lm_logits, mc_logits\n"
  },
  {
    "path": "pytorch_pretrained_bert/modeling_transfo_xl.py",
    "content": "# coding=utf-8\n# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" PyTorch Transformer XL model.\n    Adapted from https://github.com/kimiyoung/transformer-xl.\n    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py\n\"\"\"\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport os\nimport copy\nimport json\nimport math\nimport logging\nimport collections\nimport sys\nfrom io import open\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.nn import CrossEntropyLoss\nfrom torch.nn.parameter import Parameter\n\nfrom .modeling import BertLayerNorm as LayerNorm\nfrom .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits\nfrom .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME\n\nlogger = logging.getLogger(__name__)\n\nPRETRAINED_MODEL_ARCHIVE_MAP = {\n    'transfo-xl-wt103': \"https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin\",\n}\nPRETRAINED_CONFIG_ARCHIVE_MAP = {\n    'transfo-xl-wt103': \"https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json\",\n}\n\nTF_WEIGHTS_NAME = 'model.ckpt'\n\ndef build_tf_to_pytorch_map(model, config):\n    \"\"\" A map of modules from TF to PyTorch.\n        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.\n    \"\"\"\n    tf_to_pt_map = {}\n\n    if hasattr(model, 'transformer'):\n        # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax\n        tf_to_pt_map.update({\n            \"transformer/adaptive_softmax/cutoff_0/cluster_W\": model.crit.cluster_weight,\n            \"transformer/adaptive_softmax/cutoff_0/cluster_b\": model.crit.cluster_bias})\n        for i, (out_l, proj_l, tie_proj) in enumerate(zip(\n                                model.crit.out_layers,\n                                model.crit.out_projs,\n                                config.tie_projs)):\n            layer_str = \"transformer/adaptive_softmax/cutoff_%d/\" % i\n            if config.tie_weight:\n                tf_to_pt_map.update({\n                    layer_str + 'b': out_l.bias})\n            else:\n                raise NotImplementedError\n                # I don't think this is implemented in the TF code\n                tf_to_pt_map.update({\n                    layer_str + 'lookup_table': out_l.weight,\n                    layer_str + 'b': out_l.bias})\n            if not tie_proj:\n                tf_to_pt_map.update({\n                    layer_str + 'proj': proj_l\n                    })\n        # Now load the rest of the transformer\n        model = model.transformer\n\n    # Embeddings\n    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):\n        layer_str = \"transformer/adaptive_embed/cutoff_%d/\" % i\n        tf_to_pt_map.update({\n            layer_str + 'lookup_table': embed_l.weight,\n            layer_str + 'proj_W': proj_l\n            })\n\n    # Transformer blocks\n    for i, b in enumerate(model.layers):\n        layer_str = \"transformer/layer_%d/\" % i\n        tf_to_pt_map.update({\n            layer_str + \"rel_attn/LayerNorm/gamma\": b.dec_attn.layer_norm.weight,\n            layer_str + \"rel_attn/LayerNorm/beta\": b.dec_attn.layer_norm.bias,\n            layer_str + \"rel_attn/o/kernel\": b.dec_attn.o_net.weight,\n            layer_str + \"rel_attn/qkv/kernel\": b.dec_attn.qkv_net.weight,\n            layer_str + \"rel_attn/r/kernel\": b.dec_attn.r_net.weight,\n            layer_str + \"ff/LayerNorm/gamma\": b.pos_ff.layer_norm.weight,\n            layer_str + \"ff/LayerNorm/beta\": b.pos_ff.layer_norm.bias,\n            layer_str + \"ff/layer_1/kernel\": b.pos_ff.CoreNet[0].weight,\n            layer_str + \"ff/layer_1/bias\": b.pos_ff.CoreNet[0].bias,\n            layer_str + \"ff/layer_2/kernel\": b.pos_ff.CoreNet[3].weight,\n            layer_str + \"ff/layer_2/bias\": b.pos_ff.CoreNet[3].bias,\n        })\n\n    # Relative positioning biases\n    if config.untie_r:\n        r_r_list = []\n        r_w_list = []\n        for b in model.layers:\n            r_r_list.append(b.dec_attn.r_r_bias)\n            r_w_list.append(b.dec_attn.r_w_bias)\n    else:\n        r_r_list = [model.r_r_bias]\n        r_w_list = [model.r_w_bias]\n    tf_to_pt_map.update({\n        'transformer/r_r_bias': r_r_list,\n        'transformer/r_w_bias': r_w_list})\n    return tf_to_pt_map\n\ndef load_tf_weights_in_transfo_xl(model, config, tf_path):\n    \"\"\" Load tf checkpoints in a pytorch model\n    \"\"\"\n    try:\n        import numpy as np\n        import tensorflow as tf\n    except ImportError:\n        print(\"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see \"\n            \"https://www.tensorflow.org/install/ for installation instructions.\")\n        raise\n    # Build TF to PyTorch weights loading map\n    tf_to_pt_map = build_tf_to_pytorch_map(model, config)\n\n    # Load weights from TF model\n    init_vars = tf.train.list_variables(tf_path)\n    tf_weights = {}\n    for name, shape in init_vars:\n        print(\"Loading TF weight {} with shape {}\".format(name, shape))\n        array = tf.train.load_variable(tf_path, name)\n        tf_weights[name] = array\n\n    for name, pointer in tf_to_pt_map.items():\n        assert name in tf_weights\n        array = tf_weights[name]\n        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v\n        # which are not required for using pretrained model\n        if 'kernel' in name or 'proj' in name:\n            array = np.transpose(array)\n        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:\n            # Here we will split the TF weigths\n            assert len(pointer) == array.shape[0]\n            for i, p_i in enumerate(pointer):\n                arr_i = array[i, ...]\n                try:\n                    assert p_i.shape == arr_i.shape\n                except AssertionError as e:\n                    e.args += (p_i.shape, arr_i.shape)\n                    raise\n                print(\"Initialize PyTorch weight {} for layer {}\".format(name, i))\n                p_i.data = torch.from_numpy(arr_i)\n        else:\n            try:\n                assert pointer.shape == array.shape\n            except AssertionError as e:\n                e.args += (pointer.shape, array.shape)\n                raise\n            print(\"Initialize PyTorch weight {}\".format(name))\n            pointer.data = torch.from_numpy(array)\n        tf_weights.pop(name, None)\n        tf_weights.pop(name + '/Adam', None)\n        tf_weights.pop(name + '/Adam_1', None)\n\n    print(\"Weights not copied to PyTorch model: {}\".format(', '.join(tf_weights.keys())))\n    return model\n\n\nclass TransfoXLConfig(object):\n    \"\"\"Configuration class to store the configuration of a `TransfoXLModel`.\n    \"\"\"\n    def __init__(self,\n                 vocab_size_or_config_json_file=267735,\n                 cutoffs=[20000, 40000, 200000],\n                 d_model=1024,\n                 d_embed=1024,\n                 n_head=16,\n                 d_head=64,\n                 d_inner=4096,\n                 div_val=4,\n                 pre_lnorm=False,\n                 n_layer=18,\n                 tgt_len=128,\n                 ext_len=0,\n                 mem_len=1600,\n                 clamp_len=1000,\n                 same_length=True,\n                 proj_share_all_but_first=True,\n                 attn_type=0,\n                 sample_softmax=-1,\n                 adaptive=True,\n                 tie_weight=True,\n                 dropout=0.1,\n                 dropatt=0.0,\n                 untie_r=True,\n                 init=\"normal\",\n                 init_range=0.01,\n                 proj_init_std=0.01,\n                 init_std=0.02):\n        \"\"\"Constructs TransfoXLConfig.\n\n        Args:\n            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.\n            cutoffs: cutoffs for the adaptive softmax\n            d_model: Dimensionality of the model's hidden states.\n            d_embed: Dimensionality of the embeddings\n            d_head: Dimensionality of the model's heads.\n            div_val: divident value for adapative input and softmax\n            pre_lnorm: apply LayerNorm to the input instead of the output\n            d_inner: Inner dimension in FF\n            n_layer: Number of hidden layers in the Transformer encoder.\n            n_head: Number of attention heads for each attention layer in\n                the Transformer encoder.\n            tgt_len: number of tokens to predict\n            ext_len: length of the extended context\n            mem_len: length of the retained previous heads\n            same_length: use the same attn length for all tokens\n            proj_share_all_but_first: True to share all but first projs, False not to share.\n            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.\n            clamp_len: use the same pos embeddings after clamp_len\n            sample_softmax: number of samples in sampled softmax\n            adaptive: use adaptive softmax\n            tie_weight: tie the word embedding and softmax weights\n            dropout: The dropout probabilitiy for all fully connected\n                layers in the embeddings, encoder, and pooler.\n            dropatt: The dropout ratio for the attention probabilities.\n            untie_r: untie relative position biases           \n            embd_pdrop: The dropout ratio for the embeddings.\n            init: parameter initializer to use\n            init_range: parameters initialized by U(-init_range, init_range).\n            proj_init_std: parameters initialized by N(0, init_std)\n            init_std: parameters initialized by N(0, init_std)\n        \"\"\"\n        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2\n                        and isinstance(vocab_size_or_config_json_file, unicode)):\n            with open(vocab_size_or_config_json_file, \"r\", encoding='utf-8') as reader:\n                json_config = json.loads(reader.read())\n            for key, value in json_config.items():\n                self.__dict__[key] = value\n        elif isinstance(vocab_size_or_config_json_file, int):\n            self.n_token = vocab_size_or_config_json_file\n            self.cutoffs = []\n            self.cutoffs.extend(cutoffs)\n            self.tie_weight = tie_weight\n            if proj_share_all_but_first:\n                self.tie_projs = [False] + [True] * len(self.cutoffs)\n            else:\n                self.tie_projs = [False] + [False] * len(self.cutoffs)\n            self.d_model = d_model\n            self.d_embed = d_embed\n            self.d_head = d_head\n            self.d_inner = d_inner\n            self.div_val = div_val\n            self.pre_lnorm = pre_lnorm\n            self.n_layer = n_layer\n            self.n_head = n_head\n            self.tgt_len = tgt_len\n            self.ext_len = ext_len\n            self.mem_len = mem_len\n            self.same_length = same_length\n            self.attn_type = attn_type\n            self.clamp_len = clamp_len\n            self.sample_softmax = sample_softmax\n            self.adaptive = adaptive\n            self.dropout = dropout\n            self.dropatt = dropatt\n            self.untie_r = untie_r\n            self.init = init\n            self.init_range = init_range\n            self.proj_init_std = proj_init_std\n            self.init_std = init_std\n        else:\n            raise ValueError(\"First argument must be either a vocabulary size (int)\"\n                             \"or the path to a pretrained model config file (str)\")\n\n    @classmethod\n    def from_dict(cls, json_object):\n        \"\"\"Constructs a `TransfoXLConfig` from a Python dictionary of parameters.\"\"\"\n        config = TransfoXLConfig(vocab_size_or_config_json_file=-1)\n        for key, value in json_object.items():\n            config.__dict__[key] = value\n        return config\n\n    @classmethod\n    def from_json_file(cls, json_file):\n        \"\"\"Constructs a `TransfoXLConfig` from a json file of parameters.\"\"\"\n        with open(json_file, \"r\", encoding='utf-8') as reader:\n            text = reader.read()\n        return cls.from_dict(json.loads(text))\n\n    def __repr__(self):\n        return str(self.to_json_string())\n\n    def to_dict(self):\n        \"\"\"Serializes this instance to a Python dictionary.\"\"\"\n        output = copy.deepcopy(self.__dict__)\n        return output\n\n    def to_json_string(self):\n        \"\"\"Serializes this instance to a JSON string.\"\"\"\n        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + \"\\n\"\n\n    def to_json_file(self, json_file_path):\n        \"\"\" Save this instance to a json file.\"\"\"\n        with open(json_file_path, \"w\", encoding='utf-8') as writer:\n            writer.write(self.to_json_string())\n\n\nclass PositionalEmbedding(nn.Module):\n    def __init__(self, demb):\n        super(PositionalEmbedding, self).__init__()\n\n        self.demb = demb\n\n        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))\n        self.register_buffer('inv_freq', inv_freq)\n\n    def forward(self, pos_seq, bsz=None):\n        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)\n        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)\n\n        if bsz is not None:\n            return pos_emb[:,None,:].expand(-1, bsz, -1)\n        else:\n            return pos_emb[:,None,:]\n\n\nclass PositionwiseFF(nn.Module):\n    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):\n        super(PositionwiseFF, self).__init__()\n\n        self.d_model = d_model\n        self.d_inner = d_inner\n        self.dropout = dropout\n\n        self.CoreNet = nn.Sequential(\n            nn.Linear(d_model, d_inner), nn.ReLU(inplace=True),\n            nn.Dropout(dropout),\n            nn.Linear(d_inner, d_model),\n            nn.Dropout(dropout),\n        )\n\n        self.layer_norm = LayerNorm(d_model)\n\n        self.pre_lnorm = pre_lnorm\n\n    def forward(self, inp):\n        if self.pre_lnorm:\n            ##### layer normalization + positionwise feed-forward\n            core_out = self.CoreNet(self.layer_norm(inp))\n\n            ##### residual connection\n            output = core_out + inp\n        else:\n            ##### positionwise feed-forward\n            core_out = self.CoreNet(inp)\n\n            ##### residual connection + layer normalization\n            output = self.layer_norm(inp + core_out)\n\n        return output\n\nclass MultiHeadAttn(nn.Module):\n    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, \n                 pre_lnorm=False, r_r_bias=None, r_w_bias=None):\n        super(MultiHeadAttn, self).__init__()\n\n        self.n_head = n_head\n        self.d_model = d_model\n        self.d_head = d_head\n        self.dropout = dropout\n\n        self.q_net = nn.Linear(d_model, n_head * d_head, bias=False)\n        self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False)\n\n        self.drop = nn.Dropout(dropout)\n        self.dropatt = nn.Dropout(dropatt)\n        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)\n\n        self.layer_norm = LayerNorm(d_model)\n\n        self.scale = 1 / (d_head ** 0.5)\n\n        self.pre_lnorm = pre_lnorm\n\n        if r_r_bias is None or r_w_bias is None: # Biases are not shared\n            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))\n            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))\n        else:\n            self.r_r_bias = r_r_bias\n            self.r_w_bias = r_w_bias\n\n    def forward(self, h, attn_mask=None, mems=None):\n        ##### multihead attention\n        # [hlen x bsz x n_head x d_head]\n\n        if mems is not None:\n            c = torch.cat([mems, h], 0)\n        else:\n            c = h\n\n        if self.pre_lnorm:\n            ##### layer normalization\n            c = self.layer_norm(c)\n\n        head_q = self.q_net(h)\n        head_k, head_v = torch.chunk(self.kv_net(c), 2, -1)\n\n        head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head)\n        head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head)\n        head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head)\n\n        # [qlen x klen x bsz x n_head]\n        attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))\n        attn_score.mul_(self.scale)\n        if attn_mask is not None and attn_mask.any().item():\n            if attn_mask.dim() == 2:\n                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))\n            elif attn_mask.dim() == 3:\n                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))\n\n        # [qlen x klen x bsz x n_head]\n        attn_prob = F.softmax(attn_score, dim=1)\n        attn_prob = self.dropatt(attn_prob)\n\n        # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]\n        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))\n        attn_vec = attn_vec.contiguous().view(\n            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)\n\n        ##### linear projection\n        attn_out = self.o_net(attn_vec)\n        attn_out = self.drop(attn_out)\n\n        if self.pre_lnorm:\n            ##### residual connection\n            output = h + attn_out\n        else:\n            ##### residual connection + layer normalization\n            output = self.layer_norm(h + attn_out)\n\n        return output\n\nclass RelMultiHeadAttn(nn.Module):\n    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,\n                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,\n                 r_r_bias=None, r_w_bias=None):\n        super(RelMultiHeadAttn, self).__init__()\n\n        self.n_head = n_head\n        self.d_model = d_model\n        self.d_head = d_head\n        self.dropout = dropout\n\n        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)\n\n        self.drop = nn.Dropout(dropout)\n        self.dropatt = nn.Dropout(dropatt)\n        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)\n\n        self.layer_norm = LayerNorm(d_model)\n\n        self.scale = 1 / (d_head ** 0.5)\n\n        self.pre_lnorm = pre_lnorm\n\n        if r_r_bias is None or r_w_bias is None: # Biases are not shared\n            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))\n            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))\n        else:\n            self.r_r_bias = r_r_bias\n            self.r_w_bias = r_w_bias\n\n    def _parallelogram_mask(self, h, w, left=False):\n        mask = torch.ones((h, w)).byte()\n        m = min(h, w)\n        mask[:m,:m] = torch.triu(mask[:m,:m])\n        mask[-m:,-m:] = torch.tril(mask[-m:,-m:])\n\n        if left:\n            return mask\n        else:\n            return mask.flip(0)\n\n    def _shift(self, x, qlen, klen, mask, left=False):\n        if qlen > 1:\n            zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),\n                                    device=x.device, dtype=x.dtype)\n        else:\n            zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)\n\n        if left:\n            mask = mask.flip(1)\n            x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1)\n        else:\n            x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1)\n\n        x = x_padded.masked_select(mask[:,:,None,None]) \\\n                    .view(qlen, klen, x.size(2), x.size(3))\n\n        return x\n\n    def _rel_shift(self, x, zero_triu=False):\n        zero_pad_shape = (x.size(0), 1) + x.size()[2:]\n        zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)\n        x_padded = torch.cat([zero_pad, x], dim=1)\n\n        x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:]\n        x_padded = x_padded.view(*x_padded_shape)\n\n        x = x_padded[1:].view_as(x)\n\n        if zero_triu:\n            ones = torch.ones((x.size(0), x.size(1)))\n            x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None]\n\n        return x\n\n    def forward(self, w, r, attn_mask=None, mems=None):\n        raise NotImplementedError\n\nclass RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):\n    def __init__(self, *args, **kwargs):\n        super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs)\n\n        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)\n\n    def forward(self, w, r, attn_mask=None, mems=None):\n        qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)\n\n        if mems is not None:\n            cat = torch.cat([mems, w], 0)\n            if self.pre_lnorm:\n                w_heads = self.qkv_net(self.layer_norm(cat))\n            else:\n                w_heads = self.qkv_net(cat)\n            r_head_k = self.r_net(r)\n\n            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)\n            w_head_q = w_head_q[-qlen:]\n        else:\n            if self.pre_lnorm:\n                w_heads = self.qkv_net(self.layer_norm(w))\n            else:\n                w_heads = self.qkv_net(w)\n            r_head_k = self.r_net(r)\n\n            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)\n\n        klen = w_head_k.size(0)\n\n        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head\n        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head\n        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head\n\n        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head\n\n        #### compute attention score\n        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head\n        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head\n\n        rr_head_q = w_head_q + self.r_r_bias\n        BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k))              # qlen x klen x bsz x n_head\n        BD = self._rel_shift(BD)\n\n        # [qlen x klen x bsz x n_head]\n        attn_score = AC + BD\n        attn_score.mul_(self.scale)\n\n        #### compute attention probability\n        if attn_mask is not None and attn_mask.any().item():\n            if attn_mask.dim() == 2:\n                attn_score = attn_score.float().masked_fill(\n                    attn_mask[None,:,:,None], -1e30).type_as(attn_score)\n            elif attn_mask.dim() == 3:\n                attn_score = attn_score.float().masked_fill(\n                    attn_mask[:,:,:,None], -1e30).type_as(attn_score)\n\n        # [qlen x klen x bsz x n_head]\n        attn_prob = F.softmax(attn_score, dim=1)\n        attn_prob = self.dropatt(attn_prob)\n\n        #### compute attention vector\n        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))\n\n        # [qlen x bsz x n_head x d_head]\n        attn_vec = attn_vec.contiguous().view(\n            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)\n\n        ##### linear projection\n        attn_out = self.o_net(attn_vec)\n        attn_out = self.drop(attn_out)\n\n        if self.pre_lnorm:\n            ##### residual connection\n            output = w + attn_out\n        else:\n            ##### residual connection + layer normalization\n            output = self.layer_norm(w + attn_out)\n\n        return output\n\nclass RelLearnableMultiHeadAttn(RelMultiHeadAttn):\n    def __init__(self, *args, **kwargs):\n        super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)\n\n    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None):\n        # r_emb: [klen, n_head, d_head], used for term B\n        # r_w_bias: [n_head, d_head], used for term C\n        # r_bias: [klen, n_head], used for term D\n\n        qlen, bsz = w.size(0), w.size(1)\n\n        if mems is not None:\n            cat = torch.cat([mems, w], 0)\n            if self.pre_lnorm:\n                w_heads = self.qkv_net(self.layer_norm(cat))\n            else:\n                w_heads = self.qkv_net(cat)\n            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)\n\n            w_head_q = w_head_q[-qlen:]\n        else:\n            if self.pre_lnorm:\n                w_heads = self.qkv_net(self.layer_norm(w))\n            else:\n                w_heads = self.qkv_net(w)\n            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)\n\n        klen = w_head_k.size(0)\n\n        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)\n        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)\n        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)\n\n        if klen > r_emb.size(0):\n            r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1)\n            r_emb = torch.cat([r_emb_pad, r_emb], 0)\n            r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1)\n            r_bias = torch.cat([r_bias_pad, r_bias], 0)\n        else:\n            r_emb = r_emb[-klen:]\n            r_bias = r_bias[-klen:]\n\n        #### compute attention score\n        rw_head_q = w_head_q + r_w_bias[None]                                   # qlen x bsz x n_head x d_head\n\n        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head\n        B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb))                  # qlen x klen x bsz x n_head\n        D_ = r_bias[None, :, None]                                              # 1    x klen x 1   x n_head\n        BD = self._rel_shift(B_ + D_)\n\n        # [qlen x klen x bsz x n_head]\n        attn_score = AC + BD\n        attn_score.mul_(self.scale)\n\n        #### compute attention probability\n        if attn_mask is not None and attn_mask.any().item():\n            if attn_mask.dim() == 2:\n                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))\n            elif attn_mask.dim() == 3:\n                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))\n\n        # [qlen x klen x bsz x n_head]\n        attn_prob = F.softmax(attn_score, dim=1)\n        attn_prob = self.dropatt(attn_prob)\n\n        #### compute attention vector\n        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))\n\n        # [qlen x bsz x n_head x d_head]\n        attn_vec = attn_vec.contiguous().view(\n            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)\n\n        ##### linear projection\n        attn_out = self.o_net(attn_vec)\n        attn_out = self.drop(attn_out)\n\n        if self.pre_lnorm:\n            ##### residual connection\n            output = w + attn_out\n        else:\n            ##### residual connection + layer normalization\n            output = self.layer_norm(w + attn_out)\n\n        return output\n\nclass DecoderLayer(nn.Module):\n    def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):\n        super(DecoderLayer, self).__init__()\n\n        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)\n        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, \n                                     pre_lnorm=kwargs.get('pre_lnorm'))\n\n    def forward(self, dec_inp, dec_attn_mask=None, mems=None):\n\n        output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,\n                               mems=mems)\n        output = self.pos_ff(output)\n\n        return output\n\nclass RelLearnableDecoderLayer(nn.Module):\n    def __init__(self, n_head, d_model, d_head, d_inner, dropout,\n                 **kwargs):\n        super(RelLearnableDecoderLayer, self).__init__()\n\n        self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,\n                                         **kwargs)\n        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, \n                                     pre_lnorm=kwargs.get('pre_lnorm'))\n\n    def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None):\n\n        output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,\n                               attn_mask=dec_attn_mask,\n                               mems=mems)\n        output = self.pos_ff(output)\n\n        return output\n\nclass RelPartialLearnableDecoderLayer(nn.Module):\n    def __init__(self, n_head, d_model, d_head, d_inner, dropout,\n                 **kwargs):\n        super(RelPartialLearnableDecoderLayer, self).__init__()\n\n        self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,\n                            d_head, dropout, **kwargs)\n        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, \n                                     pre_lnorm=kwargs.get('pre_lnorm'))\n\n    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None):\n\n        output = self.dec_attn(dec_inp, r,\n                               attn_mask=dec_attn_mask,\n                               mems=mems)\n        output = self.pos_ff(output)\n\n        return output\n\n\nclass AdaptiveEmbedding(nn.Module):\n    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, \n                 sample_softmax=False):\n        super(AdaptiveEmbedding, self).__init__()\n\n        self.n_token = n_token\n        self.d_embed = d_embed\n\n        self.cutoffs = cutoffs + [n_token]\n        self.div_val = div_val\n        self.d_proj = d_proj\n\n        self.emb_scale = d_proj ** 0.5\n\n        self.cutoff_ends = [0] + self.cutoffs\n\n        self.emb_layers = nn.ModuleList()\n        self.emb_projs = nn.ParameterList()\n        if div_val == 1:\n            self.emb_layers.append(\n                nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)\n            )\n            if d_proj != d_embed:\n                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed)))\n        else:\n            for i in range(len(self.cutoffs)):\n                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]\n                d_emb_i = d_embed // (div_val ** i)\n                self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))\n                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i)))\n\n    def forward(self, inp):\n        if self.div_val == 1:\n            embed = self.emb_layers[0](inp)\n            if self.d_proj != self.d_embed:\n                embed  = F.linear(embed, self.emb_projs[0])\n        else:\n            param = next(self.parameters())\n            inp_flat = inp.view(-1)\n            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], \n                dtype=param.dtype, device=param.device)\n            for i in range(len(self.cutoffs)):\n                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]\n\n                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)\n                indices_i = mask_i.nonzero().squeeze()\n\n                if indices_i.numel() == 0:\n                    continue\n\n                inp_i = inp_flat.index_select(0, indices_i) - l_idx\n                emb_i = self.emb_layers[i](inp_i)\n                emb_i = F.linear(emb_i, self.emb_projs[i])\n\n                emb_flat.index_copy_(0, indices_i, emb_i)\n\n            embed_shape = inp.size() + (self.d_proj,)\n            embed = emb_flat.view(embed_shape)\n\n        embed.mul_(self.emb_scale)\n\n        return embed\n\n\nclass TransfoXLPreTrainedModel(nn.Module):\n    \"\"\" An abstract class to handle weights initialization and\n        a simple interface for dowloading and loading pretrained models.\n    \"\"\"\n    def __init__(self, config, *inputs, **kwargs):\n        super(TransfoXLPreTrainedModel, self).__init__()\n        if not isinstance(config, TransfoXLConfig):\n            raise ValueError(\n                \"Parameter config in `{}(config)` should be an instance of class `TransfoXLConfig`. \"\n                \"To create a model from a pretrained model use \"\n                \"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`\".format(\n                    self.__class__.__name__, self.__class__.__name__\n                ))\n        self.config = config\n\n    def init_weight(self, weight):\n        if self.config.init == 'uniform':\n            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)\n        elif self.config.init == 'normal':\n            nn.init.normal_(weight, 0.0, self.config.init_std)\n\n    def init_bias(self, bias):\n        nn.init.constant_(bias, 0.0)\n\n    def init_weights(self, m):\n        \"\"\" Initialize the weights.\n        \"\"\"\n        classname = m.__class__.__name__\n        if classname.find('Linear') != -1:\n            if hasattr(m, 'weight') and m.weight is not None:\n                self.init_weight(m.weight)\n            if hasattr(m, 'bias') and m.bias is not None:\n                self.init_bias(m.bias)\n        elif classname.find('AdaptiveEmbedding') != -1:\n            if hasattr(m, 'emb_projs'):\n                for i in range(len(m.emb_projs)):\n                    if m.emb_projs[i] is not None:\n                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)\n        elif classname.find('Embedding') != -1:\n            if hasattr(m, 'weight'):\n                self.init_weight(m.weight)\n        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:\n            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:\n                self.init_weight(m.cluster_weight)\n            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:\n                self.init_bias(m.cluster_bias)\n            if hasattr(m, 'out_projs'):\n                for i in range(len(m.out_projs)):\n                    if m.out_projs[i] is not None:\n                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)\n        elif classname.find('LayerNorm') != -1:\n            if hasattr(m, 'weight'):\n                nn.init.normal_(m.weight, 1.0, self.config.init_std)\n            if hasattr(m, 'bias') and m.bias is not None:\n                self.init_bias(m.bias)\n        elif classname.find('TransformerLM') != -1:\n            if hasattr(m, 'r_emb'):\n                self.init_weight(m.r_emb)\n            if hasattr(m, 'r_w_bias'):\n                self.init_weight(m.r_w_bias)\n            if hasattr(m, 'r_r_bias'):\n                self.init_weight(m.r_r_bias)\n            if hasattr(m, 'r_bias'):\n                self.init_bias(m.r_bias)\n\n    def set_num_special_tokens(self, num_special_tokens):\n        pass\n\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):\n        \"\"\"\n        Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.\n        Download and cache the pre-trained model file if needed.\n\n        Params:\n            pretrained_model_name_or_path: either:\n                - a str with the name of a pre-trained model to load selected in the list of:\n                    . `transfo-xl-wt103`\n                - a path or url to a pretrained model archive containing:\n                    . `transfo_xl_config.json` a configuration file for the model\n                    . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance\n                - a path or url to a pretrained model archive containing:\n                    . `transfo_xl_config.json` a configuration file for the model\n                    . `model.chkpt` a TensorFlow checkpoint\n            from_tf: should we load the weights from a locally saved TensorFlow checkpoint\n            cache_dir: an optional path to a folder in which the pre-trained models will be cached.\n            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models\n            *inputs, **kwargs: additional input for the specific TransformerXL class\n        \"\"\"\n        state_dict = kwargs.get('state_dict', None)\n        kwargs.pop('state_dict', None)\n        cache_dir = kwargs.get('cache_dir', None)\n        kwargs.pop('cache_dir', None)\n        from_tf = kwargs.get('from_tf', False)\n        kwargs.pop('from_tf', None)\n\n        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:\n            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]\n            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]\n        else:\n            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)\n            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)\n        # redirect to the cache, if necessary\n        try:\n            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:\n                logger.error(\n                    \"Couldn't reach server at '{}' to download pretrained weights.\".format(\n                        archive_file))\n            else:\n                logger.error(\n                    \"Model name '{}' was not found in model name list ({}). \"\n                    \"We assumed '{}' was a path or url but couldn't find file {} \"\n                    \"at this path or url.\".format(\n                        pretrained_model_name_or_path, \", \".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,\n                        archive_file\n                    )\n                )\n            return None\n        try:\n            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:\n                logger.error(\n                    \"Couldn't reach server at '{}' to download pretrained model configuration file.\".format(\n                        config_file))\n            else:\n                logger.error(\n                    \"Model name '{}' was not found in model name list ({}). \"\n                    \"We assumed '{}' was a path or url but couldn't find file {} \"\n                    \"at this path or url.\".format(\n                        pretrained_model_name_or_path, \", \".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,\n                        config_file\n                    )\n                )\n            return None\n        if resolved_archive_file == archive_file and resolved_config_file == config_file:\n            logger.info(\"loading weights file {}\".format(archive_file))\n            logger.info(\"loading configuration file {}\".format(config_file))\n        else:\n            logger.info(\"loading weights file {} from cache at {}\".format(\n                archive_file, resolved_archive_file))\n            logger.info(\"loading configuration file {} from cache at {}\".format(\n                config_file, resolved_config_file))\n        # Load config\n        config = TransfoXLConfig.from_json_file(resolved_config_file)\n        logger.info(\"Model config {}\".format(config))\n        # Instantiate model.\n        model = cls(config, *inputs, **kwargs)\n        if state_dict is None and not from_tf:\n            state_dict = torch.load(resolved_archive_file, map_location='cpu')\n        if from_tf:\n            # Directly load from a TensorFlow checkpoint\n            return load_tf_weights_in_transfo_xl(model, config, pretrained_model_name_or_path)\n\n        missing_keys = []\n        unexpected_keys = []\n        error_msgs = []\n        # copy state_dict so _load_from_state_dict can modify it\n        metadata = getattr(state_dict, '_metadata', None)\n        state_dict = state_dict.copy()\n        if metadata is not None:\n            state_dict._metadata = metadata\n\n        def load(module, prefix=''):\n            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})\n            module._load_from_state_dict(\n                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)\n            for name, child in module._modules.items():\n                if child is not None:\n                    load(child, prefix + name + '.')\n\n        start_prefix = ''\n        if not hasattr(model, 'transformer') and any(s.startswith('transformer.') for s in state_dict.keys()):\n            start_prefix = 'transformer.'\n        load(model, prefix=start_prefix)\n\n        if len(missing_keys) > 0:\n            logger.info(\"Weights of {} not initialized from pretrained model: {}\".format(\n                model.__class__.__name__, missing_keys))\n        if len(unexpected_keys) > 0:\n            logger.info(\"Weights from pretrained model not used in {}: {}\".format(\n                model.__class__.__name__, unexpected_keys))\n        if len(error_msgs) > 0:\n            raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\n                               model.__class__.__name__, \"\\n\\t\".join(error_msgs)))\n        # Make sure we are still sharing the input and output embeddings\n        if hasattr(model, 'tie_weights'):\n            model.tie_weights()\n        return model\n\n\nclass TransfoXLModel(TransfoXLPreTrainedModel):\n    \"\"\"Transformer XL model (\"Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context\").\n\n    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:\n    - you don't need to specify positioning embeddings indices\n    - the tokens in the vocabulary have to be sorted to decreasing frequency.\n\n    Params:\n        config: a TransfoXLConfig class instance with the configuration to build a new model\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]\n            with the token indices selected in the range [0, self.config.n_token[\n        `mems`: optional memomry of hidden states from previous forward passes\n            as a list (num layers) of hidden states at the entry of each layer\n            each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]\n            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`\n    Outputs:\n        A tuple of (last_hidden_state, new_mems)\n        `last_hidden_state`: the encoded-hidden-states at the top of the model\n            as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]\n        `new_mems`: list (num layers) of updated mem states at the entry of each layer\n            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]\n            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`\n\n    Example usage:\n    ```python\n    # Already been converted into BPE token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n    input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])\n\n    config = TransfoXLConfig()\n\n    model = TransfoXLModel(config)\n    last_hidden_state, new_mems = model(input_ids)\n\n    # Another time on input_ids_next using the memory:\n    last_hidden_state, new_mems = model(input_ids_next, new_mems)\n    ```\n    \"\"\"\n    def __init__(self, config):\n        super(TransfoXLModel, self).__init__(config)\n        self.n_token = config.n_token\n\n        self.d_embed = config.d_embed\n        self.d_model = config.d_model\n        self.n_head = config.n_head\n        self.d_head = config.d_head\n\n        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, \n                                          div_val=config.div_val)\n\n        self.drop = nn.Dropout(config.dropout)\n\n        self.n_layer = config.n_layer\n\n        self.tgt_len = config.tgt_len\n        self.mem_len = config.mem_len\n        self.ext_len = config.ext_len\n        self.max_klen = config.tgt_len + config.ext_len + config.mem_len\n\n        self.attn_type = config.attn_type\n\n        if not config.untie_r:\n            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))\n            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))\n\n        self.layers = nn.ModuleList()\n        if config.attn_type == 0: # the default attention\n            for i in range(config.n_layer):\n                self.layers.append(\n                    RelPartialLearnableDecoderLayer(\n                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,\n                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,\n                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,\n                        r_w_bias=None if config.untie_r else self.r_w_bias,\n                        r_r_bias=None if config.untie_r else self.r_r_bias)\n                )\n        elif config.attn_type == 1: # learnable embeddings\n            for i in range(config.n_layer):\n                self.layers.append(\n                    RelLearnableDecoderLayer(\n                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,\n                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,\n                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,\n                        r_w_bias=None if config.untie_r else self.r_w_bias,\n                        r_r_bias=None if config.untie_r else self.r_r_bias)\n                )\n        elif config.attn_type in [2, 3]: # absolute embeddings\n            for i in range(config.n_layer):\n                self.layers.append(\n                    DecoderLayer(\n                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,\n                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,\n                        r_w_bias=None if config.untie_r else self.r_w_bias,\n                        r_r_bias=None if config.untie_r else self.r_r_bias)\n                )\n\n        self.same_length = config.same_length\n        self.clamp_len = config.clamp_len\n\n        if self.attn_type == 0: # default attention\n            self.pos_emb = PositionalEmbedding(self.d_model)\n        elif self.attn_type == 1: # learnable\n            self.r_emb = nn.Parameter(torch.Tensor(\n                    self.n_layer, self.max_klen, self.n_head, self.d_head))\n            self.r_bias = nn.Parameter(torch.Tensor(\n                    self.n_layer, self.max_klen, self.n_head))\n        elif self.attn_type == 2: # absolute standard\n            self.pos_emb = PositionalEmbedding(self.d_model)\n        elif self.attn_type == 3: # absolute deeper SA\n            self.r_emb = nn.Parameter(torch.Tensor(\n                    self.n_layer, self.max_klen, self.n_head, self.d_head))\n        self.apply(self.init_weights)\n\n    def backward_compatible(self):\n        self.sample_softmax = -1\n\n\n    def reset_length(self, tgt_len, ext_len, mem_len):\n        self.tgt_len = tgt_len\n        self.mem_len = mem_len\n        self.ext_len = ext_len\n\n    def init_mems(self, data):\n        if self.mem_len > 0:\n            mems = []\n            param = next(self.parameters())\n            for i in range(self.n_layer):\n                empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model,\n                                    dtype=param.dtype, device=param.device)\n                mems.append(empty)\n\n            return mems\n        else:\n            return None\n\n    def _update_mems(self, hids, mems, qlen, mlen):\n        # does not deal with None\n        if mems is None: return None\n\n        # mems is not None\n        assert len(hids) == len(mems), 'len(hids) != len(mems)'\n\n        # There are `mlen + qlen` steps that can be cached into mems\n        # For the next step, the last `ext_len` of the `qlen` tokens\n        # will be used as the extended context. Hence, we only cache\n        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`\n        # to `mlen + qlen - self.ext_len`.\n        with torch.no_grad():\n            new_mems = []\n            end_idx = mlen + max(0, qlen - 0 - self.ext_len)\n            beg_idx = max(0, end_idx - self.mem_len)\n            for i in range(len(hids)):\n\n                cat = torch.cat([mems[i], hids[i]], dim=0)\n                new_mems.append(cat[beg_idx:end_idx].detach())\n\n        return new_mems\n\n    def _forward(self, dec_inp, mems=None):\n        qlen, bsz = dec_inp.size()\n\n        word_emb = self.word_emb(dec_inp)\n\n        mlen = mems[0].size(0) if mems is not None else 0\n        klen = mlen + qlen\n        if self.same_length:\n            all_ones = word_emb.new_ones(qlen, klen)\n            mask_len = klen - self.mem_len\n            if mask_len > 0:\n                mask_shift_len = qlen - mask_len\n            else:\n                mask_shift_len = qlen\n            dec_attn_mask = (torch.triu(all_ones, 1+mlen)\n                    + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1\n        else:\n            dec_attn_mask = torch.triu(\n                word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None]\n\n        hids = []\n        if self.attn_type == 0: # default\n            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, \n                                   dtype=word_emb.dtype)\n            if self.clamp_len > 0:\n                pos_seq.clamp_(max=self.clamp_len)\n            pos_emb = self.pos_emb(pos_seq)\n\n            core_out = self.drop(word_emb)\n            pos_emb = self.drop(pos_emb)\n\n            for i, layer in enumerate(self.layers):\n                hids.append(core_out)\n                mems_i = None if mems is None else mems[i]\n                core_out = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i)\n        elif self.attn_type == 1: # learnable\n            core_out = self.drop(word_emb)\n            for i, layer in enumerate(self.layers):\n                hids.append(core_out)\n                if self.clamp_len > 0:\n                    r_emb = self.r_emb[i][-self.clamp_len :]\n                    r_bias = self.r_bias[i][-self.clamp_len :]\n                else:\n                    r_emb, r_bias = self.r_emb[i], self.r_bias[i]\n\n                mems_i = None if mems is None else mems[i]\n                core_out = layer(core_out, r_emb, self.r_w_bias[i],\n                        r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)\n        elif self.attn_type == 2: # absolute\n            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,\n                                   dtype=word_emb.dtype)\n            if self.clamp_len > 0:\n                pos_seq.clamp_(max=self.clamp_len)\n            pos_emb = self.pos_emb(pos_seq)\n\n            core_out = self.drop(word_emb + pos_emb[-qlen:])\n\n            for i, layer in enumerate(self.layers):\n                hids.append(core_out)\n                mems_i = None if mems is None else mems[i]\n                if mems_i is not None and i == 0:\n                    mems_i += pos_emb[:mlen]\n                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,\n                                 mems=mems_i)\n        elif self.attn_type == 3:\n            core_out = self.drop(word_emb)\n\n            for i, layer in enumerate(self.layers):\n                hids.append(core_out)\n                mems_i = None if mems is None else mems[i]\n                if mems_i is not None and mlen > 0:\n                    cur_emb = self.r_emb[i][:-qlen]\n                    cur_size = cur_emb.size(0)\n                    if cur_size < mlen:\n                        cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)\n                        cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)\n                    else:\n                        cur_emb = cur_emb[-mlen:]\n                    mems_i += cur_emb.view(mlen, 1, -1)\n                core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)\n\n                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,\n                                 mems=mems_i)\n\n        core_out = self.drop(core_out)\n\n        new_mems = self._update_mems(hids, mems, mlen, qlen)\n\n        return core_out, new_mems\n\n    def forward(self, input_ids, mems=None):\n        \"\"\" Params:\n                input_ids :: [bsz, len]\n                mems :: optional mems from previous forwar passes (or init_mems)\n                    list (num layers) of mem states at the entry of each layer\n                        shape :: [self.config.mem_len, bsz, self.config.d_model]\n                    Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`\n            Returns:\n                tuple (last_hidden, new_mems) where:\n                    new_mems: list (num layers) of mem states at the entry of each layer\n                        shape :: [self.config.mem_len, bsz, self.config.d_model]\n                    last_hidden: output of the last layer:\n                        shape :: [bsz, len, self.config.d_model]\n        \"\"\"\n        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library\n        # so we transpose here from shape [bsz, len] to shape [len, bsz]\n        input_ids = input_ids.transpose(0, 1).contiguous()\n\n        if mems is None:\n            mems = self.init_mems(input_ids)\n        last_hidden, new_mems = self._forward(input_ids, mems=mems)\n\n        # We transpose back here to shape [bsz, len, hidden_dim]\n        last_hidden = last_hidden.transpose(0, 1).contiguous()\n        return (last_hidden, new_mems)\n\n\nclass TransfoXLLMHeadModel(TransfoXLPreTrainedModel):\n    \"\"\"Transformer XL model (\"Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context\").\n\n    This model add an (adaptive) softmax head on top of the TransfoXLModel\n\n    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:\n    - you don't need to specify positioning embeddings indices\n    - the tokens in the vocabulary have to be sorted to decreasing frequency.\n\n    Call self.tie_weights() if you update/load the weights of the transformer to keep the weights tied.\n\n    Params:\n        config: a TransfoXLConfig class instance with the configuration to build a new model\n\n    Inputs:\n        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]\n            with the token indices selected in the range [0, self.config.n_token[\n        `target`: an optional torch.LongTensor of shape [batch_size, sequence_length]\n            with the target token indices selected in the range [0, self.config.n_token[\n        `mems`: an optional memory of hidden states from previous forward passes\n            as a list (num layers) of hidden states at the entry of each layer\n            each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]\n            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`\n\n    Outputs:\n        A tuple of (last_hidden_state, new_mems)\n        `softmax_output`: output of the (adaptive) softmax:\n            if target is None:\n                Negative log likelihood of shape [batch_size, sequence_length] \n            else:\n                log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]\n        `new_mems`: list (num layers) of updated mem states at the entry of each layer\n            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]\n            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`\n\n    Example usage:\n    ```python\n    # Already been converted into BPE token ids\n    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])\n    input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])\n\n    config = TransfoXLConfig()\n\n    model = TransfoXLModel(config)\n    last_hidden_state, new_mems = model(input_ids)\n\n    # Another time on input_ids_next using the memory:\n    last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)\n    ```\n    \"\"\"\n    def __init__(self, config):\n        super(TransfoXLLMHeadModel, self).__init__(config)\n        self.transformer = TransfoXLModel(config)\n        self.sample_softmax = config.sample_softmax\n        # use sampled softmax\n        if config.sample_softmax > 0:\n            self.out_layer = nn.Linear(config.d_model, config.n_token)\n            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)\n        # use adaptive softmax (including standard softmax)\n        else:\n            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, \n                                                    config.cutoffs, div_val=config.div_val)\n        self.apply(self.init_weights)\n        self.tie_weights()\n\n    def tie_weights(self):\n        \"\"\" Run this to be sure output and input (adaptive) softmax weights are tied \"\"\"\n        # sampled softmax\n        if self.sample_softmax > 0:\n            if self.config.tie_weight:\n                self.out_layer.weight = self.transformer.word_emb.weight\n        # adaptive softmax (including standard softmax)\n        else:\n            if self.config.tie_weight:\n                for i in range(len(self.crit.out_layers)):\n                    self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight\n            if self.config.tie_projs:\n                for i, tie_proj in enumerate(self.config.tie_projs):\n                    if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:\n                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]\n                    elif tie_proj and self.config.div_val != 1:\n                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]\n\n    def reset_length(self, tgt_len, ext_len, mem_len):\n        self.transformer.reset_length(tgt_len, ext_len, mem_len)\n\n    def init_mems(self, data):\n        return self.transformer.init_mems(data)\n\n    def forward(self, input_ids, target=None, mems=None):\n        \"\"\" Params:\n                input_ids :: [bsz, len]\n                target :: [bsz, len]\n            Returns:\n                tuple(softmax_output, new_mems) where:\n                    new_mems: list (num layers) of hidden states at the entry of each layer\n                        shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids\n                    softmax_output: output of the (adaptive) softmax:\n                        if target is None:\n                            Negative log likelihood of shape :: [bsz, len] \n                        else:\n                            log probabilities of tokens, shape :: [bsz, len, n_tokens]\n        \"\"\"\n        bsz = input_ids.size(0)\n        tgt_len = input_ids.size(1)\n\n        last_hidden, new_mems = self.transformer(input_ids, mems)\n\n        pred_hid = last_hidden[:, -tgt_len:]\n        if self.sample_softmax > 0 and self.training:\n            assert self.config.tie_weight\n            logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, target, pred_hid, self.sampler)\n            softmax_output = -F.log_softmax(logit, -1)[:, :, 0]\n        else:\n            softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target)\n            if target is None:\n                softmax_output = softmax_output.view(bsz, tgt_len, -1)\n            else:\n                softmax_output = softmax_output.view(bsz, tgt_len)\n\n        # We transpose back\n        return (softmax_output, new_mems)\n"
  },
  {
    "path": "pytorch_pretrained_bert/modeling_transfo_xl_utilities.py",
    "content": "# coding=utf-8\n# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" Utilities for PyTorch Transformer XL model.\n    Directly adapted from https://github.com/kimiyoung/transformer-xl.\n\"\"\"\n\nfrom collections import defaultdict\n\nimport numpy as np\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])\n# CUDA_MINOR = int(torch.version.cuda.split('.')[1])\n\nclass ProjectedAdaptiveLogSoftmax(nn.Module):\n    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,\n                 keep_order=False):\n        super(ProjectedAdaptiveLogSoftmax, self).__init__()\n\n        self.n_token = n_token\n        self.d_embed = d_embed\n        self.d_proj = d_proj\n\n        self.cutoffs = cutoffs + [n_token]\n        self.cutoff_ends = [0] + self.cutoffs\n        self.div_val = div_val\n\n        self.shortlist_size = self.cutoffs[0]\n        self.n_clusters = len(self.cutoffs) - 1\n        self.head_size = self.shortlist_size + self.n_clusters\n\n        if self.n_clusters > 0:\n            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))\n            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))\n\n        self.out_layers = nn.ModuleList()\n        self.out_projs = nn.ParameterList()\n\n        if div_val == 1:\n            for i in range(len(self.cutoffs)):\n                if d_proj != d_embed:\n                    self.out_projs.append(\n                        nn.Parameter(torch.Tensor(d_proj, d_embed))\n                    )\n                else:\n                    self.out_projs.append(None)\n\n            self.out_layers.append(nn.Linear(d_embed, n_token))\n        else:\n            for i in range(len(self.cutoffs)):\n                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]\n                d_emb_i = d_embed // (div_val ** i)\n\n                self.out_projs.append(\n                    nn.Parameter(torch.Tensor(d_proj, d_emb_i))\n                )\n\n                self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))\n\n        self.keep_order = keep_order\n\n    def _compute_logit(self, hidden, weight, bias, proj):\n        if proj is None:\n            logit = F.linear(hidden, weight, bias=bias)\n        else:\n            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:\n            proj_hid = F.linear(hidden, proj.t().contiguous())\n            logit = F.linear(proj_hid, weight, bias=bias)\n            # else:\n            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))\n            #     if bias is not None:\n            #         logit = logit + bias\n\n        return logit\n\n    def forward(self, hidden, target=None, keep_order=False):\n        '''\n            Params:\n                hidden :: [len*bsz x d_proj]\n                target :: [len*bsz]\n            Return:\n                if target is None:\n                    out :: [len*bsz] Negative log likelihood\n                else:\n                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary\n            We could replace this implementation by the native PyTorch one\n            if their's had an option to set bias on all clusters in the native one.\n            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138\n        '''\n\n        if target is not None:\n            target = target.view(-1)\n            if hidden.size(0) != target.size(0):\n                raise RuntimeError('Input and target should have the same size '\n                                'in the batch dimension.')\n\n        if self.n_clusters == 0:\n            logit = self._compute_logit(hidden, self.out_layers[0].weight,\n                                        self.out_layers[0].bias, self.out_projs[0])\n            if target is not None:\n                out = -F.log_softmax(logit, dim=-1) \\\n                        .gather(1, target.unsqueeze(1)).squeeze(1)\n            else:\n                out = F.log_softmax(logit, dim=-1)\n        else:\n            # construct weights and biases\n            weights, biases = [], []\n            for i in range(len(self.cutoffs)):\n                if self.div_val == 1:\n                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]\n                    weight_i = self.out_layers[0].weight[l_idx:r_idx]\n                    bias_i = self.out_layers[0].bias[l_idx:r_idx]\n                else:\n                    weight_i = self.out_layers[i].weight\n                    bias_i = self.out_layers[i].bias\n\n                if i == 0:\n                    weight_i = torch.cat(\n                        [weight_i, self.cluster_weight], dim=0)\n                    bias_i = torch.cat(\n                        [bias_i, self.cluster_bias], dim=0)\n\n                weights.append(weight_i)\n                biases.append(bias_i)\n\n            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]\n\n            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)\n            head_logprob = F.log_softmax(head_logit, dim=1)\n\n            if target is None:\n                out = hidden.new_empty((head_logit.size(0), self.n_token))\n            else:\n                out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)\n\n            offset = 0\n            cutoff_values = [0] + self.cutoffs\n            for i in range(len(cutoff_values) - 1):\n                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]\n\n                if target is not None:\n                    mask_i = (target >= l_idx) & (target < r_idx)\n                    indices_i = mask_i.nonzero().squeeze()\n\n                    if indices_i.numel() == 0:\n                        continue\n\n                    target_i = target.index_select(0, indices_i) - l_idx\n                    head_logprob_i = head_logprob.index_select(0, indices_i)\n                    hidden_i = hidden.index_select(0, indices_i)\n                else:\n                    hidden_i = hidden\n\n                if i == 0:\n                    if target is not None:\n                        logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)\n                    else:\n                        out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]\n                else:\n                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]\n\n                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)\n                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)\n                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster\n                    if target is not None:\n                        logprob_i = head_logprob_i[:, cluster_prob_idx] \\\n                                + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)\n                    else:\n                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i\n                        out[:, l_idx:r_idx] = logprob_i\n\n                if target is not None:\n                    if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:\n                        out.index_copy_(0, indices_i, -logprob_i)\n                    else:\n                        out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)\n                    offset += logprob_i.size(0)\n\n        return out\n\n\n    def log_prob(self, hidden):\n        r\"\"\" Computes log probabilities for all :math:`n\\_classes`\n        From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py\n        Args:\n            hidden (Tensor): a minibatch of examples\n        Returns:\n            log-probabilities of for each class :math:`c`\n            in range :math:`0 <= c <= n\\_classes`, where :math:`n\\_classes` is a\n            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.\n        Shape:\n            - Input: :math:`(N, in\\_features)`\n            - Output: :math:`(N, n\\_classes)`\n        \"\"\"\n        if self.n_clusters == 0:\n            logit = self._compute_logit(hidden, self.out_layers[0].weight,\n                                        self.out_layers[0].bias, self.out_projs[0])\n            return F.log_softmax(logit, dim=-1)\n        else:\n            # construct weights and biases\n            weights, biases = [], []\n            for i in range(len(self.cutoffs)):\n                if self.div_val == 1:\n                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]\n                    weight_i = self.out_layers[0].weight[l_idx:r_idx]\n                    bias_i = self.out_layers[0].bias[l_idx:r_idx]\n                else:\n                    weight_i = self.out_layers[i].weight\n                    bias_i = self.out_layers[i].bias\n\n                if i == 0:\n                    weight_i = torch.cat(\n                        [weight_i, self.cluster_weight], dim=0)\n                    bias_i = torch.cat(\n                        [bias_i, self.cluster_bias], dim=0)\n\n                weights.append(weight_i)\n                biases.append(bias_i)\n\n            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]\n            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)\n\n            out = hidden.new_empty((head_logit.size(0), self.n_token))\n            head_logprob = F.log_softmax(head_logit, dim=1)\n\n            cutoff_values = [0] + self.cutoffs\n            for i in range(len(cutoff_values) - 1):\n                start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]\n\n                if i == 0:\n                    out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]\n                else:\n                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]\n\n                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)\n                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)\n\n                    logprob_i = head_logprob[:, -i] + tail_logprob_i\n                    out[:, start_idx, stop_idx] = logprob_i\n\n            return out\n\n\nclass LogUniformSampler(object):\n    def __init__(self, range_max, n_sample):\n        \"\"\"\n        Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py\n            `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`\n\n        expected count can be approximated by 1 - (1 - p)^n\n        and we use a numerically stable version -expm1(num_tries * log1p(-p))\n\n        Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run\n        \"\"\"\n        with torch.no_grad():\n            self.range_max = range_max\n            log_indices = torch.arange(1., range_max+2., 1.).log_()\n            self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]\n            # print('P', self.dist.numpy().tolist()[-30:])\n\n            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()\n\n        self.n_sample = n_sample\n\n    def sample(self, labels):\n        \"\"\"\n            labels: [b1, b2]\n        Return\n            true_log_probs: [b1, b2]\n            samp_log_probs: [n_sample]\n            neg_samples: [n_sample]\n        \"\"\"\n\n        # neg_samples = torch.empty(0).long()\n        n_sample = self.n_sample\n        n_tries = 2 * n_sample\n\n        with torch.no_grad():\n            neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()\n            device = labels.device\n            neg_samples = neg_samples.to(device)\n            true_log_probs = self.log_q[labels].to(device)\n            samp_log_probs = self.log_q[neg_samples].to(device)\n            return true_log_probs, samp_log_probs, neg_samples\n\ndef sample_logits(embedding, bias, labels, inputs, sampler):\n    \"\"\"\n        embedding: an nn.Embedding layer\n        bias: [n_vocab]\n        labels: [b1, b2]\n        inputs: [b1, b2, n_emb]\n        sampler: you may use a LogUniformSampler\n    Return\n        logits: [b1, b2, 1 + n_sample]\n    \"\"\"\n    true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)\n    n_sample = neg_samples.size(0)\n    b1, b2 = labels.size(0), labels.size(1)\n    all_ids = torch.cat([labels.view(-1), neg_samples])\n    all_w = embedding(all_ids)\n    true_w = all_w[: -n_sample].view(b1, b2, -1)\n    sample_w = all_w[- n_sample:].view(n_sample, -1)\n\n    all_b = bias[all_ids]\n    true_b = all_b[: -n_sample].view(b1, b2)\n    sample_b = all_b[- n_sample:]\n\n    hit = (labels[:, :, None] == neg_samples).detach()\n\n    true_logits = torch.einsum('ijk,ijk->ij',\n        [true_w, inputs]) + true_b - true_log_probs\n    sample_logits = torch.einsum('lk,ijk->ijl',\n        [sample_w, inputs]) + sample_b - samp_log_probs\n    sample_logits.masked_fill_(hit, -1e30)\n    logits = torch.cat([true_logits[:, :, None], sample_logits], -1)\n\n    return logits\n\n\n# class LogUniformSampler(object):\n#     def __init__(self, range_max, unique=False):\n#         \"\"\"\n#         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py\n#             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`\n#         \"\"\"\n#         self.range_max = range_max\n#         log_indices = torch.arange(1., range_max+2., 1.).log_()\n#         self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]\n\n#         self.unique = unique\n\n#         if self.unique:\n#             self.exclude_mask = torch.ByteTensor(range_max).fill_(0)\n\n#     def sample(self, n_sample, labels):\n#         pos_sample, new_labels = labels.unique(return_inverse=True)\n#         n_pos_sample = pos_sample.size(0)\n#         n_neg_sample = n_sample - n_pos_sample\n\n#         if self.unique:\n#             self.exclude_mask.index_fill_(0, pos_sample, 1)\n#             sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)\n#             self.exclude_mask.index_fill_(0, pos_sample, 0)\n#         else:\n#             sample_dist = self.dist\n\n#         neg_sample = torch.multinomial(sample_dist, n_neg_sample)\n\n#         sample = torch.cat([pos_sample, neg_sample])\n#         sample_prob = self.dist[sample]\n\n#         return new_labels, sample, sample_prob\n\n\nif __name__ == '__main__':\n    S, B = 3, 4\n    n_vocab = 10000\n    n_sample = 5\n    H = 32\n\n    labels = torch.LongTensor(S, B).random_(0, n_vocab)\n\n    # sampler = LogUniformSampler(n_vocab, unique=False)\n    # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)\n\n    sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)\n    # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)\n\n    # print('true_probs', true_probs.numpy().tolist())\n    # print('samp_probs', samp_probs.numpy().tolist())\n    # print('neg_samples', neg_samples.numpy().tolist())\n\n    # print('sum', torch.sum(sampler.dist).item())\n\n    # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()\n\n    embedding = nn.Embedding(n_vocab, H)\n    bias = torch.zeros(n_vocab)\n    inputs = torch.Tensor(S, B, H).normal_()\n\n    logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)\n    print('logits', logits.detach().numpy().tolist())\n    print('logits shape', logits.size())\n    print('out_labels', out_labels.detach().numpy().tolist())\n    print('out_labels shape', out_labels.size())\n\n"
  },
  {
    "path": "pytorch_pretrained_bert/optimization.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"PyTorch optimization for BERT model.\"\"\"\n\nimport math\nimport torch\nfrom torch.optim import Optimizer\nfrom torch.optim.optimizer import required\nfrom torch.nn.utils import clip_grad_norm_\nimport logging\nimport abc\nimport sys\n\nlogger = logging.getLogger(__name__)\n\n\nif sys.version_info >= (3, 4):\n    ABC = abc.ABC\nelse:\n    ABC = abc.ABCMeta('ABC', (), {})\n\n\nclass _LRSchedule(ABC):\n    \"\"\" Parent of all LRSchedules here. \"\"\"\n    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense\n    def __init__(self, warmup=0.002, t_total=-1, **kw):\n        \"\"\"\n        :param warmup:  what fraction of t_total steps will be used for linear warmup\n        :param t_total: how many training steps (updates) are planned\n        :param kw:\n        \"\"\"\n        super(_LRSchedule, self).__init__(**kw)\n        if t_total < 0:\n            logger.warning(\"t_total value of {} results in schedule not being applied\".format(t_total))\n        if not 0.0 <= warmup < 1.0 and not warmup == -1:\n            raise ValueError(\"Invalid warmup: {} - should be in [0.0, 1.0[ or -1\".format(warmup))\n        warmup = max(warmup, 0.)\n        self.warmup, self.t_total = float(warmup), float(t_total)\n        self.warned_for_t_total_at_progress = -1\n\n    def get_lr(self, step, nowarn=False):\n        \"\"\"\n        :param step:    which of t_total steps we're on\n        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps\n        :return:        learning rate multiplier for current update\n        \"\"\"\n        if self.t_total < 0:\n            return 1.\n        progress = float(step) / self.t_total\n        ret = self.get_lr_(progress)\n        # warning for exceeding t_total (only active with warmup_linear\n        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:\n            logger.warning(\n                \"Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly.\"\n                    .format(ret, self.__class__.__name__))\n            self.warned_for_t_total_at_progress = progress\n        # end warning\n        return ret\n\n    @abc.abstractmethod\n    def get_lr_(self, progress):\n        \"\"\"\n        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress\n        :return:            learning rate multiplier for current update\n        \"\"\"\n        return 1.\n\n\nclass ConstantLR(_LRSchedule):\n    def get_lr_(self, progress):\n        return 1.\n\n\nclass WarmupCosineSchedule(_LRSchedule):\n    \"\"\"\n    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.\n    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.\n    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.\n    \"\"\"\n    warn_t_total = True\n    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):\n        \"\"\"\n        :param warmup:      see LRSchedule\n        :param t_total:     see LRSchedule\n        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.\n        :param kw:\n        \"\"\"\n        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)\n        self.cycles = cycles\n\n    def get_lr_(self, progress):\n        if progress < self.warmup:\n            return progress / self.warmup\n        else:\n            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup\n            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))\n\n\nclass WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):\n    \"\"\"\n    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.\n    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying\n    learning rate (with hard restarts).\n    \"\"\"\n    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):\n        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)\n        assert(cycles >= 1.)\n\n    def get_lr_(self, progress):\n        if progress < self.warmup:\n            return progress / self.warmup\n        else:\n            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup\n            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))\n            return ret\n\n\nclass WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):\n    \"\"\"\n    All training progress is divided in `cycles` (default=1.) parts of equal length.\n    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,\n    followed by a learning rate decreasing from 1. to 0. following a cosine curve.\n    \"\"\"\n    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):\n        assert(warmup * cycles < 1.)\n        warmup = warmup * cycles if warmup >= 0 else warmup\n        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)\n\n    def get_lr_(self, progress):\n        progress = progress * self.cycles % 1.\n        if progress < self.warmup:\n            return progress / self.warmup\n        else:\n            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup\n            ret = 0.5 * (1. + math.cos(math.pi * progress))\n            return ret\n\n\nclass WarmupConstantSchedule(_LRSchedule):\n    \"\"\"\n    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.\n    Keeps learning rate equal to 1. after warmup.\n    \"\"\"\n    def get_lr_(self, progress):\n        if progress < self.warmup:\n            return progress / self.warmup\n        return 1.\n\n\nclass WarmupLinearSchedule(_LRSchedule):\n    \"\"\"\n    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.\n    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.\n    \"\"\"\n    warn_t_total = True\n    def get_lr_(self, progress):\n        if progress < self.warmup:\n            return progress / self.warmup\n        return max((progress - 1.) / (self.warmup - 1.), 0.)\n\n\nSCHEDULES = {\n    None:       ConstantLR,\n    \"none\":     ConstantLR,\n    \"warmup_cosine\": WarmupCosineSchedule,\n    \"warmup_constant\": WarmupConstantSchedule,\n    \"warmup_linear\": WarmupLinearSchedule\n}\n\n\nclass BertAdam(Optimizer):\n    \"\"\"Implements BERT version of Adam algorithm with weight decay fix.\n    Params:\n        lr: learning rate\n        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1\n        t_total: total number of training steps for the learning\n            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1\n        schedule: schedule to use for the warmup (see above).\n            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).\n            If `None` or `'none'`, learning rate is always kept constant.\n            Default : `'warmup_linear'`\n        betas: Adams betas. Default: (0.9, 0.999)\n        e: Adams epsilon. Default: 1e-6\n        weight_decay: Weight decay. Default: 0.01\n        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0\n    \"\"\"\n    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',\n                 betas=(0.9, 0.999), e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {} - should be >= 0.0\".format(lr))\n        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:\n            raise ValueError(\"Invalid schedule parameter: {}\".format(schedule))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {} - should be in [0.0, 1.0[\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {} - should be in [0.0, 1.0[\".format(betas[1]))\n        if not e >= 0.0:\n            raise ValueError(\"Invalid epsilon value: {} - should be >= 0.0\".format(e))\n        # initialize schedule object\n        if not isinstance(schedule, _LRSchedule):\n            schedule_type = SCHEDULES[schedule]\n            schedule = schedule_type(warmup=warmup, t_total=t_total)\n        else:\n            if warmup != -1 or t_total != -1:\n                logger.warning(\"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. \"\n                               \"Please specify custom warmup and t_total in _LRSchedule object.\")\n        defaults = dict(lr=lr, schedule=schedule,\n                        betas=betas, e=e, weight_decay=weight_decay,\n                        max_grad_norm=max_grad_norm)\n        super(BertAdam, self).__init__(params, defaults)\n\n    def get_lr(self):\n        lr = []\n        for group in self.param_groups:\n            for p in group['params']:\n                state = self.state[p]\n                if len(state) == 0:\n                    return [0]\n                lr_scheduled = group['lr']\n                lr_scheduled *= group['schedule'].get_lr(state['step'])\n                lr.append(lr_scheduled)\n        return lr\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['next_m'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['next_v'] = torch.zeros_like(p.data)\n\n                next_m, next_v = state['next_m'], state['next_v']\n                beta1, beta2 = group['betas']\n\n                # Add grad clipping\n                if group['max_grad_norm'] > 0:\n                    clip_grad_norm_(p, group['max_grad_norm'])\n\n                # Decay the first and second moment running average coefficient\n                # In-place operations to update the averages at the same time\n                next_m.mul_(beta1).add_(1 - beta1, grad)\n                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                update = next_m / (next_v.sqrt() + group['e'])\n\n                # Just adding the square of the weights to the loss function is *not*\n                # the correct way of using L2 regularization/weight decay with Adam,\n                # since that will interact with the m and v parameters in strange ways.\n                #\n                # Instead we want to decay the weights in a manner that doesn't interact\n                # with the m/v parameters. This is equivalent to adding the square\n                # of the weights to the loss with plain (non-momentum) SGD.\n                if group['weight_decay'] > 0.0:\n                    update += group['weight_decay'] * p.data\n\n                lr_scheduled = group['lr']\n                lr_scheduled *= group['schedule'].get_lr(state['step'])\n\n                update_with_lr = lr_scheduled * update\n                p.data.add_(-update_with_lr)\n\n                state['step'] += 1\n\n                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1\n                # No bias correction\n                # bias_correction1 = 1 - beta1 ** state['step']\n                # bias_correction2 = 1 - beta2 ** state['step']\n\n        return loss\n"
  },
  {
    "path": "pytorch_pretrained_bert/optimization_openai.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"PyTorch optimization for OpenAI GPT model.\"\"\"\n\nimport math\nimport torch\nfrom torch.optim import Optimizer\nfrom torch.optim.optimizer import required\nfrom torch.nn.utils import clip_grad_norm_\nimport logging\nfrom .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \\\n    WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule\n\nlogger = logging.getLogger(__name__)\n\n\nclass OpenAIAdam(Optimizer):\n    \"\"\"Implements Open AI version of Adam algorithm with weight decay fix.\n    \"\"\"\n    def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,\n                 betas=(0.9, 0.999), e=1e-8, weight_decay=0,\n                 vector_l2=False, max_grad_norm=-1, **kwargs):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {} - should be >= 0.0\".format(lr))\n        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:\n            raise ValueError(\"Invalid schedule parameter: {}\".format(schedule))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {} - should be in [0.0, 1.0[\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {} - should be in [0.0, 1.0[\".format(betas[1]))\n        if not e >= 0.0:\n            raise ValueError(\"Invalid epsilon value: {} - should be >= 0.0\".format(e))\n        # initialize schedule object\n        if not isinstance(schedule, _LRSchedule):\n            schedule_type = SCHEDULES[schedule]\n            schedule = schedule_type(warmup=warmup, t_total=t_total)\n        else:\n            if warmup != -1 or t_total != -1:\n                logger.warning(\"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. \"\n                               \"Please specify custom warmup and t_total in _LRSchedule object.\")\n        defaults = dict(lr=lr, schedule=schedule,\n                        betas=betas, e=e, weight_decay=weight_decay, vector_l2=vector_l2,\n                        max_grad_norm=max_grad_norm)\n        super(OpenAIAdam, self).__init__(params, defaults)\n\n    def get_lr(self):\n        lr = []\n        for group in self.param_groups:\n            for p in group['params']:\n                state = self.state[p]\n                if len(state) == 0:\n                    return [0]\n                lr_scheduled = group['lr']\n                lr_scheduled *= group['schedule'].get_lr(state['step'])\n                lr.append(lr_scheduled)\n        return lr\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n\n                # Add grad clipping\n                if group['max_grad_norm'] > 0:\n                    clip_grad_norm_(p, group['max_grad_norm'])\n\n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                denom = exp_avg_sq.sqrt().add_(group['e'])\n\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n\n                lr_scheduled = group['lr']\n                lr_scheduled *= group['schedule'].get_lr(state['step'])\n\n                step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1\n\n                p.data.addcdiv_(-step_size, exp_avg, denom)\n\n                # Add weight decay at the end (fixed version)\n                if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:\n                    p.data.add_(-lr_scheduled * group['weight_decay'], p.data)\n\n        return loss\n"
  },
  {
    "path": "pytorch_pretrained_bert/tokenization.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Tokenization classes.\"\"\"\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport collections\nimport logging\nimport os\nimport unicodedata\nfrom io import open\n\nfrom .file_utils import cached_path\n\nlogger = logging.getLogger(__name__)\n\nPRETRAINED_VOCAB_ARCHIVE_MAP = {\n    'bert-base-uncased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt\",\n    'bert-large-uncased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt\",\n    'bert-base-cased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt\",\n    'bert-large-cased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt\",\n    'bert-base-multilingual-uncased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt\",\n    'bert-base-multilingual-cased': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt\",\n    'bert-base-chinese': \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt\",\n}\nPRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {\n    'bert-base-uncased': 512,\n    'bert-large-uncased': 512,\n    'bert-base-cased': 512,\n    'bert-large-cased': 512,\n    'bert-base-multilingual-uncased': 512,\n    'bert-base-multilingual-cased': 512,\n    'bert-base-chinese': 512,\n}\nVOCAB_NAME = 'vocab.txt'\n\n\ndef load_vocab(vocab_file):\n    \"\"\"Loads a vocabulary file into a dictionary.\"\"\"\n    vocab = collections.OrderedDict()\n    index = 0\n    with open(vocab_file, \"r\", encoding=\"utf-8\") as reader:\n        while True:\n            token = reader.readline()\n            if not token:\n                break\n            token = token.strip()\n            vocab[token] = index\n            index += 1\n    return vocab\n\n\ndef whitespace_tokenize(text):\n    \"\"\"Runs basic whitespace cleaning and splitting on a piece of text.\"\"\"\n    text = text.strip()\n    if not text:\n        return []\n    tokens = text.split()\n    return tokens\n\n\nclass BertTokenizer(object):\n    \"\"\"Runs end-to-end tokenization: punctuation splitting + wordpiece\"\"\"\n\n    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,\n                 never_split=(\"[UNK]\", \"[SEP]\", \"[PAD]\", \"[CLS]\", \"[MASK]\")):\n        \"\"\"Constructs a BertTokenizer.\n\n        Args:\n          vocab_file: Path to a one-wordpiece-per-line vocabulary file\n          do_lower_case: Whether to lower case the input\n                         Only has an effect when do_wordpiece_only=False\n          do_basic_tokenize: Whether to do basic tokenization before wordpiece.\n          max_len: An artificial maximum length to truncate tokenized sequences to;\n                         Effective maximum length is always the minimum of this\n                         value (if specified) and the underlying BERT model's\n                         sequence length.\n          never_split: List of tokens which will never be split during tokenization.\n                         Only has an effect when do_wordpiece_only=False\n        \"\"\"\n        if not os.path.isfile(vocab_file):\n            raise ValueError(\n                \"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained \"\n                \"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`\".format(vocab_file))\n        self.vocab = load_vocab(vocab_file)\n        self.ids_to_tokens = collections.OrderedDict(\n            [(ids, tok) for tok, ids in self.vocab.items()])\n        self.do_basic_tokenize = do_basic_tokenize\n        if do_basic_tokenize:\n          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,\n                                                never_split=never_split)\n        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)\n        self.max_len = max_len if max_len is not None else int(1e12)\n\n    def tokenize(self, text, entity_pos=None):\n        split_tokens = []\n        if entity_pos:\n            entity0_start = entity_pos[0][0]\n            entity0_end = entity_pos[0][1]\n            entity1_start = entity_pos[1][0]\n            entity1_end = entity_pos[1][1]\n        if self.do_basic_tokenize:\n            basic_tokens = text.split()\n            basic_tokens = [word.lower() for word in basic_tokens]\n            \"\"\"\n                会将'high-order'分成三个词: ['high','-','order']\n            \"\"\"\n            #for i, token in enumerate(self.basic_tokenizer.tokenize(text)):\n            for i, token in enumerate(basic_tokens):\n                wordpiece_tokens = self.wordpiece_tokenizer.tokenize(token)\n                if entity_pos:\n                    if i == entity0_start:\n                        entity_pos[0][0] = len(split_tokens)\n                    if i == entity0_end:    \n                        entity_pos[0][1] = len(split_tokens)\n                    if i == entity1_start:\n                        entity_pos[1][0] = len(split_tokens)\n                    if i == entity1_end:    \n                        entity_pos[1][1] = len(split_tokens)\n                for sub_token in wordpiece_tokens:\n                    split_tokens.append(sub_token)\n        else:\n            split_tokens = self.wordpiece_tokenizer.tokenize(text)\n        if entity_pos == None:\n            return split_tokens\n        else:\n            return split_tokens, entity_pos\n\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\"Converts a sequence of tokens into ids using the vocab.\"\"\"\n        ids = []\n        for token in tokens:\n            ids.append(self.vocab[token])\n        if len(ids) > self.max_len:\n            logger.warning(\n                \"Token indices sequence length is longer than the specified maximum \"\n                \" sequence length for this BERT model ({} > {}). Running this\"\n                \" sequence through BERT will result in indexing errors\".format(len(ids), self.max_len)\n            )\n        return ids\n\n    def convert_ids_to_tokens(self, ids):\n        \"\"\"Converts a sequence of ids in wordpiece tokens using the vocab.\"\"\"\n        tokens = []\n        for i in ids:\n            tokens.append(self.ids_to_tokens[i])\n        return tokens\n\n    def save_vocabulary(self, vocab_path):\n        \"\"\"Save the tokenizer vocabulary to a directory or file.\"\"\"\n        index = 0\n        if os.path.isdir(vocab_path):\n            vocab_file = os.path.join(vocab_path, VOCAB_NAME)\n        with open(vocab_file, \"w\", encoding=\"utf-8\") as writer:\n            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):\n                if index != token_index:\n                    logger.warning(\"Saving vocabulary to {}: vocabulary indices are not consecutive.\"\n                                   \" Please check that the vocabulary is not corrupted!\".format(vocab_file))\n                    index = token_index\n                writer.write(token + u'\\n')\n                index += 1\n        return vocab_file\n\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):\n        \"\"\"\n        Instantiate a PreTrainedBertModel from a pre-trained model file.\n        Download and cache the pre-trained model file if needed.\n        \"\"\"\n        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:\n            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]\n            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):\n                logger.warning(\"The pre-trained model you are loading is a cased model but you have not set \"\n                               \"`do_lower_case` to False. We are setting `do_lower_case=False` for you but \"\n                               \"you may want to check this behavior.\")\n                kwargs['do_lower_case'] = False\n            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):\n                logger.warning(\"The pre-trained model you are loading is an uncased model but you have set \"\n                               \"`do_lower_case` to False. We are setting `do_lower_case=True` for you \"\n                               \"but you may want to check this behavior.\")\n                kwargs['do_lower_case'] = True\n        else:\n            vocab_file = pretrained_model_name_or_path\n        if os.path.isdir(vocab_file):\n            vocab_file = os.path.join(vocab_file, VOCAB_NAME)\n        # redirect to the cache, if necessary\n        try:\n            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            logger.error(\n                \"Model name '{}' was not found in model name list ({}). \"\n                \"We assumed '{}' was a path or url but couldn't find any file \"\n                \"associated to this path or url.\".format(\n                    pretrained_model_name_or_path,\n                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),\n                    vocab_file))\n            return None\n        if resolved_vocab_file == vocab_file:\n            logger.info(\"loading vocabulary file {}\".format(vocab_file))\n        else:\n            logger.info(\"loading vocabulary file {} from cache at {}\".format(\n                vocab_file, resolved_vocab_file))\n        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:\n            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer\n            # than the number of positional embeddings\n            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]\n            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)\n        # Instantiate tokenizer.\n        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)\n        return tokenizer\n\n\nclass BasicTokenizer(object):\n    \"\"\"Runs basic tokenization (punctuation splitting, lower casing, etc.).\"\"\"\n\n    def __init__(self,\n                 do_lower_case=True,\n                 never_split=(\"[UNK]\", \"[SEP]\", \"[PAD]\", \"[CLS]\", \"[MASK]\")):\n        \"\"\"Constructs a BasicTokenizer.\n\n        Args:\n          do_lower_case: Whether to lower case the input.\n        \"\"\"\n        self.do_lower_case = do_lower_case\n        self.never_split = never_split\n\n    def tokenize(self, text):\n        \"\"\"Tokenizes a piece of text.\"\"\"\n        text = self._clean_text(text)\n        # This was added on November 1st, 2018 for the multilingual and Chinese\n        # models. This is also applied to the English models now, but it doesn't\n        # matter since the English models were not trained on any Chinese data\n        # and generally don't have any Chinese data in them (there are Chinese\n        # characters in the vocabulary because Wikipedia does have some Chinese\n        # words in the English Wikipedia.).\n        text = self._tokenize_chinese_chars(text)\n        orig_tokens = whitespace_tokenize(text)\n        split_tokens = []\n        for token in orig_tokens:\n            if self.do_lower_case and token not in self.never_split:\n                token = token.lower()\n                token = self._run_strip_accents(token)\n            split_tokens.extend(self._run_split_on_punc(token))\n\n        output_tokens = whitespace_tokenize(\" \".join(split_tokens))\n        return output_tokens\n\n    def _run_strip_accents(self, text):\n        \"\"\"Strips accents from a piece of text.\"\"\"\n        text = unicodedata.normalize(\"NFD\", text)\n        output = []\n        for char in text:\n            cat = unicodedata.category(char)\n            if cat == \"Mn\":\n                continue\n            output.append(char)\n        return \"\".join(output)\n\n    def _run_split_on_punc(self, text):\n        \"\"\"Splits punctuation on a piece of text.\"\"\"\n        if text in self.never_split:\n            return [text]\n        chars = list(text)\n        i = 0\n        start_new_word = True\n        output = []\n        while i < len(chars):\n            char = chars[i]\n            if _is_punctuation(char):\n                output.append([char])\n                start_new_word = True\n            else:\n                if start_new_word:\n                    output.append([])\n                start_new_word = False\n                output[-1].append(char)\n            i += 1\n\n        return [\"\".join(x) for x in output]\n\n    def _tokenize_chinese_chars(self, text):\n        \"\"\"Adds whitespace around any CJK character.\"\"\"\n        output = []\n        for char in text:\n            cp = ord(char)\n            if self._is_chinese_char(cp):\n                output.append(\" \")\n                output.append(char)\n                output.append(\" \")\n            else:\n                output.append(char)\n        return \"\".join(output)\n\n    def _is_chinese_char(self, cp):\n        \"\"\"Checks whether CP is the codepoint of a CJK character.\"\"\"\n        # This defines a \"chinese character\" as anything in the CJK Unicode block:\n        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)\n        #\n        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,\n        # despite its name. The modern Korean Hangul alphabet is a different block,\n        # as is Japanese Hiragana and Katakana. Those alphabets are used to write\n        # space-separated words, so they are not treated specially and handled\n        # like the all of the other languages.\n        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #\n                (cp >= 0x3400 and cp <= 0x4DBF) or  #\n                (cp >= 0x20000 and cp <= 0x2A6DF) or  #\n                (cp >= 0x2A700 and cp <= 0x2B73F) or  #\n                (cp >= 0x2B740 and cp <= 0x2B81F) or  #\n                (cp >= 0x2B820 and cp <= 0x2CEAF) or\n                (cp >= 0xF900 and cp <= 0xFAFF) or  #\n                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #\n            return True\n\n        return False\n\n    def _clean_text(self, text):\n        \"\"\"Performs invalid character removal and whitespace cleanup on text.\"\"\"\n        output = []\n        for char in text:\n            cp = ord(char)\n            if cp == 0 or cp == 0xfffd or _is_control(char):\n                continue\n            if _is_whitespace(char):\n                output.append(\" \")\n            else:\n                output.append(char)\n        return \"\".join(output)\n\n\nclass WordpieceTokenizer(object):\n    \"\"\"Runs WordPiece tokenization.\"\"\"\n\n    def __init__(self, vocab, unk_token=\"[UNK]\", max_input_chars_per_word=100):\n        self.vocab = vocab\n        self.unk_token = unk_token\n        self.max_input_chars_per_word = max_input_chars_per_word\n\n    def tokenize(self, text):\n        \"\"\"Tokenizes a piece of text into its word pieces.\n\n        This uses a greedy longest-match-first algorithm to perform tokenization\n        using the given vocabulary.\n\n        For example:\n          input = \"unaffable\"\n          output = [\"un\", \"##aff\", \"##able\"]\n\n        Args:\n          text: A single token or whitespace separated tokens. This should have\n            already been passed through `BasicTokenizer`.\n\n        Returns:\n          A list of wordpiece tokens.\n        \"\"\"\n\n        output_tokens = []\n        for token in whitespace_tokenize(text):\n            chars = list(token)\n            if len(chars) > self.max_input_chars_per_word:\n                output_tokens.append(self.unk_token)\n                continue\n\n            is_bad = False\n            start = 0\n            sub_tokens = []\n            while start < len(chars):\n                end = len(chars)\n                cur_substr = None\n                while start < end:\n                    substr = \"\".join(chars[start:end])\n                    if start > 0:\n                        substr = \"##\" + substr\n                    if substr in self.vocab:\n                        cur_substr = substr\n                        break\n                    end -= 1\n                if cur_substr is None:\n                    is_bad = True\n                    break\n                sub_tokens.append(cur_substr)\n                start = end\n\n            if is_bad:\n                output_tokens.append(self.unk_token)\n            else:\n                output_tokens.extend(sub_tokens)\n        return output_tokens\n\n\ndef _is_whitespace(char):\n    \"\"\"Checks whether `chars` is a whitespace character.\"\"\"\n    # \\t, \\n, and \\r are technically contorl characters but we treat them\n    # as whitespace since they are generally considered as such.\n    if char == \" \" or char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return True\n    cat = unicodedata.category(char)\n    if cat == \"Zs\":\n        return True\n    return False\n\n\ndef _is_control(char):\n    \"\"\"Checks whether `chars` is a control character.\"\"\"\n    # These are technically control characters but we count them as whitespace\n    # characters.\n    if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return False\n    cat = unicodedata.category(char)\n    if cat.startswith(\"C\"):\n        return True\n    return False\n\n\ndef _is_punctuation(char):\n    \"\"\"Checks whether `chars` is a punctuation character.\"\"\"\n    cp = ord(char)\n    # We treat all non-letter/number ASCII as punctuation.\n    # Characters such as \"^\", \"$\", and \"`\" are not in the Unicode\n    # Punctuation class but we treat them as punctuation anyways, for\n    # consistency.\n    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or\n            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):\n        return True\n    cat = unicodedata.category(char)\n    if cat.startswith(\"P\"):\n        return True\n    return False\n"
  },
  {
    "path": "pytorch_pretrained_bert/tokenization_gpt2.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Tokenization classes for OpenAI GPT.\"\"\"\nfrom __future__ import (absolute_import, division, print_function,\n                        unicode_literals)\n\nimport sys\nimport json\nimport logging\nimport os\nimport regex as re\nfrom io import open\n\ntry:\n    from functools import lru_cache\nexcept ImportError:\n    # Just a dummy decorator to get the checks to run on python2\n    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.\n    def lru_cache():\n        return lambda func: func\n\nfrom .file_utils import cached_path\n\nlogger = logging.getLogger(__name__)\n\nPRETRAINED_VOCAB_ARCHIVE_MAP = {\n    'gpt2': \"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json\",\n    'gpt2-medium': \"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json\",\n}\nPRETRAINED_MERGES_ARCHIVE_MAP = {\n    'gpt2': \"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt\",\n    'gpt2-medium': \"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt\",\n}\nPRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {\n    'gpt2': 1024,\n}\nVOCAB_NAME = 'vocab.json'\nMERGES_NAME = 'merges.txt'\nSPECIAL_TOKENS_NAME = 'special_tokens.txt'\n\n@lru_cache()\ndef bytes_to_unicode():\n    \"\"\"\n    Returns list of utf-8 byte and a corresponding list of unicode strings.\n    The reversible bpe codes work on unicode strings.\n    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.\n    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.\n    This is a signficant percentage of your normal, say, 32K bpe vocab.\n    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.\n    And avoids mapping to whitespace/control characters the bpe code barfs on.\n    \"\"\"\n    _chr = unichr if sys.version_info[0] == 2 else chr\n    bs = list(range(ord(\"!\"), ord(\"~\")+1))+list(range(ord(\"¡\"), ord(\"¬\")+1))+list(range(ord(\"®\"), ord(\"ÿ\")+1))\n    cs = bs[:]\n    n = 0\n    for b in range(2**8):\n        if b not in bs:\n            bs.append(b)\n            cs.append(2**8+n)\n            n += 1\n    cs = [_chr(n) for n in cs]\n    return dict(zip(bs, cs))\n\ndef get_pairs(word):\n    \"\"\"Return set of symbol pairs in a word.\n\n    Word is represented as tuple of symbols (symbols being variable-length strings).\n    \"\"\"\n    pairs = set()\n    prev_char = word[0]\n    for char in word[1:]:\n        pairs.add((prev_char, char))\n        prev_char = char\n    return pairs\n\nclass GPT2Tokenizer(object):\n    \"\"\"\n    GPT-2 BPE tokenizer. Peculiarities:\n        - Byte-level BPE\n    \"\"\"\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):\n        \"\"\"\n        Instantiate a GPT2Tokenizer from a pre-trained model file.\n        Download and cache the pre-trained model file if needed.\n        \"\"\"\n        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:\n            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]\n            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]\n            special_tokens_file = None\n        else:\n            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)\n            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)\n            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)\n            if not os.path.exists(special_tokens_file):\n                special_tokens_file = None\n            else:\n                logger.info(\"loading special tokens file {}\".format(special_tokens_file))\n        # redirect to the cache, if necessary\n        try:\n            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)\n            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:\n                logger.error(\n                    \"Couldn't reach server at '{}' to download vocabulary.\".format(\n                        vocab_file))\n            else:\n                logger.error(\n                    \"Model name '{}' was not found in model name list ({}). \"\n                    \"We assumed '{}' was a path or url but couldn't find files {} and {} \"\n                    \"at this path or url.\".format(\n                        pretrained_model_name_or_path,\n                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),\n                        pretrained_model_name_or_path,\n                        vocab_file, merges_file))\n            return None\n        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:\n            logger.info(\"loading vocabulary file {}\".format(vocab_file))\n            logger.info(\"loading merges file {}\".format(merges_file))\n        else:\n            logger.info(\"loading vocabulary file {} from cache at {}\".format(\n                vocab_file, resolved_vocab_file))\n            logger.info(\"loading merges file {} from cache at {}\".format(\n                merges_file, resolved_merges_file))\n        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:\n            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer\n            # than the number of positional embeddings\n            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]\n            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)\n        # Instantiate tokenizer.\n        if special_tokens_file and 'special_tokens' not in kwargs:\n            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\\n')[:-1]\n        else:\n            special_tokens = kwargs.pop('special_tokens', [])\n        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)\n        return tokenizer\n\n    def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):\n        self.max_len = max_len if max_len is not None else int(1e12)\n        self.encoder = json.load(open(vocab_file))\n        self.decoder = {v:k for k,v in self.encoder.items()}\n        self.errors = errors # how to handle errors in decoding\n        self.byte_encoder = bytes_to_unicode()\n        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}\n        bpe_data = open(merges_file, encoding='utf-8').read().split('\\n')[1:-1]\n        bpe_merges = [tuple(merge.split()) for merge in bpe_data]\n        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))\n        self.cache = {}\n\n        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions\n        self.pat = re.compile(r\"\"\"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+\"\"\")\n\n        self.special_tokens = {}\n        self.special_tokens_decoder = {}\n        self.set_special_tokens(special_tokens)\n\n    def __len__(self):\n        return len(self.encoder) + len(self.special_tokens)\n\n    def set_special_tokens(self, special_tokens):\n        \"\"\" Add a list of additional tokens to the encoder.\n            The additional tokens are indexed starting from the last index of the\n            current vocabulary in the order of the `special_tokens` list.\n        \"\"\"\n        if not special_tokens:\n            self.special_tokens = {}\n            self.special_tokens_decoder = {}\n            return\n        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))\n        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}\n        logger.info(\"Special tokens {}\".format(self.special_tokens))\n\n    def bpe(self, token):\n        if token in self.cache:\n            return self.cache[token]\n        word = tuple(token)\n        pairs = get_pairs(word)\n\n        if not pairs:\n            return token\n\n        while True:\n            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))\n            if bigram not in self.bpe_ranks:\n                break\n            first, second = bigram\n            new_word = []\n            i = 0\n            while i < len(word):\n                try:\n                    j = word.index(first, i)\n                    new_word.extend(word[i:j])\n                    i = j\n                except:\n                    new_word.extend(word[i:])\n                    break\n\n                if word[i] == first and i < len(word)-1 and word[i+1] == second:\n                    new_word.append(first+second)\n                    i += 2\n                else:\n                    new_word.append(word[i])\n                    i += 1\n            new_word = tuple(new_word)\n            word = new_word\n            if len(word) == 1:\n                break\n            else:\n                pairs = get_pairs(word)\n        word = ' '.join(word)\n        self.cache[token] = word\n        return word\n\n    def tokenize(self, text):\n        \"\"\" Tokenize a string. \"\"\"\n        bpe_tokens = []\n        for token in re.findall(self.pat, text):\n            if sys.version_info[0] == 2:\n                token = ''.join(self.byte_encoder[ord(b)] for b in token)\n            else:\n                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))\n            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))\n        return bpe_tokens\n\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\" Converts a sequence of tokens into ids using the vocab. \"\"\"\n        ids = []\n        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):\n            if tokens in self.special_tokens:\n                return self.special_tokens[tokens]\n            else:\n                return self.encoder.get(tokens, 0)\n        for token in tokens:\n            if token in self.special_tokens:\n                ids.append(self.special_tokens[token])\n            else:\n                ids.append(self.encoder.get(token, 0))\n        if len(ids) > self.max_len:\n            logger.warning(\n                \"Token indices sequence length is longer than the specified maximum \"\n                \" sequence length for this OpenAI GPT model ({} > {}). Running this\"\n                \" sequence through the model will result in indexing errors\".format(len(ids), self.max_len)\n            )\n        return ids\n\n    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):\n        \"\"\"Converts a sequence of ids in BPE tokens using the vocab.\"\"\"\n        tokens = []\n        for i in ids:\n            if i in self.special_tokens_decoder:\n                if not skip_special_tokens:\n                    tokens.append(self.special_tokens_decoder[i])\n            else:\n                tokens.append(self.decoder[i])\n        return tokens\n\n    def encode(self, text):\n        return self.convert_tokens_to_ids(self.tokenize(text))\n\n    def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True):\n        text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))\n        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)\n        if clean_up_tokenization_spaces:\n            text = text.replace('<unk>', '')\n            text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','\n                    ).replace(\" ' \", \"'\").replace(\" n't\", \"n't\").replace(\" 'm\", \"'m\").replace(\" do not\", \" don't\"\n                    ).replace(\" 's\", \"'s\").replace(\" 've\", \"'ve\").replace(\" 're\", \"'re\")\n        return text\n\n    def save_vocabulary(self, vocab_path):\n        \"\"\"Save the tokenizer vocabulary and merge files to a directory.\"\"\"\n        if not os.path.isdir(vocab_path):\n            logger.error(\"Vocabulary path ({}) should be a directory\".format(vocab_path))\n            return\n        vocab_file = os.path.join(vocab_path, VOCAB_NAME)\n        merge_file = os.path.join(vocab_path, MERGES_NAME)\n        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)\n\n        with open(vocab_file, 'w', encoding='utf-8') as f:\n            f.write(json.dumps(self.encoder, ensure_ascii=False))\n\n        index = 0\n        with open(merge_file, \"w\", encoding=\"utf-8\") as writer:\n            writer.write(u'#version: 0.2\\n')\n            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):\n                if index != token_index:\n                    logger.warning(\"Saving vocabulary to {}: BPE merge indices are not consecutive.\"\n                                   \" Please check that the tokenizer is not corrupted!\".format(merge_file))\n                    index = token_index\n                writer.write(' '.join(bpe_tokens) + u'\\n')\n                index += 1\n\n        index = len(self.encoder)\n        with open(special_tokens_file, 'w', encoding='utf-8') as writer:\n            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):\n                if index != token_index:\n                    logger.warning(\"Saving special tokens vocabulary to {}: BPE indices are not consecutive.\"\n                                   \" Please check that the tokenizer is not corrupted!\".format(special_tokens_file))\n                    index = token_index\n                writer.write(token + u'\\n')\n                index += 1\n\n        return vocab_file, merge_file, special_tokens_file\n"
  },
  {
    "path": "pytorch_pretrained_bert/tokenization_openai.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Tokenization classes for OpenAI GPT.\"\"\"\nfrom __future__ import (absolute_import, division, print_function,\n                        unicode_literals)\n\nimport json\nimport logging\nimport os\nimport re\nimport sys\nfrom io import open\n\nfrom tqdm import tqdm\n\nfrom .file_utils import cached_path\nfrom .tokenization import BasicTokenizer\n\nlogger = logging.getLogger(__name__)\n\nPRETRAINED_VOCAB_ARCHIVE_MAP = {\n    'openai-gpt': \"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json\",\n}\nPRETRAINED_MERGES_ARCHIVE_MAP = {\n    'openai-gpt': \"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt\",\n}\nPRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {\n    'openai-gpt': 512,\n}\nVOCAB_NAME = 'vocab.json'\nMERGES_NAME = 'merges.txt'\nSPECIAL_TOKENS_NAME = 'special_tokens.txt'\n\ndef get_pairs(word):\n    \"\"\"\n    Return set of symbol pairs in a word.\n    word is represented as tuple of symbols (symbols being variable-length strings)\n    \"\"\"\n    pairs = set()\n    prev_char = word[0]\n    for char in word[1:]:\n        pairs.add((prev_char, char))\n        prev_char = char\n    return pairs\n\ndef text_standardize(text):\n    \"\"\"\n    fixes some issues the spacy tokenizer had on books corpus\n    also does some whitespace standardization\n    \"\"\"\n    text = text.replace('—', '-')\n    text = text.replace('–', '-')\n    text = text.replace('―', '-')\n    text = text.replace('…', '...')\n    text = text.replace('´', \"'\")\n    text = re.sub(r'''(-+|~+|!+|\"+|;+|\\?+|\\++|,+|\\)+|\\(+|\\\\+|\\/+|\\*+|\\[+|\\]+|}+|{+|\\|+|_+)''', r' \\1 ', text)\n    text = re.sub(r'\\s*\\n\\s*', ' \\n ', text)\n    text = re.sub(r'[^\\S\\n]+', ' ', text)\n    return text.strip()\n\nclass OpenAIGPTTokenizer(object):\n    \"\"\"\n    BPE tokenizer. Peculiarities:\n        - lower case all inputs\n        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.\n        - argument special_tokens and function set_special_tokens:\n            can be used to add additional symbols (ex: \"__classify__\") to a vocabulary.\n    \"\"\"\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):\n        \"\"\"\n        Instantiate a PreTrainedBertModel from a pre-trained model file.\n        Download and cache the pre-trained model file if needed.\n        \"\"\"\n        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:\n            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]\n            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]\n            special_tokens_file = None\n        else:\n            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)\n            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)\n            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)\n            if not os.path.exists(special_tokens_file):\n                special_tokens_file = None\n            else:\n                logger.info(\"loading special tokens file {}\".format(special_tokens_file))\n        # redirect to the cache, if necessary\n        try:\n            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)\n            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:\n                logger.error(\n                    \"Couldn't reach server at '{}' to download vocabulary.\".format(\n                        vocab_file))\n            else:\n                logger.error(\n                    \"Model name '{}' was not found in model name list ({}). \"\n                    \"We assumed '{}' was a path or url but couldn't find files {} and {} \"\n                    \"at this path or url.\".format(\n                        pretrained_model_name_or_path,\n                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),\n                        pretrained_model_name_or_path,\n                        vocab_file, merges_file))\n            return None\n        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:\n            logger.info(\"loading vocabulary file {}\".format(vocab_file))\n            logger.info(\"loading merges file {}\".format(merges_file))\n        else:\n            logger.info(\"loading vocabulary file {} from cache at {}\".format(\n                vocab_file, resolved_vocab_file))\n            logger.info(\"loading merges file {} from cache at {}\".format(\n                merges_file, resolved_merges_file))\n        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:\n            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer\n            # than the number of positional embeddings\n            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]\n            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)\n        # Instantiate tokenizer.\n        if special_tokens_file and 'special_tokens' not in kwargs:\n            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\\n')[:-1]\n        else:\n            special_tokens = kwargs.pop('special_tokens', [])\n        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)\n        return tokenizer\n\n    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):\n        try:\n            import ftfy\n            import spacy\n            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])\n            self.fix_text = ftfy.fix_text\n        except ImportError:\n            logger.warning(\"ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.\")\n            self.nlp = BasicTokenizer(do_lower_case=True,\n                                      never_split=special_tokens if special_tokens is not None else [])\n            self.fix_text = None\n\n        self.max_len = max_len if max_len is not None else int(1e12)\n        self.encoder = json.load(open(vocab_file, encoding=\"utf-8\"))\n        self.decoder = {v:k for k,v in self.encoder.items()}\n        merges = open(merges_file, encoding='utf-8').read().split('\\n')[1:-1]\n        merges = [tuple(merge.split()) for merge in merges]\n        self.bpe_ranks = dict(zip(merges, range(len(merges))))\n        self.cache = {}\n        self.special_tokens = {}\n        self.special_tokens_decoder = {}\n        self.set_special_tokens(special_tokens)\n\n    def __len__(self):\n        return len(self.encoder) + len(self.special_tokens)\n\n    def set_special_tokens(self, special_tokens):\n        \"\"\" Add a list of additional tokens to the encoder.\n            The additional tokens are indexed starting from the last index of the\n            current vocabulary in the order of the `special_tokens` list.\n        \"\"\"\n        if not special_tokens:\n            self.special_tokens = {}\n            self.special_tokens_decoder = {}\n            return\n        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))\n        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}\n        if self.fix_text is None:\n            # Using BERT's BasicTokenizer: we can update the tokenizer\n            self.nlp.never_split = special_tokens\n        logger.info(\"Special tokens {}\".format(self.special_tokens))\n\n    def bpe(self, token):\n        word = tuple(token[:-1]) + (token[-1] + '</w>',)\n        if token in self.cache:\n            return self.cache[token]\n        pairs = get_pairs(word)\n\n        if not pairs:\n            return token+'</w>'\n\n        while True:\n            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))\n            if bigram not in self.bpe_ranks:\n                break\n            first, second = bigram\n            new_word = []\n            i = 0\n            while i < len(word):\n                try:\n                    j = word.index(first, i)\n                    new_word.extend(word[i:j])\n                    i = j\n                except:\n                    new_word.extend(word[i:])\n                    break\n\n                if word[i] == first and i < len(word)-1 and word[i+1] == second:\n                    new_word.append(first+second)\n                    i += 2\n                else:\n                    new_word.append(word[i])\n                    i += 1\n            new_word = tuple(new_word)\n            word = new_word\n            if len(word) == 1:\n                break\n            else:\n                pairs = get_pairs(word)\n        word = ' '.join(word)\n        if word == '\\n  </w>':\n            word = '\\n</w>'\n        self.cache[token] = word\n        return word\n\n    def tokenize(self, text):\n        \"\"\" Tokenize a string. \"\"\"\n        split_tokens = []\n        if self.fix_text is None:\n            # Using BERT's BasicTokenizer\n            text = self.nlp.tokenize(text)\n            for token in text:\n                split_tokens.extend([t for t in self.bpe(token).split(' ')])\n        else:\n            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)\n            text = self.nlp(text_standardize(self.fix_text(text)))\n            for token in text:\n                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])\n        return split_tokens\n\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\" Converts a sequence of tokens into ids using the vocab. \"\"\"\n        ids = []\n        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):\n            if tokens in self.special_tokens:\n                return self.special_tokens[tokens]\n            else:\n                return self.encoder.get(tokens, 0)\n        for token in tokens:\n            if token in self.special_tokens:\n                ids.append(self.special_tokens[token])\n            else:\n                ids.append(self.encoder.get(token, 0))\n        if len(ids) > self.max_len:\n            logger.warning(\n                \"Token indices sequence length is longer than the specified maximum \"\n                \" sequence length for this OpenAI GPT model ({} > {}). Running this\"\n                \" sequence through the model will result in indexing errors\".format(len(ids), self.max_len)\n            )\n        return ids\n\n    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):\n        \"\"\"Converts a sequence of ids in BPE tokens using the vocab.\"\"\"\n        tokens = []\n        for i in ids:\n            if i in self.special_tokens_decoder:\n                if not skip_special_tokens:\n                    tokens.append(self.special_tokens_decoder[i])\n            else:\n                tokens.append(self.decoder[i])\n        return tokens\n\n    def encode(self, text):\n        return self.convert_tokens_to_ids(self.tokenize(text))\n\n    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):\n        \"\"\"Converts a sequence of ids in a string.\"\"\"\n        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)\n        out_string = ''.join(tokens).replace('</w>', ' ').strip()\n        if clean_up_tokenization_spaces:\n            out_string = out_string.replace('<unk>', '')\n            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','\n                    ).replace(\" ' \", \"'\").replace(\" n't\", \"n't\").replace(\" 'm\", \"'m\").replace(\" do not\", \" don't\"\n                    ).replace(\" 's\", \"'s\").replace(\" 've\", \"'ve\").replace(\" 're\", \"'re\")\n        return out_string\n\n    def save_vocabulary(self, vocab_path):\n        \"\"\"Save the tokenizer vocabulary and merge files to a directory.\"\"\"\n        if not os.path.isdir(vocab_path):\n            logger.error(\"Vocabulary path ({}) should be a directory\".format(vocab_path))\n            return\n        vocab_file = os.path.join(vocab_path, VOCAB_NAME)\n        merge_file = os.path.join(vocab_path, MERGES_NAME)\n        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)\n\n        with open(vocab_file, 'w', encoding='utf-8') as f:\n            f.write(json.dumps(self.encoder, ensure_ascii=False))\n\n        index = 0\n        with open(merge_file, \"w\", encoding=\"utf-8\") as writer:\n            writer.write(u'#version: 0.2\\n')\n            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):\n                if index != token_index:\n                    logger.warning(\"Saving vocabulary to {}: BPE merge indices are not consecutive.\"\n                                   \" Please check that the tokenizer is not corrupted!\".format(merge_file))\n                    index = token_index\n                writer.write(' '.join(bpe_tokens) + u'\\n')\n                index += 1\n\n        index = len(self.encoder)\n        with open(special_tokens_file, 'w', encoding='utf-8') as writer:\n            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):\n                if index != token_index:\n                    logger.warning(\"Saving special tokens vocabulary to {}: BPE indices are not consecutive.\"\n                                   \" Please check that the tokenizer is not corrupted!\".format(special_tokens_file))\n                    index = token_index\n                writer.write(token + u'\\n')\n                index += 1\n\n        return vocab_file, merge_file, special_tokens_file\n"
  },
  {
    "path": "pytorch_pretrained_bert/tokenization_transfo_xl.py",
    "content": "# coding=utf-8\n# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" Tokenization classes for Transformer XL model.\n    Adapted from https://github.com/kimiyoung/transformer-xl.\n\"\"\"\nfrom __future__ import (absolute_import, division, print_function,\n                        unicode_literals)\n\nimport glob\nimport logging\nimport os\nimport sys\nfrom collections import Counter, OrderedDict\nfrom io import open\nimport unicodedata\n\nimport torch\nimport numpy as np\n\nfrom .file_utils import cached_path\n\nif sys.version_info[0] == 2:\n    import cPickle as pickle\nelse:\n    import pickle\n\n\nlogger = logging.getLogger(__name__)\n\nPRETRAINED_VOCAB_ARCHIVE_MAP = {\n    'transfo-xl-wt103': \"https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin\",\n}\nVOCAB_NAME = 'vocab.bin'\n\nPRETRAINED_CORPUS_ARCHIVE_MAP = {\n    'transfo-xl-wt103': \"https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin\",\n}\nCORPUS_NAME = 'corpus.bin'\n\nclass TransfoXLTokenizer(object):\n    \"\"\"\n    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl\n    \"\"\"\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):\n        \"\"\"\n        Instantiate a TransfoXLTokenizer.\n        The TransfoXLTokenizer.\n        \"\"\"\n        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:\n            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]\n        else:\n            if os.path.isdir(pretrained_model_name_or_path):\n                vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)\n            else:\n                vocab_file = pretrained_model_name_or_path\n        # redirect to the cache, if necessary\n        try:\n            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:\n                logger.error(\n                    \"Couldn't reach server at '{}' to download vocabulary.\".format(\n                        vocab_file))\n            else:\n                logger.error(\n                    \"Model name '{}' was not found in model name list ({}). \"\n                    \"We assumed '{}' was a path or url but couldn't find files {} \"\n                    \"at this path or url.\".format(\n                        pretrained_model_name_or_path,\n                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),\n                        pretrained_model_name_or_path,\n                        vocab_file))\n            return None\n        if resolved_vocab_file == vocab_file:\n            logger.info(\"loading vocabulary file {}\".format(vocab_file))\n        else:\n            logger.info(\"loading vocabulary file {} from cache at {}\".format(\n                vocab_file, resolved_vocab_file))\n\n        # Instantiate tokenizer.\n        tokenizer = cls(*inputs, **kwargs)\n        vocab_dict = torch.load(resolved_vocab_file)\n        for key, value in vocab_dict.items():\n            tokenizer.__dict__[key] = value\n        return tokenizer\n\n    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False,\n                 delimiter=None, vocab_file=None, never_split=(\"<unk>\", \"<eos>\", \"<formula>\")):\n        self.counter = Counter()\n        self.special = special\n        self.min_freq = min_freq\n        self.max_size = max_size\n        self.lower_case = lower_case\n        self.delimiter = delimiter\n        self.vocab_file = vocab_file\n        self.never_split = never_split\n\n    def count_file(self, path, verbose=False, add_eos=False):\n        if verbose: print('counting file {} ...'.format(path))\n        assert os.path.exists(path)\n\n        sents = []\n        with open(path, 'r', encoding='utf-8') as f:\n            for idx, line in enumerate(f):\n                if verbose and idx > 0 and idx % 500000 == 0:\n                    print('    line {}'.format(idx))\n                symbols = self.tokenize(line, add_eos=add_eos)\n                self.counter.update(symbols)\n                sents.append(symbols)\n\n        return sents\n\n    def count_sents(self, sents, verbose=False):\n        \"\"\"\n            sents : a list of sentences, each a list of tokenized symbols\n        \"\"\"\n        if verbose: print('counting {} sents ...'.format(len(sents)))\n        for idx, symbols in enumerate(sents):\n            if verbose and idx > 0 and idx % 500000 == 0:\n                print('    line {}'.format(idx))\n            self.counter.update(symbols)\n\n    def _build_from_file(self, vocab_file):\n        self.idx2sym = []\n        self.sym2idx = OrderedDict()\n\n        with open(vocab_file, 'r', encoding='utf-8') as f:\n            for line in f:\n                symb = line.strip().split()[0]\n                self.add_symbol(symb)\n        if '<UNK>' in self.sym2idx:\n            self.unk_idx = self.sym2idx['<UNK>']\n        elif '<unk>' in self.sym2idx:\n            self.unk_idx = self.sym2idx['<unk>']\n        else:\n            raise ValueError('No <unkown> token in vocabulary')\n\n    def save_vocabulary(self, vocab_path):\n        \"\"\"Save the tokenizer vocabulary to a directory or file.\"\"\"\n        index = 0\n        if os.path.isdir(vocab_path):\n            vocab_file = os.path.join(vocab_path, VOCAB_NAME)\n        torch.save(self.__dict__, vocab_file)\n        return vocab_file\n\n    def build_vocab(self):\n        if self.vocab_file:\n            print('building vocab from {}'.format(self.vocab_file))\n            self._build_from_file(self.vocab_file)\n            print('final vocab size {}'.format(len(self)))\n        else:\n            print('building vocab with min_freq={}, max_size={}'.format(\n                self.min_freq, self.max_size))\n            self.idx2sym = []\n            self.sym2idx = OrderedDict()\n\n            for sym in self.special:\n                self.add_special(sym)\n\n            for sym, cnt in self.counter.most_common(self.max_size):\n                if cnt < self.min_freq: break\n                self.add_symbol(sym)\n\n            print('final vocab size {} from {} unique tokens'.format(\n                len(self), len(self.counter)))\n\n    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,\n            add_double_eos=False):\n        if verbose: print('encoding file {} ...'.format(path))\n        assert os.path.exists(path)\n        encoded = []\n        with open(path, 'r', encoding='utf-8') as f:\n            for idx, line in enumerate(f):\n                if verbose and idx > 0 and idx % 500000 == 0:\n                    print('    line {}'.format(idx))\n                symbols = self.tokenize(line, add_eos=add_eos,\n                    add_double_eos=add_double_eos)\n                encoded.append(self.convert_to_tensor(symbols))\n\n        if ordered:\n            encoded = torch.cat(encoded)\n\n        return encoded\n\n    def encode_sents(self, sents, ordered=False, verbose=False):\n        if verbose: print('encoding {} sents ...'.format(len(sents)))\n        encoded = []\n        for idx, symbols in enumerate(sents):\n            if verbose and idx > 0 and idx % 500000 == 0:\n                print('    line {}'.format(idx))\n            encoded.append(self.convert_to_tensor(symbols))\n\n        if ordered:\n            encoded = torch.cat(encoded)\n\n        return encoded\n\n    def add_special(self, sym):\n        if sym not in self.sym2idx:\n            self.idx2sym.append(sym)\n            self.sym2idx[sym] = len(self.idx2sym) - 1\n            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])\n\n    def add_symbol(self, sym):\n        if sym not in self.sym2idx:\n            self.idx2sym.append(sym)\n            self.sym2idx[sym] = len(self.idx2sym) - 1\n\n    def get_sym(self, idx):\n        assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)\n        return self.idx2sym[idx]\n\n    def get_idx(self, sym):\n        if sym in self.sym2idx:\n            return self.sym2idx[sym]\n        else:\n            # print('encounter unk {}'.format(sym))\n            # assert '<eos>' not in sym\n            if hasattr(self, 'unk_idx'):\n                return self.sym2idx.get(sym, self.unk_idx)\n            # Backward compatibility with pre-trained models\n            elif '<unk>' in self.sym2idx:\n                return self.sym2idx['<unk>']\n            elif '<UNK>' in self.sym2idx:\n                return self.sym2idx['<UNK>']\n            else:\n                raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')\n\n    def convert_ids_to_tokens(self, indices):\n        \"\"\"Converts a sequence of indices in symbols using the vocab.\"\"\"\n        return [self.get_sym(idx) for idx in indices]\n\n    def convert_tokens_to_ids(self, symbols):\n        \"\"\"Converts a sequence of symbols into ids using the vocab.\"\"\"\n        return [self.get_idx(sym) for sym in symbols]\n\n    def convert_to_tensor(self, symbols):\n        return torch.LongTensor(self.convert_tokens_to_ids(symbols))\n\n    def decode(self, indices, exclude=None):\n        \"\"\"Converts a sequence of indices in a string.\"\"\"\n        if exclude is None:\n            return ' '.join([self.get_sym(idx) for idx in indices])\n        else:\n            return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])\n\n    def __len__(self):\n        return len(self.idx2sym)\n\n    def tokenize(self, line, add_eos=False, add_double_eos=False):\n        line = line.strip()\n        # convert to lower case\n        if self.lower_case:\n            line = line.lower()\n\n        # empty delimiter '' will evaluate False\n        if self.delimiter == '':\n            symbols = line\n        else:\n            symbols = line.split(self.delimiter)\n\n        if add_double_eos: # lm1b\n            return ['<S>'] + symbols + ['<S>']\n        elif add_eos:\n            return symbols + ['<eos>']\n        else:\n            return symbols\n\n\nclass LMOrderedIterator(object):\n    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):\n        \"\"\"\n            data -- LongTensor -- the LongTensor is strictly ordered\n        \"\"\"\n        self.bsz = bsz\n        self.bptt = bptt\n        self.ext_len = ext_len if ext_len is not None else 0\n\n        self.device = device\n\n        # Work out how cleanly we can divide the dataset into bsz parts.\n        self.n_step = data.size(0) // bsz\n\n        # Trim off any extra elements that wouldn't cleanly fit (remainders).\n        data = data.narrow(0, 0, self.n_step * bsz)\n\n        # Evenly divide the data across the bsz batches.\n        self.data = data.view(bsz, -1).t().contiguous().to(device)\n\n        # Number of mini-batches\n        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt\n\n    def get_batch(self, i, bptt=None):\n        if bptt is None: bptt = self.bptt\n        seq_len = min(bptt, self.data.size(0) - 1 - i)\n\n        end_idx = i + seq_len\n        beg_idx = max(0, i - self.ext_len)\n\n        data = self.data[beg_idx:end_idx]\n        target = self.data[i+1:i+1+seq_len]\n\n        data_out = data.transpose(0, 1).contiguous().to(self.device)\n        target_out = target.transpose(0, 1).contiguous().to(self.device)\n\n        return data_out, target_out, seq_len\n\n    def get_fixlen_iter(self, start=0):\n        for i in range(start, self.data.size(0) - 1, self.bptt):\n            yield self.get_batch(i)\n\n    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):\n        max_len = self.bptt + max_deviation * std\n        i = start\n        while True:\n            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.\n            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))\n            data, target, seq_len = self.get_batch(i, bptt)\n            i += seq_len\n            yield data, target, seq_len\n            if i >= self.data.size(0) - 2:\n                break\n\n    def __iter__(self):\n        return self.get_fixlen_iter()\n\n\nclass LMShuffledIterator(object):\n    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):\n        \"\"\"\n            data -- list[LongTensor] -- there is no order among the LongTensors\n        \"\"\"\n        self.data = data\n\n        self.bsz = bsz\n        self.bptt = bptt\n        self.ext_len = ext_len if ext_len is not None else 0\n\n        self.device = device\n        self.shuffle = shuffle\n\n    def get_sent_stream(self):\n        # index iterator\n        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \\\n            else np.array(range(len(self.data)))\n\n        # sentence iterator\n        for idx in epoch_indices:\n            yield self.data[idx]\n\n    def stream_iterator(self, sent_stream):\n        # streams for each data in the batch\n        streams = [None] * self.bsz\n\n        data = torch.LongTensor(self.bptt, self.bsz)\n        target = torch.LongTensor(self.bptt, self.bsz)\n\n        n_retain = 0\n\n        while True:\n            # data   : [n_retain+bptt x bsz]\n            # target : [bptt x bsz]\n            data[n_retain:].fill_(-1)\n            target.fill_(-1)\n\n            valid_batch = True\n\n            for i in range(self.bsz):\n                n_filled = 0\n                try:\n                    while n_filled < self.bptt:\n                        if streams[i] is None or len(streams[i]) <= 1:\n                            streams[i] = next(sent_stream)\n                        # number of new tokens to fill in\n                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)\n                        # first n_retain tokens are retained from last batch\n                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \\\n                            streams[i][:n_new]\n                        target[n_filled:n_filled+n_new, i] = \\\n                            streams[i][1:n_new+1]\n                        streams[i] = streams[i][n_new:]\n                        n_filled += n_new\n                except StopIteration:\n                    valid_batch = False\n                    break\n\n            if not valid_batch:\n                return\n\n            data_out = data.transpose(0, 1).contiguous().to(self.device)\n            target_out = target.transpose(0, 1).contiguous().to(self.device)\n\n            yield data_out, target_out, self.bptt\n\n            n_retain = min(data.size(0), self.ext_len)\n            if n_retain > 0:\n                data[:n_retain] = data[-n_retain:]\n            data.resize_(n_retain + self.bptt, data.size(1))\n\n    def __iter__(self):\n        # sent_stream is an iterator\n        sent_stream = self.get_sent_stream()\n\n        for batch in self.stream_iterator(sent_stream):\n            yield batch\n\n\nclass LMMultiFileIterator(LMShuffledIterator):\n    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,\n        shuffle=False):\n\n        self.paths = paths\n        self.vocab = vocab\n\n        self.bsz = bsz\n        self.bptt = bptt\n        self.ext_len = ext_len if ext_len is not None else 0\n\n        self.device = device\n        self.shuffle = shuffle\n\n    def get_sent_stream(self, path):\n        sents = self.vocab.encode_file(path, add_double_eos=True)\n        if self.shuffle:\n            np.random.shuffle(sents)\n        sent_stream = iter(sents)\n\n        return sent_stream\n\n    def __iter__(self):\n        if self.shuffle:\n            np.random.shuffle(self.paths)\n\n        for path in self.paths:\n            # sent_stream is an iterator\n            sent_stream = self.get_sent_stream(path)\n            for batch in self.stream_iterator(sent_stream):\n                yield batch\n\n\nclass TransfoXLCorpus(object):\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):\n        \"\"\"\n        Instantiate a pre-processed corpus.\n        \"\"\"\n        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)\n        if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:\n            corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]\n        else:\n            corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)\n        # redirect to the cache, if necessary\n        try:\n            resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)\n        except EnvironmentError:\n            logger.error(\n                \"Corpus '{}' was not found in corpus list ({}). \"\n                \"We assumed '{}' was a path or url but couldn't find files {} \"\n                \"at this path or url.\".format(\n                    pretrained_model_name_or_path,\n                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),\n                    pretrained_model_name_or_path,\n                    corpus_file))\n            return None\n        if resolved_corpus_file == corpus_file:\n            logger.info(\"loading corpus file {}\".format(corpus_file))\n        else:\n            logger.info(\"loading corpus file {} from cache at {}\".format(\n                corpus_file, resolved_corpus_file))\n\n        # Instantiate tokenizer.\n        corpus = cls(*inputs, **kwargs)\n        corpus_dict = torch.load(resolved_corpus_file)\n        for key, value in corpus_dict.items():\n            corpus.__dict__[key] = value\n        corpus.vocab = vocab\n        if corpus.train is not None:\n            corpus.train = torch.tensor(corpus.train, dtype=torch.long)\n        if corpus.valid is not None:\n            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)\n        if corpus.test is not None:\n            corpus.test = torch.tensor(corpus.test, dtype=torch.long)\n        return corpus\n\n    def __init__(self, *args, **kwargs):\n        self.vocab = TransfoXLTokenizer(*args, **kwargs)\n        self.dataset = None\n        self.train = None\n        self.valid = None\n        self.test = None\n\n    def build_corpus(self, path, dataset):\n        self.dataset = dataset\n\n        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:\n            self.vocab.count_file(os.path.join(path, 'train.txt'))\n            self.vocab.count_file(os.path.join(path, 'valid.txt'))\n            self.vocab.count_file(os.path.join(path, 'test.txt'))\n        elif self.dataset == 'wt103':\n            self.vocab.count_file(os.path.join(path, 'train.txt'))\n        elif self.dataset == 'lm1b':\n            train_path_pattern = os.path.join(\n                path, '1-billion-word-language-modeling-benchmark-r13output',\n                'training-monolingual.tokenized.shuffled', 'news.en-*')\n            train_paths = glob.glob(train_path_pattern)\n            # the vocab will load from file when build_vocab() is called\n\n        self.vocab.build_vocab()\n\n        if self.dataset in ['ptb', 'wt2', 'wt103']:\n            self.train = self.vocab.encode_file(\n                os.path.join(path, 'train.txt'), ordered=True)\n            self.valid = self.vocab.encode_file(\n                os.path.join(path, 'valid.txt'), ordered=True)\n            self.test = self.vocab.encode_file(\n                os.path.join(path, 'test.txt'), ordered=True)\n        elif self.dataset in ['enwik8', 'text8']:\n            self.train = self.vocab.encode_file(\n                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)\n            self.valid = self.vocab.encode_file(\n                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)\n            self.test = self.vocab.encode_file(\n                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)\n        elif self.dataset == 'lm1b':\n            self.train = train_paths\n            self.valid = self.vocab.encode_file(\n                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)\n            self.test = self.vocab.encode_file(\n                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)\n\n    def get_iterator(self, split, *args, **kwargs):\n        if split == 'train':\n            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:\n                data_iter = LMOrderedIterator(self.train, *args, **kwargs)\n            elif self.dataset == 'lm1b':\n                kwargs['shuffle'] = True\n                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)\n        elif split in ['valid', 'test']:\n            data = self.valid if split == 'valid' else self.test\n            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:\n                data_iter = LMOrderedIterator(data, *args, **kwargs)\n            elif self.dataset == 'lm1b':\n                data_iter = LMShuffledIterator(data, *args, **kwargs)\n\n        return data_iter\n\n\ndef get_lm_corpus(datadir, dataset):\n    fn = os.path.join(datadir, 'cache.pt')\n    fn_pickle = os.path.join(datadir, 'cache.pkl')\n    if os.path.exists(fn):\n        print('Loading cached dataset...')\n        corpus = torch.load(fn_pickle)\n    elif os.path.exists(fn):\n        print('Loading cached dataset from pickle...')\n        with open(fn, \"rb\") as fp:\n            corpus = pickle.load(fp)\n    else:\n        print('Producing dataset {}...'.format(dataset))\n        kwargs = {}\n        if dataset in ['wt103', 'wt2']:\n            kwargs['special'] = ['<eos>']\n            kwargs['lower_case'] = False\n        elif dataset == 'ptb':\n            kwargs['special'] = ['<eos>']\n            kwargs['lower_case'] = True\n        elif dataset == 'lm1b':\n            kwargs['special'] = []\n            kwargs['lower_case'] = False\n            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')\n        elif dataset in ['enwik8', 'text8']:\n            pass\n\n        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)\n        torch.save(corpus, fn)\n\n    return corpus\n"
  },
  {
    "path": "requirements.txt",
    "content": "# PyTorch\ntorch>=0.4.1\n# progress bars in model download and training scripts\ntqdm\n# Accessing files from S3 directly.\nboto3\n# Used for downloading models over HTTP\nrequests\n# For OpenAI GPT\nregex"
  },
  {
    "path": "samples/input.txt",
    "content": "Who was Jim Henson ? ||| Jim Henson was a puppeteer\n"
  },
  {
    "path": "samples/sample_text.txt",
    "content": "This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত\nText should be one-sentence-per-line, with empty lines between documents.\nThis sample text is public domain and was randomly selected from Project Guttenberg.\n\nThe rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.\nIndeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.\nPossibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.\n\"Cass\" Beard had risen early that morning, but not with a view to discovery.\nA leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded \"bunk\" and wet blankets.\nThe chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.\nThis was nearly opposite.\nMr. Cassius crossed the highway, and stopped suddenly.\nSomething glittered in the nearest red pool before him.\nGold, surely!\nBut, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.\nLooking at it more attentively, he saw that it bore the inscription, \"May to Cass.\"\nLike most of his fellow gold-seekers, Cass was superstitious.\n\nThe fountain of classic wisdom, Hypatia herself.\nAs the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.\nFrom my youth I felt in me a soul above the matter-entangled herd.\nShe revealed to me the glorious fact, that I am a spark of Divinity itself.\nA fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.\nThere is a philosophic pleasure in opening one's treasures to the modest young.\nPerhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.\nPhilammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;\nbut the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.\nAlready he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.\nHis guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;\nwhile at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.\nAt last they reached the quay at the opposite end of the street;\nand there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.\nHe stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.\n"
  },
  {
    "path": "setup.py",
    "content": "\"\"\"\nSimple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py\n\nTo create the package for pypi.\n\n1. Change the version in __init__.py and setup.py.\n\n2. Commit these changes with the message: \"Release: VERSION\"\n\n3. Add a tag in git to mark the release: \"git tag VERSION -m'Adds tag VERSION for pypi' \"\n   Push the tag to git: git push --tags origin master\n\n4. Build both the sources and the wheel. Do not change anything in setup.py between\n   creating the wheel and the source distribution (obviously).\n\n   For the wheel, run: \"python setup.py bdist_wheel\" in the top level allennlp directory.\n   (this will build a wheel for the python version you use to build it - make sure you use python 3.x).\n\n   For the sources, run: \"python setup.py sdist\"\n   You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.\n\n5. Check that everything looks correct by uploading the package to the pypi test server:\n\n   twine upload dist/* -r pypitest\n   (pypi suggest using twine as other methods upload files via plaintext.)\n\n   Check that you can install it in a virtualenv by running:\n   pip install -i https://testpypi.python.org/pypi allennlp\n\n6. Upload the final version to actual pypi:\n   twine upload dist/* -r pypi\n\n7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.\n\n\"\"\"\nfrom io import open\nfrom setuptools import find_packages, setup\n\nsetup(\n    name=\"pytorch_pretrained_bert\",\n    version=\"0.6.2\",\n    author=\"Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors\",\n    author_email=\"thomas@huggingface.co\",\n    description=\"PyTorch version of Google AI BERT model with script to load Google pre-trained models\",\n    long_description=open(\"README.md\", \"r\", encoding='utf-8').read(),\n    long_description_content_type=\"text/markdown\",\n    keywords='BERT NLP deep learning google',\n    license='Apache',\n    url=\"https://github.com/huggingface/pytorch-pretrained-BERT\",\n    packages=find_packages(exclude=[\"*.tests\", \"*.tests.*\",\n                                    \"tests.*\", \"tests\"]),\n    install_requires=['torch>=0.4.1',\n                      'numpy',\n                      'boto3',\n                      'requests',\n                      'tqdm',\n                      'regex'],\n    entry_points={\n      'console_scripts': [\n        \"pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main\",\n      ]\n    },\n    # python_requires='>=3.5.0',\n    tests_require=['pytest'],\n    classifiers=[\n          'Intended Audience :: Science/Research',\n          'License :: OSI Approved :: Apache Software License',\n          'Programming Language :: Python :: 3',\n          'Topic :: Scientific/Engineering :: Artificial Intelligence',\n    ],\n)\n"
  },
  {
    "path": "tests/conftest.py",
    "content": "# content of conftest.py\n\nimport pytest\n\n\ndef pytest_addoption(parser):\n    parser.addoption(\n        \"--runslow\", action=\"store_true\", default=False, help=\"run slow tests\"\n    )\n\n\ndef pytest_collection_modifyitems(config, items):\n    if config.getoption(\"--runslow\"):\n        # --runslow given in cli: do not skip slow tests\n        return\n    skip_slow = pytest.mark.skip(reason=\"need --runslow option to run\")\n    for item in items:\n        if \"slow\" in item.keywords:\n            item.add_marker(skip_slow)\n"
  },
  {
    "path": "tests/modeling_gpt2_test.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport unittest\nimport json\nimport random\nimport shutil\nimport pytest\n\nimport torch\n\nfrom pytorch_pretrained_bert import (GPT2Config, GPT2Model,\n                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)\nfrom pytorch_pretrained_bert.modeling_gpt2 import PRETRAINED_MODEL_ARCHIVE_MAP\n\nclass GPT2ModelTest(unittest.TestCase):\n    class GPT2ModelTester(object):\n\n        def __init__(self,\n                     parent,\n                     batch_size=13,\n                     seq_length=7,\n                     is_training=True,\n                     use_position_ids=True,\n                     use_token_type_ids=True,\n                     use_labels=True,\n                     vocab_size=99,\n                     n_special=1,\n                     n_positions=33,\n                     n_embd=32,\n                     n_layer=5,\n                     n_head=4,\n                     n_choices=3,\n                     type_sequence_label_size=2,\n                     initializer_range=0.02,\n                     num_labels=3,\n                     scope=None):\n            self.parent = parent\n            self.batch_size = batch_size\n            self.seq_length = seq_length\n            self.is_training = is_training\n            self.use_position_ids = use_position_ids\n            self.use_token_type_ids = use_token_type_ids\n            self.use_labels = use_labels\n            self.vocab_size = vocab_size\n            self.n_special = n_special\n            self.n_positions = n_positions\n            self.n_embd = n_embd\n            self.n_layer = n_layer\n            self.n_head = n_head\n            self.n_choices = n_choices\n            self.type_sequence_label_size = type_sequence_label_size\n            self.initializer_range = initializer_range\n            self.num_labels = num_labels\n            self.scope = scope\n\n        def prepare_config_and_inputs(self):\n            total_num_tokens = self.vocab_size + self.n_special\n            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)\n\n            position_ids = None\n            if self.use_position_ids:\n                position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)\n\n            token_type_ids = None\n            if self.use_token_type_ids:\n                total_voc = self.vocab_size\n                token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)\n\n            mc_labels = None\n            lm_labels = None\n            mc_token_ids = None\n            if self.use_labels:\n                mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)\n                lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)\n                mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)\n\n            config = GPT2Config(\n                vocab_size_or_config_json_file=self.vocab_size,\n                n_special=self.n_special,\n                n_positions=self.n_positions,\n                n_embd=self.n_embd,\n                n_layer=self.n_layer,\n                n_head=self.n_head,\n                initializer_range=self.initializer_range)\n\n            return (config, input_ids, token_type_ids, position_ids,\n                    mc_labels, lm_labels, mc_token_ids)\n\n        def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids,\n                                mc_labels, lm_labels, mc_token_ids):\n            model = GPT2Model(config)\n            model.eval()\n            hidden_states, presents = model(input_ids, position_ids, token_type_ids)\n            outputs = {\n                \"hidden_states\": hidden_states,\n                \"presents\": presents,\n            }\n            return outputs\n\n        def check_gpt2_model_output(self, result):\n            self.parent.assertEqual(len(result[\"hidden_states\"]), self.n_layer + 1)\n            self.parent.assertListEqual(\n                list(result[\"hidden_states\"][0].size()),\n                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])\n\n\n        def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,\n                                       mc_labels, lm_labels, mc_token_ids):\n            model = GPT2LMHeadModel(config)\n            model.eval()\n            loss = model(input_ids, position_ids, token_type_ids, lm_labels)\n            lm_logits, presents = model(input_ids, position_ids, token_type_ids)\n            outputs = {\n                \"loss\": loss,\n                \"lm_logits\": lm_logits,\n                \"presents\": presents,\n            }\n            return outputs\n\n        def create_gpt2_lm_head_with_output_attention(self, config, input_ids, token_type_ids, position_ids,\n                                       mc_labels, lm_labels, mc_token_ids):\n            model = GPT2LMHeadModel(config, output_attentions=True)\n            model.eval()\n            loss = model(input_ids, position_ids, token_type_ids, lm_labels)\n            attentions, lm_logits, presents = model(input_ids, position_ids, token_type_ids)\n            outputs = {\n                \"loss\": loss,\n                \"lm_logits\": lm_logits,\n                \"presents\": presents,\n                \"attentions\": attentions,\n            }\n            return outputs\n\n        def check_gpt2_lm_head_output(self, result):\n            total_voc = self.n_special + self.vocab_size\n            self.parent.assertListEqual(\n                list(result[\"lm_logits\"].size()),\n                [self.batch_size, self.n_choices, self.seq_length, total_voc])\n            self.parent.assertEqual(self.n_layer, len(result[\"presents\"]))\n            self.parent.assertListEqual(\n                list(result[\"presents\"][0].size()),\n                [2, self.batch_size * self.n_choices, self.n_head, self.seq_length, self.n_embd // self.n_head])\n\n        def check_gpt2_lm_head_loss_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"loss\"].size()),\n                [])\n\n        def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_ids,\n                                       mc_labels, lm_labels, mc_token_ids):\n            model = GPT2DoubleHeadsModel(config)\n            model.eval()\n            loss = model(input_ids, mc_token_ids,\n                         lm_labels=lm_labels, mc_labels=mc_labels,\n                         token_type_ids=token_type_ids, position_ids=position_ids)\n            lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)\n            outputs = {\n                \"loss\": loss,\n                \"lm_logits\": lm_logits,\n                \"mc_logits\": mc_logits,\n                \"presents\": presents,\n            }\n            return outputs\n\n        def create_gpt2_double_heads_with_output_attention(self, config, input_ids, token_type_ids, position_ids,\n                                       mc_labels, lm_labels, mc_token_ids):\n            model = GPT2DoubleHeadsModel(config, output_attentions=True)\n            model.eval()\n            loss = model(input_ids, mc_token_ids,\n                         lm_labels=lm_labels, mc_labels=mc_labels,\n                         token_type_ids=token_type_ids, position_ids=position_ids)\n            attentions, lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)\n            outputs = {\n                \"loss\": loss,\n                \"lm_logits\": lm_logits,\n                \"mc_logits\": mc_logits,\n                \"presents\": presents,\n                \"attentions\": attentions,\n            }\n            return outputs\n\n        def check_gpt2_double_heads_output(self, result):\n            total_voc = self.n_special + self.vocab_size\n            self.parent.assertListEqual(\n                list(result[\"lm_logits\"].size()),\n                [self.batch_size, self.n_choices, self.seq_length, total_voc])\n            self.parent.assertListEqual(\n                list(result[\"mc_logits\"].size()),\n                [self.batch_size, self.n_choices])\n\n        def check_gpt2_double_heads_loss_output(self, result):\n            self.parent.assertListEqual(\n                [list(l.size()) for l in result[\"loss\"]],\n                [[], []])\n\n        def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_ids, position_ids,\n                                                mc_labels, lm_labels, mc_token_ids):\n            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):\n                model = model_class(config=config, keep_multihead_output=True)\n                model.eval()\n                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)\n                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer\n                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer\n                if isinstance(model, GPT2DoubleHeadsModel):\n                    output = model(input_ids, mc_token_ids, head_mask=head_mask)\n                else:\n                    output = model(input_ids, head_mask=head_mask)\n\n                if isinstance(model, GPT2Model):\n                    output = sum(t.sum() for t in output[0])\n                elif isinstance(output, (list, tuple)):\n                    output = sum(t.sum() for t in output[:-1])\n                output = output.sum()\n                output.backward()\n                multihead_outputs = (model if isinstance(model, GPT2Model) else model.transformer).get_multihead_outputs()\n\n                self.parent.assertEqual(len(multihead_outputs), self.n_layer)\n                self.parent.assertListEqual(\n                    list(multihead_outputs[0].size()),\n                    [self.batch_size * self.n_choices, self.n_head,\n                        self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertEqual(\n                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),\n                    0)\n                self.parent.assertEqual(\n                    len(multihead_outputs[0][:, 0, :, :].nonzero()),\n                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)\n                self.parent.assertEqual(\n                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),\n                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)\n\n                self.parent.assertListEqual(\n                    list(multihead_outputs[1].size()),\n                    [self.batch_size * self.n_choices, self.n_head,\n                     self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertEqual(\n                    len(multihead_outputs[1].nonzero()),\n                    multihead_outputs[1].numel())\n\n                self.parent.assertListEqual(\n                    list(multihead_outputs[-1].size()),\n                    [self.batch_size * self.n_choices, self.n_head,\n                     self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertEqual(\n                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),\n                    0)\n                self.parent.assertEqual(\n                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),\n                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)\n\n        def create_and_check_gpt2_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,\n                                                   mc_labels, lm_labels, mc_token_ids):\n            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):\n                model = model_class(config=config, keep_multihead_output=True)\n                model.eval()\n                transformer = model if isinstance(model, GPT2Model) else model.transformer\n                heads_to_prune = {0: list(range(1, self.n_head)),\n                                  -1: [0]}\n                transformer.prune_heads(heads_to_prune)\n                if isinstance(model, GPT2DoubleHeadsModel):\n                    output = model(input_ids, mc_token_ids)\n                else:\n                    output = model(input_ids)\n\n                if isinstance(model, GPT2Model):\n                    output = sum(t.sum() for t in output[0])\n                elif isinstance(output, (list, tuple)):\n                    output = sum(t.sum() for t in output[:-1])\n                output = output.sum()\n                output.backward()\n                multihead_outputs = transformer.get_multihead_outputs()\n\n                self.parent.assertEqual(len(multihead_outputs), self.n_layer)\n                self.parent.assertListEqual(\n                    list(multihead_outputs[0].size()),\n                    [self.batch_size * self.n_choices, 1,\n                        self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertListEqual(\n                    list(multihead_outputs[1].size()),\n                    [self.batch_size * self.n_choices, self.n_head,\n                        self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertListEqual(\n                    list(multihead_outputs[-1].size()),\n                    [self.batch_size * self.n_choices, self.n_head-1,\n                        self.seq_length, self.n_embd // self.n_head])\n\n\n    def test_default(self):\n        self.run_tester(GPT2ModelTest.GPT2ModelTester(self))\n\n    def test_config_to_json_string(self):\n        config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)\n        obj = json.loads(config.to_json_string())\n        self.assertEqual(obj[\"vocab_size\"], 99)\n        self.assertEqual(obj[\"n_embd\"], 37)\n\n    def test_config_to_json_file(self):\n        config_first = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)\n        json_file_path = \"/tmp/config.json\"\n        config_first.to_json_file(json_file_path)\n        config_second = GPT2Config.from_json_file(json_file_path)\n        os.remove(json_file_path)\n        self.assertEqual(config_second.to_dict(), config_first.to_dict())\n\n    @pytest.mark.slow\n    def test_model_from_pretrained(self):\n        cache_dir = \"/tmp/pytorch_pretrained_bert_test/\"\n        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:\n            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)\n            shutil.rmtree(cache_dir)\n            self.assertIsNotNone(model)\n\n    def run_tester(self, tester):\n        config_and_inputs = tester.prepare_config_and_inputs()\n        output_result = tester.create_gpt2_model(*config_and_inputs)\n        tester.check_gpt2_model_output(output_result)\n\n        output_result = tester.create_gpt2_lm_head(*config_and_inputs)\n        tester.check_gpt2_lm_head_output(output_result)\n        tester.check_gpt2_lm_head_loss_output(output_result)\n\n        output_result = tester.create_gpt2_double_heads(*config_and_inputs)\n        tester.check_gpt2_double_heads_output(output_result)\n        tester.check_gpt2_double_heads_loss_output(output_result)\n\n        tester.create_and_check_gpt2_for_headmasking(*config_and_inputs)\n        tester.create_and_check_gpt2_for_head_pruning(*config_and_inputs)\n\n    @classmethod\n    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):\n        \"\"\"Creates a random int32 tensor of the shape within the vocab size.\"\"\"\n        if rng is None:\n            rng = random.Random()\n\n        total_dims = 1\n        for dim in shape:\n            total_dims *= dim\n\n        values = []\n        for _ in range(total_dims):\n            values.append(rng.randint(0, vocab_size - 1))\n\n        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "tests/modeling_openai_test.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport unittest\nimport json\nimport random\nimport shutil\nimport pytest\n\nimport torch\n\nfrom pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,\n                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)\nfrom pytorch_pretrained_bert.modeling_openai import PRETRAINED_MODEL_ARCHIVE_MAP\n\nclass OpenAIGPTModelTest(unittest.TestCase):\n    class OpenAIGPTModelTester(object):\n\n        def __init__(self,\n                     parent,\n                     batch_size=13,\n                     seq_length=7,\n                     is_training=True,\n                     use_position_ids=True,\n                     use_token_type_ids=True,\n                     use_labels=True,\n                     vocab_size=99,\n                     n_special=1,\n                     n_positions=33,\n                     n_embd=32,\n                     n_layer=5,\n                     n_head=4,\n                     n_choices=3,\n                     afn=\"gelu\",\n                     resid_pdrop=0.1,\n                     attn_pdrop=0.1,\n                     embd_pdrop=0.1,\n                     type_sequence_label_size=2,\n                     initializer_range=0.02,\n                     num_labels=3,\n                     scope=None):\n            self.parent = parent\n            self.batch_size = batch_size\n            self.seq_length = seq_length\n            self.is_training = is_training\n            self.use_position_ids = use_position_ids\n            self.use_token_type_ids = use_token_type_ids\n            self.use_labels = use_labels\n            self.vocab_size = vocab_size\n            self.n_special = n_special\n            self.n_positions = n_positions\n            self.n_embd = n_embd\n            self.n_layer = n_layer\n            self.n_head = n_head\n            self.afn = afn\n            self.n_choices = n_choices\n            self.resid_pdrop = resid_pdrop\n            self.attn_pdrop = attn_pdrop\n            self.embd_pdrop = embd_pdrop\n            self.type_sequence_label_size = type_sequence_label_size\n            self.initializer_range = initializer_range\n            self.num_labels = num_labels\n            self.scope = scope\n\n        def prepare_config_and_inputs(self):\n            input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)\n\n            position_ids = None\n            if self.use_position_ids:\n                position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)\n\n            token_type_ids = None\n            if self.use_token_type_ids:\n                total_voc = self.vocab_size + self.n_special\n                token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)\n\n            mc_labels = None\n            lm_labels = None\n            mc_token_ids = None\n            if self.use_labels:\n                mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)\n                lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)\n                mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)\n\n            config = OpenAIGPTConfig(\n                vocab_size_or_config_json_file=self.vocab_size,\n                n_positions=self.n_positions,\n                n_special=self.n_special,\n                n_embd=self.n_embd,\n                n_layer=self.n_layer,\n                n_head=self.n_head,\n                afn=self.afn,\n                resid_pdrop=self.resid_pdrop,\n                attn_pdrop=self.attn_pdrop,\n                embd_pdrop=self.embd_pdrop,\n                initializer_range=self.initializer_range)\n\n            return (config, input_ids, token_type_ids, position_ids,\n                    mc_labels, lm_labels, mc_token_ids)\n\n        def create_openai_model(self, config, input_ids, token_type_ids, position_ids,\n                                mc_labels, lm_labels, mc_token_ids):\n            model = OpenAIGPTModel(config)\n            model.eval()\n            hidden_states = model(input_ids, position_ids, token_type_ids)\n            outputs = {\n                \"hidden_states\": hidden_states,\n            }\n            return outputs\n\n        def check_openai_model_output(self, result):\n            self.parent.assertEqual(len(result[\"hidden_states\"]), self.n_layer + 1)\n            self.parent.assertListEqual(\n                list(result[\"hidden_states\"][0].size()),\n                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])\n\n\n        def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,\n                                       mc_labels, lm_labels, mc_token_ids):\n            model = OpenAIGPTLMHeadModel(config)\n            model.eval()\n            loss = model(input_ids, position_ids, token_type_ids, lm_labels)\n            lm_logits = model(input_ids, position_ids, token_type_ids)\n            outputs = {\n                \"loss\": loss,\n                \"lm_logits\": lm_logits,\n            }\n            return outputs\n\n        def check_openai_lm_head_output(self, result):\n            total_voc = self.n_special + self.vocab_size\n            self.parent.assertListEqual(\n                list(result[\"lm_logits\"].size()),\n                [self.batch_size, self.n_choices, self.seq_length, total_voc])\n\n        def check_openai_lm_head_loss_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"loss\"].size()),\n                [])\n\n        def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,\n                                       mc_labels, lm_labels, mc_token_ids):\n            model = OpenAIGPTDoubleHeadsModel(config)\n            model.eval()\n            loss = model(input_ids, mc_token_ids,\n                         lm_labels=lm_labels, mc_labels=mc_labels,\n                         token_type_ids=token_type_ids, position_ids=position_ids)\n            lm_logits, mc_logits = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)\n            outputs = {\n                \"loss\": loss,\n                \"lm_logits\": lm_logits,\n                \"mc_logits\": mc_logits,\n            }\n            return outputs\n\n        def check_openai_double_heads_output(self, result):\n            total_voc = self.n_special + self.vocab_size\n            self.parent.assertListEqual(\n                list(result[\"lm_logits\"].size()),\n                [self.batch_size, self.n_choices, self.seq_length, total_voc])\n            self.parent.assertListEqual(\n                list(result[\"mc_logits\"].size()),\n                [self.batch_size, self.n_choices])\n\n        def check_openai_double_heads_loss_output(self, result):\n            self.parent.assertListEqual(\n                [list(l.size()) for l in result[\"loss\"]],\n                [[], []])\n\n        def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_ids, position_ids,\n                                                mc_labels, lm_labels, mc_token_ids):\n            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):\n                model = model_class(config=config, keep_multihead_output=True)\n                model.eval()\n                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)\n                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer\n                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer\n                if isinstance(model, OpenAIGPTDoubleHeadsModel):\n                    output = model(input_ids, mc_token_ids, head_mask=head_mask)\n                else:\n                    output = model(input_ids, head_mask=head_mask)\n\n                if isinstance(model, OpenAIGPTModel):\n                    output = sum(t.sum() for t in output[0])\n                elif isinstance(output, (list, tuple)):\n                    output = sum(t.sum() for t in output)\n                output = output.sum()\n                output.backward()\n                multihead_outputs = (model if isinstance(model, OpenAIGPTModel) else model.transformer).get_multihead_outputs()\n\n                self.parent.assertEqual(len(multihead_outputs), self.n_layer)\n                self.parent.assertListEqual(\n                    list(multihead_outputs[0].size()),\n                    [self.batch_size * self.n_choices, self.n_head,\n                        self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertEqual(\n                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),\n                    0)\n                self.parent.assertEqual(\n                    len(multihead_outputs[0][:, 0, :, :].nonzero()),\n                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)\n                self.parent.assertEqual(\n                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),\n                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)\n\n                self.parent.assertListEqual(\n                    list(multihead_outputs[1].size()),\n                    [self.batch_size * self.n_choices, self.n_head,\n                     self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertEqual(\n                    len(multihead_outputs[1].nonzero()),\n                    multihead_outputs[1].numel())\n\n                self.parent.assertListEqual(\n                    list(multihead_outputs[-1].size()),\n                    [self.batch_size * self.n_choices, self.n_head,\n                     self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertEqual(\n                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),\n                    0)\n                self.parent.assertEqual(\n                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),\n                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)\n\n\n        def create_and_check_openai_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,\n                                                     mc_labels, lm_labels, mc_token_ids):\n            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):\n                model = model_class(config=config, keep_multihead_output=True)\n                model.eval()\n                transformer = model if isinstance(model, OpenAIGPTModel) else model.transformer\n                heads_to_prune = {0: list(range(1, self.n_head)),\n                                  -1: [0]}\n                transformer.prune_heads(heads_to_prune)\n                if isinstance(model, OpenAIGPTDoubleHeadsModel):\n                    output = model(input_ids, mc_token_ids)\n                else:\n                    output = model(input_ids)\n\n                if isinstance(model, OpenAIGPTModel):\n                    output = sum(t.sum() for t in output[0])\n                elif isinstance(output, (list, tuple)):\n                    output = sum(t.sum() for t in output)\n                output = output.sum()\n                output.backward()\n                multihead_outputs = transformer.get_multihead_outputs()\n\n                self.parent.assertEqual(len(multihead_outputs), self.n_layer)\n                self.parent.assertListEqual(\n                    list(multihead_outputs[0].size()),\n                    [self.batch_size * self.n_choices, 1,\n                        self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertListEqual(\n                    list(multihead_outputs[1].size()),\n                    [self.batch_size * self.n_choices, self.n_head,\n                        self.seq_length, self.n_embd // self.n_head])\n                self.parent.assertListEqual(\n                    list(multihead_outputs[-1].size()),\n                    [self.batch_size * self.n_choices, self.n_head-1,\n                        self.seq_length, self.n_embd // self.n_head])\n\n\n    def test_default(self):\n        self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))\n\n    def test_config_to_json_string(self):\n        config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)\n        obj = json.loads(config.to_json_string())\n        self.assertEqual(obj[\"vocab_size\"], 99)\n        self.assertEqual(obj[\"n_embd\"], 37)\n\n    def test_config_to_json_file(self):\n        config_first = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)\n        json_file_path = \"/tmp/config.json\"\n        config_first.to_json_file(json_file_path)\n        config_second = OpenAIGPTConfig.from_json_file(json_file_path)\n        os.remove(json_file_path)\n        self.assertEqual(config_second.to_dict(), config_first.to_dict())\n\n    @pytest.mark.slow\n    def test_model_from_pretrained(self):\n        cache_dir = \"/tmp/pytorch_pretrained_bert_test/\"\n        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:\n            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)\n            shutil.rmtree(cache_dir)\n            self.assertIsNotNone(model)\n\n    def run_tester(self, tester):\n        config_and_inputs = tester.prepare_config_and_inputs()\n        output_result = tester.create_openai_model(*config_and_inputs)\n        tester.check_openai_model_output(output_result)\n\n        output_result = tester.create_openai_lm_head(*config_and_inputs)\n        tester.check_openai_lm_head_output(output_result)\n        tester.check_openai_lm_head_loss_output(output_result)\n\n        output_result = tester.create_openai_double_heads(*config_and_inputs)\n        tester.check_openai_double_heads_output(output_result)\n        tester.check_openai_double_heads_loss_output(output_result)\n\n        tester.create_and_check_openai_for_headmasking(*config_and_inputs)\n        tester.create_and_check_openai_for_head_pruning(*config_and_inputs)\n\n    @classmethod\n    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):\n        \"\"\"Creates a random int32 tensor of the shape within the vocab size.\"\"\"\n        if rng is None:\n            rng = random.Random()\n\n        total_dims = 1\n        for dim in shape:\n            total_dims *= dim\n\n        values = []\n        for _ in range(total_dims):\n            values.append(rng.randint(0, vocab_size - 1))\n\n        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "tests/modeling_test.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport unittest\nimport json\nimport random\nimport shutil\nimport pytest\n\nimport torch\n\nfrom pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,\n                                     BertForNextSentencePrediction, BertForPreTraining,\n                                     BertForQuestionAnswering, BertForSequenceClassification,\n                                     BertForTokenClassification, BertForMultipleChoice)\nfrom pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP\n\n\nclass BertModelTest(unittest.TestCase):\n    class BertModelTester(object):\n\n        def __init__(self,\n                     parent,\n                     batch_size=13,\n                     seq_length=7,\n                     is_training=True,\n                     use_input_mask=True,\n                     use_token_type_ids=True,\n                     use_labels=True,\n                     vocab_size=99,\n                     hidden_size=32,\n                     num_hidden_layers=5,\n                     num_attention_heads=4,\n                     intermediate_size=37,\n                     hidden_act=\"gelu\",\n                     hidden_dropout_prob=0.1,\n                     attention_probs_dropout_prob=0.1,\n                     max_position_embeddings=512,\n                     type_vocab_size=16,\n                     type_sequence_label_size=2,\n                     initializer_range=0.02,\n                     num_labels=3,\n                     num_choices=4,\n                     scope=None):\n            self.parent = parent\n            self.batch_size = batch_size\n            self.seq_length = seq_length\n            self.is_training = is_training\n            self.use_input_mask = use_input_mask\n            self.use_token_type_ids = use_token_type_ids\n            self.use_labels = use_labels\n            self.vocab_size = vocab_size\n            self.hidden_size = hidden_size\n            self.num_hidden_layers = num_hidden_layers\n            self.num_attention_heads = num_attention_heads\n            self.intermediate_size = intermediate_size\n            self.hidden_act = hidden_act\n            self.hidden_dropout_prob = hidden_dropout_prob\n            self.attention_probs_dropout_prob = attention_probs_dropout_prob\n            self.max_position_embeddings = max_position_embeddings\n            self.type_vocab_size = type_vocab_size\n            self.type_sequence_label_size = type_sequence_label_size\n            self.initializer_range = initializer_range\n            self.num_labels = num_labels\n            self.num_choices = num_choices\n            self.scope = scope\n\n        def prepare_config_and_inputs(self):\n            input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)\n\n            input_mask = None\n            if self.use_input_mask:\n                input_mask = BertModelTest.ids_tensor([self.batch_size, self.seq_length], vocab_size=2)\n\n            token_type_ids = None\n            if self.use_token_type_ids:\n                token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)\n\n            sequence_labels = None\n            token_labels = None\n            choice_labels = None\n            if self.use_labels:\n                sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)\n                token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels)\n                choice_labels = BertModelTest.ids_tensor([self.batch_size], self.num_choices)\n\n            config = BertConfig(\n                vocab_size_or_config_json_file=self.vocab_size,\n                hidden_size=self.hidden_size,\n                num_hidden_layers=self.num_hidden_layers,\n                num_attention_heads=self.num_attention_heads,\n                intermediate_size=self.intermediate_size,\n                hidden_act=self.hidden_act,\n                hidden_dropout_prob=self.hidden_dropout_prob,\n                attention_probs_dropout_prob=self.attention_probs_dropout_prob,\n                max_position_embeddings=self.max_position_embeddings,\n                type_vocab_size=self.type_vocab_size,\n                initializer_range=self.initializer_range)\n\n            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels\n\n        def check_loss_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"loss\"].size()),\n                [])\n\n        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            model = BertModel(config=config)\n            model.eval()\n            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)\n            outputs = {\n                \"sequence_output\": all_encoder_layers[-1],\n                \"pooled_output\": pooled_output,\n                \"all_encoder_layers\": all_encoder_layers,\n            }\n            return outputs\n\n        def check_bert_model_output(self, result):\n            self.parent.assertListEqual(\n                [size for layer in result[\"all_encoder_layers\"] for size in layer.size()],\n                [self.batch_size, self.seq_length, self.hidden_size] * self.num_hidden_layers)\n            self.parent.assertListEqual(\n                list(result[\"sequence_output\"].size()),\n                [self.batch_size, self.seq_length, self.hidden_size])\n            self.parent.assertListEqual(list(result[\"pooled_output\"].size()), [self.batch_size, self.hidden_size])\n\n\n        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            model = BertForMaskedLM(config=config)\n            model.eval()\n            loss = model(input_ids, token_type_ids, input_mask, token_labels)\n            prediction_scores = model(input_ids, token_type_ids, input_mask)\n            outputs = {\n                \"loss\": loss,\n                \"prediction_scores\": prediction_scores,\n            }\n            return outputs\n\n        def check_bert_for_masked_lm_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"prediction_scores\"].size()),\n                [self.batch_size, self.seq_length, self.vocab_size])\n\n        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            model = BertForNextSentencePrediction(config=config)\n            model.eval()\n            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)\n            seq_relationship_score = model(input_ids, token_type_ids, input_mask)\n            outputs = {\n                \"loss\": loss,\n                \"seq_relationship_score\": seq_relationship_score,\n            }\n            return outputs\n\n        def check_bert_for_next_sequence_prediction_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"seq_relationship_score\"].size()),\n                [self.batch_size, 2])\n\n\n        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            model = BertForPreTraining(config=config)\n            model.eval()\n            loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)\n            prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)\n            outputs = {\n                \"loss\": loss,\n                \"prediction_scores\": prediction_scores,\n                \"seq_relationship_score\": seq_relationship_score,\n            }\n            return outputs\n\n        def check_bert_for_pretraining_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"prediction_scores\"].size()),\n                [self.batch_size, self.seq_length, self.vocab_size])\n            self.parent.assertListEqual(\n                list(result[\"seq_relationship_score\"].size()),\n                [self.batch_size, 2])\n\n\n        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            model = BertForQuestionAnswering(config=config)\n            model.eval()\n            loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)\n            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)\n            outputs = {\n                \"loss\": loss,\n                \"start_logits\": start_logits,\n                \"end_logits\": end_logits,\n            }\n            return outputs\n\n        def check_bert_for_question_answering_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"start_logits\"].size()),\n                [self.batch_size, self.seq_length])\n            self.parent.assertListEqual(\n                list(result[\"end_logits\"].size()),\n                [self.batch_size, self.seq_length])\n\n\n        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            model = BertForSequenceClassification(config=config, num_labels=self.num_labels)\n            model.eval()\n            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)\n            logits = model(input_ids, token_type_ids, input_mask)\n            outputs = {\n                \"loss\": loss,\n                \"logits\": logits,\n            }\n            return outputs\n\n        def check_bert_for_sequence_classification_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"logits\"].size()),\n                [self.batch_size, self.num_labels])\n\n\n        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            model = BertForTokenClassification(config=config, num_labels=self.num_labels)\n            model.eval()\n            loss = model(input_ids, token_type_ids, input_mask, token_labels)\n            logits = model(input_ids, token_type_ids, input_mask)\n            outputs = {\n                \"loss\": loss,\n                \"logits\": logits,\n            }\n            return outputs\n\n        def check_bert_for_token_classification_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"logits\"].size()),\n                [self.batch_size, self.seq_length, self.num_labels])\n\n\n        def create_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            model = BertForMultipleChoice(config=config, num_choices=self.num_choices)\n            model.eval()\n            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()\n            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()\n            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()\n            loss = model(multiple_choice_inputs_ids,\n                         multiple_choice_token_type_ids,\n                         multiple_choice_input_mask,\n                         choice_labels)\n            logits = model(multiple_choice_inputs_ids,\n                           multiple_choice_token_type_ids,\n                           multiple_choice_input_mask)\n            outputs = {\n                \"loss\": loss,\n                \"logits\": logits,\n            }\n            return outputs\n\n        def check_bert_for_multiple_choice(self, result):\n            self.parent.assertListEqual(\n                list(result[\"logits\"].size()),\n                [self.batch_size, self.num_choices])\n\n\n        def create_and_check_bert_for_attentions(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,\n                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,\n                                BertForTokenClassification):\n                if model_class in [BertForSequenceClassification,\n                                   BertForTokenClassification]:\n                    model = model_class(config=config, num_labels=self.num_labels, output_attentions=True)\n                else:\n                    model = model_class(config=config, output_attentions=True)\n                model.eval()\n                output = model(input_ids, token_type_ids, input_mask)\n                attentions = output[0]\n                self.parent.assertEqual(len(attentions), self.num_hidden_layers)\n                self.parent.assertListEqual(\n                    list(attentions[0].size()),\n                    [self.batch_size, self.num_attention_heads, self.seq_length, self.seq_length])\n\n\n        def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,\n                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,\n                                BertForTokenClassification):\n                if model_class in [BertForSequenceClassification,\n                                   BertForTokenClassification]:\n                    model = model_class(config=config,\n                                        num_labels=self.num_labels,\n                                        keep_multihead_output=True)\n                else:\n                    model = model_class(config=config, keep_multihead_output=True)\n                model.eval()\n                head_mask = torch.ones(self.num_hidden_layers, self.num_attention_heads).to(input_ids.device)\n                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer\n                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer\n                output = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)\n\n                if isinstance(model, BertModel):\n                    output = sum(t.sum() for t in output[0])\n                elif isinstance(output, (list, tuple)):\n                    output = sum(t.sum() for t in output)\n                output = output.sum()\n                output.backward()\n                multihead_outputs = (model if isinstance(model, BertModel) else model.bert).get_multihead_outputs()\n\n                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)\n                self.parent.assertListEqual(\n                    list(multihead_outputs[0].size()),\n                    [self.batch_size, self.num_attention_heads,\n                     self.seq_length, self.hidden_size // self.num_attention_heads])\n                self.parent.assertEqual(\n                    len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),\n                    0)\n                self.parent.assertEqual(\n                    len(multihead_outputs[0][:, 0, :, :].nonzero()),\n                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)\n                self.parent.assertEqual(\n                    len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),\n                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)\n\n                self.parent.assertListEqual(\n                    list(multihead_outputs[1].size()),\n                    [self.batch_size, self.num_attention_heads,\n                     self.seq_length, self.hidden_size // self.num_attention_heads])\n                self.parent.assertEqual(\n                    len(multihead_outputs[1].nonzero()),\n                    multihead_outputs[1].numel())\n\n                self.parent.assertListEqual(\n                    list(multihead_outputs[-1].size()),\n                    [self.batch_size, self.num_attention_heads,\n                     self.seq_length, self.hidden_size // self.num_attention_heads])\n                self.parent.assertEqual(\n                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),\n                    0)\n                self.parent.assertEqual(\n                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),\n                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)\n\n\n        def create_and_check_bert_for_head_pruning(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):\n            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,\n                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,\n                                BertForTokenClassification):\n                if model_class in [BertForSequenceClassification,\n                                   BertForTokenClassification]:\n                    model = model_class(config=config,\n                                        num_labels=self.num_labels,\n                                        keep_multihead_output=True)\n                else:\n                    model = model_class(config=config, keep_multihead_output=True)\n                model.eval()\n                bert_model = model if isinstance(model, BertModel) else model.bert\n                heads_to_prune = {0: list(range(1, self.num_attention_heads)),\n                                  -1: [0]}\n                bert_model.prune_heads(heads_to_prune)\n                output = model(input_ids, token_type_ids, input_mask)\n\n                if isinstance(model, BertModel):\n                    output = sum(t.sum() for t in output[0])\n                elif isinstance(output, (list, tuple)):\n                    output = sum(t.sum() for t in output)\n                output = output.sum()\n                output.backward()\n                multihead_outputs = bert_model.get_multihead_outputs()\n\n                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)\n                self.parent.assertListEqual(\n                    list(multihead_outputs[0].size()),\n                    [self.batch_size, 1,\n                     self.seq_length, self.hidden_size // self.num_attention_heads])\n                self.parent.assertListEqual(\n                    list(multihead_outputs[1].size()),\n                    [self.batch_size, self.num_attention_heads,\n                     self.seq_length, self.hidden_size // self.num_attention_heads])\n                self.parent.assertListEqual(\n                    list(multihead_outputs[-1].size()),\n                    [self.batch_size, self.num_attention_heads-1,\n                     self.seq_length, self.hidden_size // self.num_attention_heads])\n\n\n    def test_default(self):\n        self.run_tester(BertModelTest.BertModelTester(self))\n\n    def test_config_to_json_string(self):\n        config = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37)\n        obj = json.loads(config.to_json_string())\n        self.assertEqual(obj[\"vocab_size\"], 99)\n        self.assertEqual(obj[\"hidden_size\"], 37)\n\n    def test_config_to_json_file(self):\n        config_first = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37)\n        json_file_path = \"/tmp/config.json\"\n        config_first.to_json_file(json_file_path)\n        config_second = BertConfig.from_json_file(json_file_path)\n        os.remove(json_file_path)\n        self.assertEqual(config_second.to_dict(), config_first.to_dict())\n\n    @pytest.mark.slow\n    def test_model_from_pretrained(self):\n        cache_dir = \"/tmp/pytorch_pretrained_bert_test/\"\n        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:\n            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)\n            shutil.rmtree(cache_dir)\n            self.assertIsNotNone(model)\n\n    def run_tester(self, tester):\n        config_and_inputs = tester.prepare_config_and_inputs()\n        output_result = tester.create_bert_model(*config_and_inputs)\n        tester.check_bert_model_output(output_result)\n\n        output_result = tester.create_bert_for_masked_lm(*config_and_inputs)\n        tester.check_bert_for_masked_lm_output(output_result)\n        tester.check_loss_output(output_result)\n\n        output_result = tester.create_bert_for_next_sequence_prediction(*config_and_inputs)\n        tester.check_bert_for_next_sequence_prediction_output(output_result)\n        tester.check_loss_output(output_result)\n\n        output_result = tester.create_bert_for_pretraining(*config_and_inputs)\n        tester.check_bert_for_pretraining_output(output_result)\n        tester.check_loss_output(output_result)\n\n        output_result = tester.create_bert_for_question_answering(*config_and_inputs)\n        tester.check_bert_for_question_answering_output(output_result)\n        tester.check_loss_output(output_result)\n\n        output_result = tester.create_bert_for_sequence_classification(*config_and_inputs)\n        tester.check_bert_for_sequence_classification_output(output_result)\n        tester.check_loss_output(output_result)\n\n        output_result = tester.create_bert_for_token_classification(*config_and_inputs)\n        tester.check_bert_for_token_classification_output(output_result)\n        tester.check_loss_output(output_result)\n\n        output_result = tester.create_bert_for_multiple_choice(*config_and_inputs)\n        tester.check_bert_for_multiple_choice(output_result)\n        tester.check_loss_output(output_result)\n\n        tester.create_and_check_bert_for_attentions(*config_and_inputs)\n        tester.create_and_check_bert_for_headmasking(*config_and_inputs)\n        tester.create_and_check_bert_for_head_pruning(*config_and_inputs)\n\n    @classmethod\n    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):\n        \"\"\"Creates a random int32 tensor of the shape within the vocab size.\"\"\"\n        if rng is None:\n            rng = random.Random()\n\n        total_dims = 1\n        for dim in shape:\n            total_dims *= dim\n\n        values = []\n        for _ in range(total_dims):\n            values.append(rng.randint(0, vocab_size - 1))\n\n        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "tests/modeling_transfo_xl_test.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport unittest\nimport json\nimport random\nimport shutil\nimport pytest\n\nimport torch\n\nfrom pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)\nfrom pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP\n\nclass TransfoXLModelTest(unittest.TestCase):\n    class TransfoXLModelTester(object):\n\n        def __init__(self,\n                     parent,\n                     batch_size=13,\n                     seq_length=7,\n                     mem_len=30,\n                     clamp_len=15,\n                     is_training=True,\n                     use_labels=True,\n                     vocab_size=99,\n                     cutoffs=[10, 50, 80],\n                     d_model=32,\n                     d_embed=32,\n                     n_head=4,\n                     d_head=8,\n                     d_inner=128,\n                     div_val=2,\n                     n_layer=5,\n                     scope=None,\n                     seed=1):\n            self.parent = parent\n            self.batch_size = batch_size\n            self.seq_length = seq_length\n            self.mem_len = mem_len\n            self.clamp_len = clamp_len\n            self.is_training = is_training\n            self.use_labels = use_labels\n            self.vocab_size = vocab_size\n            self.cutoffs = cutoffs\n            self.d_model = d_model\n            self.d_embed = d_embed\n            self.n_head = n_head\n            self.d_head = d_head\n            self.d_inner = d_inner\n            self.div_val = div_val\n            self.n_layer = n_layer\n            self.scope = scope\n            self.seed = seed\n\n        def prepare_config_and_inputs(self):\n            input_ids_1 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)\n            input_ids_2 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)\n\n            lm_labels = None\n            if self.use_labels:\n                lm_labels = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)\n\n            config = TransfoXLConfig(\n                vocab_size_or_config_json_file=self.vocab_size,\n                mem_len=self.mem_len,\n                clamp_len=self.clamp_len,\n                cutoffs=self.cutoffs,\n                d_model=self.d_model,\n                d_embed=self.d_embed,\n                n_head=self.n_head,\n                d_head=self.d_head,\n                d_inner=self.d_inner,\n                div_val=self.div_val,\n                n_layer=self.n_layer)\n\n            return (config, input_ids_1, input_ids_2, lm_labels)\n\n        def set_seed(self):\n            random.seed(self.seed)\n            torch.manual_seed(self.seed)\n\n        def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):\n            model = TransfoXLModel(config)\n            model.eval()\n\n            hidden_states_1, mems_1 = model(input_ids_1)\n            hidden_states_2, mems_2 = model(input_ids_2, mems_1)\n            outputs = {\n                \"hidden_states_1\": hidden_states_1,\n                \"mems_1\": mems_1,\n                \"hidden_states_2\": hidden_states_2,\n                \"mems_2\": mems_2,\n            }\n            return outputs\n\n        def check_transfo_xl_model_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"hidden_states_1\"].size()),\n                [self.batch_size, self.seq_length, self.d_model])\n            self.parent.assertListEqual(\n                list(result[\"hidden_states_2\"].size()),\n                [self.batch_size, self.seq_length, self.d_model])\n            self.parent.assertListEqual(\n                list(list(mem.size()) for mem in result[\"mems_1\"]),\n                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)\n            self.parent.assertListEqual(\n                list(list(mem.size()) for mem in result[\"mems_2\"]),\n                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)\n\n\n        def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):\n            model = TransfoXLLMHeadModel(config)\n            model.eval()\n\n            loss_1, mems_1a = model(input_ids_1, target=lm_labels)\n            lm_logits_1, mems_1b = model(input_ids_1)\n\n            loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a)\n            lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b)\n\n            outputs = {\n                \"loss_1\": loss_1,\n                \"mems_1a\": mems_1a,\n                \"lm_logits_1\": lm_logits_1,\n                \"mems_1b\": mems_1b,\n                \"loss_2\": loss_2,\n                \"mems_2a\": mems_2a,\n                \"lm_logits_2\": lm_logits_2,\n                \"mems_2b\": mems_2b,\n            }\n            return outputs\n\n        def check_transfo_xl_lm_head_output(self, result):\n            self.parent.assertListEqual(\n                list(result[\"loss_1\"].size()),\n                [self.batch_size, self.seq_length])\n            self.parent.assertListEqual(\n                list(result[\"lm_logits_1\"].size()),\n                [self.batch_size, self.seq_length, self.vocab_size])\n            self.parent.assertListEqual(\n                list(list(mem.size()) for mem in result[\"mems_1a\"]),\n                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)\n            self.parent.assertListEqual(\n                list(list(mem.size()) for mem in result[\"mems_1b\"]),\n                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)\n            self.parent.assertListEqual(\n                list(mem[~torch.isnan(mem)].sum() for mem in result[\"mems_1a\"]),\n                list(mem[~torch.isnan(mem)].sum() for mem in result[\"mems_1b\"]))\n\n            self.parent.assertListEqual(\n                list(result[\"loss_2\"].size()),\n                [self.batch_size, self.seq_length])\n            self.parent.assertListEqual(\n                list(result[\"lm_logits_2\"].size()),\n                [self.batch_size, self.seq_length, self.vocab_size])\n            self.parent.assertListEqual(\n                list(list(mem.size()) for mem in result[\"mems_2a\"]),\n                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)\n            self.parent.assertListEqual(\n                list(list(mem.size()) for mem in result[\"mems_2b\"]),\n                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)\n            self.parent.assertListEqual(\n                list(mem[~torch.isnan(mem)].sum() for mem in result[\"mems_2a\"]),\n                list(mem[~torch.isnan(mem)].sum() for mem in result[\"mems_2b\"]))\n\n    def test_default(self):\n        self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))\n\n    def test_config_to_json_string(self):\n        config = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)\n        obj = json.loads(config.to_json_string())\n        self.assertEqual(obj[\"n_token\"], 96)\n        self.assertEqual(obj[\"d_embed\"], 37)\n\n    def test_config_to_json_file(self):\n        config_first = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)\n        json_file_path = \"/tmp/config.json\"\n        config_first.to_json_file(json_file_path)\n        config_second = TransfoXLConfig.from_json_file(json_file_path)\n        os.remove(json_file_path)\n        self.assertEqual(config_second.to_dict(), config_first.to_dict())\n\n    @pytest.mark.slow\n    def test_model_from_pretrained(self):\n        cache_dir = \"/tmp/pytorch_pretrained_bert_test/\"\n        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:\n            model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)\n            shutil.rmtree(cache_dir)\n            self.assertIsNotNone(model)\n\n    def run_tester(self, tester):\n        config_and_inputs = tester.prepare_config_and_inputs()\n\n        tester.set_seed()\n        output_result = tester.create_transfo_xl_model(*config_and_inputs)\n        tester.check_transfo_xl_model_output(output_result)\n\n        tester.set_seed()\n        output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)\n        tester.check_transfo_xl_lm_head_output(output_result)\n\n    @classmethod\n    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):\n        \"\"\"Creates a random int32 tensor of the shape within the vocab size.\"\"\"\n        if rng is None:\n            rng = random.Random()\n\n        total_dims = 1\n        for dim in shape:\n            total_dims *= dim\n\n        values = []\n        for _ in range(total_dims):\n            values.append(rng.randint(0, vocab_size - 1))\n\n        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "tests/optimization_test.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport unittest\n\nimport torch\n\nfrom pytorch_pretrained_bert import BertAdam\nfrom pytorch_pretrained_bert import OpenAIAdam\nfrom pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \\\n    WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule\nimport numpy as np\n\n\nclass OptimizationTest(unittest.TestCase):\n\n    def assertListAlmostEqual(self, list1, list2, tol):\n        self.assertEqual(len(list1), len(list2))\n        for a, b in zip(list1, list2):\n            self.assertAlmostEqual(a, b, delta=tol)\n\n    def test_adam(self):\n        w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)\n        target = torch.tensor([0.4, 0.2, -0.5])\n        criterion = torch.nn.MSELoss()\n        # No warmup, constant schedule, no gradient clipping\n        optimizer = BertAdam(params=[w], lr=2e-1,\n                                          weight_decay=0.0,\n                                          max_grad_norm=-1)\n        for _ in range(100):\n            loss = criterion(w, target)\n            loss.backward()\n            optimizer.step()\n            w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.\n            w.grad.zero_()\n        self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)\n\n\nclass ScheduleInitTest(unittest.TestCase):\n    def test_bert_sched_init(self):\n        m = torch.nn.Linear(50, 50)\n        optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)\n        self.assertTrue(isinstance(optim.param_groups[0][\"schedule\"], ConstantLR))\n        optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=\"none\")\n        self.assertTrue(isinstance(optim.param_groups[0][\"schedule\"], ConstantLR))\n        optim = BertAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)\n        self.assertTrue(isinstance(optim.param_groups[0][\"schedule\"], WarmupLinearSchedule))\n        # shouldn't fail\n\n    def test_openai_sched_init(self):\n        m = torch.nn.Linear(50, 50)\n        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)\n        self.assertTrue(isinstance(optim.param_groups[0][\"schedule\"], ConstantLR))\n        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=\"none\")\n        self.assertTrue(isinstance(optim.param_groups[0][\"schedule\"], ConstantLR))\n        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)\n        self.assertTrue(isinstance(optim.param_groups[0][\"schedule\"], WarmupLinearSchedule))\n        # shouldn't fail\n\n\nclass WarmupCosineWithRestartsTest(unittest.TestCase):\n    def test_it(self):\n        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1000., cycles=5)\n        x = np.arange(0, 1000)\n        y = [m.get_lr(xe) for xe in x]\n        y = np.asarray(y)\n        expected_zeros = y[[0, 200, 400, 600, 800]]\n        print(expected_zeros)\n        expected_ones = y[[50, 250, 450, 650, 850]]\n        print(expected_ones)\n        self.assertTrue(np.allclose(expected_ones, 1))\n        self.assertTrue(np.allclose(expected_zeros, 0))\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "tests/tokenization_gpt2_test.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport os\nimport unittest\nimport json\nimport shutil\nimport pytest\n\nfrom pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP\n\n\nclass GPT2TokenizationTest(unittest.TestCase):\n\n    def test_full_tokenizer(self):\n        \"\"\" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt \"\"\"\n        vocab = [\"l\", \"o\", \"w\", \"e\", \"r\", \"s\", \"t\", \"i\", \"d\", \"n\",\n                 \"lo\", \"low\", \"er\",\n                 \"low\", \"lowest\", \"newer\", \"wider\"]\n        vocab_tokens = dict(zip(vocab, range(len(vocab))))\n        merges = [\"#version: 0.2\", \"l o\", \"lo w\", \"e r\", \"\"]\n        with open(\"/tmp/openai_tokenizer_vocab_test.json\", \"w\") as fp:\n            fp.write(json.dumps(vocab_tokens))\n            vocab_file = fp.name\n        with open(\"/tmp/openai_tokenizer_merges_test.txt\", \"w\") as fp:\n            fp.write(\"\\n\".join(merges))\n            merges_file = fp.name\n\n        tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=[\"<unk>\", \"<pad>\"])\n        os.remove(vocab_file)\n        os.remove(merges_file)\n\n        text = \"lower\"\n        bpe_tokens = [\"low\", \"er\"]\n        tokens = tokenizer.tokenize(text)\n        self.assertListEqual(tokens, bpe_tokens)\n\n        input_tokens = tokens + [\"<unk>\"]\n        input_bpe_tokens = [13, 12, 16]\n        self.assertListEqual(\n            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)\n\n        vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path=\"/tmp/\")\n        tokenizer_2 = GPT2Tokenizer.from_pretrained(\"/tmp/\")\n        os.remove(vocab_file)\n        os.remove(merges_file)\n        os.remove(special_tokens_file)\n\n        self.assertListEqual(\n            [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,\n             tokenizer.special_tokens, tokenizer.special_tokens_decoder],\n            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,\n             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])\n\n    # @pytest.mark.slow\n    def test_tokenizer_from_pretrained(self):\n        cache_dir = \"/tmp/pytorch_pretrained_bert_test/\"\n        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:\n            tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)\n            shutil.rmtree(cache_dir)\n            self.assertIsNotNone(tokenizer)\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "tests/tokenization_openai_test.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport os\nimport unittest\nimport json\nimport shutil\nimport pytest\n\nfrom pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP\n\n\nclass OpenAIGPTTokenizationTest(unittest.TestCase):\n\n    def test_full_tokenizer(self):\n        \"\"\" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt \"\"\"\n        vocab = [\"l\", \"o\", \"w\", \"e\", \"r\", \"s\", \"t\", \"i\", \"d\", \"n\",\n                 \"w</w>\", \"r</w>\", \"t</w>\",\n                 \"lo\", \"low\", \"er</w>\",\n                 \"low</w>\", \"lowest</w>\", \"newer</w>\", \"wider</w>\"]\n        vocab_tokens = dict(zip(vocab, range(len(vocab))))\n        merges = [\"#version: 0.2\", \"l o\", \"lo w\", \"e r</w>\", \"\"]\n        with open(\"/tmp/openai_tokenizer_vocab_test.json\", \"w\") as fp:\n            fp.write(json.dumps(vocab_tokens))\n            vocab_file = fp.name\n        with open(\"/tmp/openai_tokenizer_merges_test.txt\", \"w\") as fp:\n            fp.write(\"\\n\".join(merges))\n            merges_file = fp.name\n\n        tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=[\"<unk>\", \"<pad>\"])\n        os.remove(vocab_file)\n        os.remove(merges_file)\n\n        text = \"lower\"\n        bpe_tokens = [\"low\", \"er</w>\"]\n        tokens = tokenizer.tokenize(text)\n        self.assertListEqual(tokens, bpe_tokens)\n\n        input_tokens = tokens + [\"<unk>\"]\n        input_bpe_tokens = [14, 15, 20]\n        self.assertListEqual(\n            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)\n\n        vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path=\"/tmp/\")\n        tokenizer_2 = OpenAIGPTTokenizer.from_pretrained(\"/tmp/\")\n        os.remove(vocab_file)\n        os.remove(merges_file)\n        os.remove(special_tokens_file)\n\n        self.assertListEqual(\n            [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,\n             tokenizer.special_tokens, tokenizer.special_tokens_decoder],\n            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,\n             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])\n\n    @pytest.mark.slow\n    def test_tokenizer_from_pretrained(self):\n        cache_dir = \"/tmp/pytorch_pretrained_bert_test/\"\n        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:\n            tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)\n            shutil.rmtree(cache_dir)\n            self.assertIsNotNone(tokenizer)\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "tests/tokenization_test.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport os\nimport unittest\nfrom io import open\nimport shutil\nimport pytest\n\nfrom pytorch_pretrained_bert.tokenization import (BasicTokenizer,\n                                                  BertTokenizer,\n                                                  WordpieceTokenizer,\n                                                  _is_control, _is_punctuation,\n                                                  _is_whitespace, PRETRAINED_VOCAB_ARCHIVE_MAP)\n\n\nclass TokenizationTest(unittest.TestCase):\n\n    def test_full_tokenizer(self):\n        vocab_tokens = [\n            \"[UNK]\", \"[CLS]\", \"[SEP]\", \"want\", \"##want\", \"##ed\", \"wa\", \"un\", \"runn\",\n            \"##ing\", \",\"\n        ]\n        with open(\"/tmp/bert_tokenizer_test.txt\", \"w\", encoding='utf-8') as vocab_writer:\n            vocab_writer.write(\"\".join([x + \"\\n\" for x in vocab_tokens]))\n\n            vocab_file = vocab_writer.name\n\n        tokenizer = BertTokenizer(vocab_file)\n        os.remove(vocab_file)\n\n        tokens = tokenizer.tokenize(u\"UNwant\\u00E9d,running\")\n        self.assertListEqual(tokens, [\"un\", \"##want\", \"##ed\", \",\", \"runn\", \"##ing\"])\n\n        self.assertListEqual(\n            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])\n\n        vocab_file = tokenizer.save_vocabulary(vocab_path=\"/tmp/\")\n        tokenizer.from_pretrained(vocab_file)\n        os.remove(vocab_file)\n\n        tokens = tokenizer.tokenize(u\"UNwant\\u00E9d,running\")\n        self.assertListEqual(tokens, [\"un\", \"##want\", \"##ed\", \",\", \"runn\", \"##ing\"])\n\n        self.assertListEqual(\n            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])\n\n    @pytest.mark.slow\n    def test_tokenizer_from_pretrained(self):\n        cache_dir = \"/tmp/pytorch_pretrained_bert_test/\"\n        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:\n            tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)\n            shutil.rmtree(cache_dir)\n            self.assertIsNotNone(tokenizer)\n\n    def test_chinese(self):\n        tokenizer = BasicTokenizer()\n\n        self.assertListEqual(\n            tokenizer.tokenize(u\"ah\\u535A\\u63A8zz\"),\n            [u\"ah\", u\"\\u535A\", u\"\\u63A8\", u\"zz\"])\n\n    def test_basic_tokenizer_lower(self):\n        tokenizer = BasicTokenizer(do_lower_case=True)\n\n        self.assertListEqual(\n            tokenizer.tokenize(u\" \\tHeLLo!how  \\n Are yoU?  \"),\n            [\"hello\", \"!\", \"how\", \"are\", \"you\", \"?\"])\n        self.assertListEqual(tokenizer.tokenize(u\"H\\u00E9llo\"), [\"hello\"])\n\n    def test_basic_tokenizer_no_lower(self):\n        tokenizer = BasicTokenizer(do_lower_case=False)\n\n        self.assertListEqual(\n            tokenizer.tokenize(u\" \\tHeLLo!how  \\n Are yoU?  \"),\n            [\"HeLLo\", \"!\", \"how\", \"Are\", \"yoU\", \"?\"])\n\n    def test_wordpiece_tokenizer(self):\n        vocab_tokens = [\n            \"[UNK]\", \"[CLS]\", \"[SEP]\", \"want\", \"##want\", \"##ed\", \"wa\", \"un\", \"runn\",\n            \"##ing\"\n        ]\n\n        vocab = {}\n        for (i, token) in enumerate(vocab_tokens):\n            vocab[token] = i\n        tokenizer = WordpieceTokenizer(vocab=vocab)\n\n        self.assertListEqual(tokenizer.tokenize(\"\"), [])\n\n        self.assertListEqual(\n            tokenizer.tokenize(\"unwanted running\"),\n            [\"un\", \"##want\", \"##ed\", \"runn\", \"##ing\"])\n\n        self.assertListEqual(\n            tokenizer.tokenize(\"unwantedX running\"), [\"[UNK]\", \"runn\", \"##ing\"])\n\n    def test_is_whitespace(self):\n        self.assertTrue(_is_whitespace(u\" \"))\n        self.assertTrue(_is_whitespace(u\"\\t\"))\n        self.assertTrue(_is_whitespace(u\"\\r\"))\n        self.assertTrue(_is_whitespace(u\"\\n\"))\n        self.assertTrue(_is_whitespace(u\"\\u00A0\"))\n\n        self.assertFalse(_is_whitespace(u\"A\"))\n        self.assertFalse(_is_whitespace(u\"-\"))\n\n    def test_is_control(self):\n        self.assertTrue(_is_control(u\"\\u0005\"))\n\n        self.assertFalse(_is_control(u\"A\"))\n        self.assertFalse(_is_control(u\" \"))\n        self.assertFalse(_is_control(u\"\\t\"))\n        self.assertFalse(_is_control(u\"\\r\"))\n\n    def test_is_punctuation(self):\n        self.assertTrue(_is_punctuation(u\"-\"))\n        self.assertTrue(_is_punctuation(u\"$\"))\n        self.assertTrue(_is_punctuation(u\"`\"))\n        self.assertTrue(_is_punctuation(u\".\"))\n\n        self.assertFalse(_is_punctuation(u\"A\"))\n        self.assertFalse(_is_punctuation(u\" \"))\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "tests/tokenization_transfo_xl_test.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport os\nimport unittest\nfrom io import open\nimport shutil\nimport pytest\n\nfrom pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP\n\n\nclass TransfoXLTokenizationTest(unittest.TestCase):\n\n    def test_full_tokenizer(self):\n        vocab_tokens = [\n            \"<unk>\", \"[CLS]\", \"[SEP]\", \"want\", \"unwanted\", \"wa\", \"un\", \"running\", \",\"\n        ]\n        with open(\"/tmp/transfo_xl_tokenizer_test.txt\", \"w\", encoding='utf-8') as vocab_writer:\n            vocab_writer.write(\"\".join([x + \"\\n\" for x in vocab_tokens]))\n            vocab_file = vocab_writer.name\n\n        tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)\n        tokenizer.build_vocab()\n        os.remove(vocab_file)\n\n        tokens = tokenizer.tokenize(u\"<unk> UNwanted , running\")\n        self.assertListEqual(tokens, [\"<unk>\", \"unwanted\", \",\", \"running\"])\n\n        self.assertListEqual(\n            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])\n\n        vocab_file = tokenizer.save_vocabulary(vocab_path=\"/tmp/\")\n        tokenizer.from_pretrained(vocab_file)\n        os.remove(vocab_file)\n\n        tokens = tokenizer.tokenize(u\"<unk> UNwanted , running\")\n        self.assertListEqual(tokens, [\"<unk>\", \"unwanted\", \",\", \"running\"])\n\n        self.assertListEqual(\n            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])\n\n\n    def test_full_tokenizer_lower(self):\n        tokenizer = TransfoXLTokenizer(lower_case=True)\n\n        self.assertListEqual(\n            tokenizer.tokenize(u\" \\tHeLLo ! how  \\n Are yoU ?  \"),\n            [\"hello\", \"!\", \"how\", \"are\", \"you\", \"?\"])\n\n    def test_full_tokenizer_no_lower(self):\n        tokenizer = TransfoXLTokenizer(lower_case=False)\n\n        self.assertListEqual(\n            tokenizer.tokenize(u\" \\tHeLLo ! how  \\n Are yoU ?  \"),\n            [\"HeLLo\", \"!\", \"how\", \"Are\", \"yoU\", \"?\"])\n\n    @pytest.mark.slow\n    def test_tokenizer_from_pretrained(self):\n        cache_dir = \"/tmp/pytorch_pretrained_bert_test/\"\n        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:\n            tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)\n            shutil.rmtree(cache_dir)\n            self.assertIsNotNone(tokenizer)\n\nif __name__ == '__main__':\n    unittest.main()\n"
  }
]