[
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [2018] [Liyuan Liu]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "ReadMe.md",
    "content": "# LD-Net\n\n[![Documentation Status](https://readthedocs.org/projects/ld-net/badge/?version=latest)](http://ld-net.readthedocs.io/en/latest/?badge=latest)\n[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)\n\n**Check Our New NER Toolkit🚀🚀🚀**\n- **Inference**:\n  - **[LightNER](https://github.com/LiyuanLucasLiu/LightNER)**: inference w. models pre-trained / trained w. *any* following tools, *efficiently*. \n- **Training**:\n  - **[LD-Net](https://github.com/LiyuanLucasLiu/LD-Net)**: train NER models w. efficient contextualized representations.\n  - **[VanillaNER](https://github.com/LiyuanLucasLiu/Vanilla_NER)**: train vanilla NER models w. pre-trained embedding.\n- **Distant Training**:\n  - **[AutoNER](https://shangjingbo1226.github.io/AutoNER/)**: train NER models w.o. line-by-line annotations and get competitive performance.\n\n--------------------------------\n\nLD-Net provides sequence labeling models featuring:\n- **Efficiency**: constructing *efficient contextualized representations* without retraining language models. \n- **Portability**: *well-organized*, *easy-to-modify* and *[well-documented](http://lm-lstm-crf.readthedocs.io/en/latest/)*.\n\nRemarkablely, our pre-trained NER model achieved:\n- **92.08** test F1 on the CoNLL03 NER task.\n- **160K words/sec** decoding speed (**6X** speedup compared to its original model).\n\nDetails about LD-Net can be accessed at: https://arxiv.org/abs/1804.07827.\n\n- [Model notes](#model-notes)\n- [Benchmarks](#benchmarks)\n- [Pretrained model](#pretrained-model)\n\t- [Language models](#language-models)\n\t- [Named Entity Recognition](#named-entity-recognition)\n\t- [Chunking](#chunking)\n- [Training](#model-training)\n\t- [Dependency](#dependency)\n\t- [Data](#data)\n\t- [Model](#model)\n\t- [Command](#command)\n- [Inference](#inference)\n- [Citation](#citation)\n\n## Model Notes\n\n![LD-Net Framework](docs/model_note.png)\n\n## Benchmarks\n\n| Model for CoNLL03 | #FLOPs| Mean(F1) | Std(F1) |\n| ------------- |-------------| -----| -----|\n| Vanilla NER w.o. LM | 3 M | 90.78 | 0.24 |\n| LD-Net (w.o. pruning) | 51 M | 91.86 | 0.15 |\n| LD-Net (origin, picked based on dev f1) | 51 M | 91.95 |  |\n| LD-Net (pruned) | **5 M** | 91.84 | 0.14 |\n\n| Model for CoNLL00 | #FLOPs| Mean(F1) | Std(F1) |\n| ------------- |-------------| -----| -----|\n| Vanilla NP w.o. LM | 3 M | 94.42 | 0.08 |\n| LD-Net (w.o. pruning) | 51 M | 96.01 | 0.07 |\n| LD-Net (origin, picked based on dev f1) | 51 M | 96.13 |  |\n| LD-Net (pruned) | **10 M** | 95.66 | 0.04 |\n\n## Pretrained Models\n\nHere we provide both pre-trained language models and pre-trained sequence labeling models.\n\n### Language Models\n\nOur pretrained language model contains word embedding, 10-layer densely-connected LSTM and adative softmax, and achieve an average PPL of 50.06 on the one billion benchmark dataset.\n\n| Forward Language Model | Backward Language Model |\n| ------------- |------------- |\n| [Download Link](http://dmserv4.cs.illinois.edu/ld0.th) | [Download Link](http://dmserv4.cs.illinois.edu/ld_0.th)|\n\n### Named Entity Recognition\n\nThe original pre-trained named entity tagger achieves 91.95 F1, the pruned tagged achieved 92.08 F1.\n\n| Original Tagger | Pruned Tagger |\n| ------------- |------------- |\n| [Download Link](http://dmserv4.cs.illinois.edu/ner.th) | [Download Link](http://dmserv4.cs.illinois.edu/pner0.th) |\n\n### Chunking\n\nThe original pre-trained named entity tagger achieves 96.13 F1, the pruned tagged achieved 95.79 F1.\n\n| Original Tagger | Pruned Tagger |\n| ------------- |------------- |\n| [Download Link](http://dmserv4.cs.illinois.edu/np.th) | [Download Link](http://dmserv4.cs.illinois.edu/pnp0.th) |\n\n## Training\n\n### Demo Scripts\n\nTo pruning the original LD-Net for the CoNLL03 NER, please run:\n```\nbash ldnet_ner_prune.sh\n```\n\nTo pruning the original LD-Net for the CoNLL00 Chunking, please run:\n```\nbash ldnet_np_prune.sh\n```\n\n### Dependency\n\nOur package is based on Python 3.6 and the following packages:\n```\nnumpy\ntqdm\ntorch-scope\ntorch==0.4.1\n```\n\n### Data\n\nPre-process scripts are available in ```pre_seq``` and ```pre_word_ada```, while pre-processed data has been stored in:\n\n| NER | Chunking |\n| ------------- |------------- |\n| [Download Link](http://dmserv4.cs.illinois.edu/ner_dataset.pk) | [Download Link](http://dmserv4.cs.illinois.edu/np_dataset.pk) |\n\n### Model\n\nOur implementations are available in ```model_seq``` and ```model_word_ada```, and the documentations are hosted in [ReadTheDoc](http://lm-lstm-crf.readthedocs.io/en/latest/)\n\n| NER | Chunking |\n| ------------- |------------- |\n| [Download Link](http://dmserv4.cs.illinois.edu/ner_dataset.pk) | [Download Link](http://dmserv4.cs.illinois.edu/np_dataset.pk) |\n\n## Inference\n\nFor model inference, please check our [LightNER package](https://github.com/LiyuanLucasLiu/LightNER) \n\n## Citation\n\nIf you find the implementation useful, please cite the following paper: [Efficient Contextualized Representation: Language Model Pruning for Sequence Labeling](https://arxiv.org/abs/1804.07827)\n\n```\n@inproceedings{liu2018efficient,\n  title = \"{Efficient Contextualized Representation: Language Model Pruning for Sequence Labeling}\", \n  author = {Liu, Liyuan and Ren, Xiang and Shang, Jingbo and Peng, Jian and Han, Jiawei}, \n  booktitle = {EMNLP}, \n  year = 2018, \n}\n```\n"
  },
  {
    "path": "docs/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\r\n#\r\n\r\n# You can set these variables from the command line.\r\nSPHINXOPTS    =\r\nSPHINXBUILD   = python -msphinx\r\nSPHINXPROJ    = LD_Net\r\nSOURCEDIR     = source\r\nBUILDDIR      = build\r\n\r\n# Put it first so that \"make\" without argument is like \"make help\".\r\nhelp:\r\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\r\n\r\n.PHONY: help Makefile\r\n\r\n# Catch-all target: route all unknown targets to Sphinx using the new\r\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\r\n%: Makefile\r\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\r\n"
  },
  {
    "path": "docs/source/conf.py",
    "content": "#!/usr/bin/env python3\r\n# -*- coding: utf-8 -*-\r\n#\r\n# Wrapper documentation build configuration file, created by\r\n# sphinx-quickstart on Thu Sep 14 03:49:01 2017.\r\n#\r\n# This file is execfile()d with the current directory set to its\r\n# containing dir.\r\n#\r\n# Note that not all possible configuration values are present in this\r\n# autogenerated file.\r\n#\r\n# All configuration values have a default; values that are commented out\r\n# serve to show the default.\r\n\r\n# If extensions (or modules to document with autodoc) are in another directory,\r\n# add these directories to sys.path here. If the directory is relative to the\r\n# documentation root, use os.path.abspath to make it absolute, like shown here.\r\n\r\nimport os\r\nimport sys\r\n\r\nsys.path.insert(0, os.path.abspath('../..'))\r\n\r\n# -- General configuration ------------------------------------------------\r\n\r\n# If your documentation needs a minimal Sphinx version, state it here.\r\n#\r\n# needs_sphinx = '1.0'\r\n\r\n# Add any Sphinx extension module names here, as strings. They can be\r\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\r\n# ones.\r\nextensions = [\r\n    'sphinx.ext.autodoc',\r\n    'sphinx.ext.autosummary',\r\n    'sphinx.ext.doctest',\r\n    'sphinx.ext.intersphinx',\r\n    'sphinx.ext.todo',\r\n    'sphinx.ext.coverage',\r\n    'sphinx.ext.mathjax',\r\n    'sphinx.ext.napoleon',\r\n    'sphinx.ext.viewcode',\r\n    'sphinx.ext.githubpages'\r\n]\r\n\r\nnapoleon_use_ivar = True\r\n\r\n# Add any paths that contain templates here, relative to this directory.\r\ntemplates_path = ['_templates']\r\n\r\n# The suffix(es) of source filenames.\r\n# You can specify multiple suffix as a list of string:\r\n#\r\n# source_suffix = ['.rst', '.md']\r\nsource_suffix = '.rst'\r\n\r\n# The master toctree document.\r\nmaster_doc = 'index'\r\n\r\n# General information about the project.\r\nproject = 'LD-Net'\r\ncopyright = '2018, Liyuan Liu'\r\nauthor = 'Liyuan Liu'\r\n\r\n# The version info for the project you're documenting, acts as replacement for\r\n# |version| and |release|, also used in various other places throughout the\r\n# built documents.\r\n#\r\n# The short X.Y version.\r\nversion = ''\r\n# The full version, including alpha/beta/rc tags.\r\nrelease = ''\r\n\r\n# The language for content autogenerated by Sphinx. Refer to documentation\r\n# for a list of supported languages.\r\n#\r\n# This is also used if you do content translation via gettext catalogs.\r\n# Usually you set \"language\" from the command line for these cases.\r\nlanguage = None\r\n\r\n# List of patterns, relative to source directory, that match files and\r\n# directories to ignore when looking for source files.\r\n# This patterns also effect to html_static_path and html_extra_path\r\nexclude_patterns = []\r\n\r\n# The name of the Pygments (syntax highlighting) style to use.\r\npygments_style = 'sphinx'\r\n\r\n# If true, `todo` and `todoList` produce output, else they produce nothing.\r\ntodo_include_todos = False\r\n\r\n# -- Options for HTML output ----------------------------------------------\r\n\r\n# The theme to use for HTML and HTML Help pages.  See the documentation for\r\n# a list of builtin themes.\r\n#\r\nhtml_theme = 'sphinx_rtd_theme'\r\n\r\n# Theme options are theme-specific and customize the look and feel of a theme\r\n# further.  For a list of options available for each theme, see the\r\n# documentation.\r\n#\r\n# html_theme_options = {}\r\nhtml_theme_options = {\r\n    'collapse_navigation': False,\r\n    'display_version': True,\r\n}\r\n\r\n# Add any paths that contain custom static files (such as style sheets) here,\r\n# relative to this directory. They are copied after the builtin static files,\r\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\r\nhtml_static_path = ['_static']\r\n\r\n# Custom sidebar templates, must be a dictionary that maps document names\r\n# to template names.\r\n#\r\n# This is required for the alabaster theme\r\n# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars\r\nhtml_sidebars = {\r\n    '**': [\r\n        'about.html',\r\n        'navigation.html',\r\n        'relations.html',  # needs 'show_related': True theme option to display\r\n        'searchbox.html',\r\n        'donate.html',\r\n    ]\r\n}\r\n\r\n# -- Options for HTMLHelp output ------------------------------------------\r\n\r\n# Output file base name for HTML help builder.\r\nhtmlhelp_basename = 'LD_Net'\r\n\r\n# -- Options for LaTeX output ---------------------------------------------\r\n\r\nlatex_elements = {\r\n    # The paper size ('letterpaper' or 'a4paper').\r\n    #\r\n    # 'papersize': 'letterpaper',\r\n\r\n    # The font size ('10pt', '11pt' or '12pt').\r\n    #\r\n    # 'pointsize': '10pt',\r\n\r\n    # Additional stuff for the LaTeX preamble.\r\n    #\r\n    # 'preamble': '',\r\n\r\n    # Latex figure (float) alignment\r\n    #\r\n    # 'figure_align': 'htbp',\r\n}\r\n\r\n# Grouping the document tree into LaTeX files. List of tuples\r\n# (source start file, target name, title,\r\n#  author, documentclass [howto, manual, or own class]).\r\nlatex_documents = [\r\n    (master_doc, 'ldnet.tex', 'LD-Net Documentation',\r\n     'Liyuan Liu', 'manual'),\r\n]\r\n\r\n# -- Options for manual page output ---------------------------------------\r\n\r\n# One entry per manual page. List of tuples\r\n# (source start file, name, description, authors, manual section).\r\nman_pages = [\r\n    (master_doc, 'LD-Net', 'LD-Net Documentation',\r\n     [author], 1)\r\n]\r\n\r\n# -- Options for Texinfo output -------------------------------------------\r\n\r\n# Grouping the document tree into Texinfo files. List of tuples\r\n# (source start file, target name, title, author,\r\n#  dir menu entry, description, category)\r\ntexinfo_documents = [\r\n    (master_doc, 'LD-Net', 'LD-Net Documentation',\r\n     author, 'LD-Net', 'Efficient Contextualized Representations.',\r\n     'Miscellaneous'),\r\n]\r\n\r\nautodoc_mock_imports = ['torch', 'numpy', 'tensorboardX', 'git', 'tqdm']\r\n\r\nintersphinx_mapping = {\r\n    'git': ('https://gitpython.readthedocs.io/en/stable/', None),\r\n    'tensorboardX': ('https://tensorboardx.readthedocs.io/en/latest/', None),\r\n    'python':('https://docs.python.org/3', None),\r\n    'numpy': ('http://docs.scipy.org/doc/numpy/', None),\r\n    'torch': ('http://pytorch.org/docs/master', None)\r\n    }\r\n"
  },
  {
    "path": "docs/source/index.rst",
    "content": ".. LD-Net documentation master file.\r\n\r\n:github_url: https://github.com/LiyuanLucasLiu/LD-Net\r\n\r\nLD-Net documentation\r\n=========================\r\n\r\n**Check Our New NER Toolkit🚀🚀🚀**\r\n\r\n- **Inference**:\r\n\r\n  - `LightNER <https://github.com/LiyuanLucasLiu/LightNER>`_: inference w. models pre-trained / trained w. *any* following tools, *efficiently*. \r\n\r\n- **Training**:\r\n\r\n  - `LD-Net <https://github.com/LiyuanLucasLiu/LD-Net>`_: train NER models w. efficient contextualized representations.\r\n  - `VanillaNER <https://github.com/LiyuanLucasLiu/Vanilla_NER>`_: train vanilla NER models w. pre-trained embedding.\r\n\r\n- **Distant Training**:\r\n\r\n  - `AutoNER <https://shangjingbo1226.github.io/AutoNER/>`_: train NER models w.o. line-by-line annotations and get competitive performance.\r\n\r\n--------------------------\r\n\r\nThis project provides high-performance word-level language model, and sequence labeling with contextualized representation.\r\nThe key feature of this project is the support of langugage model pruning without retraining. \r\n\r\nDetails about LD-Net can be accessed at: https://arxiv.org/abs/1804.07827.\r\n\r\n.. toctree::\r\n   :maxdepth: 2\r\n   :caption: Language Modeling\r\n\r\n   word\r\n\r\n.. toctree::\r\n   :maxdepth: 2\r\n   :caption: Sequence Labeling\r\n\r\n   seq\r\n\r\n\r\nIndices and tables\r\n==================\r\n\r\n* :ref:`genindex`\r\n* :ref:`modindex`\r\n* :ref:`search`\r\n"
  },
  {
    "path": "docs/source/seq.rst",
    "content": "Sequence Labeling\n==========================\n\nmodel_seq\\.crf module\n----------------------\n.. automodule:: model_seq.crf\n\t:members:\n\nmodel_seq\\.dataset module\n--------------------------\n.. automodule:: model_seq.dataset\n\t:members:\n\nmodel_seq\\.elmo module\n-----------------------\n.. automodule:: model_seq.elmo\n\t:members:\n\nmodel_seq\\.evaluator module\n----------------------------\n.. automodule:: model_seq.evaluator\n\t:members:\n\nmodel_seq\\.seqlabel module\n--------------------------\n.. automodule:: model_seq.seqlabel\n\t:members:\n\nmodel_seq\\.seqlm module\n------------------------\n.. automodule:: model_seq.seqlm\n\t:members:\n\nmodel_seq\\.sparse_lm module\n----------------------------\n.. automodule:: model_seq.sparse_lm\n\t:members:\n\nmodel_seq\\.utils module\n-------------------------\n.. automodule:: model_seq.utils\n\t:members:"
  },
  {
    "path": "docs/source/word.rst",
    "content": "Language Modeling\n==========================\n\nmodel_word_ada\\.adaptive module\n-------------------------------\n.. automodule:: model_word_ada.adaptive\n\t:members:\n\nmodel_word_ada\\.basic module\n----------------------------\n.. automodule:: model_word_ada.basic\n\t:members:\n\nmodel_word_ada\\.dataset module\n-------------------------------\n.. automodule:: model_word_ada.dataset\n\t:members:\n\nmodel_word_ada\\.densenet module\n-------------------------------\n.. automodule:: model_word_ada.densenet\n\t:members:\n\nmodel_word_ada\\.ldnet module\n----------------------------\n.. automodule:: model_word_ada.ldnet\n\t:members:\n\nmodel_word_ada\\.LM module\n-------------------------\n.. automodule:: model_word_ada.LM\n\t:members:\n\nmodel_word_ada\\.utils module\n----------------------------\n.. automodule:: model_word_ada.utils\n\t:members:"
  },
  {
    "path": "ldnet_ner_prune.sh",
    "content": "FIRST_RUN=1\n\nDATA_ROOT=\"data/\"\nNER_DATASET=$DATA_ROOT/ner_dataset.pk\n\nCHECKPOINT_ROOT=\"checkpoint/\"\nNER_CHECKPOINT=$CHECKPOINT_ROOT/ner.th\n\nCHECKPOINT_NAME=\"p_ner0\"\n\ngreen=`tput setaf 2`\nreset=`tput sgr0`\n\nif [ $FIRST_RUN == 1 ] && [ ! -e $NER_DATASET ]; then\n    echo ${green}=== Downloading Dataset ===${reset}\n    mkdir -p DATA_ROOT\n    curl http://dmserv4.cs.illinois.edu/ner_dataset.pk -o $NER_DATASET\nfi\n\nif [ $FIRST_RUN == 1 ] && [ ! -e $NER_CHECKPOINT ]; then\n    echo ${green}=== Downloading Checkpoint ===${reset}\n    mkdir -p CHECKPOINT_ROOT\n    curl http://dmserv4.cs.illinois.edu/ner.th -o $NER_CHECKPOINT\nfi\n\necho ${green}=== Pruning NER Model ===${reset}\npython prune_sparse_seq.py --cp_root $CHECKPOINT_ROOT --checkpoint_name $CHECKPOINT_NAME --corpus $NER_DATASET --load_seq $NER_CHECKPOINT --seq_lambda0 0.05 --seq_lambda1 2\n"
  },
  {
    "path": "ldnet_np_prune.sh",
    "content": "FIRST_RUN=1\n\nDATA_ROOT=\"data/\"\nNP_DATASET=$DATA_ROOT/np_dataset.pk\n\nCHECKPOINT_ROOT=\"checkpoint/\"\nNP_CHECKPOINT=$CHECKPOINT_ROOT/np.th\n\nCHECKPOINT_NAME=\"p_np0\"\n\ngreen=`tput setaf 2`\nreset=`tput sgr0`\n\nif [ $FIRST_RUN == 1 ] && [ ! -e $NP_DATASET ]; then\n    echo ${green}=== Downloading Dataset ===${reset}\n    mkdir -p DATA_ROOT\n    curl http://dmserv4.cs.illinois.edu/np_dataset.pk -o $NP_DATASET\nfi\n\nif [ $FIRST_RUN == 1 ] && [ ! -e $NP_CHECKPOINT ]; then\n    echo ${green}=== Downloading Checkpoint ===${reset}\n    mkdir -p CHECKPOINT_ROOT\n    curl http://dmserv4.cs.illinois.edu/np.th -o $NP_CHECKPOINT\nfi\n\necho ${green}=== Pruning NER Model ===${reset}\npython prune_sparse_seq.py --cp_root $CHECKPOINT_ROOT --checkpoint_name $CHECKPOINT_NAME --corpus $NP_DATASET --load_seq $NP_CHECKPOINT --seq_lambda0 0.05 --seq_lambda1 2\n"
  },
  {
    "path": "model_seq/__init__.py",
    "content": ""
  },
  {
    "path": "model_seq/crf.py",
    "content": "\"\"\"\n.. module:: crf\n    :synopsis: conditional random field\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\n\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport torch.sparse as sparse\nimport model_seq.utils as utils\n\nclass CRF(nn.Module):\n    \"\"\"\n    Conditional Random Field Module\n\n    Parameters\n    ----------\n    hidden_dim : ``int``, required.\n        the dimension of the input features.\n    tagset_size : ``int``, required.\n        the size of the target labels.\n    if_bias: ``bool``, optional, (default=True).\n        whether the linear transformation has the bias term.\n    \"\"\"\n    def __init__(self, \n                hidden_dim: int, \n                tagset_size: int, \n                if_bias: bool = True):\n        super(CRF, self).__init__()\n        self.tagset_size = tagset_size\n        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size, bias=if_bias)\n        self.transitions = nn.Parameter(torch.Tensor(self.tagset_size, self.tagset_size))\n\n    def rand_init(self):\n        \"\"\"\n        random initialization\n        \"\"\"\n        utils.init_linear(self.hidden2tag)\n        self.transitions.data.zero_()\n\n    def forward(self, feats):\n        \"\"\"\n        calculate the potential score for the conditional random field.\n\n        Parameters\n        ----------\n        feats: ``torch.FloatTensor``, required.\n            the input features for the conditional random field, of shape (*, hidden_dim).\n\n        Returns\n        -------\n        output: ``torch.FloatTensor``.\n            A float tensor of shape (ins_num, from_tag_size, to_tag_size)\n        \"\"\"\n        scores = self.hidden2tag(feats).view(-1, 1, self.tagset_size)\n        ins_num = scores.size(0)\n        crf_scores = scores.expand(ins_num, self.tagset_size, self.tagset_size) + self.transitions.view(1, self.tagset_size, self.tagset_size).expand(ins_num, self.tagset_size, self.tagset_size)\n\n        return crf_scores\n\nclass CRFLoss(nn.Module):\n    \"\"\"\n    \n    The negative loss for the Conditional Random Field Module\n\n    Parameters\n    ----------\n    y_map : ``dict``, required.\n        a ``dict`` maps from tag string to tag index.\n    average_batch : ``bool``, optional, (default=True).\n        whether the return score would be averaged per batch.\n    \"\"\"\n\n    def __init__(self, \n                 y_map: dict, \n                 average_batch: bool = True):\n        super(CRFLoss, self).__init__()\n        self.tagset_size = len(y_map)\n        self.start_tag = y_map['<s>']\n        self.end_tag = y_map['<eof>']\n        self.average_batch = average_batch\n\n    def forward(self, scores, target, mask):\n        \"\"\"\n        calculate the negative log likehood for the conditional random field.\n\n        Parameters\n        ----------\n        scores: ``torch.FloatTensor``, required.\n            the potential score for the conditional random field, of shape (seq_len, batch_size, from_tag_size, to_tag_size).\n        target: ``torch.LongTensor``, required.\n            the positive path for the conditional random field, of shape (seq_len, batch_size).\n        mask: ``torch.ByteTensor``, required.\n            the mask for the unpadded sentence parts, of shape (seq_len, batch_size).\n\n        Returns\n        -------\n        loss: ``torch.FloatTensor``.\n            The NLL loss.\n        \"\"\"\n        seq_len = scores.size(0)\n        bat_size = scores.size(1)\n\n        tg_energy = torch.gather(scores.view(seq_len, bat_size, -1), 2, target.unsqueeze(2)).view(seq_len, bat_size)\n        tg_energy = tg_energy.masked_select(mask).sum()\n\n        seq_iter = enumerate(scores)\n\n        _, inivalues = seq_iter.__next__()\n        partition = inivalues[:, self.start_tag, :].squeeze(1).clone()\n\n        for idx, cur_values in seq_iter:\n            cur_values = cur_values + partition.unsqueeze(2).expand(bat_size, self.tagset_size, self.tagset_size)\n\n            cur_partition = utils.log_sum_exp(cur_values)\n\n            mask_idx = mask[idx, :].view(bat_size, 1).expand(bat_size, self.tagset_size)\n            partition.masked_scatter_(mask_idx, cur_partition.masked_select(mask_idx))\n\n        partition = partition[:, self.end_tag].sum()\n\n        if self.average_batch:\n            return (partition - tg_energy) / bat_size\n        else:\n            return (partition - tg_energy)\n\nclass CRFDecode():\n    \"\"\"\n    \n    The negative loss for the Conditional Random Field Module\n\n    Parameters\n    ----------\n    y_map : ``dict``, required.\n        a ``dict`` maps from tag string to tag index.\n    \"\"\"\n    def __init__(self, y_map: dict):\n        self.tagset_size = len(y_map)\n        self.start_tag = y_map['<s>']\n        self.end_tag = y_map['<eof>']\n        self.y_map = y_map\n        self.r_y_map = {v:k for k, v in self.y_map.items()}\n\n    def decode(self, scores, mask):\n        \"\"\"\n        find the best path from the potential scores by the viterbi decoding algorithm.\n\n        Parameters\n        ----------\n        scores: ``torch.FloatTensor``, required.\n            the potential score for the conditional random field, of shape (seq_len, batch_size, from_tag_size, to_tag_size).\n        mask: ``torch.ByteTensor``, required.\n            the mask for the unpadded sentence parts, of shape (seq_len, batch_size).\n\n        Returns\n        -------\n        output: ``torch.LongTensor``.\n            A LongTensor of shape (seq_len - 1, batch_size)\n        \"\"\"\n        seq_len = scores.size(0)\n        bat_size = scores.size(1)\n\n        mask = 1 - mask.data\n        decode_idx = torch.LongTensor(seq_len-1, bat_size)\n\n        seq_iter = enumerate(scores)\n        _, inivalues = seq_iter.__next__()\n        forscores = inivalues[:, self.start_tag, :]\n        back_points = list()\n\n        for idx, cur_values in seq_iter:\n            cur_values = cur_values + forscores.contiguous().view(bat_size, self.tagset_size, 1).expand(bat_size, self.tagset_size, self.tagset_size)\n\n            forscores, cur_bp = torch.max(cur_values, 1)\n            cur_bp.masked_fill_(mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size), self.end_tag)\n            back_points.append(cur_bp)\n\n        pointer = back_points[-1][:, self.end_tag]\n        decode_idx[-1] = pointer\n        for idx in range(len(back_points)-2, -1, -1):\n            back_point = back_points[idx]\n            index = pointer.contiguous().view(-1, 1)\n            pointer = torch.gather(back_point, 1, index).view(-1)\n            decode_idx[idx] = pointer\n        return decode_idx\n\n    def to_spans(self, sequence):\n        \"\"\"\n        decode the best path to spans.\n\n        Parameters\n        ----------\n        sequence: list, required.\n            the list of best label indexes paths .\n\n        Returns\n        -------\n        output: ``set``.\n            A set of chunks contains the position and type of the entities.\n        \"\"\"\n        chunks = []\n        current = None\n\n        for i, y in enumerate(sequence):\n            label = self.r_y_map[y]\n\n            if label.startswith('B-'):\n\n                if current is not None:\n                    chunks.append('@'.join(current))\n                current = [label.replace('B-', ''), '%d' % i]\n\n            elif label.startswith('S-'):\n\n                if current is not None:\n                    chunks.append('@'.join(current))\n                    current = None\n                base = label.replace('S-', '')\n                chunks.append('@'.join([base, '%d' % i]))\n\n            elif label.startswith('I-'):\n\n                if current is not None:\n                    base = label.replace('I-', '')\n                    if base == current[0]:\n                        current.append('%d' % i)\n                    else:\n                        chunks.append('@'.join(current))\n                        current = [base, '%d' % i]\n\n                else:\n                    current = [label.replace('I-', ''), '%d' % i]\n\n            elif label.startswith('E-'):\n\n                if current is not None:\n                    base = label.replace('E-', '')\n                    if base == current[0]:\n                        current.append('%d' % i)\n                        chunks.append('@'.join(current))\n                        current = None\n                    else:\n                        chunks.append('@'.join(current))\n                        current = [base, '%d' % i]\n                        chunks.append('@'.join(current))\n                        current = None\n\n                else:\n                    current = [label.replace('E-', ''), '%d' % i]\n                    chunks.append('@'.join(current))\n                    current = None\n            else:\n                if current is not None:\n                    chunks.append('@'.join(current))\n                current = None\n\n        if current is not None:\n            chunks.append('@'.join(current))\n\n        return set(chunks)"
  },
  {
    "path": "model_seq/dataset.py",
    "content": "\"\"\"\n.. module:: dataset\n    :synopsis: dataset for sequence labeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nimport sys\nimport pickle\nimport random\nimport functools\nimport itertools\nfrom tqdm import tqdm\n\nclass SeqDataset(object):\n    \"\"\"    \n    Dataset for Sequence Labeling\n\n    Parameters\n    ----------\n    dataset : ``list``, required.\n        The encoded dataset (outputs of preprocess scripts).\n    flm_pad : ``int``, required.\n        The pad index for the forward language model.\n    blm_pad : ``int``, required.\n        The pad index for the backward language model.\n    w_pad : ``int``, required.\n        The pad index for the word-level inputs.\n    c_con : ``int``, required.\n        The index of connect character token for character-level inputs.\n    c_pad : ``int``, required.\n        The pad index for the character-level inputs.\n    y_start : ``int``, required.\n        The index of the start label token.\n    y_pad : ``int``, required.\n        The index of the pad label token.\n    y_size : ``int``, required.\n        The size of the tag set.\n    batch_size: ``int``, required.\n        Batch size.\n    \"\"\"\n    def __init__(self, \n                dataset: list, \n                flm_pad: int, \n                blm_pad: int, \n                w_pad: int, \n                c_con: int, \n                c_pad: int, \n                y_start: int, \n                y_pad: int, \n                y_size: int, \n                batch_size: int):\n        super(SeqDataset, self).__init__()\n\n        self.flm_pad = flm_pad\n        self.blm_pad = blm_pad\n        self.w_pad = w_pad\n        self.c_con = c_con\n        self.c_pad = c_pad\n        self.y_pad = y_pad\n        self.y_size = y_size\n        self.y_start = y_start\n        self.batch_size = batch_size\n\n        self.construct_index(dataset)\n        self.shuffle()\n\n    def shuffle(self):\n        \"\"\"\n        shuffle dataset\n        \"\"\"\n        random.shuffle(self.shuffle_list)\n\n    def get_tqdm(self, device):\n        \"\"\"\n        construct dataset reader and the corresponding tqdm.\n\n        Parameters\n        ----------\n        device: ``torch.device``, required.\n            the target device for the dataset loader.\n\n        \"\"\"\n        return tqdm(self.reader(device), mininterval=2, total=self.index_length // self.batch_size, leave=False, file=sys.stdout, ncols=80)\n\n    def construct_index(self, dataset):\n        \"\"\"\n        construct index for the dataset.\n\n        Parameters\n        ----------\n        dataset: ``list``, required.\n            the encoded dataset (outputs of preprocess scripts).        \n        \"\"\"\n        for instance in dataset:\n            c_len = [len(tup)+1 for tup in instance[3]]\n            c_ins = [tup for ins in instance[3] for tup in (ins + [self.c_con])]\n            instance[3] = c_ins\n            instance.append(c_len)\n\n        self.dataset = dataset\n        self.index_length = len(dataset)\n        self.shuffle_list = list(range(0, self.index_length))\n    \n    def reader(self, device):\n        \"\"\"\n        construct dataset reader.\n\n        Parameters\n        ----------\n        device: ``torch.device``, required.\n            the target device for the dataset loader.\n\n        Returns\n        -------\n        reader: ``iterator``.\n            A lazy iterable object        \n        \"\"\"\n        cur_idx = 0\n        while cur_idx < self.index_length:\n            end_index = min(cur_idx + self.batch_size, self.index_length)\n            batch = [self.dataset[self.shuffle_list[index]] for index in range(cur_idx, end_index)]\n            cur_idx = end_index\n            yield self.batchify(batch, device)\n        self.shuffle()\n    \n    def batchify(self, batch, device):\n        \"\"\"\n        batchify a batch of data and move to a device.\n\n        Parameters\n        ----------\n        batch: ``list``, required.\n            a sample from the encoded dataset (outputs of preprocess scripts).  \n        device: ``torch.device``, required.\n            the target device for the dataset loader.\n        \"\"\"\n        \n        cur_batch_size = len(batch)\n\n        char_padded_len = max([len(tup[3]) for tup in batch])\n        word_padded_len = max([len(tup[0]) for tup in batch])\n\n        tmp_batch =  [list() for ind in range(11)]\n\n        for instance_ind in range(cur_batch_size):\n\n            instance = batch[instance_ind]\n\n            char_padded_len_ins = char_padded_len - len(instance[3])\n            word_padded_len_ins = word_padded_len - len(instance[0])\n\n            tmp_batch[0].append(instance[3] + [self.c_pad] + [self.c_pad] * char_padded_len_ins)\n            tmp_batch[2].append([self.c_pad] + instance[3][::-1] + [self.c_pad] * char_padded_len_ins)\n\n            tmp_p = list( itertools.accumulate(instance[5]+[1]+[0]* word_padded_len_ins) )\n            tmp_batch[1].append([(x - 1) * cur_batch_size + instance_ind for x in tmp_p])\n            tmp_p = list(itertools.accumulate([1]+instance[5][::-1]))[::-1] + [1]*word_padded_len_ins\n            tmp_batch[3].append([(x - 1) * cur_batch_size + instance_ind for x in tmp_p])\n\n            tmp_batch[4].append(instance[0] + [self.flm_pad] + [self.flm_pad] * word_padded_len_ins)\n            tmp_batch[5].append([self.blm_pad] + instance[1][::-1] + [self.blm_pad] * word_padded_len_ins)\n\n            tmp_p = list(range(len(instance[1]), -1, -1)) + list(range(len(instance[1])+1, word_padded_len+1))\n            tmp_batch[6].append([x * cur_batch_size + instance_ind for x in tmp_p])\n\n            tmp_batch[7].append(instance[2] + [self.w_pad] + [self.w_pad] * word_padded_len_ins)\n\n            tmp_batch[8].append([self.y_start * self.y_size + instance[4][0]] + [instance[4][ind] * self.y_size + instance[4][ind+1] for ind in range(len(instance[4]) - 1)] + [instance[4][-1] * self.y_size + self.y_pad] + [self.y_pad * self.y_size + self.y_pad] * word_padded_len_ins)\n\n            tmp_batch[9].append([1] * len(instance[4]) + [1] + [0] * word_padded_len_ins)\n\n            tmp_batch[10].append(instance[4])\n                \n        tbt = [torch.LongTensor(v).transpose(0, 1).contiguous() for v in tmp_batch[0:9]] + [torch.ByteTensor(tmp_batch[9]).transpose(0, 1).contiguous()]\n\n        tbt[1] = tbt[1].view(-1)\n        tbt[3] = tbt[3].view(-1)\n        tbt[6] = tbt[6].view(-1)\n\n        return [ten.to(device) for ten in tbt] + [tmp_batch[10]]"
  },
  {
    "path": "model_seq/elmo.py",
    "content": "\"\"\"\n.. module:: elmo\n    :synopsis: deep contextualized representation\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport time\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport model_seq.utils as utils\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass EBUnit(nn.Module):\n    \"\"\"\n    The basic recurrent unit for the ELMo RNNs wrapper.\n\n    Parameters\n    ----------\n    ori_unit : ``torch.nn.Module``, required.\n        The original module of rnn unit.\n    droprate : ``float``, required.\n        The dropout ratrio.\n    fix_rate: ``bool``, required.\n        Whether to fix the rqtio.\n    \"\"\"\n    def __init__(self, ori_unit, droprate, fix_rate):\n        super(EBUnit, self).__init__()\n\n        self.layer = ori_unit.layer\n\n        self.droprate = droprate\n\n        self.output_dim = ori_unit.output_dim\n\n    def forward(self, x):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.FloatTensor``, required.\n            The input tensor, of shape (seq_len, batch_size, input_dim).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The output of RNNs.\n        \"\"\"\n        out, _ = self.layer(x)\n\n        if self.droprate > 0:\n            out = F.dropout(out, p=self.droprate, training=self.training)\n\n        return out\n\nclass ERNN(nn.Module):\n    \"\"\"\n    The multi-layer recurrent networks for the ELMo RNNs wrapper.\n\n    Parameters\n    ----------\n    ori_drnn : ``torch.nn.Module``, required.\n        The original module of rnn networks.\n    droprate : ``float``, required.\n        The dropout ratrio.\n    fix_rate: ``bool``, required.\n        Whether to fix the rqtio.\n    \"\"\"\n    def __init__(self, ori_drnn, droprate, fix_rate):\n        super(ERNN, self).__init__()\n\n        self.layer_list = [EBUnit(ori_unit, droprate, fix_rate) for ori_unit in ori_drnn.layer._modules.values()]\n\n        self.gamma = nn.Parameter(torch.FloatTensor([1.0]))\n        self.weight_list = nn.Parameter(torch.FloatTensor([0.0] * len(self.layer_list)))\n\n        self.layer = nn.ModuleList(self.layer_list)\n\n        for param in self.layer.parameters():\n            param.requires_grad = False\n\n        if fix_rate:\n            self.gamma.requires_grad = False\n            self.weight_list.requires_grad = False\n\n        self.output_dim = self.layer_list[-1].output_dim\n\n    def regularizer(self):\n        \"\"\"\n        Calculate the regularization term.\n\n        Returns\n        ----------\n        The regularization term.\n        \"\"\"\n        srd_weight = self.weight_list - (1.0 / len(self.layer_list))\n        return (srd_weight ** 2).sum()\n\n    def forward(self, x):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.FloatTensor``, required.\n            the input tensor, of shape (seq_len, batch_size, input_dim).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The ELMo outputs.\n        \"\"\"\n        out = 0\n        nw = self.gamma * F.softmax(self.weight_list, dim=0)\n        for ind in range(len(self.layer_list)):\n            x = self.layer[ind](x)\n            out += x * nw[ind]\n        return out\n\nclass ElmoLM(nn.Module):\n    \"\"\"\n    The language model for the ELMo RNNs wrapper.\n\n    Parameters\n    ----------\n    ori_lm : ``torch.nn.Module``, required.\n        the original module of language model.\n    backward : ``bool``, required.\n        whether the language model is backward.\n    droprate : ``float``, required.\n        the dropout ratrio.\n    fix_rate: ``bool``, required.\n        whether to fix the rqtio.\n    \"\"\"\n\n    def __init__(self, ori_lm, backward, droprate, fix_rate):\n        super(ElmoLM, self).__init__()\n\n        self.rnn = ERNN(ori_lm.rnn, droprate, fix_rate)\n\n        self.w_num = ori_lm.w_num\n        self.w_dim = ori_lm.w_dim\n        self.word_embed = ori_lm.word_embed\n        self.word_embed.weight.requires_grad = False\n\n        self.output_dim = ori_lm.rnn_output\n\n        self.backward = backward\n\n    def init_hidden(self):\n        \"\"\"\n        initialize hidden states.\n        \"\"\"\n        return\n\n    def regularizer(self):\n        \"\"\"\n        Calculate the regularization term.\n\n        Returns\n        ----------\n        reg: ``list``.\n            The list of regularization terms.\n        \"\"\"\n        return self.rnn.regularizer()\n\n    def prox(self, lambda0):\n        \"\"\"\n        the proximal calculator.\n        \"\"\"\n        return 0.0\n\n    def forward(self, w_in, ind=None):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        w_in : ``torch.LongTensor``, required.\n            the input tensor, of shape (seq_len, batch_size).\n        ind : ``torch.LongTensor``, optional, (default=None).\n            the index tensor for the backward language model, of shape (seq_len, batch_size).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The ELMo outputs.\n        \"\"\"\n        w_emb = self.word_embed(w_in)\n        \n        out = self.rnn(w_emb)\n\n        if self.backward:\n            out_size = out.size()\n            out = out.view(out_size[0] * out_size[1], out_size[2]).index_select(0, ind).contiguous().view(out_size)\n\n        return out"
  },
  {
    "path": "model_seq/evaluator.py",
    "content": "\"\"\"\n.. module:: evaluator\n    :synopsis: evaluator for sequence labeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport torch\nimport numpy as np\nimport itertools\n\nimport model_seq.utils as utils\nfrom torch.autograd import Variable\n\nclass eval_batch:\n    \"\"\"\n    Base class for evaluation, provide method to calculate f1 score and accuracy.\n\n    Parameters\n    ----------\n    decoder : ``torch.nn.Module``, required.\n        the decoder module, which needs to contain the ``to_span()`` method.\n    \"\"\"\n    def __init__(self, decoder):\n        self.decoder = decoder\n\n    def reset(self):\n        \"\"\"\n        reset counters.\n        \"\"\"\n        self.correct_labels = 0\n        self.total_labels = 0\n        self.gold_count = 0\n        self.guess_count = 0\n        self.overlap_count = 0\n\n    def calc_f1_batch(self, decoded_data, target_data):\n        \"\"\"\n        update statics for f1 score.\n\n        Parameters\n        ----------\n        decoded_data: ``torch.LongTensor``, required.\n            the decoded best label index pathes.\n        target_data:  ``torch.LongTensor``, required.\n            the golden label index pathes.\n        \"\"\"\n        batch_decoded = torch.unbind(decoded_data, 1)\n\n        for decoded, target in zip(batch_decoded, target_data):\n            length = len(target)\n            best_path = decoded[:length]\n\n            correct_labels_i, total_labels_i, gold_count_i, guess_count_i, overlap_count_i = self.eval_instance(best_path.numpy(), target)\n            self.correct_labels += correct_labels_i\n            self.total_labels += total_labels_i\n            self.gold_count += gold_count_i\n            self.guess_count += guess_count_i\n            self.overlap_count += overlap_count_i\n\n    def calc_acc_batch(self, decoded_data, target_data):\n        \"\"\"\n        update statics for accuracy score.\n\n        Parameters\n        ----------\n        decoded_data: ``torch.LongTensor``, required.\n            the decoded best label index pathes.\n        target_data:  ``torch.LongTensor``, required.\n            the golden label index pathes.\n        \"\"\"\n        batch_decoded = torch.unbind(decoded_data, 1)\n\n        for decoded, target in zip(batch_decoded, target_data):\n            \n            # remove padding\n            length = len(target)\n            best_path = decoded[:length].numpy()\n\n            self.total_labels += length\n            self.correct_labels += np.sum(np.equal(best_path, gold))\n\n    def f1_score(self):\n        \"\"\"\n        calculate the f1 score based on the inner counter.\n        \"\"\"\n        if self.guess_count == 0:\n            return 0.0, 0.0, 0.0, 0.0\n        precision = self.overlap_count / float(self.guess_count)\n        recall = self.overlap_count / float(self.gold_count)\n        if precision == 0.0 or recall == 0.0:\n            return 0.0, 0.0, 0.0, 0.0\n        f = 2 * (precision * recall) / (precision + recall)\n        accuracy = float(self.correct_labels) / self.total_labels\n        return f, precision, recall, accuracy\n\n    def acc_score(self):\n        \"\"\"\n        calculate the accuracy score based on the inner counter.\n        \"\"\"\n        if 0 == self.total_labels:\n            return 0.0\n        accuracy = float(self.correct_labels) / self.total_labels\n        return accuracy        \n\n    def eval_instance(self, best_path, gold):\n        \"\"\"\n        Calculate statics to update inner counters for one instance.\n\n        Parameters\n        ----------\n        best_path: required.\n            the decoded best label index pathe.\n        gold: required.\n            the golden label index pathes.\n      \n        \"\"\"\n        total_labels = len(best_path)\n        correct_labels = np.sum(np.equal(best_path, gold))\n        gold_chunks = self.decoder.to_spans(gold)\n        gold_count = len(gold_chunks)\n\n        guess_chunks = self.decoder.to_spans(best_path)\n        guess_count = len(guess_chunks)\n\n        overlap_chunks = gold_chunks & guess_chunks\n        overlap_count = len(overlap_chunks)\n\n        return correct_labels, total_labels, gold_count, guess_count, overlap_count\n\nclass eval_wc(eval_batch):\n    \"\"\"\n    evaluation class for LD-Net\n\n    Parameters\n    ----------\n    decoder : ``torch.nn.Module``, required.\n        the decoder module, which needs to contain the ``to_span()`` and ``decode()`` method.\n    score_type : ``str``, required.\n        whether the f1 score or the accuracy is needed.\n    \"\"\"\n    def __init__(self, decoder, score_type):\n        eval_batch.__init__(self, decoder)\n\n        if 'f' in score_type:\n            self.eval_b = self.calc_f1_batch\n            self.calc_s = self.f1_score\n        else:\n            self.eval_b = self.calc_acc_batch\n            self.calc_s = self.acc_score\n\n    def calc_score(self, seq_model, dataset_loader):\n        \"\"\"\n        calculate scores\n\n        Parameters\n        ----------\n        seq_model: required.\n            sequence labeling model.\n        dataset_loader: required.\n            the dataset loader.\n\n        Returns\n        -------\n        score: ``float``.\n            calculated score.\n        \"\"\"\n        seq_model.eval()\n        self.reset()\n\n        for f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w, _, f_y_m, g_y in dataset_loader:\n            scores = seq_model(f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w)\n            decoded = self.decoder.decode(scores.data, f_y_m)\n            self.eval_b(decoded, g_y)\n\n        return self.calc_s()"
  },
  {
    "path": "model_seq/seqlabel.py",
    "content": "\"\"\"\n.. module:: seqlabel\n    :synopsis: sequence labeling model\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport model_seq.utils as utils\nfrom model_seq.crf import CRF\n\nclass SeqLabel(nn.Module):\n    \"\"\"\n    Sequence Labeling model augumented with language model.\n\n    Parameters\n    ----------\n    f_lm : ``torch.nn.Module``, required.\n        The forward language modle for contextualized representations.\n    b_lm : ``torch.nn.Module``, required.\n        The backward language modle for contextualized representations.\n    c_num : ``int`` , required.\n        The number of characters.\n    c_dim : ``int`` , required.\n        The dimension of character embedding.\n    c_hidden : ``int`` , required.\n        The dimension of character hidden states.\n    c_layer : ``int`` , required.\n        The number of character lstms.\n    w_num : ``int`` , required.\n        The number of words.\n    w_dim : ``int`` , required.\n        The dimension of word embedding.\n    w_hidden : ``int`` , required.\n        The dimension of word hidden states.\n    w_layer : ``int`` , required.\n        The number of word lstms.\n    y_num : ``int`` , required.\n        The number of tags types.\n    droprate : ``float`` , required\n        The dropout ratio.\n    unit : \"str\", optional, (default = 'lstm')\n        The type of the recurrent unit.\n    \"\"\"\n    def __init__(self, f_lm, b_lm, \n            c_num: int, \n            c_dim: int, \n            c_hidden: int, \n            c_layer: int, \n            w_num: int, \n            w_dim: int, \n            w_hidden: int, \n            w_layer: int, \n            y_num: int, \n            droprate: float, \n            unit: str = 'lstm'):\n        super(SeqLabel, self).__init__()\n\n        rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}\n\n        self.f_lm = f_lm\n        self.b_lm = b_lm\n        self.unit_type = unit\n\n        self.char_embed = nn.Embedding(c_num, c_dim)\n        self.word_embed = nn.Embedding(w_num, w_dim)\n\n        self.char_seq = nn.Linear(c_hidden * 2, w_dim)\n        self.lm_seq = nn.Linear(f_lm.output_dim + b_lm.output_dim, w_dim)\n\n        self.relu = nn.ReLU()\n\n        self.c_hidden = c_hidden\n        tmp_rnn_dropout = droprate if c_layer > 1 else 0\n        self.char_fw = rnnunit_map[unit](c_dim, c_hidden, c_layer, dropout = tmp_rnn_dropout)\n        self.char_bw = rnnunit_map[unit](c_dim, c_hidden, c_layer, dropout = tmp_rnn_dropout)\n\n        tmp_rnn_dropout = droprate if w_layer > 1 else 0\n        self.word_rnn = rnnunit_map[unit](w_dim * 3, w_hidden // 2, w_layer, dropout = tmp_rnn_dropout, bidirectional = True)\n\n        self.y_num = y_num\n        self.crf = CRF(w_hidden, y_num)\n\n        self.drop = nn.Dropout(p = droprate)\n\n    def to_params(self):\n        \"\"\"\n        To parameters.\n        \"\"\"\n        return {\n            \"model_type\": \"char-lstm-crf\",\n            \"forward_lm\": self.f_lm.to_params(),\n            \"backward_lm\": self.b_lm.to_params(),\n            \"word_embed_num\": self.word_embed.num_embeddings,\n            \"word_embed_dim\": self.word_embed.embedding_dim,\n            \"char_embed_num\": self.char_embed.num_embeddings,\n            \"char_embed_dim\": self.char_embed.embedding_dim,\n            \"char_hidden\": self.c_hidden,\n            \"char_layers\": self.char_fw.num_layers,\n            \"word_hidden\": self.word_rnn.hidden_size,\n            \"word_layers\": self.word_rnn.num_layers,\n            \"droprate\": self.drop.p,\n            \"y_num\": self.y_num,\n            \"label_schema\": \"iobes\",\n            \"unit_type\": self.unit_type\n        }\n\n    def prune_dense_rnn(self):\n        \"\"\"\n        Prune dense rnn to be smaller by delecting layers.\n        \"\"\"\n        f_prune_mask = self.f_lm.prune_dense_rnn()\n        b_prune_mask = self.b_lm.prune_dense_rnn()\n        prune_mask = torch.cat([f_prune_mask, b_prune_mask], dim = 0)\n        mask_index = prune_mask.nonzero().squeeze(1)\n        self.lm_seq.weight = nn.Parameter(self.lm_seq.weight.data.index_select(1, mask_index).contiguous())\n        self.lm_seq.in_features = self.lm_seq.weight.size(1)\n\n    def set_batch_seq_size(self, sentence):\n        \"\"\"\n        Set the batch size and sequence length.\n        \"\"\"\n        tmp = sentence.size()\n        self.word_seq_length = tmp[0]\n        self.batch_size = tmp[1]\n\n    def load_pretrained_word_embedding(self, pre_word_embeddings):\n        \"\"\"\n        Load pre-trained word embedding.\n        \"\"\"\n        self.word_embed.weight = nn.Parameter(pre_word_embeddings)\n\n    def rand_init(self):\n        \"\"\"\n        Random initialization.\n        \"\"\"\n        utils.init_embedding(self.char_embed.weight)\n        utils.init_lstm(self.char_fw)\n        utils.init_lstm(self.char_bw)\n        utils.init_lstm(self.word_rnn)\n        utils.init_linear(self.char_seq)\n        utils.init_linear(self.lm_seq)\n        self.crf.rand_init()\n\n    def forward(self, f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w):\n        \"\"\"\n        Calculate the output (crf potentials).\n\n        Parameters\n        ----------\n        f_c : ``torch.LongTensor``, required.\n            Character-level inputs in the forward direction.\n        f_p : ``torch.LongTensor``, required.\n            Ouput position of character-level inputs in the forward direction.\n        b_c : ``torch.LongTensor``, required.\n            Character-level inputs in the backward direction.\n        b_p : ``torch.LongTensor``, required.\n            Ouput position of character-level inputs in the backward direction.\n        flm_w : ``torch.LongTensor``, required.\n            Word-level inputs for the forward language model.\n        blm_w : ``torch.LongTensor``, required.\n            Word-level inputs for the backward language model.\n        blm_ind : ``torch.LongTensor``, required.\n            Ouput position of word-level inputs for the backward language model.\n        f_w: ``torch.LongTensor``, required.\n            Word-level inputs for the sequence labeling model.\n\n        Returns\n        -------\n        output: ``torch.FloatTensor``.\n            A float tensor of shape (sequence_len, batch_size, from_tag_size, to_tag_size)\n        \"\"\"\n        self.set_batch_seq_size(f_w)\n\n        f_c_e = self.drop(self.char_embed(f_c))\n        b_c_e = self.drop(self.char_embed(b_c))\n\n        f_c_e, _ = self.char_fw(f_c_e)\n        b_c_e, _ = self.char_bw(b_c_e)\n\n        f_c_e = f_c_e.view(-1, self.c_hidden).index_select(0, f_p).view(self.word_seq_length, self.batch_size, self.c_hidden)\n\n        b_c_e = b_c_e.view(-1, self.c_hidden).index_select(0, b_p).view(self.word_seq_length, self.batch_size, self.c_hidden)\n\n        c_o = self.drop(torch.cat([f_c_e, b_c_e], dim = 2))\n        c_o = self.char_seq(c_o)\n\n        self.f_lm.init_hidden()\n        self.b_lm.init_hidden()\n        f_lm_e = self.f_lm(flm_w)\n        b_lm_e = self.b_lm(blm_w, blm_ind)\n\n        lm_o = self.drop(torch.cat([f_lm_e, b_lm_e], dim = 2))\n        lm_o = self.relu(self.lm_seq(lm_o))\n\n        w_e = self.word_embed(f_w)\n\n        rnn_in = self.drop(torch.cat([c_o, lm_o, w_e], dim = 2))\n\n        rnn_out, _ = self.word_rnn(rnn_in)\n\n        crf_out = self.crf(self.drop(rnn_out)).view(self.word_seq_length, self.batch_size, self.y_num, self.y_num)\n\n        return crf_out\n\n\nclass Vanilla_SeqLabel(nn.Module):\n    \"\"\"\n    Sequence Labeling model augumented without language model.\n\n    Parameters\n    ----------\n    f_lm : ``torch.nn.Module``, required.\n        forward language modle for contextualized representations.\n    b_lm : ``torch.nn.Module``, required.\n        backward language modle for contextualized representations.\n    c_num : ``int`` , required.\n        number of characters.\n    c_dim : ``int`` , required.\n        dimension of character embedding.\n    c_hidden : ``int`` , required.\n        dimension of character hidden states.\n    c_layer : ``int`` , required.\n        number of character lstms.\n    w_num : ``int`` , required.\n        number of words.\n    w_dim : ``int`` , required.\n        dimension of word embedding.\n    w_hidden : ``int`` , required.\n        dimension of word hidden states.\n    w_layer : ``int`` , required.\n        number of word lstms.\n    y_num : ``int`` , required.\n        number of tags types.\n    droprate : ``float`` , required\n        dropout ratio.\n    unit : \"str\", optional, (default = 'lstm')\n        type of the recurrent unit.\n    \"\"\"\n    def __init__(self, f_lm, b_lm, c_num, c_dim, c_hidden, c_layer, w_num, w_dim, w_hidden, w_layer, y_num, droprate, unit='lstm'):\n        super(Vanilla_SeqLabel, self).__init__()\n\n        rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}\n\n        self.char_embed = nn.Embedding(c_num, c_dim)\n        self.word_embed = nn.Embedding(w_num, w_dim)\n\n        self.char_seq = nn.Linear(c_hidden * 2, w_dim)\n\n        self.c_hidden = c_hidden\n        self.char_fw = rnnunit_map[unit](c_dim, c_hidden, c_layer, dropout = droprate)\n        self.char_bw = rnnunit_map[unit](c_dim, c_hidden, c_layer, dropout = droprate)\n\n        self.word_rnn = rnnunit_map[unit](w_dim + w_dim, w_hidden // 2, w_layer, dropout = droprate, bidirectional = True)\n\n        self.y_num = y_num\n        self.crf = CRF(w_hidden, y_num)\n\n        self.drop = nn.Dropout(p = droprate)\n\n    def set_batch_seq_size(self, sentence):\n        \"\"\"\n        set batch size and sequence length\n        \"\"\"\n        tmp = sentence.size()\n        self.word_seq_length = tmp[0]\n        self.batch_size = tmp[1]\n\n    def load_pretrained_word_embedding(self, pre_word_embeddings):\n        \"\"\"\n        Load pre-trained word embedding.\n        \"\"\"\n        self.word_embed.weight = nn.Parameter(pre_word_embeddings)\n\n    def rand_init(self):\n        \"\"\"\n        Random initialization.\n        \"\"\"\n        utils.init_embedding(self.char_embed.weight)\n        utils.init_lstm(self.char_fw)\n        utils.init_lstm(self.char_bw)\n        utils.init_lstm(self.word_rnn)\n        utils.init_linear(self.char_seq)\n        self.crf.rand_init()\n\n    def forward(self, f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w):\n        \"\"\"\n        Calculate the output (crf potentials).\n\n        Parameters\n        ----------\n        f_c : ``torch.LongTensor``, required.\n            Character-level inputs in the forward direction.\n        f_p : ``torch.LongTensor``, required.\n            Ouput position of character-level inputs in the forward direction.\n        b_c : ``torch.LongTensor``, required.\n            Character-level inputs in the backward direction.\n        b_p : ``torch.LongTensor``, required.\n            Ouput position of character-level inputs in the backward direction.\n        flm_w : ``torch.LongTensor``, required.\n            Word-level inputs for the forward language model.\n        blm_w : ``torch.LongTensor``, required.\n            Word-level inputs for the backward language model.\n        blm_ind : ``torch.LongTensor``, required.\n            Ouput position of word-level inputs for the backward language model.\n        f_w: ``torch.LongTensor``, required.\n            Word-level inputs for the sequence labeling model.\n\n        Returns\n        -------\n        output: ``torch.FloatTensor``.\n            A float tensor of shape (sequence_len, batch_size, from_tag_size, to_tag_size)\n        \"\"\"\n        \n        self.set_batch_seq_size(f_w)\n\n        f_c_e = self.drop(self.char_embed(f_c))\n        b_c_e = self.drop(self.char_embed(b_c))\n\n        f_c_e, _ = self.char_fw(f_c_e)\n        b_c_e, _ = self.char_bw(b_c_e)\n\n        f_c_e = f_c_e.view(-1, self.c_hidden).index_select(0, f_p).view(self.word_seq_length, self.batch_size, self.c_hidden)\n\n        b_c_e = b_c_e.view(-1, self.c_hidden).index_select(0, b_p).view(self.word_seq_length, self.batch_size, self.c_hidden)\n\n        c_o = self.drop(torch.cat([f_c_e, b_c_e], dim = 2))\n        c_o = self.char_seq(c_o)\n\n        w_e = self.word_embed(f_w)\n\n        rnn_in = self.drop(torch.cat([c_o, w_e], dim = 2))\n\n        rnn_out, _ = self.word_rnn(rnn_in)\n\n        crf_out = self.crf(self.drop(rnn_out)).view(self.word_seq_length, self.batch_size, self.y_num, self.y_num)\n\n        return crf_out"
  },
  {
    "path": "model_seq/seqlm.py",
    "content": "\"\"\"\n.. module:: seqlm\n    :synopsis: language model for sequence labeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport time\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport model_seq.utils as utils\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass BasicSeqLM(nn.Module):\n    \"\"\"\n    The language model for the dense rnns.\n\n    Parameters\n    ----------\n    ori_lm : ``torch.nn.Module``, required.\n        the original module of language model.\n    backward : ``bool``, required.\n        whether the language model is backward.\n    droprate : ``float``, required.\n        the dropout ratrio.\n    fix_rate: ``bool``, required.\n        whether to fix the rqtio.\n    \"\"\"\n    def __init__(self, ori_lm, backward, droprate, fix_rate):\n        super(BasicSeqLM, self).__init__()\n\n        self.rnn = ori_lm.rnn\n\n        for param in self.rnn.parameters():\n            param.requires_grad = False\n\n        self.w_num = ori_lm.w_num\n        self.w_dim = ori_lm.w_dim\n        self.word_embed = ori_lm.word_embed\n        self.word_embed.weight.requires_grad = False\n\n        self.output_dim = ori_lm.rnn_output\n\n        self.backward = backward\n\n    def to_params(self):\n        \"\"\"\n        To parameters.\n        \"\"\"\n        return {\n            \"rnn_params\": self.rnn.to_params(),\n            \"word_embed_num\": self.word_embed.num_embeddings,\n            \"word_embed_dim\": self.word_embed.embedding_dim\n        }\n\n    def init_hidden(self):\n        \"\"\"\n        initialize hidden states.\n        \"\"\"\n        self.rnn.init_hidden()\n    \n    def regularizer(self):\n        \"\"\"\n        Calculate the regularization term.\n\n        Returns\n        ----------\n        reg: ``list``.\n            The list of regularization terms.\n        \"\"\"\n        return self.rnn.regularizer()\n\n    def forward(self, w_in, ind=None):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        w_in : ``torch.LongTensor``, required.\n            the input tensor, of shape (seq_len, batch_size).\n        ind : ``torch.LongTensor``, optional, (default=None).\n            the index tensor for the backward language model, of shape (seq_len, batch_size).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The ELMo outputs.\n        \"\"\"\n        w_emb = self.word_embed(w_in)\n        \n        out = self.rnn(w_emb)\n\n        if self.backward:\n            out_size = out.size()\n            out = out.view(out_size[0] * out_size[1], out_size[2]).index_select(0, ind).contiguous().view(out_size)\n\n        return out"
  },
  {
    "path": "model_seq/sparse_lm.py",
    "content": "\"\"\"\n.. module:: sparse_lm\n    :synopsis: sparse language model for sequence labeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport time\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport model_seq.utils as utils\n\nclass SBUnit(nn.Module):\n    \"\"\"\n    The basic recurrent unit for the dense-RNNs wrapper.\n\n    Parameters\n    ----------\n    ori_unit : ``torch.nn.Module``, required.\n        the original module of rnn unit.\n    droprate : ``float``, required.\n        the dropout ratrio.\n    fix_rate: ``bool``, required.\n        whether to fix the rqtio.\n    \"\"\"\n    def __init__(self, ori_unit, droprate, fix_rate):\n        super(SBUnit, self).__init__()\n\n        self.unit_type = ori_unit.unit_type\n\n        self.layer = ori_unit.layer\n\n        self.droprate = droprate\n\n        self.input_dim = ori_unit.input_dim\n        self.increase_rate = ori_unit.increase_rate\n        self.output_dim = ori_unit.input_dim + ori_unit.increase_rate\n\n    def prune_rnn(self, mask):\n        \"\"\"\n        Prune dense rnn to be smaller by delecting layers.\n\n        Parameters\n        ----------\n        mask : ``torch.ByteTensor``, required.\n            The selection tensor for the input matrix.\n        \"\"\"\n        mask_index = mask.nonzero().squeeze(1)\n        self.layer.weight_ih_l0 = nn.Parameter(self.layer.weight_ih_l0.data.index_select(1, mask_index).contiguous())\n        self.layer.input_size = self.layer.weight_ih_l0.size(1)\n\n    def forward(self, x, weight=1):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.FloatTensor``, required.\n            The input tensor, of shape (seq_len, batch_size, input_dim).\n        weight : ``torch.FloatTensor``, required.\n            The selection variable.\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The output of RNNs.\n        \"\"\"\n\n        if self.droprate > 0:\n            new_x = F.dropout(x, p=self.droprate, training=self.training)\n        else:\n            new_x = x\n\n        out, _ = self.layer(new_x)\n\n        out = weight * out\n\n        return torch.cat([x, out], 2)\n\nclass SDRNN(nn.Module):\n    \"\"\"\n    The multi-layer recurrent networks for the dense-RNNs wrapper.\n\n    Parameters\n    ----------\n    ori_unit : ``torch.nn.Module``, required.\n        the original module of rnn unit.\n    droprate : ``float``, required.\n        the dropout ratrio.\n    fix_rate: ``bool``, required.\n        whether to fix the rqtio.\n    \"\"\"\n    def __init__(self, ori_drnn, droprate, fix_rate):\n        super(SDRNN, self).__init__()\n\n        if ori_drnn.layer:\n            self.layer_list = [SBUnit(ori_unit, droprate, fix_rate) for ori_unit in ori_drnn.layer._modules.values()]\n\n            self.weight_list = nn.Parameter(torch.FloatTensor([1.0] * len(self.layer_list)))\n            self.weight_list.requires_grad = not fix_rate\n\n            # self.layer = nn.Sequential(*self.layer_list)\n            self.layer = nn.ModuleList(self.layer_list)\n\n            for param in self.layer.parameters():\n                param.requires_grad = False\n        else:\n            self.layer_list = list()\n            self.weight_list = list()\n            self.layer = None\n\n        # self.output_dim = self.layer_list[-1].output_dim\n        self.emb_dim = ori_drnn.emb_dim\n        self.output_dim = ori_drnn.output_dim\n        self.unit_type = ori_drnn.unit_type\n\n    def to_params(self):\n        \"\"\"\n        To parameters.\n        \"\"\"\n        return {\n            \"rnn_type\": \"LDRNN\",\n            \"unit_type\": self.unit_type,\n            \"layer_num\": 0 if not self.layer else len(self.layer),\n            \"emb_dim\": self.emb_dim,\n            \"hid_dim\": -1 if not self.layer else self.layer[0].increase_rate,\n            \"droprate\": -1 if not self.layer else self.layer[0].droprate,\n            \"after_pruned\": True\n        }\n\n    def prune_dense_rnn(self):\n        \"\"\"\n        Prune dense rnn to be smaller by delecting layers.\n        \"\"\"\n        prune_mask = torch.ones(self.layer_list[0].input_dim)\n        increase_mask_one = torch.ones(self.layer_list[0].increase_rate)\n        increase_mask_zero = torch.zeros(self.layer_list[0].increase_rate)\n\n        new_layer_list = list()\n        new_weight_list = list()\n        for ind in range(0, len(self.layer_list)):\n            if self.weight_list.data[ind] > 0:\n                new_weight_list.append(self.weight_list.data[ind])\n\n                self.layer_list[ind].prune_rnn(prune_mask)\n                new_layer_list.append(self.layer_list[ind])\n\n                prune_mask = torch.cat([prune_mask, increase_mask_one], dim = 0)\n            else:\n                prune_mask = torch.cat([prune_mask, increase_mask_zero], dim = 0)\n\n        if not new_layer_list:\n            self.output_dim = self.layer_list[0].input_dim\n            self.layer = None\n            self.weight_list = None\n            self.layer_list = None\n        else:\n            self.layer_list = new_layer_list\n            self.layer = nn.ModuleList(self.layer_list)\n            self.weight_list = nn.Parameter(torch.FloatTensor(new_weight_list))\n            self.weight_list.requires_grad = False\n\n\n            for param in self.layer.parameters():\n                param.requires_grad = False\n\n        return prune_mask\n\n    def prox(self):\n        \"\"\"\n        the proximal calculator.\n        \"\"\"\n        self.weight_list.data.masked_fill_(self.weight_list.data < 0, 0)\n        self.weight_list.data.masked_fill_(self.weight_list.data > 1, 1)\n        none_zero_count = (self.weight_list.data > 0).sum()\n        return none_zero_count\n\n    def regularizer(self):\n        \"\"\"\n        Calculate the regularization term.\n\n        Returns\n        ----------\n        reg0: ``torch.FloatTensor``.\n            The value of reg0.\n        reg1: ``torch.FloatTensor``.\n            The value of reg1.\n        reg2: ``torch.FloatTensor``.\n            The value of reg2.\n        \"\"\"\n        reg3 = (self.weight_list * (1 - self.weight_list)).sum()\n        none_zero = self.weight_list.data > 0\n        none_zero_count = none_zero.sum()\n        reg0 = none_zero_count\n        reg1 = self.weight_list[none_zero].sum()\n        return reg0, reg1, reg3\n\n    def forward(self, x):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.FloatTensor``, required.\n            the input tensor, of shape (seq_len, batch_size, input_dim).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The ELMo outputs.\n        \"\"\"\n        if self.layer_list is not None:\n            for ind in range(len(self.layer_list)):\n                x = self.layer[ind](x, self.weight_list[ind])\n        return x\n        # return self.layer(x)\n\nclass SparseSeqLM(nn.Module):\n    \"\"\"\n    The language model for the dense rnns with layer-wise selection.\n\n    Parameters\n    ----------\n    ori_lm : ``torch.nn.Module``, required.\n        the original module of language model.\n    backward : ``bool``, required.\n        whether the language model is backward.\n    droprate : ``float``, required.\n        the dropout ratrio.\n    fix_rate: ``bool``, required.\n        whether to fix the rqtio.\n    \"\"\"\n\n    def __init__(self, ori_lm, backward, droprate, fix_rate):\n        super(SparseSeqLM, self).__init__()\n\n        self.rnn = SDRNN(ori_lm.rnn, droprate, fix_rate)\n\n        self.w_num = ori_lm.w_num\n        self.w_dim = ori_lm.w_dim\n        self.word_embed = ori_lm.word_embed\n        self.word_embed.weight.requires_grad = False\n\n        self.output_dim = ori_lm.rnn_output\n\n        self.backward = backward\n\n    def to_params(self):\n        \"\"\"\n        To parameters.\n        \"\"\"\n        return {\n            \"backward\": self.backward,\n            \"rnn_params\": self.rnn.to_params(),\n            \"word_embed_num\": self.word_embed.num_embeddings,\n            \"word_embed_dim\": self.word_embed.embedding_dim\n        }\n\n    def prune_dense_rnn(self):\n        \"\"\"\n        Prune dense rnn to be smaller by delecting layers.\n        \"\"\"\n        prune_mask = self.rnn.prune_dense_rnn()\n        self.output_dim = self.rnn.output_dim\n        return prune_mask\n\n    def init_hidden(self):\n        \"\"\"\n        initialize hidden states.\n        \"\"\"\n        return\n\n    def regularizer(self):\n        \"\"\"\n        Calculate the regularization term.\n\n        Returns\n        ----------\n        reg: ``list``.\n            The list of regularization terms.\n        \"\"\"\n        return self.rnn.regularizer()\n\n    def prox(self):\n        \"\"\"\n        the proximal calculator.\n        \"\"\"\n        return self.rnn.prox()\n\n    def forward(self, w_in, ind=None):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        w_in : ``torch.LongTensor``, required.\n            the input tensor, of shape (seq_len, batch_size).\n        ind : ``torch.LongTensor``, optional, (default=None).\n            the index tensor for the backward language model, of shape (seq_len, batch_size).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The ELMo outputs.\n        \"\"\"\n        w_emb = self.word_embed(w_in)\n        \n        out = self.rnn(w_emb)\n\n        if self.backward:\n            out_size = out.size()\n            out = out.view(out_size[0] * out_size[1], out_size[2]).index_select(0, ind).contiguous().view(out_size)\n\n        return out\n        "
  },
  {
    "path": "model_seq/utils.py",
    "content": "\"\"\"\n.. module:: utils\n    :synopsis: utils\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport numpy as np\nimport torch\nimport json\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.init\n\nfrom torch.autograd import Variable\n\ndef log_sum_exp(vec):\n    \"\"\"\n    log sum exp function.\n\n    Parameters\n    ----------\n    vec : ``torch.FloatTensor``, required.\n        input vector, of shape(ins_num, from_tag_size, to_tag_size)\n\n    Returns\n    -------\n    sum: ``torch.FloatTensor``.\n        log sum exp results, tensor of shape (ins_num, to_tag_size)    \n    \"\"\"\n    max_score, _ = torch.max(vec, 1)\n\n    return max_score + torch.log(torch.sum(torch.exp(vec - max_score.unsqueeze(1).expand_as(vec)), 1))\n\ndef repackage_hidden(h):\n    \"\"\"\n    Wraps hidden states in new Variables, to detach them from their history\n\n    Parameters\n    ----------\n    h : ``Tuple`` or ``Tensors``, required.\n        Tuple or Tensors, hidden states.\n\n    Returns\n    -------\n    hidden: ``Tuple`` or ``Tensors``.\n        detached hidden states\n    \"\"\"\n    if type(h) == torch.Tensor:\n        return h.detach()\n    else:\n        return tuple(repackage_hidden(v) for v in h)\n\ndef to_scalar(var):\n    \"\"\"\n    convert a tensor to a scalar number\n    \"\"\"\n    return var.view(-1).item()\n\ndef init_embedding(input_embedding):\n    \"\"\"\n    random initialize embedding\n    \"\"\"\n    bias = np.sqrt(3.0 / input_embedding.size(1))\n    nn.init.uniform_(input_embedding, -bias, bias)\n\ndef init_linear(input_linear):\n    \"\"\"\n    random initialize linear projection.\n    \"\"\"\n    bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1)))\n    nn.init.uniform_(input_linear.weight, -bias, bias)\n    if input_linear.bias is not None:\n        input_linear.bias.data.zero_()\n\ndef adjust_learning_rate(optimizer, lr):\n    \"\"\"\n    adjust learning to the the new value.\n\n    Parameters\n    ----------\n    optimizer : required.\n        pytorch optimizer.\n    float :  ``float``, required.\n        the target learning rate.\n    \"\"\"\n    for param_group in optimizer.param_groups:\n        param_group['lr'] = lr\n\ndef init_lstm(input_lstm):\n    \"\"\"\n    random initialize lstms\n    \"\"\"\n    for ind in range(0, input_lstm.num_layers):\n        weight = eval('input_lstm.weight_ih_l'+str(ind))\n        bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1)))\n        nn.init.uniform_(weight, -bias, bias)\n        weight = eval('input_lstm.weight_hh_l'+str(ind))\n        bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1)))\n        nn.init.uniform_(weight, -bias, bias)\n    \n    if input_lstm.bias:\n        for ind in range(0, input_lstm.num_layers):\n            weight = eval('input_lstm.bias_ih_l'+str(ind))\n            weight.data.zero_()\n            weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1\n            weight = eval('input_lstm.bias_hh_l'+str(ind))\n            weight.data.zero_()\n            weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1"
  },
  {
    "path": "model_word_ada/LM.py",
    "content": "\"\"\"\n.. module:: LM\n    :synopsis: language modeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport model_word_ada.utils as utils\n\nclass LM(nn.Module):\n    \"\"\"\n    The language model model.\n    \n    Parameters\n    ----------\n    rnn : ``torch.nn.Module``, required.\n        The RNNs network.\n    soft_max : ``torch.nn.Module``, required.\n        The softmax layer.\n    w_num : ``int`` , required.\n        The number of words.\n    w_dim : ``int`` , required.\n        The dimension of word embedding.\n    droprate : ``float`` , required\n        The dropout ratio.\n    label_dim : ``int`` , required.\n        The input dimension of softmax.    \n    \"\"\"\n\n    def __init__(self, rnn, soft_max, w_num, w_dim, droprate, label_dim = -1, add_relu=False):\n        super(LM, self).__init__()\n\n        self.rnn = rnn\n        self.soft_max = soft_max\n\n        self.w_num = w_num\n        self.w_dim = w_dim\n        self.word_embed = nn.Embedding(w_num, w_dim)\n\n        self.rnn_output = self.rnn.output_dim\n\n        self.add_proj = label_dim > 0\n        if self.add_proj:\n            self.project = nn.Linear(self.rnn_output, label_dim)\n            if add_relu:\n                self.relu = nn.ReLU()\n            else:\n                self.relu = lambda x: x\n\n        self.drop = nn.Dropout(p=droprate)\n\n    def load_embed(self, origin_lm):\n        \"\"\"\n        Load embedding from another language model.\n        \"\"\"\n        self.word_embed = origin_lm.word_embed\n        self.soft_max = origin_lm.soft_max\n\n    def rand_ini(self):\n        \"\"\"\n        Random initialization.\n        \"\"\"\n        self.rnn.rand_ini()\n        # utils.init_linear(self.project)\n        self.soft_max.rand_ini()\n        # if not self.tied_weight:\n        utils.init_embedding(self.word_embed.weight)\n\n        if self.add_proj:\n            utils.init_linear(self.project)\n\n    def init_hidden(self):\n        \"\"\"\n        Initialize hidden states.\n        \"\"\"\n        self.rnn.init_hidden()\n\n    def forward(self, w_in, target):\n        \"\"\"\n        Calculate the loss.\n\n        Parameters\n        ----------\n        w_in : ``torch.FloatTensor``, required.\n            the input tensor, of shape (word_num, input_dim).\n        target : ``torch.FloatTensor``, required.\n            the target of the language model, of shape (word_num).\n        \n        Returns\n        ----------\n        loss: ``torch.FloatTensor``.\n            The NLL loss.\n        \"\"\"\n\n        w_emb = self.word_embed(w_in)\n        \n        w_emb = self.drop(w_emb)\n\n        out = self.rnn(w_emb).contiguous().view(-1, self.rnn_output)\n\n        if self.add_proj:\n            out = self.drop(self.relu(self.project(out)))\n            # out = self.drop(self.project(out))\n\n        out = self.soft_max(out, target)\n\n        return out\n\n    def log_prob(self, w_in):\n        \"\"\"\n        Calculate log-probability for the whole dictionary.\n        \n        Parameters\n        ----------\n        w_in : ``torch.FloatTensor``, required.\n            the input tensor, of shape (word_num, input_dim).\n        \n        Returns\n        ----------\n        prob: ``torch.FloatTensor``.\n            The full log-probability.\n        \"\"\"\n\n        w_emb = self.word_embed(w_in)\n        \n        out = self.rnn(w_emb).contiguous().view(-1, self.rnn_output)\n\n        if self.add_proj:\n            out = self.relu(self.project(out))\n\n        out = self.soft_max.log_prob(out, w_emb.device)\n\n        return out"
  },
  {
    "path": "model_word_ada/__init__.py",
    "content": ""
  },
  {
    "path": "model_word_ada/adaptive.py",
    "content": "\"\"\"\n.. module:: adaptive\n    :synopsis: adaptive softmax\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport torch\nfrom torch import nn\n\nfrom math import sqrt\n\nclass AdaptiveSoftmax(nn.Module):\n    \"\"\"\n    The adaptive softmax layer.\n    Modified from: https://github.com/rosinality/adaptive-softmax-pytorch/blob/master/adasoft.py\n\n    Parameters\n    ----------\n    input_size : ``int``, required.\n        The input dimension.\n    cutoff : ``list``, required.\n        The list of cutoff values.\n    \"\"\"\n    def __init__(self, input_size, cutoff):\n        super().__init__()\n\n        self.input_size = input_size\n        self.cutoff = cutoff\n        self.output_size = cutoff[0] + len(cutoff) - 1\n\n        self.head = nn.Linear(input_size, self.output_size)\n        self.tail = nn.ModuleList()\n\n        self.cross_entropy = nn.CrossEntropyLoss(size_average=False)\n\n        for i in range(len(self.cutoff) - 1):\n            seq = nn.Sequential(\n                nn.Linear(input_size, input_size // 4 ** i, False),\n                nn.Linear(input_size // 4 ** i, cutoff[i + 1] - cutoff[i], False)\n            )\n\n            self.tail.append(seq)\n\n    def rand_ini(self):\n        \"\"\"\n        Random Initialization.\n        \"\"\"\n        nn.init.xavier_normal_(self.head.weight)\n\n        for tail in self.tail:\n            nn.init.xavier_normal_(tail[0].weight)\n            nn.init.xavier_normal_(tail[1].weight)\n\n    def log_prob(self, w_in, device):\n        \"\"\"\n        Calculate log-probability for the whole dictionary.\n        \n        Parameters\n        ----------\n        w_in : ``torch.FloatTensor``, required.\n            the input tensor, of shape (word_num, input_dim).\n        device: ``torch.device``, required.\n            the target device for calculation.\n\n        Returns\n        ----------\n        prob: ``torch.FloatTensor``.\n            The full log-probability.\n        \"\"\"\n        lsm = nn.LogSoftmax(dim=1).to(device)\n\n        head_out = self.head(w_in)\n\n        batch_size = head_out.size(0)\n        prob = torch.zeros(batch_size, self.cutoff[-1]).to(device)\n\n        lsm_head = lsm(head_out) \n        prob.narrow(1, 0, self.output_size).add_(lsm_head.narrow(1, 0, self.output_size).data)\n\n        for i in range(len(self.tail)):\n            pos = self.cutoff[i]\n            i_size = self.cutoff[i + 1] - pos\n            buffer = lsm_head.narrow(1, self.cutoff[0] + i, 1)\n            buffer = buffer.expand(batch_size, i_size)\n            lsm_tail = lsm(self.tail[i](w_in)) \n            prob.narrow(1, pos, i_size).copy_(buffer.data).add_(lsm_tail.data)\n\n        return prob\n\n    def forward(self, w_in, target):\n        \"\"\"\n        Calculate the log-likihood w.o. calculate the full distribution.\n\n        Parameters\n        ----------\n        w_in : ``torch.FloatTensor``, required.\n            the input tensor, of shape (word_num, input_dim).\n        target : ``torch.FloatTensor``, required.\n            the target of the language model, of shape (word_num).\n        \n        Returns\n        ----------\n        loss: ``torch.FloatTensor``.\n            The NLL loss.\n        \"\"\"\n        batch_size = w_in.size(0)\n        output = 0.0\n\n        first_target = target.clone()\n\n        for i in range(len(self.cutoff) - 1):\n            \n            mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1]))\n\n            if mask.sum() > 0:\n\n                first_target[mask] = self.cutoff[0] + i\n\n                second_target = target[mask].add(-self.cutoff[i])\n                second_input = w_in.index_select(0, mask.nonzero().squeeze())\n\n                second_output = self.tail[i](second_input)\n\n                output += self.cross_entropy(second_output, second_target)\n\n        output += self.cross_entropy(self.head(w_in), first_target)\n        output /= batch_size\n        return output\n"
  },
  {
    "path": "model_word_ada/basic.py",
    "content": "\"\"\"\n.. module:: basic\n    :synopsis: basic rnn\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport model_word_ada.utils as utils\n\nclass BasicUnit(nn.Module):\n    \"\"\"\n    The basic recurrent unit for the vanilla stacked RNNs.\n\n    Parameters\n    ----------\n    unit : ``str``, required.\n        The type of rnn unit.\n    input_dim : ``int``, required.\n        The input dimension fo the unit.\n    hid_dim : ``int``, required.\n        The hidden dimension fo the unit.\n    droprate : ``float``, required.\n        The dropout ratrio.\n    \"\"\"\n    def __init__(self, unit, input_dim, hid_dim, droprate):\n        super(BasicUnit, self).__init__()\n\n        self.unit_type = unit\n        rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}\n\n        self.batch_norm = (unit == 'bnlstm')\n\n        self.layer = rnnunit_map[unit](input_dim, hid_dim, 1)\n        self.droprate = droprate\n\n        self.output_dim = hid_dim\n\n        self.init_hidden()\n\n    def init_hidden(self):\n        \"\"\"\n        Initialize hidden states.\n        \"\"\"\n        self.hidden_state = None\n\n    def rand_ini(self):\n        \"\"\"\n        Random Initialization.\n        \"\"\"\n        if not self.batch_norm:\n            utils.init_lstm(self.layer)\n\n    def forward(self, x):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.LongTensor``, required.\n            the input tensor, of shape (seq_len, batch_size, input_dim).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.   \n            The output of RNNs.\n        \"\"\"\n        out, new_hidden = self.layer(x, self.hidden_state)\n\n        self.hidden_state = utils.repackage_hidden(new_hidden)\n        \n        if self.droprate > 0:\n            out = F.dropout(out, p=self.droprate, training=self.training)\n\n        return out\n\nclass BasicRNN(nn.Module):\n    \"\"\"\n    The multi-layer recurrent networks for the vanilla stacked RNNs.\n\n    Parameters\n    ----------\n    layer_num: ``int``, required.\n        The number of layers. \n    unit : ``torch.nn.Module``, required.\n        The type of rnn unit.\n    input_dim : ``int``, required.\n        The input dimension fo the unit.\n    hid_dim : ``int``, required.\n        The hidden dimension fo the unit.\n    droprate : ``float``, required.\n        The dropout ratrio.\n    \"\"\"\n    def __init__(self, layer_num, unit, emb_dim, hid_dim, droprate):\n        super(BasicRNN, self).__init__()\n\n        layer_list = [BasicUnit(unit, emb_dim, hid_dim, droprate)] + [BasicUnit(unit, hid_dim, hid_dim, droprate) for i in range(layer_num - 1)]\n        self.layer = nn.Sequential(*layer_list)\n        self.output_dim = layer_list[-1].output_dim\n        self.unit_type = unit\n        \n        self.init_hidden()\n\n    def to_params(self):\n        \"\"\"\n        To parameters.\n        \"\"\"\n        return {\n            \"rnn_type\": \"Basic\",\n            \"unit_type\": self.layer[0].unit_type,\n            \"layer_num\": len(self.layer),\n            \"emb_dim\": self.layer[0].layer.input_size,\n            \"hid_dim\": self.layer[0].layer.hidden_size,\n            \"droprate\": self.layer[0].droprate\n        }\n\n    def init_hidden(self):\n        \"\"\"\n        Initialize hidden states.\n        \"\"\"\n        for tup in self.layer.children():\n            tup.init_hidden()\n\n    def rand_ini(self):\n        \"\"\"\n        Random Initialization.\n        \"\"\"\n        for tup in self.layer.children():\n            tup.rand_ini()\n\n    def forward(self, x):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.LongTensor``, required.\n            the input tensor, of shape (seq_len, batch_size, input_dim).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The output of RNNs.\n        \"\"\"\n        return self.layer(x)"
  },
  {
    "path": "model_word_ada/dataset.py",
    "content": "\"\"\"\n.. module:: dataset\n    :synopsis: dataset for language modeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nimport sys\nimport pickle\nimport random\nfrom tqdm import tqdm\n\nfrom torch.utils.data import Dataset\n\nclass EvalDataset(object):\n    \"\"\"    \n    Dataset for Language Modeling\n\n    Parameters\n    ----------\n    dataset : ``list``, required.\n        The encoded dataset (outputs of preprocess scripts).\n    sequence_length: ``int``, required.\n        Sequence Length.\n    \"\"\"\n    def __init__(self, dataset, sequence_length):\n        super(EvalDataset, self).__init__()\n        self.dataset = dataset\n\n        self.sequence_length = sequence_length\n\n        self.construct_index()\n\n    def get_tqdm(self, device):\n        \"\"\"\n        construct dataset reader and the corresponding tqdm.\n\n        Parameters\n        ----------\n        device: ``torch.device``, required.\n            the target device for the dataset loader.\n\n        \"\"\"\n        return tqdm(self.reader(device), mininterval=2, total=self.index_length, leave=False, file=sys.stdout, ncols=80)\n\n    def construct_index(self):\n        \"\"\"\n        construct index for the dataset.\n        \"\"\"\n        token_per_batch = self.sequence_length\n        tot_num = len(self.dataset) - 1\n        res_num = tot_num - tot_num % token_per_batch\n\n        self.x = list(torch.unbind(torch.LongTensor(self.dataset[0:res_num]).view(-1, self.sequence_length), 0))\n        self.y = list(torch.unbind(torch.LongTensor(self.dataset[1:res_num+1]).view(-1, self.sequence_length), 0))\n\n        self.x.append(torch.LongTensor(self.dataset[res_num:tot_num]))\n        self.y.append(torch.LongTensor(self.dataset[res_num+1:tot_num+1]))\n\n        self.index_length = len(self.x)\n        self.cur_idx = 0\n\n    def reader(self, device):\n        \"\"\"\n        construct dataset reader.\n\n        Parameters\n        ----------\n        device: ``torch.device``, required.\n            the target device for the dataset loader.\n\n        Returns\n        -------\n        reader: ``iterator``.\n            A lazy iterable object        \n        \"\"\"\n        if self.cur_idx == self.index_length:\n            self.cur_idx = 0\n            raise StopIteration\n\n        word_t = self.x[self.cur_idx].to(device).view(-1, 1)\n        label_t = self.y[self.cur_idx].to(device).view(-1, 1)\n\n        self.cur_idx += 1\n        \n        yield word_t, label_t\n\nclass LargeDataset(object):\n    \"\"\"    \n    Lazy Dataset for Language Modeling\n\n    Parameters\n    ----------\n    root : ``str``, required.\n        The root folder for dataset files.\n    range_idx : ``int``, required.\n        The maximum file index for the input files (train_*.pk).\n    batch_size : ``int``, required.\n        Batch size.\n    sequence_length: ``int``, required.\n        Sequence Length.\n    \"\"\"\n    def __init__(self, root, range_idx, batch_size, sequence_length):\n        super(LargeDataset, self).__init__()\n        self.root = root\n        self.range_idx = range_idx\n        self.shuffle_list = list(range(0, range_idx))\n        self.shuffle()\n\n        self.batch_size = batch_size\n        self.sequence_length = sequence_length\n        self.token_per_batch = self.batch_size * self.sequence_length\n\n        self.total_batch_num = -1\n\n    def shuffle(self):\n        \"\"\"\n        shuffle dataset\n        \"\"\"\n        random.shuffle(self.shuffle_list)\n\n    def get_tqdm(self, device):\n        \"\"\"\n        construct dataset reader and the corresponding tqdm.\n\n        Parameters\n        ----------\n        device: ``torch.device``, required.\n            the target device for the dataset loader.        \n        \"\"\"\n        self.batch_count = 0\n        self.cur_idx = 0\n        self.file_idx = 0\n        self.index_length = 0\n\n        if self.total_batch_num <= 0:\n            return tqdm(self.reader(device), mininterval=2, leave=False, file=sys.stdout).__iter__()\n        else:\n            return tqdm(self.reader(device), mininterval=2, total=self.total_batch_num, leave=False, file=sys.stdout, ncols=80).__iter__()\n\n\n    def reader(self, device):\n        \"\"\"\n        construct dataset reader.\n\n        Parameters\n        ----------\n        device: ``torch.device``, required.\n            the target device for the dataset loader.\n\n        Returns\n        -------\n        reader: ``iterator``.\n            A lazy iterable object        \n        \"\"\"\n        while self.file_idx < self.range_idx:\n\n            self.open_next()\n            while self.cur_idx < self.index_length:\n\n                word_t = self.x[self.cur_idx].to(device)\n                # label_t = self.y[self.cur_idx].to(device)\n                label_t = self.y[self.cur_idx].to(device)\n\n                self.cur_idx += 1\n\n                yield word_t, label_t\n\n        self.total_batch_num = self.batch_count\n        self.shuffle()\n\n    def open_next(self):\n        \"\"\"\n        Open the next file.\n        \"\"\"\n        self.dataset = pickle.load(open(self.root + 'train_' + str( self.shuffle_list[self.file_idx])+'.pk', 'rb'))\n\n        res_num = len(self.dataset) - 1\n        res_num = res_num - res_num % self.token_per_batch\n\n        self.x = torch.LongTensor(self.dataset[0:res_num]).view(self.batch_size, -1, self.sequence_length).transpose_(0, 1).transpose_(1, 2).contiguous()\n        self.y = torch.LongTensor(self.dataset[1:res_num+1]).view(self.batch_size, -1, self.sequence_length).transpose_(0, 1).transpose_(1, 2).contiguous()\n\n        self.index_length = self.x.size(0)\n        self.cur_idx = 0\n\n        self.batch_count += self.index_length\n        self.file_idx += 1"
  },
  {
    "path": "model_word_ada/densenet.py",
    "content": "\"\"\"\n.. module:: densenet\n    :synopsis: densernn\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport model_word_ada.utils as utils\n\nclass BasicUnit(nn.Module):\n    \"\"\"\n    The basic recurrent unit for the densely connected RNNs.\n\n    Parameters\n    ----------\n    unit : ``torch.nn.Module``, required.\n        The type of rnn unit.\n    input_dim : ``float``, required.\n        The input dimension fo the unit.\n    increase_rate : ``float``, required.\n        The hidden dimension fo the unit.\n    droprate : ``float``, required.\n        The dropout ratrio.\n    \"\"\"\n    def __init__(self, unit, input_dim, increase_rate, droprate):\n        super(BasicUnit, self).__init__()\n\n        rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}\n\n        self.unit_type = unit\n\n        self.layer = rnnunit_map[unit](input_dim, increase_rate, 1)\n\n        if 'lstm' == self.unit_type:\n            utils.init_lstm(self.layer)\n\n        self.droprate = droprate\n\n        self.input_dim = input_dim\n        self.increase_rate = increase_rate\n        self.output_dim = input_dim + increase_rate\n\n        self.init_hidden()\n\n    def init_hidden(self):\n        \"\"\"\n        Initialize hidden states.\n        \"\"\"\n        self.hidden_state = None\n\n    def rand_ini(self):\n        \"\"\"\n        Random Initialization.\n        \"\"\"\n        return\n\n    def forward(self, x):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.LongTensor``, required.\n            the input tensor, of shape (seq_len, batch_size, input_dim).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.   \n            The output of RNNs.\n        \"\"\"\n        if self.droprate > 0:\n            new_x = F.dropout(x, p=self.droprate, training=self.training)\n        else:\n            new_x = x\n\n        out, new_hidden = self.layer(new_x, self.hidden_state)\n\n        self.hidden_state = utils.repackage_hidden(new_hidden)\n\n        out = out.contiguous()\n\n        return torch.cat([x, out], 2)\n\nclass DenseRNN(nn.Module):\n    \"\"\"\n    The multi-layer recurrent networks for the densely connected RNNs.\n\n    Parameters\n    ----------\n    layer_num: ``float``, required.\n        The number of layers. \n    unit : ``torch.nn.Module``, required.\n        The type of rnn unit.\n    input_dim : ``float``, required.\n        The input dimension fo the unit.\n    hid_dim : ``float``, required.\n        The hidden dimension fo the unit.\n    droprate : ``float``, required.\n        The dropout ratrio.\n    \"\"\"\n    def __init__(self, layer_num, unit, emb_dim, hid_dim, droprate):\n        super(DenseRNN, self).__init__()\n        \n        self.unit_type = unit\n        self.layer_list = [BasicUnit(unit, emb_dim + i * hid_dim, hid_dim, droprate) for i in range(layer_num)]\n        self.layer = nn.Sequential(*self.layer_list) if layer_num > 0 else None\n        self.output_dim = self.layer_list[-1].output_dim if layer_num > 0 else emb_dim\n        self.emb_dim = emb_dim\n\n        self.init_hidden()\n\n    def to_params(self):\n        \"\"\"\n        To parameters.\n        \"\"\"\n        return {\n            \"rnn_type\": \"DenseRNN\",\n            \"unit_type\": self.layer[0].unit_type,\n            \"layer_num\": len(self.layer),\n            \"emb_dim\": self.layer[0].input_dim,\n            \"hid_dim\": self.layer[0].increase_rate,\n            \"droprate\": self.layer[0].droprate\n        }\n\n    def init_hidden(self):\n        \"\"\"\n        Initialize hidden states.\n        \"\"\"\n        for tup in self.layer_list:\n            tup.init_hidden()\n\n    def rand_ini(self):\n        \"\"\"\n        Random Initialization.\n        \"\"\"\n        for tup in self.layer_list:\n            tup.rand_ini()\n\n    def forward(self, x):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.LongTensor``, required.\n            the input tensor, of shape (seq_len, batch_size, input_dim).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The output of RNNs.\n        \"\"\"\n        return self.layer(x)"
  },
  {
    "path": "model_word_ada/ldnet.py",
    "content": "\"\"\"\n.. module:: ldnet\n    :synopsis: LD-Net\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport model_word_ada.utils as utils\nimport random\n\nclass BasicUnit(nn.Module):\n    \"\"\"\n    The basic recurrent unit for the densely connected RNNs with layer-wise dropout.\n\n    Parameters\n    ----------\n    unit : ``torch.nn.Module``, required.\n        The type of rnn unit.\n    input_dim : ``float``, required.\n        The input dimension fo the unit.\n    increase_rate : ``float``, required.\n        The hidden dimension fo the unit.\n    droprate : ``float``, required.\n        The dropout ratrio.\n    layer_dropout : ``float``, required.\n        The layer-wise dropout ratrio.\n    \"\"\"\n    def __init__(self, unit, input_dim, increase_rate, droprate, layer_drop = 0):\n        super(BasicUnit, self).__init__()\n\n        rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}\n\n        self.unit_type = unit\n\n        self.layer = rnnunit_map[unit](input_dim, increase_rate, 1)\n\n        if 'lstm' == self.unit_type:\n            utils.init_lstm(self.layer)\n\n        self.layer_drop = layer_drop\n\n        self.droprate = droprate\n\n        self.input_dim = input_dim\n        self.increase_rate = increase_rate\n        self.output_dim = input_dim + increase_rate\n\n        self.init_hidden()\n\n    def init_hidden(self):\n        \"\"\"\n        Initialize hidden states.\n        \"\"\"\n        self.hidden_state = None\n\n    def rand_ini(self):\n        \"\"\"\n        Random Initialization.\n        \"\"\"\n        return\n\n    def forward(self, x, p_out):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.LongTensor``, required.\n            the input tensor, of shape (seq_len, batch_size, input_dim).\n        p_out : ``torch.LongTensor``, required.\n            the final output tensor for the softmax, of shape (seq_len, batch_size, input_dim).\n\n        Returns\n        ----------\n        out: ``torch.FloatTensor``.\n            The undropped outputs of RNNs to the softmax.\n        p_out: ``torch.FloatTensor``.\n            The dropped outputs of RNNs to the next_layer.\n        \"\"\"\n        if self.droprate > 0:\n            new_x = F.dropout(x, p=self.droprate, training=self.training)\n        else:\n            new_x = x\n\n        out, new_hidden = self.layer(new_x, self.hidden_state)\n\n        self.hidden_state = utils.repackage_hidden(new_hidden)\n\n        out = out.contiguous()\n\n        if self.training and random.uniform(0, 1) < self.layer_drop:\n            deep_out = torch.autograd.Variable( torch.zeros(x.size(0), x.size(1), self.increase_rate) ).cuda()\n        else:\n            deep_out = out\n\n        o_out = torch.cat([p_out, out], 2)\n        d_out = torch.cat([x, deep_out], 2)\n        return d_out, o_out\n\nclass LDRNN(nn.Module):\n    \"\"\"\n    The multi-layer recurrent networks for the densely connected RNNs with layer-wise dropout.\n\n    Parameters\n    ----------\n    layer_num: ``float``, required.\n        The number of layers. \n    unit : ``torch.nn.Module``, required.\n        The type of rnn unit.\n    input_dim : ``float``, required.\n        The input dimension fo the unit.\n    hid_dim : ``float``, required.\n        The hidden dimension fo the unit.\n    droprate : ``float``, required.\n        The dropout ratrio.\n    layer_dropout : ``float``, required.\n        The layer-wise dropout ratrio.\n    \"\"\"\n    def __init__(self, layer_num, unit, emb_dim, hid_dim, droprate, layer_drop):\n        super(LDRNN, self).__init__()\n\n        self.unit_type = unit\n        self.layer_list = [BasicUnit(unit, emb_dim + i * hid_dim, hid_dim, droprate, layer_drop) for i in range(layer_num)]\n\n        self.layer_num = layer_num\n        self.layer = nn.ModuleList(self.layer_list) if layer_num > 0 else None\n        self.output_dim = self.layer_list[-1].output_dim if layer_num > 0 else emb_dim\n        self.emb_dim = emb_dim\n    \n        self.init_hidden()\n\n    def to_params(self):\n        \"\"\"\n        To parameters.\n        \"\"\"\n        return {\n            \"rnn_type\": \"LDRNN\",\n            \"unit_type\": self.layer[0].unit_type,\n            \"layer_num\": len(self.layer),\n            \"emb_dim\": self.layer[0].input_dim,\n            \"hid_dim\": self.layer[0].increase_rate,\n            \"droprate\": self.layer[0].droprate,\n            \"after_pruned\": False\n        }\n\n    def init_hidden(self):\n        \"\"\"\n        Initialize hidden states.\n        \"\"\"\n        for tup in self.layer_list:\n            tup.init_hidden()\n\n    def rand_ini(self):\n        \"\"\"\n        Random Initialization.\n        \"\"\"\n        for tup in self.layer_list:\n            tup.rand_ini()\n\n    def forward(self, x):\n        \"\"\"\n        Calculate the output.\n\n        Parameters\n        ----------\n        x : ``torch.LongTensor``, required.\n            the input tensor, of shape (seq_len, batch_size, input_dim).\n\n        Returns\n        ----------\n        output: ``torch.FloatTensor``.\n            The output of RNNs to the Softmax.\n        \"\"\"\n        output = x\n        for ind in range(self.layer_num):\n            x, output = self.layer_list[ind](x, output)\n        return output"
  },
  {
    "path": "model_word_ada/utils.py",
    "content": "\"\"\"\n.. module:: utils\n    :synopsis: utils\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport numpy as np\nimport torch\nimport json\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.init\n\nfrom torch.autograd import Variable\n\ndef repackage_hidden(h):\n    \"\"\"\n    Wraps hidden states in new Variables, to detach them from their history\n\n    Parameters\n    ----------\n    h : ``Tuple`` or ``Tensors``, required.\n        Tuple or Tensors, hidden states.\n\n    Returns\n    -------\n    hidden: ``Tuple`` or ``Tensors``.\n        detached hidden states\n    \"\"\"\n    if type(h) == torch.Tensor:\n        return h.detach()\n    else:\n        return tuple(repackage_hidden(v) for v in h)\n\ndef to_scalar(var):\n    \"\"\"\n    convert a tensor to a scalar number\n    \"\"\"\n    return var.view(-1).item()\n\ndef init_embedding(input_embedding):\n    \"\"\"\n    random initialize embedding\n    \"\"\"\n    bias = np.sqrt(3.0 / input_embedding.size(1))\n    nn.init.uniform_(input_embedding, -bias, bias)\n\ndef init_linear(input_linear):\n    \"\"\"\n    random initialize linear projection.\n    \"\"\"\n    bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1)))\n    nn.init.uniform_(input_linear.weight, -bias, bias)\n    if input_linear.bias is not None:\n        input_linear.bias.data.zero_()\n\ndef adjust_learning_rate(optimizer, lr):\n    \"\"\"\n    adjust learning to the the new value.\n\n    Parameters\n    ----------\n    optimizer : required.\n        pytorch optimizer.\n    float :  ``float``, required.\n        the target learning rate.\n    \"\"\"\n    for param_group in optimizer.param_groups:\n        param_group['lr'] = lr\n\ndef init_lstm(input_lstm):\n    \"\"\"\n    random initialize lstms\n    \"\"\"\n    for ind in range(0, input_lstm.num_layers):\n        weight = eval('input_lstm.weight_ih_l'+str(ind))\n        bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1)))\n        nn.init.uniform_(weight, -bias, bias)\n        weight = eval('input_lstm.weight_hh_l'+str(ind))\n        bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1)))\n        nn.init.uniform_(weight, -bias, bias)\n    \n    if input_lstm.bias:\n        for ind in range(0, input_lstm.num_layers):\n            weight = eval('input_lstm.bias_ih_l'+str(ind))\n            weight.data.zero_()\n            weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1\n            weight = eval('input_lstm.bias_hh_l'+str(ind))\n            weight.data.zero_()\n            weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1"
  },
  {
    "path": "pre_seq/encode_data.py",
    "content": "\"\"\"\n.. module:: encode_data\n    :synopsis: encode data for sequence labeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport pickle\nimport argparse\nimport os\nimport random\nimport numpy as np\n\nfrom tqdm import tqdm\n\nimport itertools\nimport functools\n\ndef encode_dataset(input_file, flm_map, blm_map, gw_map, c_map, y_map):\n\n    flm_unk = flm_map['<unk>']\n    blm_unk = blm_map['<unk>']\n    gw_unk = gw_map['<unk>']\n    c_con = c_map[' ']\n    c_unk = c_map['<unk>']\n\n    dataset = list()\n\n    tmpw_flm, tmpw_blm, tmpw_gw, tmpc, tmpy = list(), list(), list(), list(), list()\n\n    with open(input_file, 'r') as fin:\n        for line in fin:\n            if line.isspace() or line.startswith('-DOCSTART-'):\n                if len(tmpw_flm) > 0:\n                    dataset.append([tmpw_flm, tmpw_blm, tmpw_gw, tmpc, tmpy])\n                tmpw_flm, tmpw_blm, tmpw_gw, tmpc, tmpy = list(), list(), list(), list(), list()\n            else:\n                line = line.split()\n                tmpw_flm.append(flm_map.get(line[0], flm_unk))\n                tmpw_blm.append(blm_map.get(line[0], blm_unk))\n                tmpw_gw.append(gw_map.get(line[0].lower(), gw_unk))\n                tmpy.append(y_map[line[-1]])\n                tmpc.append([c_map.get(tup, c_unk) for tup in line[0]])\n\n    if len(tmpw_flm) > 0:\n        dataset.append([tmpw_flm, tmpw_blm, tmpw_gw, tmpc, tmpy])\n\n    return dataset\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--train_file', default=\"./data/ner/eng.train.iobes\")\n    parser.add_argument('--test_file', default=\"./data/ner/eng.testb.iobes\")\n    parser.add_argument('--dev_file', default=\"./data/ner/eng.testa.iobes\")\n    parser.add_argument('--input_map', default=\"./data/conll_map.pk\")\n    parser.add_argument('--output_file', default=\"./data/ner_dataset.pk\")\n    parser.add_argument('--threshold', type=int, default=1)\n    parser.add_argument('--unk', default='<unk>')\n    args = parser.parse_args()\n\n    with open(args.input_map, 'rb') as f:\n        p_data = pickle.load(f)\n        name_list = ['flm_map', 'blm_map', 'gw_map', 'c_map', 'y_map', 'emb_array']\n        flm_map, blm_map, gw_map, c_map, y_map, emb_array = [p_data[tup] for tup in name_list]\n\n    train_dataset = encode_dataset(args.train_file, flm_map, blm_map, gw_map, c_map, y_map)\n    test_dataset = encode_dataset(args.test_file, flm_map, blm_map, gw_map, c_map, y_map)\n    dev_dataset = encode_dataset(args.dev_file, flm_map, blm_map, gw_map, c_map, y_map)\n\n    with open(args.output_file, 'wb') as f:\n        pickle.dump({'flm_map': flm_map, 'blm_map': blm_map, 'gw_map': gw_map, 'c_map': c_map, 'y_map': y_map, 'emb_array': emb_array, 'train_data': train_dataset, 'test_data': test_dataset, 'dev_data': dev_dataset}, f)"
  },
  {
    "path": "pre_seq/gene_map.py",
    "content": "\"\"\"\n.. module:: gene_map\n    :synopsis: generate map for sequence labeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport pickle\nimport argparse\nimport os\nimport random\nimport numpy as np\nfrom tqdm import tqdm\n\nimport itertools\nimport functools\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--train_corpus', default='./data/ner/eng.train.iobes')\n    parser.add_argument('--input_embedding', default=\"./embedding/glove.6B.100d.txt\")\n    parser.add_argument('--output_map', default=\"./data/conll_map.pk\")\n    parser.add_argument('--flm_map', default=\"./data/one_billion/test.pk\")\n    parser.add_argument('--blm_map', default=\"./data/one_billion_reverse/test.pk\")\n    parser.add_argument('--threshold', type=int, default=5)\n    parser.add_argument('--unk', default='unk')\n    args = parser.parse_args()\n\n    with open(args.flm_map, 'rb') as f:\n        p_data = pickle.load(f)\n        flm_map = p_data['w_map']\n\n    with open(args.blm_map, 'rb') as f:\n        p_data = pickle.load(f)\n        blm_map = p_data['w_map']\n\n    gw_map = dict()\n    embedding_array = list()\n    for line in open(args.input_embedding, 'r'):\n        line = line.split()\n        vector = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:])))\n        if line[0] == args.unk:\n            gw_map['<unk>'] = len(gw_map)\n        else:\n            gw_map[line[0]] = len(gw_map)\n        embedding_array.append(vector)\n\n    bias = 2 * np.sqrt(3.0 / len(embedding_array[0]))\n\n    gw_map['<\\n>'] = len(gw_map)\n    embedding_array.append([random.random() * bias - bias for tup in embedding_array[0]])\n\n    w_count = dict()\n    c_count = dict()\n    y_map = dict()\n    # y_map = {'B-LST':0, 'E-LST':1}\n\n    with open(args.train_corpus, 'r') as fin:\n        for line in fin:\n            if line.isspace() or line.startswith('-DOCSTART-'):\n                c_count['\\n'] = c_count.get('\\n', 0) + 1\n            else:\n                line = line.split()\n                for tup in line[0]:\n                    c_count[tup] = c_count.get(tup, 0) + 1\n                c_count[' '] = c_count.get(' ', 0) + 1\n                if line[-1] not in y_map:\n                    y_map[line[-1]] = len(y_map)\n                word = line[0].lower()\n                if word not in gw_map:\n                    w_count[word] = w_count.get(word, 0) + 1\n\n    w_set = {k for k, v in w_count.items() if v > args.threshold}\n    for k in w_set:\n        gw_map[k] = len(gw_map)\n        embedding_array.append([random.random() * bias - bias for tup in embedding_array[0]])\n\n    c_set = {k for k, v in c_count.items() if v > args.threshold}\n    c_map = {v:k for k, v in enumerate(c_set)}\n    c_map['<unk>'] = len(c_map)\n\n    y_map['<s>'] = len(y_map)\n    y_map['<eof>'] = len(y_map)\n\n    with open(args.output_map, 'wb') as f:\n        pickle.dump({'flm_map': flm_map, 'blm_map': blm_map, 'gw_map': gw_map, 'c_map': c_map, 'y_map': y_map, 'emb_array': embedding_array}, f)\n"
  },
  {
    "path": "pre_word_ada/encode_data2folder.py",
    "content": "\"\"\"\n.. module:: encode_data2folder\n    :synopsis: encode data folder for language modeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport pickle\nimport argparse\nimport os\nimport random\nimport numpy as np\n\nfrom tqdm import tqdm\n\nimport itertools\nimport functools\n\ndef encode_dataset(input_folder, w_map, reverse):\n\n    w_eof = w_map['\\n']\n    w_unk = w_map['<unk>']\n\n    list_dirs = os.walk(input_folder)\n\n    lines = list()\n\n    for root, dirs, files in list_dirs:\n        for file in tqdm(files):\n            with open(os.path.join(root, file)) as fin:\n                lines = lines + list(filter(lambda t: t and not t.isspace(), fin.readlines()))\n\n    dataset = list()\n    for line in lines:\n        dataset += list(map(lambda t: w_map.get(t, w_unk), line.split())) + [w_eof]\n\n    if reverse:\n        dataset = dataset[::-1]\n\n    return dataset\n\ndef encode_dataset2file(input_folder, t, w_map, reverse):\n\n    w_eof = w_map['\\n']\n    w_unk = w_map['<unk>']\n\n    list_dirs = os.walk(input_folder)\n\n    range_ind = 0\n\n    for root, dirs, files in list_dirs:\n        for file in tqdm(files):\n            with open(os.path.join(root, file), 'r') as fin:\n                lines = list(filter(lambda t: t and not t.isspace(), fin.readlines()))\n            \n            dataset = list()\n            for line in lines:\n                dataset += list(map(lambda t: w_map.get(t, w_unk), line.split())) + [w_eof]\n\n            if reverse:\n                dataset = dataset[::-1]\n\n            with open(output_folder+'train_'+ str(range_ind) + '.pk', 'wb') as f:\n                pickle.dump(dataset, f)\n\n            range_ind += 1\n\n    return range_ind\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--train_folder', default=\"./data/1b_train\")\n    parser.add_argument('--test_folder', default=\"./data/1b_test\")\n    parser.add_argument('--input_map', default=\"./data/1b_map.pk\")\n    parser.add_argument('--output_folder', default=\"./data/one_billion/\")\n    parser.add_argument('--threshold', type=int, default=3)\n    parser.add_argument('--unk', default='<unk>')\n    parser.add_argument('--reverse', action='store_true')\n    args = parser.parse_args()\n\n    with open(args.input_map, 'rb') as f:\n        w_count = pickle.load(f)\n\n    unk_count = sum([v for k, v in w_count.items() if v <= args.threshold])\n    w_list = [(k, v) for k, v in w_count.items() if v > args.threshold]\n    w_list.append(('<unk>', unk_count))\n    w_list.sort(key=lambda t: t[1], reverse=True)\n    w_map = {kv[0]:v for v, kv in enumerate(w_list)}\n\n    range_ind = encode_dataset2file(args.train_folder, args.output_folder, w_map, args.reverse)\n\n    test_dataset = encode_dataset(args.test_folder, w_map, args.reverse)\n\n    with open(args.output_folder+'test.pk', 'wb') as f:\n        pickle.dump({'w_map': w_map, 'test_data':test_dataset, 'range' : range_ind}, f)\n"
  },
  {
    "path": "pre_word_ada/gene_map.py",
    "content": "\"\"\"\n.. module:: gene_map\n    :synopsis: gene map for language modeling\n \n.. moduleauthor:: Liyuan Liu\n\"\"\"\nimport pickle\nimport argparse\nimport os\nimport random\nimport numpy as np\nfrom tqdm import tqdm\n\nimport itertools\nimport functools\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--input_folder', default=\"./data/1b_train\")\n    parser.add_argument('--output_map', default=\"./data/1b_map.pk\")\n    args = parser.parse_args()\n\n    w_count = {'\\n':0}\n\n    list_dirs = os.walk(args.input_folder)\n    \n    for root, dirs, files in list_dirs:\n        for file in tqdm(files):\n            with open(os.path.join(root, file)) as fin:\n                for line in fin:\n                    if not line or line.isspace():\n                        continue\n                    line = line.split()\n                    for tup in line:\n                        w_count[tup] = w_count.get(tup, 0) + 1\n                    w_count['\\n'] += 1\n\n    with open(args.output_map, 'wb') as f:\n        pickle.dump(w_count, f)"
  },
  {
    "path": "prune_sparse_seq.py",
    "content": "from __future__ import print_function\nimport datetime\nimport time\nimport torch\nimport torch.autograd as autograd\nimport torch.nn as nn\nimport torch.optim as optim\nimport codecs\nimport pickle\nimport math\n\nfrom model_word_ada.LM import LM\nfrom model_word_ada.basic import BasicRNN\nfrom model_word_ada.densenet import DenseRNN\nfrom model_word_ada.ldnet import LDRNN\n\nfrom model_seq.crf import CRFLoss, CRFDecode\nfrom model_seq.dataset import SeqDataset\nfrom model_seq.evaluator import eval_wc\nfrom model_seq.seqlabel import SeqLabel, Vanilla_SeqLabel\nfrom model_seq.seqlm import BasicSeqLM\nfrom model_seq.sparse_lm import SparseSeqLM\nimport model_seq.utils as utils\n\nfrom torch_scope import wrapper\n\nimport argparse\nimport logging\nimport json\nimport os\nimport sys\nimport itertools\nimport functools\n\nlogger = logging.getLogger(__name__)\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    \n    parser.add_argument('--gpu', type=str, default=\"auto\")\n    parser.add_argument('--cp_root', default='./checkpoint')\n    parser.add_argument('--checkpoint_name', default='p_ner')\n    parser.add_argument('--git_tracking', action='store_true')\n\n    parser.add_argument('--corpus', default='./data/ner_dataset.pk')\n    parser.add_argument('--load_seq', default='./checkpoint/ner.th')\n\n    parser.add_argument('--lm_hid_dim', type=int, default=300)\n    parser.add_argument('--lm_word_dim', type=int, default=300)\n    parser.add_argument('--lm_label_dim', type=int, default=1600)\n    parser.add_argument('--lm_layer_num', type=int, default=10)\n    parser.add_argument('--lm_droprate', type=float, default=0.5)\n    parser.add_argument('--lm_rnn_layer', choices=['Basic', 'DenseNet', 'LDNet'], default='LDNet')\n    parser.add_argument('--lm_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm')\n\n    parser.add_argument('--seq_c_dim', type=int, default=30)\n    parser.add_argument('--seq_c_hid', type=int, default=150)\n    parser.add_argument('--seq_c_layer', type=int, default=1)\n    parser.add_argument('--seq_w_dim', type=int, default=100)\n    parser.add_argument('--seq_w_hid', type=int, default=300)\n    parser.add_argument('--seq_w_layer', type=int, default=1)\n    parser.add_argument('--seq_droprate', type=float, default=0.5)\n    parser.add_argument('--seq_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm')\n    parser.add_argument('--seq_model', choices=['vanilla', 'lm-aug'], default='lm-aug')\n    parser.add_argument('--seq_lambda0', type=float, default=0.05)\n    parser.add_argument('--seq_lambda1', type=float, default=2)\n\n    parser.add_argument('--batch_size', type=int, default=10)\n    parser.add_argument('--patience', type=int, default=5)\n    parser.add_argument('--epoch', type=int, default=200)\n    parser.add_argument('--least', type=int, default=50)\n    parser.add_argument('--clip', type=float, default=5)\n    parser.add_argument('--lr', type=float, default=0.015)\n    parser.add_argument('--lr_decay', type=float, default=0.05)\n    parser.add_argument('--update', choices=['Adam', 'Adagrad', 'Adadelta', 'SGD'], default='SGD')\n    args = parser.parse_args()\n\n    pw = wrapper(os.path.join(args.cp_root, args.checkpoint_name), args.checkpoint_name, enable_git_track=args.git_tracking)\n    \n    gpu_index = pw.auto_device() if 'auto' == args.gpu else int(args.gpu)\n    device = torch.device(\"cuda:\" + str(gpu_index) if gpu_index >= 0 else \"cpu\")\n    if gpu_index >= 0:\n        torch.cuda.set_device(gpu_index)\n\n    logger.info('Loading data from {}.'.format(args.corpus))\n\n    dataset = pickle.load(open(args.corpus, 'rb'))\n    name_list = ['flm_map', 'blm_map', 'gw_map', 'c_map', 'y_map', 'emb_array', 'train_data', 'test_data', 'dev_data']\n    flm_map, blm_map, gw_map, c_map, y_map, emb_array, train_data, test_data, dev_data = [dataset[tup] for tup in name_list ]\n\n    logger.info('Building language models and seuqence labeling models.')\n\n    rnn_map = {'Basic': BasicRNN, 'DenseNet': DenseRNN, 'LDNet': functools.partial(LDRNN, layer_drop = 0)}\n    flm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate)\n    blm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate)\n    flm_model = LM(flm_rnn_layer, None, len(flm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim)\n    blm_model = LM(blm_rnn_layer, None, len(blm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim)\n    flm_model_seq = SparseSeqLM(flm_model, False, args.lm_droprate, False)\n    blm_model_seq = SparseSeqLM(blm_model, True, args.lm_droprate, False)\n    SL_map = {'vanilla':Vanilla_SeqLabel, 'lm-aug': SeqLabel}\n    seq_model = SL_map[args.seq_model](flm_model_seq, blm_model_seq, len(c_map), args.seq_c_dim, args.seq_c_hid, args.seq_c_layer, len(gw_map), args.seq_w_dim, args.seq_w_hid, args.seq_w_layer, len(y_map), args.seq_droprate, unit=args.seq_rnn_unit)\n\n    logger.info('Loading pre-trained models from {}.'.format(args.load_seq))\n\n    seq_file = wrapper.restore_checkpoint(args.load_seq)['model']\n    seq_model.load_state_dict(seq_file)\n    seq_model.to(device)\n    crit = CRFLoss(y_map)\n    decoder = CRFDecode(y_map)\n    evaluator = eval_wc(decoder, 'f1')\n\n    logger.info('Constructing dataset.')\n\n    train_dataset, test_dataset, dev_dataset = [SeqDataset(tup_data, flm_map['\\n'], blm_map['\\n'], gw_map['<\\n>'], c_map[' '], c_map['\\n'], y_map['<s>'], y_map['<eof>'], len(y_map), args.batch_size) for tup_data in [train_data, test_data, dev_data]]\n\n    logger.info('Constructing optimizer.')\n\n    param_dict = filter(lambda t: t.requires_grad, seq_model.parameters())\n    optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta, 'SGD': functools.partial(optim.SGD, momentum=0.9)}\n    if args.lr > 0:\n        optimizer=optim_map[args.update](param_dict, lr=args.lr)\n    else:\n        optimizer=optim_map[args.update](param_dict)\n\n    logger.info('Saving configues.')\n\n    pw.save_configue(args)\n\n    logger.info('Setting up training environ.')\n\n    best_f1 = float('-inf')\n    patience_count = 0\n    batch_index = 0\n    normalizer = 0\n    tot_loss = 0\n\n    dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device))\n    print(dev_f1)\n    \n    logger.info('Start training...')\n\n    for indexs in range(args.epoch):\n\n        logger.info('############')\n        logger.info('Epoch: {}'.format(indexs))\n        pw.nvidia_memory_map()\n\n        iterator = train_dataset.get_tqdm(device)\n\n        seq_model.train()\n        for f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w, f_y, f_y_m, _ in iterator:\n\n            seq_model.zero_grad()\n            output = seq_model(f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w)\n            loss = crit(output, f_y, f_y_m)\n\n            tot_loss += utils.to_scalar(loss)\n            normalizer += 1\n\n            if args.seq_lambda0 > 0:\n                f_reg0, f_reg1, f_reg3 = flm_model_seq.regularizer()\n                b_reg0, b_reg1, b_reg3 = blm_model_seq.regularizer()\n\n                loss += args.seq_lambda0 * (f_reg3 + b_reg3)\n\n                if (f_reg0 + b_reg0 > args.seq_lambda1):\n                    loss += args.seq_lambda0 * (f_reg1 + b_reg1)\n\n            loss.backward()\n            torch.nn.utils.clip_grad_norm_(seq_model.parameters(), args.clip)\n            optimizer.step()\n\n            flm_model_seq.prox()\n            blm_model_seq.prox()\n\n            batch_index += 1\n            if 0 == batch_index % 100:\n                pw.add_loss_vs_batch({'training_loss': tot_loss / (normalizer + 1e-9)}, batch_index, use_logger = False)\n                tot_loss = 0\n                normalizer = 0\n\n        if args.lr > 0:\n            current_lr = args.lr / (1 + (indexs + 1) * args.lr_decay)\n            utils.adjust_learning_rate(optimizer, current_lr)\n\n        dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device))\n        nonezero_count = (flm_model_seq.rnn.weight_list.data > 0).int().cpu().sum() + (blm_model_seq.rnn.weight_list.data > 0).cpu().int().sum()\n\n        pw.add_loss_vs_batch({'dev_f1': dev_f1, 'none_zero_count': nonezero_count.item()}, indexs, use_logger = True)\n        pw.add_loss_vs_batch({'dev_pre': dev_pre, 'dev_rec': dev_rec}, indexs, use_logger = False)\n\n        logger.info('Saving model...')\n        pw.save_checkpoint(model = seq_model, is_best = (nonezero_count <= args.seq_lambda1 and dev_f1 > best_f1))\n\n        if nonezero_count <= args.seq_lambda1 and dev_f1 > best_f1:\n            nonezero_count = nonezero_count\n            test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device))\n            best_f1, best_dev_pre, best_dev_rec, best_dev_acc = dev_f1, dev_pre, dev_rec, dev_acc\n            pw.add_loss_vs_batch({'tot_loss': tot_loss/(normalizer+1e-9), 'test_f1': test_f1}, indexs, use_logger = True)\n            pw.add_loss_vs_batch({'test_pre': test_pre, 'test_rec': test_rec}, indexs, use_logger = False)\n            patience_count = 0\n        elif dev_f1 > best_f1:\n            test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device))\n            pw.add_loss_vs_batch({'tot_loss': tot_loss/(normalizer+1e-9), 'test_f1': test_f1}, indexs, use_logger = True)\n            pw.add_loss_vs_batch({'test_pre': test_pre, 'test_rec': test_rec}, indexs, use_logger = False)\n        else:\n            patience_count += 1\n            if patience_count >= args.patience and indexs >= args.least:\n                break\n\n    pw.add_loss_vs_batch({'best_test_f1': test_f1, 'best_test_pre': test_pre, 'best_test_rec': test_rec}, 0, use_logger = True, use_writer = False)\n    pw.add_loss_vs_batch({'best_dev_f1': best_f1, 'best_dev_pre': best_dev_pre, 'best_dev_rec': best_dev_rec}, 0, use_logger = True, use_writer = False)\n\n    logger.info('Loading best_performing_model.')\n    seq_param = pw.restore_best_checkpoint()['model']\n    seq_model.load_state_dict(seq_param)\n    seq_model.to(device)\n\n    logger.info('Test before deleting layers.')\n    test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device))\n    dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device))\n\n    pw.add_loss_vs_batch({'best_test_f1': test_f1, 'best_dev_f1': dev_f1}, 1, use_logger = True, use_writer = False)\n\n    logger.info('Deleting layers.')\n    seq_model.cpu()\n    seq_model.prune_dense_rnn()\n    seq_model.to(device)\n\n    logger.info('Resulting models display.')\n    print(seq_model)\n\n    logger.info('Test after deleting layers.')\n    test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device))\n    dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device))\n\n    pw.add_loss_vs_batch({'best_test_f1': test_f1, 'best_dev_f1': dev_f1}, 2, use_logger = True, use_writer = False)\n\n    seq_model.cpu()\n    logger.info('Saving model...')\n\n    seq_config = seq_model.to_params()\n\n    pw.save_checkpoint(model = seq_model, \n                        is_best = True,\n                        s_dict = {'config': seq_config, \n                            'flm_map': flm_map, \n                            'blm_map': blm_map, \n                            'gw_map': gw_map, \n                            'c_map': c_map, \n                            'y_map': y_map})\n\n    pw.close()\n"
  },
  {
    "path": "train_lm.py",
    "content": "from __future__ import print_function\nimport datetime\nimport time\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport codecs\nimport pickle\nimport math\n\nfrom model_word_ada.LM import LM\nfrom model_word_ada.basic import BasicRNN\nfrom model_word_ada.ldnet import LDRNN\nfrom model_word_ada.densenet import DenseRNN\nfrom model_word_ada.dataset import LargeDataset, EvalDataset\nfrom model_word_ada.adaptive import AdaptiveSoftmax\nimport model_word_ada.utils as utils\n\nfrom torch_scope import wrapper\n\nimport argparse\nimport logging\nimport json\nimport os\nimport sys\nimport itertools\nimport functools\n\nlogger = logging.getLogger(__name__)\n\ndef evaluate(data_loader, lm_model, limited = 76800):\n    lm_model.eval()\n    lm_model.init_hidden()\n    total_loss = 0\n    total_len = 0\n    for word_t, label_t in data_loader:\n        label_t = label_t.view(-1)\n        tmp_len = label_t.size(0)\n        total_loss += tmp_len * lm_model(word_t, label_t).item()\n        total_len += tmp_len\n\n        if limited >=0 and total_len > limited:\n            break\n\n    ppl = math.exp(total_loss / total_len)\n    return ppl\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n\n    parser.add_argument('--gpu', type=str, default=\"auto\")\n    parser.add_argument('--cp_root', default='./checkpoint')\n    parser.add_argument('--checkpoint_name', default='ld0')\n    parser.add_argument('--git_tracking', action='store_true')\n\n    parser.add_argument('--dataset_folder', default='./data/one_billion/')\n    parser.add_argument('--restore_checkpoint', default='')\n\n    parser.add_argument('--batch_size', type=int, default=128)\n    parser.add_argument('--sequence_length', type=int, default=20)\n    parser.add_argument('--hid_dim', type=int, default=300)\n    parser.add_argument('--word_dim', type=int, default=300)\n    parser.add_argument('--label_dim', type=int, default=1600)\n    parser.add_argument('--layer_num', type=int, default=10)\n    parser.add_argument('--droprate', type=float, default=0.01)\n    parser.add_argument('--add_relu', action='store_true')\n    parser.add_argument('--layer_drop', type=float, default=0.5)\n    parser.add_argument('--epoch', type=int, default=400)\n    parser.add_argument('--clip', type=float, default=5)\n    parser.add_argument('--update', choices=['Adam', 'Adagrad', 'Adadelta'], default='Adam', help='adam is the best')\n    parser.add_argument('--rnn_layer', choices=['Basic', 'DenseNet', 'LDNet'], default='LDNet')\n    parser.add_argument('--rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm')\n    parser.add_argument('--lr', type=float, default=0.001)\n    parser.add_argument('--lr_decay', type=float, default=0.1)\n    parser.add_argument('--cut_off', nargs='+', default=[4000,40000,200000])\n    parser.add_argument('--interval', type=int, default=100)\n    parser.add_argument('--epoch_size', type=int, default=4000)\n    parser.add_argument('--patience', type=float, default=10)\n    args = parser.parse_args()\n\n    pw = wrapper(os.path.join(args.cp_root, args.checkpoint_name), args.checkpoint_name, enable_git_track=args.git_tracking)\n\n    gpu_index = pw.auto_device() if 'auto' == args.gpu else int(args.gpu)\n    device = torch.device(\"cuda:\" + str(gpu_index) if gpu_index >= 0 else \"cpu\")\n    if gpu_index >= 0:\n        torch.cuda.set_device(gpu_index)\n\n    logger.info('Loading dataset.')\n    dataset = pickle.load(open(args.dataset_folder + 'test.pk', 'rb'))\n    w_map, test_data, range_idx = dataset['w_map'], dataset['test_data'], dataset['range']\n    train_loader = LargeDataset(args.dataset_folder, range_idx, args.batch_size, args.sequence_length)\n    test_loader = EvalDataset(test_data, args.batch_size)\n\n    logger.info('Building models.')\n    rnn_map = {'Basic': BasicRNN, 'DenseNet': DenseRNN, 'LDNet': functools.partial(LDRNN, layer_drop = args.layer_drop)}\n    rnn_layer = rnn_map[args.rnn_layer](args.layer_num, args.rnn_unit, args.word_dim, args.hid_dim, args.droprate)\n    cut_off = args.cut_off + [len(w_map) + 1]\n    if args.label_dim > 0:\n        soft_max = AdaptiveSoftmax(args.label_dim, cut_off)\n    else:\n        soft_max = AdaptiveSoftmax(rnn_layer.output_dim, cut_off)\n    lm_model = LM(rnn_layer, soft_max, len(w_map), args.word_dim, args.droprate, label_dim = args.label_dim, add_relu=args.add_relu)\n    lm_model.rand_ini()\n\n    logger.info('Building optimizer.')\n    optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta}\n    if args.lr > 0:\n        optimizer=optim_map[args.update](lm_model.parameters(), lr=args.lr)\n    else:\n        optimizer=optim_map[args.update](lm_model.parameters())\n\n    if args.restore_checkpoint:\n        if os.path.isfile(args.restore_checkpoint):\n            logger.info(\"loading checkpoint: '{}'\".format(args.restore_checkpoint))\n            model_file = wrapper.restore_checkpoint(args.restore_checkpoint)['model']\n            lm_model.load_state_dict(model_file, False)\n        else:\n            logger.info(\"no checkpoint found at: '{}'\".format(args.restore_checkpoint))\n    lm_model.to(device)\n\n    logger.info('Saving configues.')\n    pw.save_configue(args)\n\n    logger.info('Setting up training environ.')\n    best_train_ppl = float('inf')\n    cur_lr = args.lr\n    batch_index = 0\n    epoch_loss = 0\n    patience = 0\n\n    writer = SummaryWriter(log_dir='./runs_1b/'+args.log_dir)\n    name_list = ['batch_loss', 'train_ppl', 'test_ppl']\n    bloss, tr_ppl, te_ppl = [args.log_dir+'/'+tup for tup in name_list]\n\n    try:\n        for indexs in range(args.epoch):\n    \n            logger.info('############')\n            logger.info('Epoch: {}'.format(indexs))\n            pw.nvidia_memory_map()\n\n            lm_model.train()\n\n            for word_t, label_t in train_loader.get_tqdm(device):\n\n                if 1 == train_loader.cur_idx:\n                    lm_model.init_hidden()\n\n                label_t = label_t.view(-1)\n\n                lm_model.zero_grad()\n                loss = lm_model(word_t, label_t)\n                \n                loss.backward()\n                torch.nn.utils.clip_grad_norm_(lm_model.parameters(), args.clip)\n                optimizer.step()\n\n                batch_index += 1\n                if 0 == batch_index % args.interval:\n                    s_loss = utils.to_scalar(loss)\n                    pw.add_loss_vs_batch({'batch_loss': s_loss}, batch_index, use_logger = False)\n                                \n                epoch_loss += utils.to_scalar(loss)\n                if 0 == batch_index % args.epoch_size:\n                    epoch_ppl = math.exp(epoch_loss / args.epoch_size)\n                    pw.add_loss_vs_batch({'train_ppl': epoch_ppl}, batch_index, use_logger = True)\n                    if epoch_loss < best_train_ppl:\n                        best_train_ppl = epoch_loss\n                        patience = 0\n                    else:\n                        patience += 1\n                    epoch_loss = 0\n\n                if patience > args.patience and cur_lr > 0:\n                    patience = 0\n                    cur_lr *= args.lr_decay\n                    best_train_ppl = float('inf')\n                    logger.info('adjust_learning_rate...')\n                    utils.adjust_learning_rate(optimizer, cur_lr)\n\n            test_ppl = evaluate(test_loader.get_tqdm(device), lm_model)\n            pw.add_loss_vs_batch({'test_ppl': test_ppl}, indexs, use_logger = True)\n            pw.save_checkpoint(model = lm_model, optimizer = optimizer, is_best = True)\n\n    except KeyboardInterrupt:\n\n        logger.info('Exiting from training early')\n        test_ppl = evaluate(test_loader.get_tqdm(device), lm_model)\n        pw.add_loss_vs_batch({'test_ppl': test_ppl}, indexs, use_logger = True)\n        pw.save_checkpoint(model = lm_model, optimizer = optimizer, is_best = True)\n\n    pw.close()"
  },
  {
    "path": "train_seq.py",
    "content": "from __future__ import print_function\nimport datetime\nimport time\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport codecs\nimport pickle\nimport math\n\nfrom model_word_ada.LM import LM\nfrom model_word_ada.basic import BasicRNN\nfrom model_word_ada.densenet import DenseRNN\nfrom model_word_ada.ldnet import LDRNN\n\nfrom model_seq.crf import CRFLoss, CRFDecode\nfrom model_seq.dataset import SeqDataset\nfrom model_seq.evaluator import eval_wc\nfrom model_seq.seqlabel import SeqLabel, Vanilla_SeqLabel\nfrom model_seq.seqlm import BasicSeqLM\nfrom model_seq.sparse_lm import SparseSeqLM\nimport model_seq.utils as utils\n\nfrom torch_scope import wrapper\n\nimport argparse\nimport logging\nimport json\nimport os\nimport sys\nimport itertools\nimport functools\n\nlogger = logging.getLogger(__name__)\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n\n    parser.add_argument('--gpu', type=str, default=\"auto\")\n    parser.add_argument('--cp_root', default='./checkpoint')\n    parser.add_argument('--checkpoint_name', default='ner')\n    parser.add_argument('--git_tracking', action='store_true')\n\n    parser.add_argument('--corpus', default='./data/ner_dataset.pk')\n    parser.add_argument('--forward_lm', default='./checkpoint/ld0.th')\n    parser.add_argument('--backward_lm', default='./checkpoint/ld_0.th')\n\n    parser.add_argument('--lm_hid_dim', type=int, default=300)\n    parser.add_argument('--lm_word_dim', type=int, default=300)\n    parser.add_argument('--lm_label_dim', type=int, default=-1)\n    parser.add_argument('--lm_layer_num', type=int, default=10)\n    parser.add_argument('--lm_droprate', type=float, default=0.5)\n    parser.add_argument('--lm_rnn_layer', choices=['Basic', 'DenseNet', 'LDNet'], default='LDNet')\n    parser.add_argument('--lm_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm')\n\n    parser.add_argument('--seq_c_dim', type=int, default=30)\n    parser.add_argument('--seq_c_hid', type=int, default=150)\n    parser.add_argument('--seq_c_layer', type=int, default=1)\n    parser.add_argument('--seq_w_dim', type=int, default=100)\n    parser.add_argument('--seq_w_hid', type=int, default=300)\n    parser.add_argument('--seq_w_layer', type=int, default=1)\n    parser.add_argument('--seq_droprate', type=float, default=0.5)\n    parser.add_argument('--seq_model', choices=['vanilla', 'lm-aug'], default='lm-aug')\n    parser.add_argument('--seq_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm')\n    parser.add_argument('--seq_lm_model', choices=['vanilla', 'sparse-lm'], default='vanilla')\n\n    parser.add_argument('--batch_size', type=int, default=10)\n    parser.add_argument('--patience', type=int, default=15)\n    parser.add_argument('--epoch', type=int, default=200)\n    parser.add_argument('--clip', type=float, default=5)\n    parser.add_argument('--lr', type=float, default=0.015)\n    parser.add_argument('--lr_decay', type=float, default=0.05)\n    parser.add_argument('--update', choices=['Adam', 'Adagrad', 'Adadelta', 'SGD'], default='SGD')\n    args = parser.parse_args()\n\n    pw = wrapper(os.path.join(args.cp_root, args.checkpoint_name), args.checkpoint_name, enable_git_track=args.git_tracking)\n\n    gpu_index = pw.auto_device() if 'auto' == args.gpu else int(args.gpu)\n    device = torch.device(\"cuda:\" + str(gpu_index) if gpu_index >= 0 else \"cpu\")\n    if gpu_index >= 0:\n        torch.cuda.set_device(gpu_index)\n    \n    logger.info('Loading data')\n\n    dataset = pickle.load(open(args.corpus, 'rb'))\n    name_list = ['flm_map', 'blm_map', 'gw_map', 'c_map', 'y_map', 'emb_array', 'train_data', 'test_data', 'dev_data']\n    flm_map, blm_map, gw_map, c_map, y_map, emb_array, train_data, test_data, dev_data = [dataset[tup] for tup in name_list ]\n\n    logger.info('Loading language model')\n\n    rnn_map = {'Basic': BasicRNN, 'DenseNet': DenseRNN, 'LDNet': functools.partial(LDRNN, layer_drop = 0)}\n    flm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate)\n    blm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate)\n    flm_model = LM(flm_rnn_layer, None, len(flm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim)\n    blm_model = LM(blm_rnn_layer, None, len(blm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim)\n    flm_file = wrapper.restore_checkpoint(args.forward_lm)['model']\n    flm_model.load_state_dict(flm_file, False)\n    blm_file = wrapper.restore_checkpoint(args.backward_lm)['model']\n    blm_model.load_state_dict(blm_file, False)\n    slm_map = {'vanilla': BasicSeqLM, 'sparse-lm': SparseSeqLM}\n    flm_model_seq = slm_map[args.seq_lm_model](flm_model, False, args.lm_droprate, True)\n    blm_model_seq = slm_map[args.seq_lm_model](blm_model, True, args.lm_droprate, True)\n\n    logger.info('Building models')\n\n    SL_map = {'vanilla':Vanilla_SeqLabel, 'lm-aug': SeqLabel}\n    seq_model = SL_map[args.seq_model](flm_model_seq, blm_model_seq, len(c_map), args.seq_c_dim, args.seq_c_hid, args.seq_c_layer, len(gw_map), args.seq_w_dim, args.seq_w_hid, args.seq_w_layer, len(y_map), args.seq_droprate, unit=args.seq_rnn_unit)\n    seq_model.rand_init()\n    seq_model.load_pretrained_word_embedding(torch.FloatTensor(emb_array))\n    seq_model.to(device)\n    crit = CRFLoss(y_map)\n    decoder = CRFDecode(y_map)\n    evaluator = eval_wc(decoder, 'f1')\n\n    logger.info('Constructing dataset')\n\n    train_dataset, test_dataset, dev_dataset = [SeqDataset(tup_data, flm_map['\\n'], blm_map['\\n'], gw_map['<\\n>'], c_map[' '], c_map['\\n'], y_map['<s>'], y_map['<eof>'], len(y_map), args.batch_size) for tup_data in [train_data, test_data, dev_data]]\n\n    logger.info('Constructing optimizer')\n\n    param_dict = filter(lambda t: t.requires_grad, seq_model.parameters())\n    optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta, 'SGD': functools.partial(optim.SGD, momentum=0.9)}\n    if args.lr > 0:\n        optimizer=optim_map[args.update](param_dict, lr=args.lr)\n    else:\n        optimizer=optim_map[args.update](param_dict)\n\n    logger.info('Saving configues.')\n    pw.save_configue(args)\n\n    logger.info('Setting up training environ.')\n    best_f1 = float('-inf')\n    patience_count = 0\n    batch_index = 0\n    normalizer=0\n    tot_loss = 0\n\n    for indexs in range(args.epoch):\n\n        logger.info('############')\n        logger.info('Epoch: {}'.format(indexs))\n        pw.nvidia_memory_map()\n\n        seq_model.train()\n        for f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w, f_y, f_y_m, _ in train_dataset.get_tqdm(device):\n\n            seq_model.zero_grad()\n            output = seq_model(f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w)\n            loss = crit(output, f_y, f_y_m)\n\n            tot_loss += utils.to_scalar(loss)\n            normalizer += 1\n\n            loss.backward()\n            torch.nn.utils.clip_grad_norm_(seq_model.parameters(), args.clip)\n            optimizer.step()\n\n            batch_index += 1\n            if 0 == batch_index % 100:\n                pw.add_loss_vs_batch({'training_loss': tot_loss / (normalizer + 1e-9)}, batch_index, use_logger = False)\n                tot_loss = 0\n                normalizer = 0\n\n        if args.lr > 0:\n            current_lr = args.lr / (1 + (indexs + 1) * args.lr_decay)\n            utils.adjust_learning_rate(optimizer, current_lr)\n\n        dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device))\n\n        pw.add_loss_vs_batch({'dev_f1': dev_f1}, indexs, use_logger = True)\n        pw.add_loss_vs_batch({'dev_pre': dev_pre, 'dev_rec': dev_rec}, indexs, use_logger = False)\n        \n        logger.info('Saving model...')\n        pw.save_checkpoint(model = seq_model, is_best = (dev_f1 > best_f1))\n\n        if dev_f1 > best_f1:\n            test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device))\n            best_f1, best_dev_pre, best_dev_rec, best_dev_acc = dev_f1, dev_pre, dev_rec, dev_acc\n            pw.add_loss_vs_batch({'test_f1': test_f1}, indexs, use_logger = True)\n            pw.add_loss_vs_batch({'test_pre': test_pre, 'test_rec': test_rec}, indexs, use_logger = False)\n            patience_count = 0\n        else:\n            patience_count += 1\n            if patience_count >= args.patience:\n                break\n\n    pw.close()\n"
  },
  {
    "path": "train_seq_elmo.py",
    "content": "from __future__ import print_function\nimport datetime\nimport time\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport codecs\nimport pickle\nimport math\nimport numpy as np\n\nfrom model_word_ada.LM import LM\nfrom model_word_ada.basic import BasicRNN\nfrom model_word_ada.densenet import DenseRNN\nfrom model_word_ada.ldnet import LDRNN\n\nfrom model_seq.crf import CRFLoss, CRFDecode\nfrom model_seq.dataset import SeqDataset\nfrom model_seq.evaluator import eval_wc\nfrom model_seq.seqlabel import SeqLabel, Vanilla_SeqLabel\nfrom model_seq.seqlm import BasicSeqLM\nfrom model_seq.elmo import ElmoLM\nimport model_seq.utils as utils\n\nfrom torch_scope import wrapper\n\nimport argparse\nimport logging\nimport json\nimport os\nimport sys\nimport itertools\nimport functools\n\nlogger = logging.getLogger(__name__)\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n\n    parser.add_argument('--gpu', type=str, default=\"auto\")\n    parser.add_argument('--cp_root', default='./checkpoint')\n    parser.add_argument('--checkpoint_name', default='elmo_ner')\n    parser.add_argument('--git_tracking', action='store_true')\n\n    parser.add_argument('--corpus', default='./data/ner_dataset.pk')\n    parser.add_argument('--forward_lm', default='./checkpoint/basic0.th')\n    parser.add_argument('--backward_lm', default='./checkpoint/basic_0.th')\n\n    parser.add_argument('--lm_hid_dim', type=int, default=2048)\n    parser.add_argument('--lm_word_dim', type=int, default=300)\n    parser.add_argument('--lm_label_dim', type=int, default=-1)\n    parser.add_argument('--lm_layer_num', type=int, default=2)\n    parser.add_argument('--lm_droprate', type=float, default=0.5)\n    parser.add_argument('--lm_rnn_layer', choices=['Basic'], default='Basic')\n    parser.add_argument('--lm_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm')\n\n    parser.add_argument('--seq_c_dim', type=int, default=30)\n    parser.add_argument('--seq_c_hid', type=int, default=150)\n    parser.add_argument('--seq_c_layer', type=int, default=1)\n    parser.add_argument('--seq_w_dim', type=int, default=100)\n    parser.add_argument('--seq_w_hid', type=int, default=300)\n    parser.add_argument('--seq_w_layer', type=int, default=1)\n    parser.add_argument('--seq_droprate', type=float, default=0.5)\n    parser.add_argument('--seq_model', choices=['vanilla', 'lm-aug'], default='lm-aug')\n    parser.add_argument('--seq_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm')\n    parser.add_argument('--seq_lambda0', type=float, default=0.01)\n\n    parser.add_argument('--batch_size', type=int, default=10)\n    parser.add_argument('--patience', type=int, default=15)\n    parser.add_argument('--epoch', type=int, default=200)\n    parser.add_argument('--clip', type=float, default=5)\n    parser.add_argument('--lr', type=float, default=0.015)\n    parser.add_argument('--lr_decay', type=float, default=0.05)\n    parser.add_argument('--update', choices=['Adam', 'Adagrad', 'Adadelta', 'SGD'], default='SGD')\n    args = parser.parse_args()\n\n    pw = wrapper(os.path.join(args.cp_root, args.checkpoint_name), args.checkpoint_name, enable_git_track=args.git_tracking)\n\n    gpu_index = pw.auto_device() if 'auto' == args.gpu else int(args.gpu)\n    device = torch.device(\"cuda:\" + str(gpu_index) if gpu_index >= 0 else \"cpu\")\n    if gpu_index >= 0:\n        torch.cuda.set_device(gpu_index)\n\n    logger.info('Loading data')\n\n    dataset = pickle.load(open(args.corpus, 'rb'))\n    name_list = ['flm_map', 'blm_map', 'gw_map', 'c_map', 'y_map', 'emb_array', 'train_data', 'test_data', 'dev_data']\n    flm_map, blm_map, gw_map, c_map, y_map, emb_array, train_data, test_data, dev_data = [dataset[tup] for tup in name_list ]\n\n    logger.info('Loading language model')\n\n    rnn_map = {'Basic': BasicRNN}\n    flm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate)\n    blm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate)\n    flm_model = LM(flm_rnn_layer, None, len(flm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim)\n    blm_model = LM(blm_rnn_layer, None, len(blm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim)\n    flm_file = wrapper.restore_checkpoint(args.forward_lm)['model']\n    flm_model.load_state_dict(flm_file, False)\n    blm_file = wrapper.restore_checkpoint(args.backward_lm)['model']\n    blm_model.load_state_dict(blm_file, False)\n    flm_model_seq = ElmoLM(flm_model, False, args.lm_droprate, True)\n    blm_model_seq = ElmoLM(blm_model, True, args.lm_droprate, True)\n\n    logger.info('Building model')\n\n    SL_map = {'vanilla':Vanilla_SeqLabel, 'lm-aug': SeqLabel}\n    seq_model = SL_map[args.seq_model](flm_model_seq, blm_model_seq, len(c_map), args.seq_c_dim, args.seq_c_hid, args.seq_c_layer, len(gw_map), args.seq_w_dim, args.seq_w_hid, args.seq_w_layer, len(y_map), args.seq_droprate, unit=args.seq_rnn_unit)\n    seq_model.rand_init()\n    seq_model.load_pretrained_word_embedding(torch.FloatTensor(emb_array))\n    seq_model.to(device)\n    crit = CRFLoss(y_map)\n    decoder = CRFDecode(y_map)\n    evaluator = eval_wc(decoder, 'f1')\n\n    print('constructing dataset')\n\n    train_dataset, test_dataset, dev_dataset = [SeqDataset(tup_data, flm_map['\\n'], blm_map['\\n'], gw_map['<\\n>'], c_map[' '], c_map['\\n'], y_map['<s>'], y_map['<eof>'], len(y_map), args.batch_size) for tup_data in [train_data, test_data, dev_data]]\n\n    print('constructing optimizer')\n\n    param_dict = filter(lambda t: t.requires_grad, seq_model.parameters())\n    optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta, 'SGD': functools.partial(optim.SGD, momentum=0.9)}\n    if args.lr > 0:\n        optimizer=optim_map[args.update](param_dict, lr=args.lr)\n    else:\n        optimizer=optim_map[args.update](param_dict)\n    \n    logger.info('Saving configues.')\n\n    pw.save_configue(args)\n\n    logger.info('Setting up training environ.')\n\n    best_f1 = float('-inf')\n    patience_count = 0\n    batch_index = 0\n    normalizer = 0\n    tot_loss = 0\n\n    for indexs in range(args.epoch):\n\n        logger.info('############')\n        logger.info('Epoch: {}'.format(indexs))\n        pw.nvidia_memory_map()\n\n        seq_model.train()\n        for f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w, f_y, f_y_m, _ in train_dataset.get_tqdm(device):\n\n            seq_model.zero_grad()\n            output = seq_model(f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w)\n            loss = crit(output, f_y, f_y_m)\n\n            tot_loss += utils.to_scalar(loss)\n            normalizer += 1\n\n            if args.seq_lambda0 > 0:\n                loss += args.seq_lambda0 * (flm_model_seq.regularizer(args.seq_lambda1) + blm_model_seq.regularizer(args.seq_lambda1))\n\n            loss.backward()\n            torch.nn.utils.clip_grad_norm_(seq_model.parameters(), args.clip)\n            optimizer.step()\n\n            batch_index += 1\n\n            if 0 == batch_index % 100:\n                pw.add_loss_vs_batch({'training_loss': tot_loss / (normalizer + 1e-9)}, batch_index, use_logger = False)\n                tot_loss = 0\n                normalizer = 0\n\n        if args.lr > 0:\n            current_lr = args.lr / (1 + (indexs + 1) * args.lr_decay)\n            utils.adjust_learning_rate(optimizer, current_lr)\n\n        dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device))\n\n        pw.add_loss_vs_batch({'dev_f1': dev_f1}, indexs, use_logger = True)\n        pw.add_loss_vs_batch({'dev_pre': dev_pre, 'dev_rec': dev_rec}, indexs, use_logger = False)\n        \n        logger.info('Saving model...')\n        pw.save_checkpoint(model = seq_model, is_best = (dev_f1 > best_f1))\n\n        if dev_f1 > best_f1:\n            test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device))\n            best_f1, best_dev_pre, best_dev_rec, best_dev_acc = dev_f1, dev_pre, dev_rec, dev_acc\n            pw.add_loss_vs_batch({'test_f1': test_f1}, indexs, use_logger = True)\n            pw.add_loss_vs_batch({'test_pre': test_pre, 'test_rec': test_rec}, indexs, use_logger = False)\n            patience_count = 0\n        else:\n            patience_count += 1\n            if patience_count >= args.patience:\n                break\n\n    pw.close()\n"
  }
]