Repository: LiyuanLucasLiu/LD-Net Branch: master Commit: f9489b6e7d43 Files: 34 Total size: 151.5 KB Directory structure: gitextract_4ncvupf5/ ├── LICENSE ├── ReadMe.md ├── docs/ │ ├── Makefile │ └── source/ │ ├── conf.py │ ├── index.rst │ ├── seq.rst │ └── word.rst ├── ldnet_ner_prune.sh ├── ldnet_np_prune.sh ├── model_seq/ │ ├── __init__.py │ ├── crf.py │ ├── dataset.py │ ├── elmo.py │ ├── evaluator.py │ ├── seqlabel.py │ ├── seqlm.py │ ├── sparse_lm.py │ └── utils.py ├── model_word_ada/ │ ├── LM.py │ ├── __init__.py │ ├── adaptive.py │ ├── basic.py │ ├── dataset.py │ ├── densenet.py │ ├── ldnet.py │ └── utils.py ├── pre_seq/ │ ├── encode_data.py │ └── gene_map.py ├── pre_word_ada/ │ ├── encode_data2folder.py │ └── gene_map.py ├── prune_sparse_seq.py ├── train_lm.py ├── train_seq.py └── train_seq_elmo.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [2018] [Liyuan Liu] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: ReadMe.md ================================================ # LD-Net [![Documentation Status](https://readthedocs.org/projects/ld-net/badge/?version=latest)](http://ld-net.readthedocs.io/en/latest/?badge=latest) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) **Check Our New NER Toolkit🚀🚀🚀** - **Inference**: - **[LightNER](https://github.com/LiyuanLucasLiu/LightNER)**: inference w. models pre-trained / trained w. *any* following tools, *efficiently*. - **Training**: - **[LD-Net](https://github.com/LiyuanLucasLiu/LD-Net)**: train NER models w. efficient contextualized representations. - **[VanillaNER](https://github.com/LiyuanLucasLiu/Vanilla_NER)**: train vanilla NER models w. pre-trained embedding. - **Distant Training**: - **[AutoNER](https://shangjingbo1226.github.io/AutoNER/)**: train NER models w.o. line-by-line annotations and get competitive performance. -------------------------------- LD-Net provides sequence labeling models featuring: - **Efficiency**: constructing *efficient contextualized representations* without retraining language models. - **Portability**: *well-organized*, *easy-to-modify* and *[well-documented](http://lm-lstm-crf.readthedocs.io/en/latest/)*. Remarkablely, our pre-trained NER model achieved: - **92.08** test F1 on the CoNLL03 NER task. - **160K words/sec** decoding speed (**6X** speedup compared to its original model). Details about LD-Net can be accessed at: https://arxiv.org/abs/1804.07827. - [Model notes](#model-notes) - [Benchmarks](#benchmarks) - [Pretrained model](#pretrained-model) - [Language models](#language-models) - [Named Entity Recognition](#named-entity-recognition) - [Chunking](#chunking) - [Training](#model-training) - [Dependency](#dependency) - [Data](#data) - [Model](#model) - [Command](#command) - [Inference](#inference) - [Citation](#citation) ## Model Notes ![LD-Net Framework](docs/model_note.png) ## Benchmarks | Model for CoNLL03 | #FLOPs| Mean(F1) | Std(F1) | | ------------- |-------------| -----| -----| | Vanilla NER w.o. LM | 3 M | 90.78 | 0.24 | | LD-Net (w.o. pruning) | 51 M | 91.86 | 0.15 | | LD-Net (origin, picked based on dev f1) | 51 M | 91.95 | | | LD-Net (pruned) | **5 M** | 91.84 | 0.14 | | Model for CoNLL00 | #FLOPs| Mean(F1) | Std(F1) | | ------------- |-------------| -----| -----| | Vanilla NP w.o. LM | 3 M | 94.42 | 0.08 | | LD-Net (w.o. pruning) | 51 M | 96.01 | 0.07 | | LD-Net (origin, picked based on dev f1) | 51 M | 96.13 | | | LD-Net (pruned) | **10 M** | 95.66 | 0.04 | ## Pretrained Models Here we provide both pre-trained language models and pre-trained sequence labeling models. ### Language Models Our pretrained language model contains word embedding, 10-layer densely-connected LSTM and adative softmax, and achieve an average PPL of 50.06 on the one billion benchmark dataset. | Forward Language Model | Backward Language Model | | ------------- |------------- | | [Download Link](http://dmserv4.cs.illinois.edu/ld0.th) | [Download Link](http://dmserv4.cs.illinois.edu/ld_0.th)| ### Named Entity Recognition The original pre-trained named entity tagger achieves 91.95 F1, the pruned tagged achieved 92.08 F1. | Original Tagger | Pruned Tagger | | ------------- |------------- | | [Download Link](http://dmserv4.cs.illinois.edu/ner.th) | [Download Link](http://dmserv4.cs.illinois.edu/pner0.th) | ### Chunking The original pre-trained named entity tagger achieves 96.13 F1, the pruned tagged achieved 95.79 F1. | Original Tagger | Pruned Tagger | | ------------- |------------- | | [Download Link](http://dmserv4.cs.illinois.edu/np.th) | [Download Link](http://dmserv4.cs.illinois.edu/pnp0.th) | ## Training ### Demo Scripts To pruning the original LD-Net for the CoNLL03 NER, please run: ``` bash ldnet_ner_prune.sh ``` To pruning the original LD-Net for the CoNLL00 Chunking, please run: ``` bash ldnet_np_prune.sh ``` ### Dependency Our package is based on Python 3.6 and the following packages: ``` numpy tqdm torch-scope torch==0.4.1 ``` ### Data Pre-process scripts are available in ```pre_seq``` and ```pre_word_ada```, while pre-processed data has been stored in: | NER | Chunking | | ------------- |------------- | | [Download Link](http://dmserv4.cs.illinois.edu/ner_dataset.pk) | [Download Link](http://dmserv4.cs.illinois.edu/np_dataset.pk) | ### Model Our implementations are available in ```model_seq``` and ```model_word_ada```, and the documentations are hosted in [ReadTheDoc](http://lm-lstm-crf.readthedocs.io/en/latest/) | NER | Chunking | | ------------- |------------- | | [Download Link](http://dmserv4.cs.illinois.edu/ner_dataset.pk) | [Download Link](http://dmserv4.cs.illinois.edu/np_dataset.pk) | ## Inference For model inference, please check our [LightNER package](https://github.com/LiyuanLucasLiu/LightNER) ## Citation If you find the implementation useful, please cite the following paper: [Efficient Contextualized Representation: Language Model Pruning for Sequence Labeling](https://arxiv.org/abs/1804.07827) ``` @inproceedings{liu2018efficient, title = "{Efficient Contextualized Representation: Language Model Pruning for Sequence Labeling}", author = {Liu, Liyuan and Ren, Xiang and Shang, Jingbo and Peng, Jian and Han, Jiawei}, booktitle = {EMNLP}, year = 2018, } ``` ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python -msphinx SPHINXPROJ = LD_Net SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/source/conf.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Wrapper documentation build configuration file, created by # sphinx-quickstart on Thu Sep 14 03:49:01 2017. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. import os import sys sys.path.insert(0, os.path.abspath('../..')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode', 'sphinx.ext.githubpages' ] napoleon_use_ivar = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # General information about the project. project = 'LD-Net' copyright = '2018, Liyuan Liu' author = 'Liyuan Liu' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = '' # The full version, including alpha/beta/rc tags. release = '' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} html_theme_options = { 'collapse_navigation': False, 'display_version': True, } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { '**': [ 'about.html', 'navigation.html', 'relations.html', # needs 'show_related': True theme option to display 'searchbox.html', 'donate.html', ] } # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = 'LD_Net' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'ldnet.tex', 'LD-Net Documentation', 'Liyuan Liu', 'manual'), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'LD-Net', 'LD-Net Documentation', [author], 1) ] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'LD-Net', 'LD-Net Documentation', author, 'LD-Net', 'Efficient Contextualized Representations.', 'Miscellaneous'), ] autodoc_mock_imports = ['torch', 'numpy', 'tensorboardX', 'git', 'tqdm'] intersphinx_mapping = { 'git': ('https://gitpython.readthedocs.io/en/stable/', None), 'tensorboardX': ('https://tensorboardx.readthedocs.io/en/latest/', None), 'python':('https://docs.python.org/3', None), 'numpy': ('http://docs.scipy.org/doc/numpy/', None), 'torch': ('http://pytorch.org/docs/master', None) } ================================================ FILE: docs/source/index.rst ================================================ .. LD-Net documentation master file. :github_url: https://github.com/LiyuanLucasLiu/LD-Net LD-Net documentation ========================= **Check Our New NER Toolkit🚀🚀🚀** - **Inference**: - `LightNER `_: inference w. models pre-trained / trained w. *any* following tools, *efficiently*. - **Training**: - `LD-Net `_: train NER models w. efficient contextualized representations. - `VanillaNER `_: train vanilla NER models w. pre-trained embedding. - **Distant Training**: - `AutoNER `_: train NER models w.o. line-by-line annotations and get competitive performance. -------------------------- This project provides high-performance word-level language model, and sequence labeling with contextualized representation. The key feature of this project is the support of langugage model pruning without retraining. Details about LD-Net can be accessed at: https://arxiv.org/abs/1804.07827. .. toctree:: :maxdepth: 2 :caption: Language Modeling word .. toctree:: :maxdepth: 2 :caption: Sequence Labeling seq Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` ================================================ FILE: docs/source/seq.rst ================================================ Sequence Labeling ========================== model_seq\.crf module ---------------------- .. automodule:: model_seq.crf :members: model_seq\.dataset module -------------------------- .. automodule:: model_seq.dataset :members: model_seq\.elmo module ----------------------- .. automodule:: model_seq.elmo :members: model_seq\.evaluator module ---------------------------- .. automodule:: model_seq.evaluator :members: model_seq\.seqlabel module -------------------------- .. automodule:: model_seq.seqlabel :members: model_seq\.seqlm module ------------------------ .. automodule:: model_seq.seqlm :members: model_seq\.sparse_lm module ---------------------------- .. automodule:: model_seq.sparse_lm :members: model_seq\.utils module ------------------------- .. automodule:: model_seq.utils :members: ================================================ FILE: docs/source/word.rst ================================================ Language Modeling ========================== model_word_ada\.adaptive module ------------------------------- .. automodule:: model_word_ada.adaptive :members: model_word_ada\.basic module ---------------------------- .. automodule:: model_word_ada.basic :members: model_word_ada\.dataset module ------------------------------- .. automodule:: model_word_ada.dataset :members: model_word_ada\.densenet module ------------------------------- .. automodule:: model_word_ada.densenet :members: model_word_ada\.ldnet module ---------------------------- .. automodule:: model_word_ada.ldnet :members: model_word_ada\.LM module ------------------------- .. automodule:: model_word_ada.LM :members: model_word_ada\.utils module ---------------------------- .. automodule:: model_word_ada.utils :members: ================================================ FILE: ldnet_ner_prune.sh ================================================ FIRST_RUN=1 DATA_ROOT="data/" NER_DATASET=$DATA_ROOT/ner_dataset.pk CHECKPOINT_ROOT="checkpoint/" NER_CHECKPOINT=$CHECKPOINT_ROOT/ner.th CHECKPOINT_NAME="p_ner0" green=`tput setaf 2` reset=`tput sgr0` if [ $FIRST_RUN == 1 ] && [ ! -e $NER_DATASET ]; then echo ${green}=== Downloading Dataset ===${reset} mkdir -p DATA_ROOT curl http://dmserv4.cs.illinois.edu/ner_dataset.pk -o $NER_DATASET fi if [ $FIRST_RUN == 1 ] && [ ! -e $NER_CHECKPOINT ]; then echo ${green}=== Downloading Checkpoint ===${reset} mkdir -p CHECKPOINT_ROOT curl http://dmserv4.cs.illinois.edu/ner.th -o $NER_CHECKPOINT fi echo ${green}=== Pruning NER Model ===${reset} python prune_sparse_seq.py --cp_root $CHECKPOINT_ROOT --checkpoint_name $CHECKPOINT_NAME --corpus $NER_DATASET --load_seq $NER_CHECKPOINT --seq_lambda0 0.05 --seq_lambda1 2 ================================================ FILE: ldnet_np_prune.sh ================================================ FIRST_RUN=1 DATA_ROOT="data/" NP_DATASET=$DATA_ROOT/np_dataset.pk CHECKPOINT_ROOT="checkpoint/" NP_CHECKPOINT=$CHECKPOINT_ROOT/np.th CHECKPOINT_NAME="p_np0" green=`tput setaf 2` reset=`tput sgr0` if [ $FIRST_RUN == 1 ] && [ ! -e $NP_DATASET ]; then echo ${green}=== Downloading Dataset ===${reset} mkdir -p DATA_ROOT curl http://dmserv4.cs.illinois.edu/np_dataset.pk -o $NP_DATASET fi if [ $FIRST_RUN == 1 ] && [ ! -e $NP_CHECKPOINT ]; then echo ${green}=== Downloading Checkpoint ===${reset} mkdir -p CHECKPOINT_ROOT curl http://dmserv4.cs.illinois.edu/np.th -o $NP_CHECKPOINT fi echo ${green}=== Pruning NER Model ===${reset} python prune_sparse_seq.py --cp_root $CHECKPOINT_ROOT --checkpoint_name $CHECKPOINT_NAME --corpus $NP_DATASET --load_seq $NP_CHECKPOINT --seq_lambda0 0.05 --seq_lambda1 2 ================================================ FILE: model_seq/__init__.py ================================================ ================================================ FILE: model_seq/crf.py ================================================ """ .. module:: crf :synopsis: conditional random field .. moduleauthor:: Liyuan Liu """ import torch import torch.nn as nn import torch.optim as optim import torch.sparse as sparse import model_seq.utils as utils class CRF(nn.Module): """ Conditional Random Field Module Parameters ---------- hidden_dim : ``int``, required. the dimension of the input features. tagset_size : ``int``, required. the size of the target labels. if_bias: ``bool``, optional, (default=True). whether the linear transformation has the bias term. """ def __init__(self, hidden_dim: int, tagset_size: int, if_bias: bool = True): super(CRF, self).__init__() self.tagset_size = tagset_size self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size, bias=if_bias) self.transitions = nn.Parameter(torch.Tensor(self.tagset_size, self.tagset_size)) def rand_init(self): """ random initialization """ utils.init_linear(self.hidden2tag) self.transitions.data.zero_() def forward(self, feats): """ calculate the potential score for the conditional random field. Parameters ---------- feats: ``torch.FloatTensor``, required. the input features for the conditional random field, of shape (*, hidden_dim). Returns ------- output: ``torch.FloatTensor``. A float tensor of shape (ins_num, from_tag_size, to_tag_size) """ scores = self.hidden2tag(feats).view(-1, 1, self.tagset_size) ins_num = scores.size(0) crf_scores = scores.expand(ins_num, self.tagset_size, self.tagset_size) + self.transitions.view(1, self.tagset_size, self.tagset_size).expand(ins_num, self.tagset_size, self.tagset_size) return crf_scores class CRFLoss(nn.Module): """ The negative loss for the Conditional Random Field Module Parameters ---------- y_map : ``dict``, required. a ``dict`` maps from tag string to tag index. average_batch : ``bool``, optional, (default=True). whether the return score would be averaged per batch. """ def __init__(self, y_map: dict, average_batch: bool = True): super(CRFLoss, self).__init__() self.tagset_size = len(y_map) self.start_tag = y_map[''] self.end_tag = y_map[''] self.average_batch = average_batch def forward(self, scores, target, mask): """ calculate the negative log likehood for the conditional random field. Parameters ---------- scores: ``torch.FloatTensor``, required. the potential score for the conditional random field, of shape (seq_len, batch_size, from_tag_size, to_tag_size). target: ``torch.LongTensor``, required. the positive path for the conditional random field, of shape (seq_len, batch_size). mask: ``torch.ByteTensor``, required. the mask for the unpadded sentence parts, of shape (seq_len, batch_size). Returns ------- loss: ``torch.FloatTensor``. The NLL loss. """ seq_len = scores.size(0) bat_size = scores.size(1) tg_energy = torch.gather(scores.view(seq_len, bat_size, -1), 2, target.unsqueeze(2)).view(seq_len, bat_size) tg_energy = tg_energy.masked_select(mask).sum() seq_iter = enumerate(scores) _, inivalues = seq_iter.__next__() partition = inivalues[:, self.start_tag, :].squeeze(1).clone() for idx, cur_values in seq_iter: cur_values = cur_values + partition.unsqueeze(2).expand(bat_size, self.tagset_size, self.tagset_size) cur_partition = utils.log_sum_exp(cur_values) mask_idx = mask[idx, :].view(bat_size, 1).expand(bat_size, self.tagset_size) partition.masked_scatter_(mask_idx, cur_partition.masked_select(mask_idx)) partition = partition[:, self.end_tag].sum() if self.average_batch: return (partition - tg_energy) / bat_size else: return (partition - tg_energy) class CRFDecode(): """ The negative loss for the Conditional Random Field Module Parameters ---------- y_map : ``dict``, required. a ``dict`` maps from tag string to tag index. """ def __init__(self, y_map: dict): self.tagset_size = len(y_map) self.start_tag = y_map[''] self.end_tag = y_map[''] self.y_map = y_map self.r_y_map = {v:k for k, v in self.y_map.items()} def decode(self, scores, mask): """ find the best path from the potential scores by the viterbi decoding algorithm. Parameters ---------- scores: ``torch.FloatTensor``, required. the potential score for the conditional random field, of shape (seq_len, batch_size, from_tag_size, to_tag_size). mask: ``torch.ByteTensor``, required. the mask for the unpadded sentence parts, of shape (seq_len, batch_size). Returns ------- output: ``torch.LongTensor``. A LongTensor of shape (seq_len - 1, batch_size) """ seq_len = scores.size(0) bat_size = scores.size(1) mask = 1 - mask.data decode_idx = torch.LongTensor(seq_len-1, bat_size) seq_iter = enumerate(scores) _, inivalues = seq_iter.__next__() forscores = inivalues[:, self.start_tag, :] back_points = list() for idx, cur_values in seq_iter: cur_values = cur_values + forscores.contiguous().view(bat_size, self.tagset_size, 1).expand(bat_size, self.tagset_size, self.tagset_size) forscores, cur_bp = torch.max(cur_values, 1) cur_bp.masked_fill_(mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size), self.end_tag) back_points.append(cur_bp) pointer = back_points[-1][:, self.end_tag] decode_idx[-1] = pointer for idx in range(len(back_points)-2, -1, -1): back_point = back_points[idx] index = pointer.contiguous().view(-1, 1) pointer = torch.gather(back_point, 1, index).view(-1) decode_idx[idx] = pointer return decode_idx def to_spans(self, sequence): """ decode the best path to spans. Parameters ---------- sequence: list, required. the list of best label indexes paths . Returns ------- output: ``set``. A set of chunks contains the position and type of the entities. """ chunks = [] current = None for i, y in enumerate(sequence): label = self.r_y_map[y] if label.startswith('B-'): if current is not None: chunks.append('@'.join(current)) current = [label.replace('B-', ''), '%d' % i] elif label.startswith('S-'): if current is not None: chunks.append('@'.join(current)) current = None base = label.replace('S-', '') chunks.append('@'.join([base, '%d' % i])) elif label.startswith('I-'): if current is not None: base = label.replace('I-', '') if base == current[0]: current.append('%d' % i) else: chunks.append('@'.join(current)) current = [base, '%d' % i] else: current = [label.replace('I-', ''), '%d' % i] elif label.startswith('E-'): if current is not None: base = label.replace('E-', '') if base == current[0]: current.append('%d' % i) chunks.append('@'.join(current)) current = None else: chunks.append('@'.join(current)) current = [base, '%d' % i] chunks.append('@'.join(current)) current = None else: current = [label.replace('E-', ''), '%d' % i] chunks.append('@'.join(current)) current = None else: if current is not None: chunks.append('@'.join(current)) current = None if current is not None: chunks.append('@'.join(current)) return set(chunks) ================================================ FILE: model_seq/dataset.py ================================================ """ .. module:: dataset :synopsis: dataset for sequence labeling .. moduleauthor:: Liyuan Liu """ import torch import torch.nn as nn import torch.nn.functional as F import sys import pickle import random import functools import itertools from tqdm import tqdm class SeqDataset(object): """ Dataset for Sequence Labeling Parameters ---------- dataset : ``list``, required. The encoded dataset (outputs of preprocess scripts). flm_pad : ``int``, required. The pad index for the forward language model. blm_pad : ``int``, required. The pad index for the backward language model. w_pad : ``int``, required. The pad index for the word-level inputs. c_con : ``int``, required. The index of connect character token for character-level inputs. c_pad : ``int``, required. The pad index for the character-level inputs. y_start : ``int``, required. The index of the start label token. y_pad : ``int``, required. The index of the pad label token. y_size : ``int``, required. The size of the tag set. batch_size: ``int``, required. Batch size. """ def __init__(self, dataset: list, flm_pad: int, blm_pad: int, w_pad: int, c_con: int, c_pad: int, y_start: int, y_pad: int, y_size: int, batch_size: int): super(SeqDataset, self).__init__() self.flm_pad = flm_pad self.blm_pad = blm_pad self.w_pad = w_pad self.c_con = c_con self.c_pad = c_pad self.y_pad = y_pad self.y_size = y_size self.y_start = y_start self.batch_size = batch_size self.construct_index(dataset) self.shuffle() def shuffle(self): """ shuffle dataset """ random.shuffle(self.shuffle_list) def get_tqdm(self, device): """ construct dataset reader and the corresponding tqdm. Parameters ---------- device: ``torch.device``, required. the target device for the dataset loader. """ return tqdm(self.reader(device), mininterval=2, total=self.index_length // self.batch_size, leave=False, file=sys.stdout, ncols=80) def construct_index(self, dataset): """ construct index for the dataset. Parameters ---------- dataset: ``list``, required. the encoded dataset (outputs of preprocess scripts). """ for instance in dataset: c_len = [len(tup)+1 for tup in instance[3]] c_ins = [tup for ins in instance[3] for tup in (ins + [self.c_con])] instance[3] = c_ins instance.append(c_len) self.dataset = dataset self.index_length = len(dataset) self.shuffle_list = list(range(0, self.index_length)) def reader(self, device): """ construct dataset reader. Parameters ---------- device: ``torch.device``, required. the target device for the dataset loader. Returns ------- reader: ``iterator``. A lazy iterable object """ cur_idx = 0 while cur_idx < self.index_length: end_index = min(cur_idx + self.batch_size, self.index_length) batch = [self.dataset[self.shuffle_list[index]] for index in range(cur_idx, end_index)] cur_idx = end_index yield self.batchify(batch, device) self.shuffle() def batchify(self, batch, device): """ batchify a batch of data and move to a device. Parameters ---------- batch: ``list``, required. a sample from the encoded dataset (outputs of preprocess scripts). device: ``torch.device``, required. the target device for the dataset loader. """ cur_batch_size = len(batch) char_padded_len = max([len(tup[3]) for tup in batch]) word_padded_len = max([len(tup[0]) for tup in batch]) tmp_batch = [list() for ind in range(11)] for instance_ind in range(cur_batch_size): instance = batch[instance_ind] char_padded_len_ins = char_padded_len - len(instance[3]) word_padded_len_ins = word_padded_len - len(instance[0]) tmp_batch[0].append(instance[3] + [self.c_pad] + [self.c_pad] * char_padded_len_ins) tmp_batch[2].append([self.c_pad] + instance[3][::-1] + [self.c_pad] * char_padded_len_ins) tmp_p = list( itertools.accumulate(instance[5]+[1]+[0]* word_padded_len_ins) ) tmp_batch[1].append([(x - 1) * cur_batch_size + instance_ind for x in tmp_p]) tmp_p = list(itertools.accumulate([1]+instance[5][::-1]))[::-1] + [1]*word_padded_len_ins tmp_batch[3].append([(x - 1) * cur_batch_size + instance_ind for x in tmp_p]) tmp_batch[4].append(instance[0] + [self.flm_pad] + [self.flm_pad] * word_padded_len_ins) tmp_batch[5].append([self.blm_pad] + instance[1][::-1] + [self.blm_pad] * word_padded_len_ins) tmp_p = list(range(len(instance[1]), -1, -1)) + list(range(len(instance[1])+1, word_padded_len+1)) tmp_batch[6].append([x * cur_batch_size + instance_ind for x in tmp_p]) tmp_batch[7].append(instance[2] + [self.w_pad] + [self.w_pad] * word_padded_len_ins) tmp_batch[8].append([self.y_start * self.y_size + instance[4][0]] + [instance[4][ind] * self.y_size + instance[4][ind+1] for ind in range(len(instance[4]) - 1)] + [instance[4][-1] * self.y_size + self.y_pad] + [self.y_pad * self.y_size + self.y_pad] * word_padded_len_ins) tmp_batch[9].append([1] * len(instance[4]) + [1] + [0] * word_padded_len_ins) tmp_batch[10].append(instance[4]) tbt = [torch.LongTensor(v).transpose(0, 1).contiguous() for v in tmp_batch[0:9]] + [torch.ByteTensor(tmp_batch[9]).transpose(0, 1).contiguous()] tbt[1] = tbt[1].view(-1) tbt[3] = tbt[3].view(-1) tbt[6] = tbt[6].view(-1) return [ten.to(device) for ten in tbt] + [tmp_batch[10]] ================================================ FILE: model_seq/elmo.py ================================================ """ .. module:: elmo :synopsis: deep contextualized representation .. moduleauthor:: Liyuan Liu """ import time import torch import torch.nn as nn import torch.nn.functional as F import model_seq.utils as utils import torch import torch.nn as nn import torch.nn.functional as F class EBUnit(nn.Module): """ The basic recurrent unit for the ELMo RNNs wrapper. Parameters ---------- ori_unit : ``torch.nn.Module``, required. The original module of rnn unit. droprate : ``float``, required. The dropout ratrio. fix_rate: ``bool``, required. Whether to fix the rqtio. """ def __init__(self, ori_unit, droprate, fix_rate): super(EBUnit, self).__init__() self.layer = ori_unit.layer self.droprate = droprate self.output_dim = ori_unit.output_dim def forward(self, x): """ Calculate the output. Parameters ---------- x : ``torch.FloatTensor``, required. The input tensor, of shape (seq_len, batch_size, input_dim). Returns ---------- output: ``torch.FloatTensor``. The output of RNNs. """ out, _ = self.layer(x) if self.droprate > 0: out = F.dropout(out, p=self.droprate, training=self.training) return out class ERNN(nn.Module): """ The multi-layer recurrent networks for the ELMo RNNs wrapper. Parameters ---------- ori_drnn : ``torch.nn.Module``, required. The original module of rnn networks. droprate : ``float``, required. The dropout ratrio. fix_rate: ``bool``, required. Whether to fix the rqtio. """ def __init__(self, ori_drnn, droprate, fix_rate): super(ERNN, self).__init__() self.layer_list = [EBUnit(ori_unit, droprate, fix_rate) for ori_unit in ori_drnn.layer._modules.values()] self.gamma = nn.Parameter(torch.FloatTensor([1.0])) self.weight_list = nn.Parameter(torch.FloatTensor([0.0] * len(self.layer_list))) self.layer = nn.ModuleList(self.layer_list) for param in self.layer.parameters(): param.requires_grad = False if fix_rate: self.gamma.requires_grad = False self.weight_list.requires_grad = False self.output_dim = self.layer_list[-1].output_dim def regularizer(self): """ Calculate the regularization term. Returns ---------- The regularization term. """ srd_weight = self.weight_list - (1.0 / len(self.layer_list)) return (srd_weight ** 2).sum() def forward(self, x): """ Calculate the output. Parameters ---------- x : ``torch.FloatTensor``, required. the input tensor, of shape (seq_len, batch_size, input_dim). Returns ---------- output: ``torch.FloatTensor``. The ELMo outputs. """ out = 0 nw = self.gamma * F.softmax(self.weight_list, dim=0) for ind in range(len(self.layer_list)): x = self.layer[ind](x) out += x * nw[ind] return out class ElmoLM(nn.Module): """ The language model for the ELMo RNNs wrapper. Parameters ---------- ori_lm : ``torch.nn.Module``, required. the original module of language model. backward : ``bool``, required. whether the language model is backward. droprate : ``float``, required. the dropout ratrio. fix_rate: ``bool``, required. whether to fix the rqtio. """ def __init__(self, ori_lm, backward, droprate, fix_rate): super(ElmoLM, self).__init__() self.rnn = ERNN(ori_lm.rnn, droprate, fix_rate) self.w_num = ori_lm.w_num self.w_dim = ori_lm.w_dim self.word_embed = ori_lm.word_embed self.word_embed.weight.requires_grad = False self.output_dim = ori_lm.rnn_output self.backward = backward def init_hidden(self): """ initialize hidden states. """ return def regularizer(self): """ Calculate the regularization term. Returns ---------- reg: ``list``. The list of regularization terms. """ return self.rnn.regularizer() def prox(self, lambda0): """ the proximal calculator. """ return 0.0 def forward(self, w_in, ind=None): """ Calculate the output. Parameters ---------- w_in : ``torch.LongTensor``, required. the input tensor, of shape (seq_len, batch_size). ind : ``torch.LongTensor``, optional, (default=None). the index tensor for the backward language model, of shape (seq_len, batch_size). Returns ---------- output: ``torch.FloatTensor``. The ELMo outputs. """ w_emb = self.word_embed(w_in) out = self.rnn(w_emb) if self.backward: out_size = out.size() out = out.view(out_size[0] * out_size[1], out_size[2]).index_select(0, ind).contiguous().view(out_size) return out ================================================ FILE: model_seq/evaluator.py ================================================ """ .. module:: evaluator :synopsis: evaluator for sequence labeling .. moduleauthor:: Liyuan Liu """ import torch import numpy as np import itertools import model_seq.utils as utils from torch.autograd import Variable class eval_batch: """ Base class for evaluation, provide method to calculate f1 score and accuracy. Parameters ---------- decoder : ``torch.nn.Module``, required. the decoder module, which needs to contain the ``to_span()`` method. """ def __init__(self, decoder): self.decoder = decoder def reset(self): """ reset counters. """ self.correct_labels = 0 self.total_labels = 0 self.gold_count = 0 self.guess_count = 0 self.overlap_count = 0 def calc_f1_batch(self, decoded_data, target_data): """ update statics for f1 score. Parameters ---------- decoded_data: ``torch.LongTensor``, required. the decoded best label index pathes. target_data: ``torch.LongTensor``, required. the golden label index pathes. """ batch_decoded = torch.unbind(decoded_data, 1) for decoded, target in zip(batch_decoded, target_data): length = len(target) best_path = decoded[:length] correct_labels_i, total_labels_i, gold_count_i, guess_count_i, overlap_count_i = self.eval_instance(best_path.numpy(), target) self.correct_labels += correct_labels_i self.total_labels += total_labels_i self.gold_count += gold_count_i self.guess_count += guess_count_i self.overlap_count += overlap_count_i def calc_acc_batch(self, decoded_data, target_data): """ update statics for accuracy score. Parameters ---------- decoded_data: ``torch.LongTensor``, required. the decoded best label index pathes. target_data: ``torch.LongTensor``, required. the golden label index pathes. """ batch_decoded = torch.unbind(decoded_data, 1) for decoded, target in zip(batch_decoded, target_data): # remove padding length = len(target) best_path = decoded[:length].numpy() self.total_labels += length self.correct_labels += np.sum(np.equal(best_path, gold)) def f1_score(self): """ calculate the f1 score based on the inner counter. """ if self.guess_count == 0: return 0.0, 0.0, 0.0, 0.0 precision = self.overlap_count / float(self.guess_count) recall = self.overlap_count / float(self.gold_count) if precision == 0.0 or recall == 0.0: return 0.0, 0.0, 0.0, 0.0 f = 2 * (precision * recall) / (precision + recall) accuracy = float(self.correct_labels) / self.total_labels return f, precision, recall, accuracy def acc_score(self): """ calculate the accuracy score based on the inner counter. """ if 0 == self.total_labels: return 0.0 accuracy = float(self.correct_labels) / self.total_labels return accuracy def eval_instance(self, best_path, gold): """ Calculate statics to update inner counters for one instance. Parameters ---------- best_path: required. the decoded best label index pathe. gold: required. the golden label index pathes. """ total_labels = len(best_path) correct_labels = np.sum(np.equal(best_path, gold)) gold_chunks = self.decoder.to_spans(gold) gold_count = len(gold_chunks) guess_chunks = self.decoder.to_spans(best_path) guess_count = len(guess_chunks) overlap_chunks = gold_chunks & guess_chunks overlap_count = len(overlap_chunks) return correct_labels, total_labels, gold_count, guess_count, overlap_count class eval_wc(eval_batch): """ evaluation class for LD-Net Parameters ---------- decoder : ``torch.nn.Module``, required. the decoder module, which needs to contain the ``to_span()`` and ``decode()`` method. score_type : ``str``, required. whether the f1 score or the accuracy is needed. """ def __init__(self, decoder, score_type): eval_batch.__init__(self, decoder) if 'f' in score_type: self.eval_b = self.calc_f1_batch self.calc_s = self.f1_score else: self.eval_b = self.calc_acc_batch self.calc_s = self.acc_score def calc_score(self, seq_model, dataset_loader): """ calculate scores Parameters ---------- seq_model: required. sequence labeling model. dataset_loader: required. the dataset loader. Returns ------- score: ``float``. calculated score. """ seq_model.eval() self.reset() for f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w, _, f_y_m, g_y in dataset_loader: scores = seq_model(f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w) decoded = self.decoder.decode(scores.data, f_y_m) self.eval_b(decoded, g_y) return self.calc_s() ================================================ FILE: model_seq/seqlabel.py ================================================ """ .. module:: seqlabel :synopsis: sequence labeling model .. moduleauthor:: Liyuan Liu """ import torch import torch.nn as nn import torch.nn.functional as F import model_seq.utils as utils from model_seq.crf import CRF class SeqLabel(nn.Module): """ Sequence Labeling model augumented with language model. Parameters ---------- f_lm : ``torch.nn.Module``, required. The forward language modle for contextualized representations. b_lm : ``torch.nn.Module``, required. The backward language modle for contextualized representations. c_num : ``int`` , required. The number of characters. c_dim : ``int`` , required. The dimension of character embedding. c_hidden : ``int`` , required. The dimension of character hidden states. c_layer : ``int`` , required. The number of character lstms. w_num : ``int`` , required. The number of words. w_dim : ``int`` , required. The dimension of word embedding. w_hidden : ``int`` , required. The dimension of word hidden states. w_layer : ``int`` , required. The number of word lstms. y_num : ``int`` , required. The number of tags types. droprate : ``float`` , required The dropout ratio. unit : "str", optional, (default = 'lstm') The type of the recurrent unit. """ def __init__(self, f_lm, b_lm, c_num: int, c_dim: int, c_hidden: int, c_layer: int, w_num: int, w_dim: int, w_hidden: int, w_layer: int, y_num: int, droprate: float, unit: str = 'lstm'): super(SeqLabel, self).__init__() rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU} self.f_lm = f_lm self.b_lm = b_lm self.unit_type = unit self.char_embed = nn.Embedding(c_num, c_dim) self.word_embed = nn.Embedding(w_num, w_dim) self.char_seq = nn.Linear(c_hidden * 2, w_dim) self.lm_seq = nn.Linear(f_lm.output_dim + b_lm.output_dim, w_dim) self.relu = nn.ReLU() self.c_hidden = c_hidden tmp_rnn_dropout = droprate if c_layer > 1 else 0 self.char_fw = rnnunit_map[unit](c_dim, c_hidden, c_layer, dropout = tmp_rnn_dropout) self.char_bw = rnnunit_map[unit](c_dim, c_hidden, c_layer, dropout = tmp_rnn_dropout) tmp_rnn_dropout = droprate if w_layer > 1 else 0 self.word_rnn = rnnunit_map[unit](w_dim * 3, w_hidden // 2, w_layer, dropout = tmp_rnn_dropout, bidirectional = True) self.y_num = y_num self.crf = CRF(w_hidden, y_num) self.drop = nn.Dropout(p = droprate) def to_params(self): """ To parameters. """ return { "model_type": "char-lstm-crf", "forward_lm": self.f_lm.to_params(), "backward_lm": self.b_lm.to_params(), "word_embed_num": self.word_embed.num_embeddings, "word_embed_dim": self.word_embed.embedding_dim, "char_embed_num": self.char_embed.num_embeddings, "char_embed_dim": self.char_embed.embedding_dim, "char_hidden": self.c_hidden, "char_layers": self.char_fw.num_layers, "word_hidden": self.word_rnn.hidden_size, "word_layers": self.word_rnn.num_layers, "droprate": self.drop.p, "y_num": self.y_num, "label_schema": "iobes", "unit_type": self.unit_type } def prune_dense_rnn(self): """ Prune dense rnn to be smaller by delecting layers. """ f_prune_mask = self.f_lm.prune_dense_rnn() b_prune_mask = self.b_lm.prune_dense_rnn() prune_mask = torch.cat([f_prune_mask, b_prune_mask], dim = 0) mask_index = prune_mask.nonzero().squeeze(1) self.lm_seq.weight = nn.Parameter(self.lm_seq.weight.data.index_select(1, mask_index).contiguous()) self.lm_seq.in_features = self.lm_seq.weight.size(1) def set_batch_seq_size(self, sentence): """ Set the batch size and sequence length. """ tmp = sentence.size() self.word_seq_length = tmp[0] self.batch_size = tmp[1] def load_pretrained_word_embedding(self, pre_word_embeddings): """ Load pre-trained word embedding. """ self.word_embed.weight = nn.Parameter(pre_word_embeddings) def rand_init(self): """ Random initialization. """ utils.init_embedding(self.char_embed.weight) utils.init_lstm(self.char_fw) utils.init_lstm(self.char_bw) utils.init_lstm(self.word_rnn) utils.init_linear(self.char_seq) utils.init_linear(self.lm_seq) self.crf.rand_init() def forward(self, f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w): """ Calculate the output (crf potentials). Parameters ---------- f_c : ``torch.LongTensor``, required. Character-level inputs in the forward direction. f_p : ``torch.LongTensor``, required. Ouput position of character-level inputs in the forward direction. b_c : ``torch.LongTensor``, required. Character-level inputs in the backward direction. b_p : ``torch.LongTensor``, required. Ouput position of character-level inputs in the backward direction. flm_w : ``torch.LongTensor``, required. Word-level inputs for the forward language model. blm_w : ``torch.LongTensor``, required. Word-level inputs for the backward language model. blm_ind : ``torch.LongTensor``, required. Ouput position of word-level inputs for the backward language model. f_w: ``torch.LongTensor``, required. Word-level inputs for the sequence labeling model. Returns ------- output: ``torch.FloatTensor``. A float tensor of shape (sequence_len, batch_size, from_tag_size, to_tag_size) """ self.set_batch_seq_size(f_w) f_c_e = self.drop(self.char_embed(f_c)) b_c_e = self.drop(self.char_embed(b_c)) f_c_e, _ = self.char_fw(f_c_e) b_c_e, _ = self.char_bw(b_c_e) f_c_e = f_c_e.view(-1, self.c_hidden).index_select(0, f_p).view(self.word_seq_length, self.batch_size, self.c_hidden) b_c_e = b_c_e.view(-1, self.c_hidden).index_select(0, b_p).view(self.word_seq_length, self.batch_size, self.c_hidden) c_o = self.drop(torch.cat([f_c_e, b_c_e], dim = 2)) c_o = self.char_seq(c_o) self.f_lm.init_hidden() self.b_lm.init_hidden() f_lm_e = self.f_lm(flm_w) b_lm_e = self.b_lm(blm_w, blm_ind) lm_o = self.drop(torch.cat([f_lm_e, b_lm_e], dim = 2)) lm_o = self.relu(self.lm_seq(lm_o)) w_e = self.word_embed(f_w) rnn_in = self.drop(torch.cat([c_o, lm_o, w_e], dim = 2)) rnn_out, _ = self.word_rnn(rnn_in) crf_out = self.crf(self.drop(rnn_out)).view(self.word_seq_length, self.batch_size, self.y_num, self.y_num) return crf_out class Vanilla_SeqLabel(nn.Module): """ Sequence Labeling model augumented without language model. Parameters ---------- f_lm : ``torch.nn.Module``, required. forward language modle for contextualized representations. b_lm : ``torch.nn.Module``, required. backward language modle for contextualized representations. c_num : ``int`` , required. number of characters. c_dim : ``int`` , required. dimension of character embedding. c_hidden : ``int`` , required. dimension of character hidden states. c_layer : ``int`` , required. number of character lstms. w_num : ``int`` , required. number of words. w_dim : ``int`` , required. dimension of word embedding. w_hidden : ``int`` , required. dimension of word hidden states. w_layer : ``int`` , required. number of word lstms. y_num : ``int`` , required. number of tags types. droprate : ``float`` , required dropout ratio. unit : "str", optional, (default = 'lstm') type of the recurrent unit. """ def __init__(self, f_lm, b_lm, c_num, c_dim, c_hidden, c_layer, w_num, w_dim, w_hidden, w_layer, y_num, droprate, unit='lstm'): super(Vanilla_SeqLabel, self).__init__() rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU} self.char_embed = nn.Embedding(c_num, c_dim) self.word_embed = nn.Embedding(w_num, w_dim) self.char_seq = nn.Linear(c_hidden * 2, w_dim) self.c_hidden = c_hidden self.char_fw = rnnunit_map[unit](c_dim, c_hidden, c_layer, dropout = droprate) self.char_bw = rnnunit_map[unit](c_dim, c_hidden, c_layer, dropout = droprate) self.word_rnn = rnnunit_map[unit](w_dim + w_dim, w_hidden // 2, w_layer, dropout = droprate, bidirectional = True) self.y_num = y_num self.crf = CRF(w_hidden, y_num) self.drop = nn.Dropout(p = droprate) def set_batch_seq_size(self, sentence): """ set batch size and sequence length """ tmp = sentence.size() self.word_seq_length = tmp[0] self.batch_size = tmp[1] def load_pretrained_word_embedding(self, pre_word_embeddings): """ Load pre-trained word embedding. """ self.word_embed.weight = nn.Parameter(pre_word_embeddings) def rand_init(self): """ Random initialization. """ utils.init_embedding(self.char_embed.weight) utils.init_lstm(self.char_fw) utils.init_lstm(self.char_bw) utils.init_lstm(self.word_rnn) utils.init_linear(self.char_seq) self.crf.rand_init() def forward(self, f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w): """ Calculate the output (crf potentials). Parameters ---------- f_c : ``torch.LongTensor``, required. Character-level inputs in the forward direction. f_p : ``torch.LongTensor``, required. Ouput position of character-level inputs in the forward direction. b_c : ``torch.LongTensor``, required. Character-level inputs in the backward direction. b_p : ``torch.LongTensor``, required. Ouput position of character-level inputs in the backward direction. flm_w : ``torch.LongTensor``, required. Word-level inputs for the forward language model. blm_w : ``torch.LongTensor``, required. Word-level inputs for the backward language model. blm_ind : ``torch.LongTensor``, required. Ouput position of word-level inputs for the backward language model. f_w: ``torch.LongTensor``, required. Word-level inputs for the sequence labeling model. Returns ------- output: ``torch.FloatTensor``. A float tensor of shape (sequence_len, batch_size, from_tag_size, to_tag_size) """ self.set_batch_seq_size(f_w) f_c_e = self.drop(self.char_embed(f_c)) b_c_e = self.drop(self.char_embed(b_c)) f_c_e, _ = self.char_fw(f_c_e) b_c_e, _ = self.char_bw(b_c_e) f_c_e = f_c_e.view(-1, self.c_hidden).index_select(0, f_p).view(self.word_seq_length, self.batch_size, self.c_hidden) b_c_e = b_c_e.view(-1, self.c_hidden).index_select(0, b_p).view(self.word_seq_length, self.batch_size, self.c_hidden) c_o = self.drop(torch.cat([f_c_e, b_c_e], dim = 2)) c_o = self.char_seq(c_o) w_e = self.word_embed(f_w) rnn_in = self.drop(torch.cat([c_o, w_e], dim = 2)) rnn_out, _ = self.word_rnn(rnn_in) crf_out = self.crf(self.drop(rnn_out)).view(self.word_seq_length, self.batch_size, self.y_num, self.y_num) return crf_out ================================================ FILE: model_seq/seqlm.py ================================================ """ .. module:: seqlm :synopsis: language model for sequence labeling .. moduleauthor:: Liyuan Liu """ import time import torch import torch.nn as nn import torch.nn.functional as F import model_seq.utils as utils import torch import torch.nn as nn import torch.nn.functional as F class BasicSeqLM(nn.Module): """ The language model for the dense rnns. Parameters ---------- ori_lm : ``torch.nn.Module``, required. the original module of language model. backward : ``bool``, required. whether the language model is backward. droprate : ``float``, required. the dropout ratrio. fix_rate: ``bool``, required. whether to fix the rqtio. """ def __init__(self, ori_lm, backward, droprate, fix_rate): super(BasicSeqLM, self).__init__() self.rnn = ori_lm.rnn for param in self.rnn.parameters(): param.requires_grad = False self.w_num = ori_lm.w_num self.w_dim = ori_lm.w_dim self.word_embed = ori_lm.word_embed self.word_embed.weight.requires_grad = False self.output_dim = ori_lm.rnn_output self.backward = backward def to_params(self): """ To parameters. """ return { "rnn_params": self.rnn.to_params(), "word_embed_num": self.word_embed.num_embeddings, "word_embed_dim": self.word_embed.embedding_dim } def init_hidden(self): """ initialize hidden states. """ self.rnn.init_hidden() def regularizer(self): """ Calculate the regularization term. Returns ---------- reg: ``list``. The list of regularization terms. """ return self.rnn.regularizer() def forward(self, w_in, ind=None): """ Calculate the output. Parameters ---------- w_in : ``torch.LongTensor``, required. the input tensor, of shape (seq_len, batch_size). ind : ``torch.LongTensor``, optional, (default=None). the index tensor for the backward language model, of shape (seq_len, batch_size). Returns ---------- output: ``torch.FloatTensor``. The ELMo outputs. """ w_emb = self.word_embed(w_in) out = self.rnn(w_emb) if self.backward: out_size = out.size() out = out.view(out_size[0] * out_size[1], out_size[2]).index_select(0, ind).contiguous().view(out_size) return out ================================================ FILE: model_seq/sparse_lm.py ================================================ """ .. module:: sparse_lm :synopsis: sparse language model for sequence labeling .. moduleauthor:: Liyuan Liu """ import time import torch import torch.nn as nn import torch.nn.functional as F import model_seq.utils as utils class SBUnit(nn.Module): """ The basic recurrent unit for the dense-RNNs wrapper. Parameters ---------- ori_unit : ``torch.nn.Module``, required. the original module of rnn unit. droprate : ``float``, required. the dropout ratrio. fix_rate: ``bool``, required. whether to fix the rqtio. """ def __init__(self, ori_unit, droprate, fix_rate): super(SBUnit, self).__init__() self.unit_type = ori_unit.unit_type self.layer = ori_unit.layer self.droprate = droprate self.input_dim = ori_unit.input_dim self.increase_rate = ori_unit.increase_rate self.output_dim = ori_unit.input_dim + ori_unit.increase_rate def prune_rnn(self, mask): """ Prune dense rnn to be smaller by delecting layers. Parameters ---------- mask : ``torch.ByteTensor``, required. The selection tensor for the input matrix. """ mask_index = mask.nonzero().squeeze(1) self.layer.weight_ih_l0 = nn.Parameter(self.layer.weight_ih_l0.data.index_select(1, mask_index).contiguous()) self.layer.input_size = self.layer.weight_ih_l0.size(1) def forward(self, x, weight=1): """ Calculate the output. Parameters ---------- x : ``torch.FloatTensor``, required. The input tensor, of shape (seq_len, batch_size, input_dim). weight : ``torch.FloatTensor``, required. The selection variable. Returns ---------- output: ``torch.FloatTensor``. The output of RNNs. """ if self.droprate > 0: new_x = F.dropout(x, p=self.droprate, training=self.training) else: new_x = x out, _ = self.layer(new_x) out = weight * out return torch.cat([x, out], 2) class SDRNN(nn.Module): """ The multi-layer recurrent networks for the dense-RNNs wrapper. Parameters ---------- ori_unit : ``torch.nn.Module``, required. the original module of rnn unit. droprate : ``float``, required. the dropout ratrio. fix_rate: ``bool``, required. whether to fix the rqtio. """ def __init__(self, ori_drnn, droprate, fix_rate): super(SDRNN, self).__init__() if ori_drnn.layer: self.layer_list = [SBUnit(ori_unit, droprate, fix_rate) for ori_unit in ori_drnn.layer._modules.values()] self.weight_list = nn.Parameter(torch.FloatTensor([1.0] * len(self.layer_list))) self.weight_list.requires_grad = not fix_rate # self.layer = nn.Sequential(*self.layer_list) self.layer = nn.ModuleList(self.layer_list) for param in self.layer.parameters(): param.requires_grad = False else: self.layer_list = list() self.weight_list = list() self.layer = None # self.output_dim = self.layer_list[-1].output_dim self.emb_dim = ori_drnn.emb_dim self.output_dim = ori_drnn.output_dim self.unit_type = ori_drnn.unit_type def to_params(self): """ To parameters. """ return { "rnn_type": "LDRNN", "unit_type": self.unit_type, "layer_num": 0 if not self.layer else len(self.layer), "emb_dim": self.emb_dim, "hid_dim": -1 if not self.layer else self.layer[0].increase_rate, "droprate": -1 if not self.layer else self.layer[0].droprate, "after_pruned": True } def prune_dense_rnn(self): """ Prune dense rnn to be smaller by delecting layers. """ prune_mask = torch.ones(self.layer_list[0].input_dim) increase_mask_one = torch.ones(self.layer_list[0].increase_rate) increase_mask_zero = torch.zeros(self.layer_list[0].increase_rate) new_layer_list = list() new_weight_list = list() for ind in range(0, len(self.layer_list)): if self.weight_list.data[ind] > 0: new_weight_list.append(self.weight_list.data[ind]) self.layer_list[ind].prune_rnn(prune_mask) new_layer_list.append(self.layer_list[ind]) prune_mask = torch.cat([prune_mask, increase_mask_one], dim = 0) else: prune_mask = torch.cat([prune_mask, increase_mask_zero], dim = 0) if not new_layer_list: self.output_dim = self.layer_list[0].input_dim self.layer = None self.weight_list = None self.layer_list = None else: self.layer_list = new_layer_list self.layer = nn.ModuleList(self.layer_list) self.weight_list = nn.Parameter(torch.FloatTensor(new_weight_list)) self.weight_list.requires_grad = False for param in self.layer.parameters(): param.requires_grad = False return prune_mask def prox(self): """ the proximal calculator. """ self.weight_list.data.masked_fill_(self.weight_list.data < 0, 0) self.weight_list.data.masked_fill_(self.weight_list.data > 1, 1) none_zero_count = (self.weight_list.data > 0).sum() return none_zero_count def regularizer(self): """ Calculate the regularization term. Returns ---------- reg0: ``torch.FloatTensor``. The value of reg0. reg1: ``torch.FloatTensor``. The value of reg1. reg2: ``torch.FloatTensor``. The value of reg2. """ reg3 = (self.weight_list * (1 - self.weight_list)).sum() none_zero = self.weight_list.data > 0 none_zero_count = none_zero.sum() reg0 = none_zero_count reg1 = self.weight_list[none_zero].sum() return reg0, reg1, reg3 def forward(self, x): """ Calculate the output. Parameters ---------- x : ``torch.FloatTensor``, required. the input tensor, of shape (seq_len, batch_size, input_dim). Returns ---------- output: ``torch.FloatTensor``. The ELMo outputs. """ if self.layer_list is not None: for ind in range(len(self.layer_list)): x = self.layer[ind](x, self.weight_list[ind]) return x # return self.layer(x) class SparseSeqLM(nn.Module): """ The language model for the dense rnns with layer-wise selection. Parameters ---------- ori_lm : ``torch.nn.Module``, required. the original module of language model. backward : ``bool``, required. whether the language model is backward. droprate : ``float``, required. the dropout ratrio. fix_rate: ``bool``, required. whether to fix the rqtio. """ def __init__(self, ori_lm, backward, droprate, fix_rate): super(SparseSeqLM, self).__init__() self.rnn = SDRNN(ori_lm.rnn, droprate, fix_rate) self.w_num = ori_lm.w_num self.w_dim = ori_lm.w_dim self.word_embed = ori_lm.word_embed self.word_embed.weight.requires_grad = False self.output_dim = ori_lm.rnn_output self.backward = backward def to_params(self): """ To parameters. """ return { "backward": self.backward, "rnn_params": self.rnn.to_params(), "word_embed_num": self.word_embed.num_embeddings, "word_embed_dim": self.word_embed.embedding_dim } def prune_dense_rnn(self): """ Prune dense rnn to be smaller by delecting layers. """ prune_mask = self.rnn.prune_dense_rnn() self.output_dim = self.rnn.output_dim return prune_mask def init_hidden(self): """ initialize hidden states. """ return def regularizer(self): """ Calculate the regularization term. Returns ---------- reg: ``list``. The list of regularization terms. """ return self.rnn.regularizer() def prox(self): """ the proximal calculator. """ return self.rnn.prox() def forward(self, w_in, ind=None): """ Calculate the output. Parameters ---------- w_in : ``torch.LongTensor``, required. the input tensor, of shape (seq_len, batch_size). ind : ``torch.LongTensor``, optional, (default=None). the index tensor for the backward language model, of shape (seq_len, batch_size). Returns ---------- output: ``torch.FloatTensor``. The ELMo outputs. """ w_emb = self.word_embed(w_in) out = self.rnn(w_emb) if self.backward: out_size = out.size() out = out.view(out_size[0] * out_size[1], out_size[2]).index_select(0, ind).contiguous().view(out_size) return out ================================================ FILE: model_seq/utils.py ================================================ """ .. module:: utils :synopsis: utils .. moduleauthor:: Liyuan Liu """ import numpy as np import torch import json import torch import torch.nn as nn import torch.nn.init from torch.autograd import Variable def log_sum_exp(vec): """ log sum exp function. Parameters ---------- vec : ``torch.FloatTensor``, required. input vector, of shape(ins_num, from_tag_size, to_tag_size) Returns ------- sum: ``torch.FloatTensor``. log sum exp results, tensor of shape (ins_num, to_tag_size) """ max_score, _ = torch.max(vec, 1) return max_score + torch.log(torch.sum(torch.exp(vec - max_score.unsqueeze(1).expand_as(vec)), 1)) def repackage_hidden(h): """ Wraps hidden states in new Variables, to detach them from their history Parameters ---------- h : ``Tuple`` or ``Tensors``, required. Tuple or Tensors, hidden states. Returns ------- hidden: ``Tuple`` or ``Tensors``. detached hidden states """ if type(h) == torch.Tensor: return h.detach() else: return tuple(repackage_hidden(v) for v in h) def to_scalar(var): """ convert a tensor to a scalar number """ return var.view(-1).item() def init_embedding(input_embedding): """ random initialize embedding """ bias = np.sqrt(3.0 / input_embedding.size(1)) nn.init.uniform_(input_embedding, -bias, bias) def init_linear(input_linear): """ random initialize linear projection. """ bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1))) nn.init.uniform_(input_linear.weight, -bias, bias) if input_linear.bias is not None: input_linear.bias.data.zero_() def adjust_learning_rate(optimizer, lr): """ adjust learning to the the new value. Parameters ---------- optimizer : required. pytorch optimizer. float : ``float``, required. the target learning rate. """ for param_group in optimizer.param_groups: param_group['lr'] = lr def init_lstm(input_lstm): """ random initialize lstms """ for ind in range(0, input_lstm.num_layers): weight = eval('input_lstm.weight_ih_l'+str(ind)) bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1))) nn.init.uniform_(weight, -bias, bias) weight = eval('input_lstm.weight_hh_l'+str(ind)) bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1))) nn.init.uniform_(weight, -bias, bias) if input_lstm.bias: for ind in range(0, input_lstm.num_layers): weight = eval('input_lstm.bias_ih_l'+str(ind)) weight.data.zero_() weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 weight = eval('input_lstm.bias_hh_l'+str(ind)) weight.data.zero_() weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 ================================================ FILE: model_word_ada/LM.py ================================================ """ .. module:: LM :synopsis: language modeling .. moduleauthor:: Liyuan Liu """ import torch import torch.nn as nn import torch.nn.functional as F import model_word_ada.utils as utils class LM(nn.Module): """ The language model model. Parameters ---------- rnn : ``torch.nn.Module``, required. The RNNs network. soft_max : ``torch.nn.Module``, required. The softmax layer. w_num : ``int`` , required. The number of words. w_dim : ``int`` , required. The dimension of word embedding. droprate : ``float`` , required The dropout ratio. label_dim : ``int`` , required. The input dimension of softmax. """ def __init__(self, rnn, soft_max, w_num, w_dim, droprate, label_dim = -1, add_relu=False): super(LM, self).__init__() self.rnn = rnn self.soft_max = soft_max self.w_num = w_num self.w_dim = w_dim self.word_embed = nn.Embedding(w_num, w_dim) self.rnn_output = self.rnn.output_dim self.add_proj = label_dim > 0 if self.add_proj: self.project = nn.Linear(self.rnn_output, label_dim) if add_relu: self.relu = nn.ReLU() else: self.relu = lambda x: x self.drop = nn.Dropout(p=droprate) def load_embed(self, origin_lm): """ Load embedding from another language model. """ self.word_embed = origin_lm.word_embed self.soft_max = origin_lm.soft_max def rand_ini(self): """ Random initialization. """ self.rnn.rand_ini() # utils.init_linear(self.project) self.soft_max.rand_ini() # if not self.tied_weight: utils.init_embedding(self.word_embed.weight) if self.add_proj: utils.init_linear(self.project) def init_hidden(self): """ Initialize hidden states. """ self.rnn.init_hidden() def forward(self, w_in, target): """ Calculate the loss. Parameters ---------- w_in : ``torch.FloatTensor``, required. the input tensor, of shape (word_num, input_dim). target : ``torch.FloatTensor``, required. the target of the language model, of shape (word_num). Returns ---------- loss: ``torch.FloatTensor``. The NLL loss. """ w_emb = self.word_embed(w_in) w_emb = self.drop(w_emb) out = self.rnn(w_emb).contiguous().view(-1, self.rnn_output) if self.add_proj: out = self.drop(self.relu(self.project(out))) # out = self.drop(self.project(out)) out = self.soft_max(out, target) return out def log_prob(self, w_in): """ Calculate log-probability for the whole dictionary. Parameters ---------- w_in : ``torch.FloatTensor``, required. the input tensor, of shape (word_num, input_dim). Returns ---------- prob: ``torch.FloatTensor``. The full log-probability. """ w_emb = self.word_embed(w_in) out = self.rnn(w_emb).contiguous().view(-1, self.rnn_output) if self.add_proj: out = self.relu(self.project(out)) out = self.soft_max.log_prob(out, w_emb.device) return out ================================================ FILE: model_word_ada/__init__.py ================================================ ================================================ FILE: model_word_ada/adaptive.py ================================================ """ .. module:: adaptive :synopsis: adaptive softmax .. moduleauthor:: Liyuan Liu """ import torch from torch import nn from math import sqrt class AdaptiveSoftmax(nn.Module): """ The adaptive softmax layer. Modified from: https://github.com/rosinality/adaptive-softmax-pytorch/blob/master/adasoft.py Parameters ---------- input_size : ``int``, required. The input dimension. cutoff : ``list``, required. The list of cutoff values. """ def __init__(self, input_size, cutoff): super().__init__() self.input_size = input_size self.cutoff = cutoff self.output_size = cutoff[0] + len(cutoff) - 1 self.head = nn.Linear(input_size, self.output_size) self.tail = nn.ModuleList() self.cross_entropy = nn.CrossEntropyLoss(size_average=False) for i in range(len(self.cutoff) - 1): seq = nn.Sequential( nn.Linear(input_size, input_size // 4 ** i, False), nn.Linear(input_size // 4 ** i, cutoff[i + 1] - cutoff[i], False) ) self.tail.append(seq) def rand_ini(self): """ Random Initialization. """ nn.init.xavier_normal_(self.head.weight) for tail in self.tail: nn.init.xavier_normal_(tail[0].weight) nn.init.xavier_normal_(tail[1].weight) def log_prob(self, w_in, device): """ Calculate log-probability for the whole dictionary. Parameters ---------- w_in : ``torch.FloatTensor``, required. the input tensor, of shape (word_num, input_dim). device: ``torch.device``, required. the target device for calculation. Returns ---------- prob: ``torch.FloatTensor``. The full log-probability. """ lsm = nn.LogSoftmax(dim=1).to(device) head_out = self.head(w_in) batch_size = head_out.size(0) prob = torch.zeros(batch_size, self.cutoff[-1]).to(device) lsm_head = lsm(head_out) prob.narrow(1, 0, self.output_size).add_(lsm_head.narrow(1, 0, self.output_size).data) for i in range(len(self.tail)): pos = self.cutoff[i] i_size = self.cutoff[i + 1] - pos buffer = lsm_head.narrow(1, self.cutoff[0] + i, 1) buffer = buffer.expand(batch_size, i_size) lsm_tail = lsm(self.tail[i](w_in)) prob.narrow(1, pos, i_size).copy_(buffer.data).add_(lsm_tail.data) return prob def forward(self, w_in, target): """ Calculate the log-likihood w.o. calculate the full distribution. Parameters ---------- w_in : ``torch.FloatTensor``, required. the input tensor, of shape (word_num, input_dim). target : ``torch.FloatTensor``, required. the target of the language model, of shape (word_num). Returns ---------- loss: ``torch.FloatTensor``. The NLL loss. """ batch_size = w_in.size(0) output = 0.0 first_target = target.clone() for i in range(len(self.cutoff) - 1): mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1])) if mask.sum() > 0: first_target[mask] = self.cutoff[0] + i second_target = target[mask].add(-self.cutoff[i]) second_input = w_in.index_select(0, mask.nonzero().squeeze()) second_output = self.tail[i](second_input) output += self.cross_entropy(second_output, second_target) output += self.cross_entropy(self.head(w_in), first_target) output /= batch_size return output ================================================ FILE: model_word_ada/basic.py ================================================ """ .. module:: basic :synopsis: basic rnn .. moduleauthor:: Liyuan Liu """ import torch import torch.nn as nn import torch.nn.functional as F import model_word_ada.utils as utils class BasicUnit(nn.Module): """ The basic recurrent unit for the vanilla stacked RNNs. Parameters ---------- unit : ``str``, required. The type of rnn unit. input_dim : ``int``, required. The input dimension fo the unit. hid_dim : ``int``, required. The hidden dimension fo the unit. droprate : ``float``, required. The dropout ratrio. """ def __init__(self, unit, input_dim, hid_dim, droprate): super(BasicUnit, self).__init__() self.unit_type = unit rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU} self.batch_norm = (unit == 'bnlstm') self.layer = rnnunit_map[unit](input_dim, hid_dim, 1) self.droprate = droprate self.output_dim = hid_dim self.init_hidden() def init_hidden(self): """ Initialize hidden states. """ self.hidden_state = None def rand_ini(self): """ Random Initialization. """ if not self.batch_norm: utils.init_lstm(self.layer) def forward(self, x): """ Calculate the output. Parameters ---------- x : ``torch.LongTensor``, required. the input tensor, of shape (seq_len, batch_size, input_dim). Returns ---------- output: ``torch.FloatTensor``. The output of RNNs. """ out, new_hidden = self.layer(x, self.hidden_state) self.hidden_state = utils.repackage_hidden(new_hidden) if self.droprate > 0: out = F.dropout(out, p=self.droprate, training=self.training) return out class BasicRNN(nn.Module): """ The multi-layer recurrent networks for the vanilla stacked RNNs. Parameters ---------- layer_num: ``int``, required. The number of layers. unit : ``torch.nn.Module``, required. The type of rnn unit. input_dim : ``int``, required. The input dimension fo the unit. hid_dim : ``int``, required. The hidden dimension fo the unit. droprate : ``float``, required. The dropout ratrio. """ def __init__(self, layer_num, unit, emb_dim, hid_dim, droprate): super(BasicRNN, self).__init__() layer_list = [BasicUnit(unit, emb_dim, hid_dim, droprate)] + [BasicUnit(unit, hid_dim, hid_dim, droprate) for i in range(layer_num - 1)] self.layer = nn.Sequential(*layer_list) self.output_dim = layer_list[-1].output_dim self.unit_type = unit self.init_hidden() def to_params(self): """ To parameters. """ return { "rnn_type": "Basic", "unit_type": self.layer[0].unit_type, "layer_num": len(self.layer), "emb_dim": self.layer[0].layer.input_size, "hid_dim": self.layer[0].layer.hidden_size, "droprate": self.layer[0].droprate } def init_hidden(self): """ Initialize hidden states. """ for tup in self.layer.children(): tup.init_hidden() def rand_ini(self): """ Random Initialization. """ for tup in self.layer.children(): tup.rand_ini() def forward(self, x): """ Calculate the output. Parameters ---------- x : ``torch.LongTensor``, required. the input tensor, of shape (seq_len, batch_size, input_dim). Returns ---------- output: ``torch.FloatTensor``. The output of RNNs. """ return self.layer(x) ================================================ FILE: model_word_ada/dataset.py ================================================ """ .. module:: dataset :synopsis: dataset for language modeling .. moduleauthor:: Liyuan Liu """ import torch import torch.nn as nn import torch.nn.functional as F import sys import pickle import random from tqdm import tqdm from torch.utils.data import Dataset class EvalDataset(object): """ Dataset for Language Modeling Parameters ---------- dataset : ``list``, required. The encoded dataset (outputs of preprocess scripts). sequence_length: ``int``, required. Sequence Length. """ def __init__(self, dataset, sequence_length): super(EvalDataset, self).__init__() self.dataset = dataset self.sequence_length = sequence_length self.construct_index() def get_tqdm(self, device): """ construct dataset reader and the corresponding tqdm. Parameters ---------- device: ``torch.device``, required. the target device for the dataset loader. """ return tqdm(self.reader(device), mininterval=2, total=self.index_length, leave=False, file=sys.stdout, ncols=80) def construct_index(self): """ construct index for the dataset. """ token_per_batch = self.sequence_length tot_num = len(self.dataset) - 1 res_num = tot_num - tot_num % token_per_batch self.x = list(torch.unbind(torch.LongTensor(self.dataset[0:res_num]).view(-1, self.sequence_length), 0)) self.y = list(torch.unbind(torch.LongTensor(self.dataset[1:res_num+1]).view(-1, self.sequence_length), 0)) self.x.append(torch.LongTensor(self.dataset[res_num:tot_num])) self.y.append(torch.LongTensor(self.dataset[res_num+1:tot_num+1])) self.index_length = len(self.x) self.cur_idx = 0 def reader(self, device): """ construct dataset reader. Parameters ---------- device: ``torch.device``, required. the target device for the dataset loader. Returns ------- reader: ``iterator``. A lazy iterable object """ if self.cur_idx == self.index_length: self.cur_idx = 0 raise StopIteration word_t = self.x[self.cur_idx].to(device).view(-1, 1) label_t = self.y[self.cur_idx].to(device).view(-1, 1) self.cur_idx += 1 yield word_t, label_t class LargeDataset(object): """ Lazy Dataset for Language Modeling Parameters ---------- root : ``str``, required. The root folder for dataset files. range_idx : ``int``, required. The maximum file index for the input files (train_*.pk). batch_size : ``int``, required. Batch size. sequence_length: ``int``, required. Sequence Length. """ def __init__(self, root, range_idx, batch_size, sequence_length): super(LargeDataset, self).__init__() self.root = root self.range_idx = range_idx self.shuffle_list = list(range(0, range_idx)) self.shuffle() self.batch_size = batch_size self.sequence_length = sequence_length self.token_per_batch = self.batch_size * self.sequence_length self.total_batch_num = -1 def shuffle(self): """ shuffle dataset """ random.shuffle(self.shuffle_list) def get_tqdm(self, device): """ construct dataset reader and the corresponding tqdm. Parameters ---------- device: ``torch.device``, required. the target device for the dataset loader. """ self.batch_count = 0 self.cur_idx = 0 self.file_idx = 0 self.index_length = 0 if self.total_batch_num <= 0: return tqdm(self.reader(device), mininterval=2, leave=False, file=sys.stdout).__iter__() else: return tqdm(self.reader(device), mininterval=2, total=self.total_batch_num, leave=False, file=sys.stdout, ncols=80).__iter__() def reader(self, device): """ construct dataset reader. Parameters ---------- device: ``torch.device``, required. the target device for the dataset loader. Returns ------- reader: ``iterator``. A lazy iterable object """ while self.file_idx < self.range_idx: self.open_next() while self.cur_idx < self.index_length: word_t = self.x[self.cur_idx].to(device) # label_t = self.y[self.cur_idx].to(device) label_t = self.y[self.cur_idx].to(device) self.cur_idx += 1 yield word_t, label_t self.total_batch_num = self.batch_count self.shuffle() def open_next(self): """ Open the next file. """ self.dataset = pickle.load(open(self.root + 'train_' + str( self.shuffle_list[self.file_idx])+'.pk', 'rb')) res_num = len(self.dataset) - 1 res_num = res_num - res_num % self.token_per_batch self.x = torch.LongTensor(self.dataset[0:res_num]).view(self.batch_size, -1, self.sequence_length).transpose_(0, 1).transpose_(1, 2).contiguous() self.y = torch.LongTensor(self.dataset[1:res_num+1]).view(self.batch_size, -1, self.sequence_length).transpose_(0, 1).transpose_(1, 2).contiguous() self.index_length = self.x.size(0) self.cur_idx = 0 self.batch_count += self.index_length self.file_idx += 1 ================================================ FILE: model_word_ada/densenet.py ================================================ """ .. module:: densenet :synopsis: densernn .. moduleauthor:: Liyuan Liu """ import torch import torch.nn as nn import torch.nn.functional as F import model_word_ada.utils as utils class BasicUnit(nn.Module): """ The basic recurrent unit for the densely connected RNNs. Parameters ---------- unit : ``torch.nn.Module``, required. The type of rnn unit. input_dim : ``float``, required. The input dimension fo the unit. increase_rate : ``float``, required. The hidden dimension fo the unit. droprate : ``float``, required. The dropout ratrio. """ def __init__(self, unit, input_dim, increase_rate, droprate): super(BasicUnit, self).__init__() rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU} self.unit_type = unit self.layer = rnnunit_map[unit](input_dim, increase_rate, 1) if 'lstm' == self.unit_type: utils.init_lstm(self.layer) self.droprate = droprate self.input_dim = input_dim self.increase_rate = increase_rate self.output_dim = input_dim + increase_rate self.init_hidden() def init_hidden(self): """ Initialize hidden states. """ self.hidden_state = None def rand_ini(self): """ Random Initialization. """ return def forward(self, x): """ Calculate the output. Parameters ---------- x : ``torch.LongTensor``, required. the input tensor, of shape (seq_len, batch_size, input_dim). Returns ---------- output: ``torch.FloatTensor``. The output of RNNs. """ if self.droprate > 0: new_x = F.dropout(x, p=self.droprate, training=self.training) else: new_x = x out, new_hidden = self.layer(new_x, self.hidden_state) self.hidden_state = utils.repackage_hidden(new_hidden) out = out.contiguous() return torch.cat([x, out], 2) class DenseRNN(nn.Module): """ The multi-layer recurrent networks for the densely connected RNNs. Parameters ---------- layer_num: ``float``, required. The number of layers. unit : ``torch.nn.Module``, required. The type of rnn unit. input_dim : ``float``, required. The input dimension fo the unit. hid_dim : ``float``, required. The hidden dimension fo the unit. droprate : ``float``, required. The dropout ratrio. """ def __init__(self, layer_num, unit, emb_dim, hid_dim, droprate): super(DenseRNN, self).__init__() self.unit_type = unit self.layer_list = [BasicUnit(unit, emb_dim + i * hid_dim, hid_dim, droprate) for i in range(layer_num)] self.layer = nn.Sequential(*self.layer_list) if layer_num > 0 else None self.output_dim = self.layer_list[-1].output_dim if layer_num > 0 else emb_dim self.emb_dim = emb_dim self.init_hidden() def to_params(self): """ To parameters. """ return { "rnn_type": "DenseRNN", "unit_type": self.layer[0].unit_type, "layer_num": len(self.layer), "emb_dim": self.layer[0].input_dim, "hid_dim": self.layer[0].increase_rate, "droprate": self.layer[0].droprate } def init_hidden(self): """ Initialize hidden states. """ for tup in self.layer_list: tup.init_hidden() def rand_ini(self): """ Random Initialization. """ for tup in self.layer_list: tup.rand_ini() def forward(self, x): """ Calculate the output. Parameters ---------- x : ``torch.LongTensor``, required. the input tensor, of shape (seq_len, batch_size, input_dim). Returns ---------- output: ``torch.FloatTensor``. The output of RNNs. """ return self.layer(x) ================================================ FILE: model_word_ada/ldnet.py ================================================ """ .. module:: ldnet :synopsis: LD-Net .. moduleauthor:: Liyuan Liu """ import torch import torch.nn as nn import torch.nn.functional as F import model_word_ada.utils as utils import random class BasicUnit(nn.Module): """ The basic recurrent unit for the densely connected RNNs with layer-wise dropout. Parameters ---------- unit : ``torch.nn.Module``, required. The type of rnn unit. input_dim : ``float``, required. The input dimension fo the unit. increase_rate : ``float``, required. The hidden dimension fo the unit. droprate : ``float``, required. The dropout ratrio. layer_dropout : ``float``, required. The layer-wise dropout ratrio. """ def __init__(self, unit, input_dim, increase_rate, droprate, layer_drop = 0): super(BasicUnit, self).__init__() rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU} self.unit_type = unit self.layer = rnnunit_map[unit](input_dim, increase_rate, 1) if 'lstm' == self.unit_type: utils.init_lstm(self.layer) self.layer_drop = layer_drop self.droprate = droprate self.input_dim = input_dim self.increase_rate = increase_rate self.output_dim = input_dim + increase_rate self.init_hidden() def init_hidden(self): """ Initialize hidden states. """ self.hidden_state = None def rand_ini(self): """ Random Initialization. """ return def forward(self, x, p_out): """ Calculate the output. Parameters ---------- x : ``torch.LongTensor``, required. the input tensor, of shape (seq_len, batch_size, input_dim). p_out : ``torch.LongTensor``, required. the final output tensor for the softmax, of shape (seq_len, batch_size, input_dim). Returns ---------- out: ``torch.FloatTensor``. The undropped outputs of RNNs to the softmax. p_out: ``torch.FloatTensor``. The dropped outputs of RNNs to the next_layer. """ if self.droprate > 0: new_x = F.dropout(x, p=self.droprate, training=self.training) else: new_x = x out, new_hidden = self.layer(new_x, self.hidden_state) self.hidden_state = utils.repackage_hidden(new_hidden) out = out.contiguous() if self.training and random.uniform(0, 1) < self.layer_drop: deep_out = torch.autograd.Variable( torch.zeros(x.size(0), x.size(1), self.increase_rate) ).cuda() else: deep_out = out o_out = torch.cat([p_out, out], 2) d_out = torch.cat([x, deep_out], 2) return d_out, o_out class LDRNN(nn.Module): """ The multi-layer recurrent networks for the densely connected RNNs with layer-wise dropout. Parameters ---------- layer_num: ``float``, required. The number of layers. unit : ``torch.nn.Module``, required. The type of rnn unit. input_dim : ``float``, required. The input dimension fo the unit. hid_dim : ``float``, required. The hidden dimension fo the unit. droprate : ``float``, required. The dropout ratrio. layer_dropout : ``float``, required. The layer-wise dropout ratrio. """ def __init__(self, layer_num, unit, emb_dim, hid_dim, droprate, layer_drop): super(LDRNN, self).__init__() self.unit_type = unit self.layer_list = [BasicUnit(unit, emb_dim + i * hid_dim, hid_dim, droprate, layer_drop) for i in range(layer_num)] self.layer_num = layer_num self.layer = nn.ModuleList(self.layer_list) if layer_num > 0 else None self.output_dim = self.layer_list[-1].output_dim if layer_num > 0 else emb_dim self.emb_dim = emb_dim self.init_hidden() def to_params(self): """ To parameters. """ return { "rnn_type": "LDRNN", "unit_type": self.layer[0].unit_type, "layer_num": len(self.layer), "emb_dim": self.layer[0].input_dim, "hid_dim": self.layer[0].increase_rate, "droprate": self.layer[0].droprate, "after_pruned": False } def init_hidden(self): """ Initialize hidden states. """ for tup in self.layer_list: tup.init_hidden() def rand_ini(self): """ Random Initialization. """ for tup in self.layer_list: tup.rand_ini() def forward(self, x): """ Calculate the output. Parameters ---------- x : ``torch.LongTensor``, required. the input tensor, of shape (seq_len, batch_size, input_dim). Returns ---------- output: ``torch.FloatTensor``. The output of RNNs to the Softmax. """ output = x for ind in range(self.layer_num): x, output = self.layer_list[ind](x, output) return output ================================================ FILE: model_word_ada/utils.py ================================================ """ .. module:: utils :synopsis: utils .. moduleauthor:: Liyuan Liu """ import numpy as np import torch import json import torch import torch.nn as nn import torch.nn.init from torch.autograd import Variable def repackage_hidden(h): """ Wraps hidden states in new Variables, to detach them from their history Parameters ---------- h : ``Tuple`` or ``Tensors``, required. Tuple or Tensors, hidden states. Returns ------- hidden: ``Tuple`` or ``Tensors``. detached hidden states """ if type(h) == torch.Tensor: return h.detach() else: return tuple(repackage_hidden(v) for v in h) def to_scalar(var): """ convert a tensor to a scalar number """ return var.view(-1).item() def init_embedding(input_embedding): """ random initialize embedding """ bias = np.sqrt(3.0 / input_embedding.size(1)) nn.init.uniform_(input_embedding, -bias, bias) def init_linear(input_linear): """ random initialize linear projection. """ bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1))) nn.init.uniform_(input_linear.weight, -bias, bias) if input_linear.bias is not None: input_linear.bias.data.zero_() def adjust_learning_rate(optimizer, lr): """ adjust learning to the the new value. Parameters ---------- optimizer : required. pytorch optimizer. float : ``float``, required. the target learning rate. """ for param_group in optimizer.param_groups: param_group['lr'] = lr def init_lstm(input_lstm): """ random initialize lstms """ for ind in range(0, input_lstm.num_layers): weight = eval('input_lstm.weight_ih_l'+str(ind)) bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1))) nn.init.uniform_(weight, -bias, bias) weight = eval('input_lstm.weight_hh_l'+str(ind)) bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1))) nn.init.uniform_(weight, -bias, bias) if input_lstm.bias: for ind in range(0, input_lstm.num_layers): weight = eval('input_lstm.bias_ih_l'+str(ind)) weight.data.zero_() weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 weight = eval('input_lstm.bias_hh_l'+str(ind)) weight.data.zero_() weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 ================================================ FILE: pre_seq/encode_data.py ================================================ """ .. module:: encode_data :synopsis: encode data for sequence labeling .. moduleauthor:: Liyuan Liu """ import pickle import argparse import os import random import numpy as np from tqdm import tqdm import itertools import functools def encode_dataset(input_file, flm_map, blm_map, gw_map, c_map, y_map): flm_unk = flm_map[''] blm_unk = blm_map[''] gw_unk = gw_map[''] c_con = c_map[' '] c_unk = c_map[''] dataset = list() tmpw_flm, tmpw_blm, tmpw_gw, tmpc, tmpy = list(), list(), list(), list(), list() with open(input_file, 'r') as fin: for line in fin: if line.isspace() or line.startswith('-DOCSTART-'): if len(tmpw_flm) > 0: dataset.append([tmpw_flm, tmpw_blm, tmpw_gw, tmpc, tmpy]) tmpw_flm, tmpw_blm, tmpw_gw, tmpc, tmpy = list(), list(), list(), list(), list() else: line = line.split() tmpw_flm.append(flm_map.get(line[0], flm_unk)) tmpw_blm.append(blm_map.get(line[0], blm_unk)) tmpw_gw.append(gw_map.get(line[0].lower(), gw_unk)) tmpy.append(y_map[line[-1]]) tmpc.append([c_map.get(tup, c_unk) for tup in line[0]]) if len(tmpw_flm) > 0: dataset.append([tmpw_flm, tmpw_blm, tmpw_gw, tmpc, tmpy]) return dataset if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--train_file', default="./data/ner/eng.train.iobes") parser.add_argument('--test_file', default="./data/ner/eng.testb.iobes") parser.add_argument('--dev_file', default="./data/ner/eng.testa.iobes") parser.add_argument('--input_map', default="./data/conll_map.pk") parser.add_argument('--output_file', default="./data/ner_dataset.pk") parser.add_argument('--threshold', type=int, default=1) parser.add_argument('--unk', default='') args = parser.parse_args() with open(args.input_map, 'rb') as f: p_data = pickle.load(f) name_list = ['flm_map', 'blm_map', 'gw_map', 'c_map', 'y_map', 'emb_array'] flm_map, blm_map, gw_map, c_map, y_map, emb_array = [p_data[tup] for tup in name_list] train_dataset = encode_dataset(args.train_file, flm_map, blm_map, gw_map, c_map, y_map) test_dataset = encode_dataset(args.test_file, flm_map, blm_map, gw_map, c_map, y_map) dev_dataset = encode_dataset(args.dev_file, flm_map, blm_map, gw_map, c_map, y_map) with open(args.output_file, 'wb') as f: pickle.dump({'flm_map': flm_map, 'blm_map': blm_map, 'gw_map': gw_map, 'c_map': c_map, 'y_map': y_map, 'emb_array': emb_array, 'train_data': train_dataset, 'test_data': test_dataset, 'dev_data': dev_dataset}, f) ================================================ FILE: pre_seq/gene_map.py ================================================ """ .. module:: gene_map :synopsis: generate map for sequence labeling .. moduleauthor:: Liyuan Liu """ import pickle import argparse import os import random import numpy as np from tqdm import tqdm import itertools import functools if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--train_corpus', default='./data/ner/eng.train.iobes') parser.add_argument('--input_embedding', default="./embedding/glove.6B.100d.txt") parser.add_argument('--output_map', default="./data/conll_map.pk") parser.add_argument('--flm_map', default="./data/one_billion/test.pk") parser.add_argument('--blm_map', default="./data/one_billion_reverse/test.pk") parser.add_argument('--threshold', type=int, default=5) parser.add_argument('--unk', default='unk') args = parser.parse_args() with open(args.flm_map, 'rb') as f: p_data = pickle.load(f) flm_map = p_data['w_map'] with open(args.blm_map, 'rb') as f: p_data = pickle.load(f) blm_map = p_data['w_map'] gw_map = dict() embedding_array = list() for line in open(args.input_embedding, 'r'): line = line.split() vector = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:]))) if line[0] == args.unk: gw_map[''] = len(gw_map) else: gw_map[line[0]] = len(gw_map) embedding_array.append(vector) bias = 2 * np.sqrt(3.0 / len(embedding_array[0])) gw_map['<\n>'] = len(gw_map) embedding_array.append([random.random() * bias - bias for tup in embedding_array[0]]) w_count = dict() c_count = dict() y_map = dict() # y_map = {'B-LST':0, 'E-LST':1} with open(args.train_corpus, 'r') as fin: for line in fin: if line.isspace() or line.startswith('-DOCSTART-'): c_count['\n'] = c_count.get('\n', 0) + 1 else: line = line.split() for tup in line[0]: c_count[tup] = c_count.get(tup, 0) + 1 c_count[' '] = c_count.get(' ', 0) + 1 if line[-1] not in y_map: y_map[line[-1]] = len(y_map) word = line[0].lower() if word not in gw_map: w_count[word] = w_count.get(word, 0) + 1 w_set = {k for k, v in w_count.items() if v > args.threshold} for k in w_set: gw_map[k] = len(gw_map) embedding_array.append([random.random() * bias - bias for tup in embedding_array[0]]) c_set = {k for k, v in c_count.items() if v > args.threshold} c_map = {v:k for k, v in enumerate(c_set)} c_map[''] = len(c_map) y_map[''] = len(y_map) y_map[''] = len(y_map) with open(args.output_map, 'wb') as f: pickle.dump({'flm_map': flm_map, 'blm_map': blm_map, 'gw_map': gw_map, 'c_map': c_map, 'y_map': y_map, 'emb_array': embedding_array}, f) ================================================ FILE: pre_word_ada/encode_data2folder.py ================================================ """ .. module:: encode_data2folder :synopsis: encode data folder for language modeling .. moduleauthor:: Liyuan Liu """ import pickle import argparse import os import random import numpy as np from tqdm import tqdm import itertools import functools def encode_dataset(input_folder, w_map, reverse): w_eof = w_map['\n'] w_unk = w_map[''] list_dirs = os.walk(input_folder) lines = list() for root, dirs, files in list_dirs: for file in tqdm(files): with open(os.path.join(root, file)) as fin: lines = lines + list(filter(lambda t: t and not t.isspace(), fin.readlines())) dataset = list() for line in lines: dataset += list(map(lambda t: w_map.get(t, w_unk), line.split())) + [w_eof] if reverse: dataset = dataset[::-1] return dataset def encode_dataset2file(input_folder, t, w_map, reverse): w_eof = w_map['\n'] w_unk = w_map[''] list_dirs = os.walk(input_folder) range_ind = 0 for root, dirs, files in list_dirs: for file in tqdm(files): with open(os.path.join(root, file), 'r') as fin: lines = list(filter(lambda t: t and not t.isspace(), fin.readlines())) dataset = list() for line in lines: dataset += list(map(lambda t: w_map.get(t, w_unk), line.split())) + [w_eof] if reverse: dataset = dataset[::-1] with open(output_folder+'train_'+ str(range_ind) + '.pk', 'wb') as f: pickle.dump(dataset, f) range_ind += 1 return range_ind if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--train_folder', default="./data/1b_train") parser.add_argument('--test_folder', default="./data/1b_test") parser.add_argument('--input_map', default="./data/1b_map.pk") parser.add_argument('--output_folder', default="./data/one_billion/") parser.add_argument('--threshold', type=int, default=3) parser.add_argument('--unk', default='') parser.add_argument('--reverse', action='store_true') args = parser.parse_args() with open(args.input_map, 'rb') as f: w_count = pickle.load(f) unk_count = sum([v for k, v in w_count.items() if v <= args.threshold]) w_list = [(k, v) for k, v in w_count.items() if v > args.threshold] w_list.append(('', unk_count)) w_list.sort(key=lambda t: t[1], reverse=True) w_map = {kv[0]:v for v, kv in enumerate(w_list)} range_ind = encode_dataset2file(args.train_folder, args.output_folder, w_map, args.reverse) test_dataset = encode_dataset(args.test_folder, w_map, args.reverse) with open(args.output_folder+'test.pk', 'wb') as f: pickle.dump({'w_map': w_map, 'test_data':test_dataset, 'range' : range_ind}, f) ================================================ FILE: pre_word_ada/gene_map.py ================================================ """ .. module:: gene_map :synopsis: gene map for language modeling .. moduleauthor:: Liyuan Liu """ import pickle import argparse import os import random import numpy as np from tqdm import tqdm import itertools import functools if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--input_folder', default="./data/1b_train") parser.add_argument('--output_map', default="./data/1b_map.pk") args = parser.parse_args() w_count = {'\n':0} list_dirs = os.walk(args.input_folder) for root, dirs, files in list_dirs: for file in tqdm(files): with open(os.path.join(root, file)) as fin: for line in fin: if not line or line.isspace(): continue line = line.split() for tup in line: w_count[tup] = w_count.get(tup, 0) + 1 w_count['\n'] += 1 with open(args.output_map, 'wb') as f: pickle.dump(w_count, f) ================================================ FILE: prune_sparse_seq.py ================================================ from __future__ import print_function import datetime import time import torch import torch.autograd as autograd import torch.nn as nn import torch.optim as optim import codecs import pickle import math from model_word_ada.LM import LM from model_word_ada.basic import BasicRNN from model_word_ada.densenet import DenseRNN from model_word_ada.ldnet import LDRNN from model_seq.crf import CRFLoss, CRFDecode from model_seq.dataset import SeqDataset from model_seq.evaluator import eval_wc from model_seq.seqlabel import SeqLabel, Vanilla_SeqLabel from model_seq.seqlm import BasicSeqLM from model_seq.sparse_lm import SparseSeqLM import model_seq.utils as utils from torch_scope import wrapper import argparse import logging import json import os import sys import itertools import functools logger = logging.getLogger(__name__) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=str, default="auto") parser.add_argument('--cp_root', default='./checkpoint') parser.add_argument('--checkpoint_name', default='p_ner') parser.add_argument('--git_tracking', action='store_true') parser.add_argument('--corpus', default='./data/ner_dataset.pk') parser.add_argument('--load_seq', default='./checkpoint/ner.th') parser.add_argument('--lm_hid_dim', type=int, default=300) parser.add_argument('--lm_word_dim', type=int, default=300) parser.add_argument('--lm_label_dim', type=int, default=1600) parser.add_argument('--lm_layer_num', type=int, default=10) parser.add_argument('--lm_droprate', type=float, default=0.5) parser.add_argument('--lm_rnn_layer', choices=['Basic', 'DenseNet', 'LDNet'], default='LDNet') parser.add_argument('--lm_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm') parser.add_argument('--seq_c_dim', type=int, default=30) parser.add_argument('--seq_c_hid', type=int, default=150) parser.add_argument('--seq_c_layer', type=int, default=1) parser.add_argument('--seq_w_dim', type=int, default=100) parser.add_argument('--seq_w_hid', type=int, default=300) parser.add_argument('--seq_w_layer', type=int, default=1) parser.add_argument('--seq_droprate', type=float, default=0.5) parser.add_argument('--seq_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm') parser.add_argument('--seq_model', choices=['vanilla', 'lm-aug'], default='lm-aug') parser.add_argument('--seq_lambda0', type=float, default=0.05) parser.add_argument('--seq_lambda1', type=float, default=2) parser.add_argument('--batch_size', type=int, default=10) parser.add_argument('--patience', type=int, default=5) parser.add_argument('--epoch', type=int, default=200) parser.add_argument('--least', type=int, default=50) parser.add_argument('--clip', type=float, default=5) parser.add_argument('--lr', type=float, default=0.015) parser.add_argument('--lr_decay', type=float, default=0.05) parser.add_argument('--update', choices=['Adam', 'Adagrad', 'Adadelta', 'SGD'], default='SGD') args = parser.parse_args() pw = wrapper(os.path.join(args.cp_root, args.checkpoint_name), args.checkpoint_name, enable_git_track=args.git_tracking) gpu_index = pw.auto_device() if 'auto' == args.gpu else int(args.gpu) device = torch.device("cuda:" + str(gpu_index) if gpu_index >= 0 else "cpu") if gpu_index >= 0: torch.cuda.set_device(gpu_index) logger.info('Loading data from {}.'.format(args.corpus)) dataset = pickle.load(open(args.corpus, 'rb')) name_list = ['flm_map', 'blm_map', 'gw_map', 'c_map', 'y_map', 'emb_array', 'train_data', 'test_data', 'dev_data'] flm_map, blm_map, gw_map, c_map, y_map, emb_array, train_data, test_data, dev_data = [dataset[tup] for tup in name_list ] logger.info('Building language models and seuqence labeling models.') rnn_map = {'Basic': BasicRNN, 'DenseNet': DenseRNN, 'LDNet': functools.partial(LDRNN, layer_drop = 0)} flm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate) blm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate) flm_model = LM(flm_rnn_layer, None, len(flm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim) blm_model = LM(blm_rnn_layer, None, len(blm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim) flm_model_seq = SparseSeqLM(flm_model, False, args.lm_droprate, False) blm_model_seq = SparseSeqLM(blm_model, True, args.lm_droprate, False) SL_map = {'vanilla':Vanilla_SeqLabel, 'lm-aug': SeqLabel} seq_model = SL_map[args.seq_model](flm_model_seq, blm_model_seq, len(c_map), args.seq_c_dim, args.seq_c_hid, args.seq_c_layer, len(gw_map), args.seq_w_dim, args.seq_w_hid, args.seq_w_layer, len(y_map), args.seq_droprate, unit=args.seq_rnn_unit) logger.info('Loading pre-trained models from {}.'.format(args.load_seq)) seq_file = wrapper.restore_checkpoint(args.load_seq)['model'] seq_model.load_state_dict(seq_file) seq_model.to(device) crit = CRFLoss(y_map) decoder = CRFDecode(y_map) evaluator = eval_wc(decoder, 'f1') logger.info('Constructing dataset.') train_dataset, test_dataset, dev_dataset = [SeqDataset(tup_data, flm_map['\n'], blm_map['\n'], gw_map['<\n>'], c_map[' '], c_map['\n'], y_map[''], y_map[''], len(y_map), args.batch_size) for tup_data in [train_data, test_data, dev_data]] logger.info('Constructing optimizer.') param_dict = filter(lambda t: t.requires_grad, seq_model.parameters()) optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta, 'SGD': functools.partial(optim.SGD, momentum=0.9)} if args.lr > 0: optimizer=optim_map[args.update](param_dict, lr=args.lr) else: optimizer=optim_map[args.update](param_dict) logger.info('Saving configues.') pw.save_configue(args) logger.info('Setting up training environ.') best_f1 = float('-inf') patience_count = 0 batch_index = 0 normalizer = 0 tot_loss = 0 dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device)) print(dev_f1) logger.info('Start training...') for indexs in range(args.epoch): logger.info('############') logger.info('Epoch: {}'.format(indexs)) pw.nvidia_memory_map() iterator = train_dataset.get_tqdm(device) seq_model.train() for f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w, f_y, f_y_m, _ in iterator: seq_model.zero_grad() output = seq_model(f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w) loss = crit(output, f_y, f_y_m) tot_loss += utils.to_scalar(loss) normalizer += 1 if args.seq_lambda0 > 0: f_reg0, f_reg1, f_reg3 = flm_model_seq.regularizer() b_reg0, b_reg1, b_reg3 = blm_model_seq.regularizer() loss += args.seq_lambda0 * (f_reg3 + b_reg3) if (f_reg0 + b_reg0 > args.seq_lambda1): loss += args.seq_lambda0 * (f_reg1 + b_reg1) loss.backward() torch.nn.utils.clip_grad_norm_(seq_model.parameters(), args.clip) optimizer.step() flm_model_seq.prox() blm_model_seq.prox() batch_index += 1 if 0 == batch_index % 100: pw.add_loss_vs_batch({'training_loss': tot_loss / (normalizer + 1e-9)}, batch_index, use_logger = False) tot_loss = 0 normalizer = 0 if args.lr > 0: current_lr = args.lr / (1 + (indexs + 1) * args.lr_decay) utils.adjust_learning_rate(optimizer, current_lr) dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device)) nonezero_count = (flm_model_seq.rnn.weight_list.data > 0).int().cpu().sum() + (blm_model_seq.rnn.weight_list.data > 0).cpu().int().sum() pw.add_loss_vs_batch({'dev_f1': dev_f1, 'none_zero_count': nonezero_count.item()}, indexs, use_logger = True) pw.add_loss_vs_batch({'dev_pre': dev_pre, 'dev_rec': dev_rec}, indexs, use_logger = False) logger.info('Saving model...') pw.save_checkpoint(model = seq_model, is_best = (nonezero_count <= args.seq_lambda1 and dev_f1 > best_f1)) if nonezero_count <= args.seq_lambda1 and dev_f1 > best_f1: nonezero_count = nonezero_count test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device)) best_f1, best_dev_pre, best_dev_rec, best_dev_acc = dev_f1, dev_pre, dev_rec, dev_acc pw.add_loss_vs_batch({'tot_loss': tot_loss/(normalizer+1e-9), 'test_f1': test_f1}, indexs, use_logger = True) pw.add_loss_vs_batch({'test_pre': test_pre, 'test_rec': test_rec}, indexs, use_logger = False) patience_count = 0 elif dev_f1 > best_f1: test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device)) pw.add_loss_vs_batch({'tot_loss': tot_loss/(normalizer+1e-9), 'test_f1': test_f1}, indexs, use_logger = True) pw.add_loss_vs_batch({'test_pre': test_pre, 'test_rec': test_rec}, indexs, use_logger = False) else: patience_count += 1 if patience_count >= args.patience and indexs >= args.least: break pw.add_loss_vs_batch({'best_test_f1': test_f1, 'best_test_pre': test_pre, 'best_test_rec': test_rec}, 0, use_logger = True, use_writer = False) pw.add_loss_vs_batch({'best_dev_f1': best_f1, 'best_dev_pre': best_dev_pre, 'best_dev_rec': best_dev_rec}, 0, use_logger = True, use_writer = False) logger.info('Loading best_performing_model.') seq_param = pw.restore_best_checkpoint()['model'] seq_model.load_state_dict(seq_param) seq_model.to(device) logger.info('Test before deleting layers.') test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device)) dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device)) pw.add_loss_vs_batch({'best_test_f1': test_f1, 'best_dev_f1': dev_f1}, 1, use_logger = True, use_writer = False) logger.info('Deleting layers.') seq_model.cpu() seq_model.prune_dense_rnn() seq_model.to(device) logger.info('Resulting models display.') print(seq_model) logger.info('Test after deleting layers.') test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device)) dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device)) pw.add_loss_vs_batch({'best_test_f1': test_f1, 'best_dev_f1': dev_f1}, 2, use_logger = True, use_writer = False) seq_model.cpu() logger.info('Saving model...') seq_config = seq_model.to_params() pw.save_checkpoint(model = seq_model, is_best = True, s_dict = {'config': seq_config, 'flm_map': flm_map, 'blm_map': blm_map, 'gw_map': gw_map, 'c_map': c_map, 'y_map': y_map}) pw.close() ================================================ FILE: train_lm.py ================================================ from __future__ import print_function import datetime import time import torch import torch.nn as nn import torch.optim as optim import codecs import pickle import math from model_word_ada.LM import LM from model_word_ada.basic import BasicRNN from model_word_ada.ldnet import LDRNN from model_word_ada.densenet import DenseRNN from model_word_ada.dataset import LargeDataset, EvalDataset from model_word_ada.adaptive import AdaptiveSoftmax import model_word_ada.utils as utils from torch_scope import wrapper import argparse import logging import json import os import sys import itertools import functools logger = logging.getLogger(__name__) def evaluate(data_loader, lm_model, limited = 76800): lm_model.eval() lm_model.init_hidden() total_loss = 0 total_len = 0 for word_t, label_t in data_loader: label_t = label_t.view(-1) tmp_len = label_t.size(0) total_loss += tmp_len * lm_model(word_t, label_t).item() total_len += tmp_len if limited >=0 and total_len > limited: break ppl = math.exp(total_loss / total_len) return ppl if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=str, default="auto") parser.add_argument('--cp_root', default='./checkpoint') parser.add_argument('--checkpoint_name', default='ld0') parser.add_argument('--git_tracking', action='store_true') parser.add_argument('--dataset_folder', default='./data/one_billion/') parser.add_argument('--restore_checkpoint', default='') parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--sequence_length', type=int, default=20) parser.add_argument('--hid_dim', type=int, default=300) parser.add_argument('--word_dim', type=int, default=300) parser.add_argument('--label_dim', type=int, default=1600) parser.add_argument('--layer_num', type=int, default=10) parser.add_argument('--droprate', type=float, default=0.01) parser.add_argument('--add_relu', action='store_true') parser.add_argument('--layer_drop', type=float, default=0.5) parser.add_argument('--epoch', type=int, default=400) parser.add_argument('--clip', type=float, default=5) parser.add_argument('--update', choices=['Adam', 'Adagrad', 'Adadelta'], default='Adam', help='adam is the best') parser.add_argument('--rnn_layer', choices=['Basic', 'DenseNet', 'LDNet'], default='LDNet') parser.add_argument('--rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm') parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--lr_decay', type=float, default=0.1) parser.add_argument('--cut_off', nargs='+', default=[4000,40000,200000]) parser.add_argument('--interval', type=int, default=100) parser.add_argument('--epoch_size', type=int, default=4000) parser.add_argument('--patience', type=float, default=10) args = parser.parse_args() pw = wrapper(os.path.join(args.cp_root, args.checkpoint_name), args.checkpoint_name, enable_git_track=args.git_tracking) gpu_index = pw.auto_device() if 'auto' == args.gpu else int(args.gpu) device = torch.device("cuda:" + str(gpu_index) if gpu_index >= 0 else "cpu") if gpu_index >= 0: torch.cuda.set_device(gpu_index) logger.info('Loading dataset.') dataset = pickle.load(open(args.dataset_folder + 'test.pk', 'rb')) w_map, test_data, range_idx = dataset['w_map'], dataset['test_data'], dataset['range'] train_loader = LargeDataset(args.dataset_folder, range_idx, args.batch_size, args.sequence_length) test_loader = EvalDataset(test_data, args.batch_size) logger.info('Building models.') rnn_map = {'Basic': BasicRNN, 'DenseNet': DenseRNN, 'LDNet': functools.partial(LDRNN, layer_drop = args.layer_drop)} rnn_layer = rnn_map[args.rnn_layer](args.layer_num, args.rnn_unit, args.word_dim, args.hid_dim, args.droprate) cut_off = args.cut_off + [len(w_map) + 1] if args.label_dim > 0: soft_max = AdaptiveSoftmax(args.label_dim, cut_off) else: soft_max = AdaptiveSoftmax(rnn_layer.output_dim, cut_off) lm_model = LM(rnn_layer, soft_max, len(w_map), args.word_dim, args.droprate, label_dim = args.label_dim, add_relu=args.add_relu) lm_model.rand_ini() logger.info('Building optimizer.') optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta} if args.lr > 0: optimizer=optim_map[args.update](lm_model.parameters(), lr=args.lr) else: optimizer=optim_map[args.update](lm_model.parameters()) if args.restore_checkpoint: if os.path.isfile(args.restore_checkpoint): logger.info("loading checkpoint: '{}'".format(args.restore_checkpoint)) model_file = wrapper.restore_checkpoint(args.restore_checkpoint)['model'] lm_model.load_state_dict(model_file, False) else: logger.info("no checkpoint found at: '{}'".format(args.restore_checkpoint)) lm_model.to(device) logger.info('Saving configues.') pw.save_configue(args) logger.info('Setting up training environ.') best_train_ppl = float('inf') cur_lr = args.lr batch_index = 0 epoch_loss = 0 patience = 0 writer = SummaryWriter(log_dir='./runs_1b/'+args.log_dir) name_list = ['batch_loss', 'train_ppl', 'test_ppl'] bloss, tr_ppl, te_ppl = [args.log_dir+'/'+tup for tup in name_list] try: for indexs in range(args.epoch): logger.info('############') logger.info('Epoch: {}'.format(indexs)) pw.nvidia_memory_map() lm_model.train() for word_t, label_t in train_loader.get_tqdm(device): if 1 == train_loader.cur_idx: lm_model.init_hidden() label_t = label_t.view(-1) lm_model.zero_grad() loss = lm_model(word_t, label_t) loss.backward() torch.nn.utils.clip_grad_norm_(lm_model.parameters(), args.clip) optimizer.step() batch_index += 1 if 0 == batch_index % args.interval: s_loss = utils.to_scalar(loss) pw.add_loss_vs_batch({'batch_loss': s_loss}, batch_index, use_logger = False) epoch_loss += utils.to_scalar(loss) if 0 == batch_index % args.epoch_size: epoch_ppl = math.exp(epoch_loss / args.epoch_size) pw.add_loss_vs_batch({'train_ppl': epoch_ppl}, batch_index, use_logger = True) if epoch_loss < best_train_ppl: best_train_ppl = epoch_loss patience = 0 else: patience += 1 epoch_loss = 0 if patience > args.patience and cur_lr > 0: patience = 0 cur_lr *= args.lr_decay best_train_ppl = float('inf') logger.info('adjust_learning_rate...') utils.adjust_learning_rate(optimizer, cur_lr) test_ppl = evaluate(test_loader.get_tqdm(device), lm_model) pw.add_loss_vs_batch({'test_ppl': test_ppl}, indexs, use_logger = True) pw.save_checkpoint(model = lm_model, optimizer = optimizer, is_best = True) except KeyboardInterrupt: logger.info('Exiting from training early') test_ppl = evaluate(test_loader.get_tqdm(device), lm_model) pw.add_loss_vs_batch({'test_ppl': test_ppl}, indexs, use_logger = True) pw.save_checkpoint(model = lm_model, optimizer = optimizer, is_best = True) pw.close() ================================================ FILE: train_seq.py ================================================ from __future__ import print_function import datetime import time import torch import torch.nn as nn import torch.optim as optim import codecs import pickle import math from model_word_ada.LM import LM from model_word_ada.basic import BasicRNN from model_word_ada.densenet import DenseRNN from model_word_ada.ldnet import LDRNN from model_seq.crf import CRFLoss, CRFDecode from model_seq.dataset import SeqDataset from model_seq.evaluator import eval_wc from model_seq.seqlabel import SeqLabel, Vanilla_SeqLabel from model_seq.seqlm import BasicSeqLM from model_seq.sparse_lm import SparseSeqLM import model_seq.utils as utils from torch_scope import wrapper import argparse import logging import json import os import sys import itertools import functools logger = logging.getLogger(__name__) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=str, default="auto") parser.add_argument('--cp_root', default='./checkpoint') parser.add_argument('--checkpoint_name', default='ner') parser.add_argument('--git_tracking', action='store_true') parser.add_argument('--corpus', default='./data/ner_dataset.pk') parser.add_argument('--forward_lm', default='./checkpoint/ld0.th') parser.add_argument('--backward_lm', default='./checkpoint/ld_0.th') parser.add_argument('--lm_hid_dim', type=int, default=300) parser.add_argument('--lm_word_dim', type=int, default=300) parser.add_argument('--lm_label_dim', type=int, default=-1) parser.add_argument('--lm_layer_num', type=int, default=10) parser.add_argument('--lm_droprate', type=float, default=0.5) parser.add_argument('--lm_rnn_layer', choices=['Basic', 'DenseNet', 'LDNet'], default='LDNet') parser.add_argument('--lm_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm') parser.add_argument('--seq_c_dim', type=int, default=30) parser.add_argument('--seq_c_hid', type=int, default=150) parser.add_argument('--seq_c_layer', type=int, default=1) parser.add_argument('--seq_w_dim', type=int, default=100) parser.add_argument('--seq_w_hid', type=int, default=300) parser.add_argument('--seq_w_layer', type=int, default=1) parser.add_argument('--seq_droprate', type=float, default=0.5) parser.add_argument('--seq_model', choices=['vanilla', 'lm-aug'], default='lm-aug') parser.add_argument('--seq_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm') parser.add_argument('--seq_lm_model', choices=['vanilla', 'sparse-lm'], default='vanilla') parser.add_argument('--batch_size', type=int, default=10) parser.add_argument('--patience', type=int, default=15) parser.add_argument('--epoch', type=int, default=200) parser.add_argument('--clip', type=float, default=5) parser.add_argument('--lr', type=float, default=0.015) parser.add_argument('--lr_decay', type=float, default=0.05) parser.add_argument('--update', choices=['Adam', 'Adagrad', 'Adadelta', 'SGD'], default='SGD') args = parser.parse_args() pw = wrapper(os.path.join(args.cp_root, args.checkpoint_name), args.checkpoint_name, enable_git_track=args.git_tracking) gpu_index = pw.auto_device() if 'auto' == args.gpu else int(args.gpu) device = torch.device("cuda:" + str(gpu_index) if gpu_index >= 0 else "cpu") if gpu_index >= 0: torch.cuda.set_device(gpu_index) logger.info('Loading data') dataset = pickle.load(open(args.corpus, 'rb')) name_list = ['flm_map', 'blm_map', 'gw_map', 'c_map', 'y_map', 'emb_array', 'train_data', 'test_data', 'dev_data'] flm_map, blm_map, gw_map, c_map, y_map, emb_array, train_data, test_data, dev_data = [dataset[tup] for tup in name_list ] logger.info('Loading language model') rnn_map = {'Basic': BasicRNN, 'DenseNet': DenseRNN, 'LDNet': functools.partial(LDRNN, layer_drop = 0)} flm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate) blm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate) flm_model = LM(flm_rnn_layer, None, len(flm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim) blm_model = LM(blm_rnn_layer, None, len(blm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim) flm_file = wrapper.restore_checkpoint(args.forward_lm)['model'] flm_model.load_state_dict(flm_file, False) blm_file = wrapper.restore_checkpoint(args.backward_lm)['model'] blm_model.load_state_dict(blm_file, False) slm_map = {'vanilla': BasicSeqLM, 'sparse-lm': SparseSeqLM} flm_model_seq = slm_map[args.seq_lm_model](flm_model, False, args.lm_droprate, True) blm_model_seq = slm_map[args.seq_lm_model](blm_model, True, args.lm_droprate, True) logger.info('Building models') SL_map = {'vanilla':Vanilla_SeqLabel, 'lm-aug': SeqLabel} seq_model = SL_map[args.seq_model](flm_model_seq, blm_model_seq, len(c_map), args.seq_c_dim, args.seq_c_hid, args.seq_c_layer, len(gw_map), args.seq_w_dim, args.seq_w_hid, args.seq_w_layer, len(y_map), args.seq_droprate, unit=args.seq_rnn_unit) seq_model.rand_init() seq_model.load_pretrained_word_embedding(torch.FloatTensor(emb_array)) seq_model.to(device) crit = CRFLoss(y_map) decoder = CRFDecode(y_map) evaluator = eval_wc(decoder, 'f1') logger.info('Constructing dataset') train_dataset, test_dataset, dev_dataset = [SeqDataset(tup_data, flm_map['\n'], blm_map['\n'], gw_map['<\n>'], c_map[' '], c_map['\n'], y_map[''], y_map[''], len(y_map), args.batch_size) for tup_data in [train_data, test_data, dev_data]] logger.info('Constructing optimizer') param_dict = filter(lambda t: t.requires_grad, seq_model.parameters()) optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta, 'SGD': functools.partial(optim.SGD, momentum=0.9)} if args.lr > 0: optimizer=optim_map[args.update](param_dict, lr=args.lr) else: optimizer=optim_map[args.update](param_dict) logger.info('Saving configues.') pw.save_configue(args) logger.info('Setting up training environ.') best_f1 = float('-inf') patience_count = 0 batch_index = 0 normalizer=0 tot_loss = 0 for indexs in range(args.epoch): logger.info('############') logger.info('Epoch: {}'.format(indexs)) pw.nvidia_memory_map() seq_model.train() for f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w, f_y, f_y_m, _ in train_dataset.get_tqdm(device): seq_model.zero_grad() output = seq_model(f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w) loss = crit(output, f_y, f_y_m) tot_loss += utils.to_scalar(loss) normalizer += 1 loss.backward() torch.nn.utils.clip_grad_norm_(seq_model.parameters(), args.clip) optimizer.step() batch_index += 1 if 0 == batch_index % 100: pw.add_loss_vs_batch({'training_loss': tot_loss / (normalizer + 1e-9)}, batch_index, use_logger = False) tot_loss = 0 normalizer = 0 if args.lr > 0: current_lr = args.lr / (1 + (indexs + 1) * args.lr_decay) utils.adjust_learning_rate(optimizer, current_lr) dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device)) pw.add_loss_vs_batch({'dev_f1': dev_f1}, indexs, use_logger = True) pw.add_loss_vs_batch({'dev_pre': dev_pre, 'dev_rec': dev_rec}, indexs, use_logger = False) logger.info('Saving model...') pw.save_checkpoint(model = seq_model, is_best = (dev_f1 > best_f1)) if dev_f1 > best_f1: test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device)) best_f1, best_dev_pre, best_dev_rec, best_dev_acc = dev_f1, dev_pre, dev_rec, dev_acc pw.add_loss_vs_batch({'test_f1': test_f1}, indexs, use_logger = True) pw.add_loss_vs_batch({'test_pre': test_pre, 'test_rec': test_rec}, indexs, use_logger = False) patience_count = 0 else: patience_count += 1 if patience_count >= args.patience: break pw.close() ================================================ FILE: train_seq_elmo.py ================================================ from __future__ import print_function import datetime import time import torch import torch.nn as nn import torch.optim as optim import codecs import pickle import math import numpy as np from model_word_ada.LM import LM from model_word_ada.basic import BasicRNN from model_word_ada.densenet import DenseRNN from model_word_ada.ldnet import LDRNN from model_seq.crf import CRFLoss, CRFDecode from model_seq.dataset import SeqDataset from model_seq.evaluator import eval_wc from model_seq.seqlabel import SeqLabel, Vanilla_SeqLabel from model_seq.seqlm import BasicSeqLM from model_seq.elmo import ElmoLM import model_seq.utils as utils from torch_scope import wrapper import argparse import logging import json import os import sys import itertools import functools logger = logging.getLogger(__name__) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=str, default="auto") parser.add_argument('--cp_root', default='./checkpoint') parser.add_argument('--checkpoint_name', default='elmo_ner') parser.add_argument('--git_tracking', action='store_true') parser.add_argument('--corpus', default='./data/ner_dataset.pk') parser.add_argument('--forward_lm', default='./checkpoint/basic0.th') parser.add_argument('--backward_lm', default='./checkpoint/basic_0.th') parser.add_argument('--lm_hid_dim', type=int, default=2048) parser.add_argument('--lm_word_dim', type=int, default=300) parser.add_argument('--lm_label_dim', type=int, default=-1) parser.add_argument('--lm_layer_num', type=int, default=2) parser.add_argument('--lm_droprate', type=float, default=0.5) parser.add_argument('--lm_rnn_layer', choices=['Basic'], default='Basic') parser.add_argument('--lm_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm') parser.add_argument('--seq_c_dim', type=int, default=30) parser.add_argument('--seq_c_hid', type=int, default=150) parser.add_argument('--seq_c_layer', type=int, default=1) parser.add_argument('--seq_w_dim', type=int, default=100) parser.add_argument('--seq_w_hid', type=int, default=300) parser.add_argument('--seq_w_layer', type=int, default=1) parser.add_argument('--seq_droprate', type=float, default=0.5) parser.add_argument('--seq_model', choices=['vanilla', 'lm-aug'], default='lm-aug') parser.add_argument('--seq_rnn_unit', choices=['gru', 'lstm', 'rnn'], default='lstm') parser.add_argument('--seq_lambda0', type=float, default=0.01) parser.add_argument('--batch_size', type=int, default=10) parser.add_argument('--patience', type=int, default=15) parser.add_argument('--epoch', type=int, default=200) parser.add_argument('--clip', type=float, default=5) parser.add_argument('--lr', type=float, default=0.015) parser.add_argument('--lr_decay', type=float, default=0.05) parser.add_argument('--update', choices=['Adam', 'Adagrad', 'Adadelta', 'SGD'], default='SGD') args = parser.parse_args() pw = wrapper(os.path.join(args.cp_root, args.checkpoint_name), args.checkpoint_name, enable_git_track=args.git_tracking) gpu_index = pw.auto_device() if 'auto' == args.gpu else int(args.gpu) device = torch.device("cuda:" + str(gpu_index) if gpu_index >= 0 else "cpu") if gpu_index >= 0: torch.cuda.set_device(gpu_index) logger.info('Loading data') dataset = pickle.load(open(args.corpus, 'rb')) name_list = ['flm_map', 'blm_map', 'gw_map', 'c_map', 'y_map', 'emb_array', 'train_data', 'test_data', 'dev_data'] flm_map, blm_map, gw_map, c_map, y_map, emb_array, train_data, test_data, dev_data = [dataset[tup] for tup in name_list ] logger.info('Loading language model') rnn_map = {'Basic': BasicRNN} flm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate) blm_rnn_layer = rnn_map[args.lm_rnn_layer](args.lm_layer_num, args.lm_rnn_unit, args.lm_word_dim, args.lm_hid_dim, args.lm_droprate) flm_model = LM(flm_rnn_layer, None, len(flm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim) blm_model = LM(blm_rnn_layer, None, len(blm_map), args.lm_word_dim, args.lm_droprate, label_dim = args.lm_label_dim) flm_file = wrapper.restore_checkpoint(args.forward_lm)['model'] flm_model.load_state_dict(flm_file, False) blm_file = wrapper.restore_checkpoint(args.backward_lm)['model'] blm_model.load_state_dict(blm_file, False) flm_model_seq = ElmoLM(flm_model, False, args.lm_droprate, True) blm_model_seq = ElmoLM(blm_model, True, args.lm_droprate, True) logger.info('Building model') SL_map = {'vanilla':Vanilla_SeqLabel, 'lm-aug': SeqLabel} seq_model = SL_map[args.seq_model](flm_model_seq, blm_model_seq, len(c_map), args.seq_c_dim, args.seq_c_hid, args.seq_c_layer, len(gw_map), args.seq_w_dim, args.seq_w_hid, args.seq_w_layer, len(y_map), args.seq_droprate, unit=args.seq_rnn_unit) seq_model.rand_init() seq_model.load_pretrained_word_embedding(torch.FloatTensor(emb_array)) seq_model.to(device) crit = CRFLoss(y_map) decoder = CRFDecode(y_map) evaluator = eval_wc(decoder, 'f1') print('constructing dataset') train_dataset, test_dataset, dev_dataset = [SeqDataset(tup_data, flm_map['\n'], blm_map['\n'], gw_map['<\n>'], c_map[' '], c_map['\n'], y_map[''], y_map[''], len(y_map), args.batch_size) for tup_data in [train_data, test_data, dev_data]] print('constructing optimizer') param_dict = filter(lambda t: t.requires_grad, seq_model.parameters()) optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta, 'SGD': functools.partial(optim.SGD, momentum=0.9)} if args.lr > 0: optimizer=optim_map[args.update](param_dict, lr=args.lr) else: optimizer=optim_map[args.update](param_dict) logger.info('Saving configues.') pw.save_configue(args) logger.info('Setting up training environ.') best_f1 = float('-inf') patience_count = 0 batch_index = 0 normalizer = 0 tot_loss = 0 for indexs in range(args.epoch): logger.info('############') logger.info('Epoch: {}'.format(indexs)) pw.nvidia_memory_map() seq_model.train() for f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w, f_y, f_y_m, _ in train_dataset.get_tqdm(device): seq_model.zero_grad() output = seq_model(f_c, f_p, b_c, b_p, flm_w, blm_w, blm_ind, f_w) loss = crit(output, f_y, f_y_m) tot_loss += utils.to_scalar(loss) normalizer += 1 if args.seq_lambda0 > 0: loss += args.seq_lambda0 * (flm_model_seq.regularizer(args.seq_lambda1) + blm_model_seq.regularizer(args.seq_lambda1)) loss.backward() torch.nn.utils.clip_grad_norm_(seq_model.parameters(), args.clip) optimizer.step() batch_index += 1 if 0 == batch_index % 100: pw.add_loss_vs_batch({'training_loss': tot_loss / (normalizer + 1e-9)}, batch_index, use_logger = False) tot_loss = 0 normalizer = 0 if args.lr > 0: current_lr = args.lr / (1 + (indexs + 1) * args.lr_decay) utils.adjust_learning_rate(optimizer, current_lr) dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(seq_model, dev_dataset.get_tqdm(device)) pw.add_loss_vs_batch({'dev_f1': dev_f1}, indexs, use_logger = True) pw.add_loss_vs_batch({'dev_pre': dev_pre, 'dev_rec': dev_rec}, indexs, use_logger = False) logger.info('Saving model...') pw.save_checkpoint(model = seq_model, is_best = (dev_f1 > best_f1)) if dev_f1 > best_f1: test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(seq_model, test_dataset.get_tqdm(device)) best_f1, best_dev_pre, best_dev_rec, best_dev_acc = dev_f1, dev_pre, dev_rec, dev_acc pw.add_loss_vs_batch({'test_f1': test_f1}, indexs, use_logger = True) pw.add_loss_vs_batch({'test_pre': test_pre, 'test_rec': test_rec}, indexs, use_logger = False) patience_count = 0 else: patience_count += 1 if patience_count >= args.patience: break pw.close()