[
  {
    "path": ".amlignore",
    "content": "data/\nexamples/\n"
  },
  {
    "path": ".bumpversion.cfg",
    "content": "[bumpversion]\ncurrent_version = 1.0.0\ncommit = True\ntag = True\nmessage = \"Bump version: {current_version} -> {new_version}\"\n\n[bumpversion:file:setup.py]\nsearch = version='{current_version}'\nreplace = version='{new_version}'\n\n[bumpversion:file:utils_nlp/__init__.py]\nsearch = __version__ = '{current_version}'\nreplace = __version__ = '{new_version}'"
  },
  {
    "path": ".flake8",
    "content": "[flake8]\n# Intial set of rules\n# Feel Free to add any new rule here with description of what it does.\n\n# E203\tWhitespace before ':'\n# E266\tToo many leading '#' for block comment\n# E501\tLine too long (82 > 79 characters)\n# W503\tLine break occurred before a binary operator\n# F403\t'from module import *' used; unable to detect undefined names\n# F405  '<function>' may be undefined, or defined from star imports\n# E402  module level import not at top of file\n# E731  do not assign a lambda expression, use a def\n# F821  undefined name 'get_ipython' --> from generated python files using nbconvert\n# E722: do not use bare except\n# E231: missing white space after \",\" --> black generates autoformat [,] which fails flake8\nignore = E203, E266, W503, F403, F405, E402, E731, F821, E722, E231\n\nmax-line-length = 88\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.md",
    "content": "---\nname: Bug report\nabout: Create a report to help us improve\ntitle: \"[BUG] \"\nlabels: 'bug'\nassignees: ''\n\n---\n\n### Description\n<!--- Describe your bug in detail -->\n\n\n### How do we replicate the bug?\n<!--- Please be specific as possible (use a list if needed). -->\n<!--- For example: -->\n<!--- * Create a conda environment for gpu -->\n<!--- * Run unit test `test_timer.py` -->\n<!--- * ... -->\n\n\n### Expected behavior (i.e. solution)\n<!--- For example:  -->\n<!--- * The tests for the timer should pass successfully. -->\n\n\n### Other Comments\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.md",
    "content": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: \"[FEATURE] \"\nlabels: 'enhancement'\nassignees: ''\n\n---\n\n### Description\n<!--- Describe your expected feature in detail -->\n\n\n### Expected behavior with the suggested feature\n<!--- For example:  -->\n<!--- *Adding algorithm xxx will help people understand more about xxx use case scenarios. -->\n\n\n### Other Comments\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/general-ask.md",
    "content": "---\nname: General ask\nabout: Technical/non-technical asks about the repo\ntitle: \"[ASK] \"\nlabels: ''\nassignees: ''\n\n---\n\n### Description\n<!--- Describe your general ask in detail -->\n\n\n### Other Comments\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE.md",
    "content": "### Description\n<!--- Describe your issue/bug/request in detail -->\n\n\n### In which platform does it happen?\n<!--- Describe the platform where the issue is happening (use a list if needed) -->\n<!--- For example: -->\n<!--- * Azure Ubuntu Data Science Virtual Machine. -->\n<!--- * Other platforms.  -->\n\n\n### How do we replicate the issue?\n<!--- Please be specific as possible (use a list if needed). -->\n<!--- For example: -->\n<!--- * Create a conda environment for gpu -->\n<!--- * Run unit test `test_timer.py` -->\n<!--- * ... -->\n\n\n### Expected behavior (i.e. solution)\n<!--- For example:  -->\n<!--- * The tests for the timer should pass successfully. -->\n\n\n### Other Comments\n"
  },
  {
    "path": ".github/PULL_REQUEST_TEMPLATE.md",
    "content": "### Description\n<!--- Describe your changes in detail -->\n<!--- Why is this change required? What problem does it solve? -->\n\n\n### Related Issues\n<!--- If it fixes an open issue, please link to the issue here. -->\n\n\n### Checklist:\n<!--- Go over all the following points, and put an `x` in all the boxes that apply. -->\n<!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->\n- [ ] My code follows the code style of this project, as detailed in our [contribution guidelines](https://github.com/microsoft/nlp-recipes/blob/master/CONTRIBUTING.md).\n- [ ] I have added tests.\n- [ ] I have updated the documentation accordingly.\n\n\n\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\npip-wheel-metadata/\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n\n\n##########################\n.DS_Store\n.~*\nUntitled*.ipynb\n*-Copy*.ipynb\n~$*\noutput.ipynb\n.idea/\n*.npz\n*.data\n*.dat\n*.csv\n*.tsv\n*.zip\n.vscode/\ntools/repo_metrics/config.py\n*.jar\n*.item\n*.pkl\nnlp_*.yaml\nnohup.out\ntemp/\ntmp/\nlogs/\nscore.py\n\n# Data\ndata/\nsquad/\nbidaf-question-answering/\n*/question_answering/bidaf.tar.gz\n*/question_answering/bidafenv.yml\n*/question_answering/config.json\n*/question_answering/vocabulary/\n*/question_answering/weights.th\n\n# AML Config\naml_config/\n.azureml/\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n-   repo: https://github.com/ambv/black\n    rev: stable\n    hooks:\n    - id: black\n      language_version: python3.6\n-   repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v1.2.3\n    hooks:\n    - id: flake8\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contribution Guidelines\n\nContribution are welcome! Here's a few things to know:\n\n- [Contribution Guidelines](#contribution-guidelines)\n  - [Microsoft Contributor License Agreement](#microsoft-contributor-license-agreement)\n  - [Steps to Contributing](#steps-to-contributing)\n  - [Coding Guidelines](#coding-guidelines)\n  - [Code of Conduct](#code-of-conduct)\n      - [Do not point fingers](#do-not-point-fingers)\n      - [Provide code feedback based on evidence](#provide-code-feedback-based-on-evidence)\n      - [Ask questions do not give answers](#ask-questions-do-not-give-answers)\n\n## Microsoft Contributor License Agreement\n\nMost contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com.\n\nWhen you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.\n\n## Steps to Contributing\n\nHere are the basic steps to get started with your first contribution. Please reach out with any questions.\n1. Use [open issues](https://github.com/Microsoft/Recommenders/issues) to discuss the proposed changes. Create an issue describing changes if necessary to collect feedback. Also, please use provided labels to tag issues so everyone can easily sort issues of interest.\n2. [Fork the repo](https://help.github.com/articles/fork-a-repo/) so you can make and test local changes.\n3. Create a new branch for the issue. We suggest prefixing the branch with your username and then a descriptive title: (e.g. gramhagen/update_contributing_docs)\n4. Create a test that replicates the issue.\n5. Make code changes.\n6. Ensure unit tests pass and code style / formatting is consistent (see [wiki](https://github.com/Microsoft/Recommenders/wiki/Coding-Guidelines#python-and-docstrings-style) for more details).\n7. We use [pre-commit](https://pre-commit.com/) package to run our pre-commit hooks. We use black formatter and flake8 linting on each commit. In order to set up pre-commit on your machine, follow the steps here, please note that you only need to run these steps the first time you use pre-commit for this project.\n   \n   * Update your conda environment, pre-commit is part of the yaml file or just do    \n   ```\n    $ pip install pre-commit\n   ```    \n   * Set up pre-commit by running following command, this will put pre-commit under your .git/hooks directory. \n   ```\n   $ pre-commit install\n   ```\n   ```\n   $ git commit -m \"message\" \n   ```\n   * Each time you commit, git will run the pre-commit hooks (black and flake8 for now) on any python files that are getting committed and are part of the git index.  If black modifies/formats the file, or if flake8 finds any linting errors, the commit will not succeed. You will need to stage the file again if black changed the file, or fix the issues identified by flake8 and and stage it again.\n\n   * To run pre-commit on all files just run\n   ```\n   $ pre-commit run --all-files\n8. Create a pull request against <b>staging</b> branch.\n\nNote: We use the staging branch to land all new features, so please remember to create the Pull Request against staging. \n\nOnce the features included in a milestone are complete we will merge staging into master and make a release. See the wiki for more detail about our [merge strategy](https://github.com/Microsoft/Recommenders/wiki/Strategy-to-merge-the-code-to-master-branch).\n\n## Coding Guidelines\n\nWe strive to maintain high quality code to make the utilities in the repository easy to understand, use, and extend. We also work hard to maintain a friendly and constructive environment. We've found that having clear expectations on the development process and consistent style helps to ensure everyone can contribute and collaborate effectively.\n\nPlease review the [coding guidelines](https://github.com/Microsoft/Recommenders/wiki/Coding-Guidelines) wiki page to see more details about the expectations for development approach and style.\n\n## Code of Conduct\n\nThis project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).\n\nFor more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.\n\nApart from the official Code of Conduct developed by Microsoft, in the Recommenders team we adopt the following behaviors, to ensure a great working environment:\n\n#### Do not point fingers\nLet’s be constructive.\n\n<details>\n<summary><em>Click here to see some examples</em></summary>\n\n\"This method is missing docstrings\" instead of \"YOU forgot to put docstrings\".\n\n</details>\n\n#### Provide code feedback based on evidence \n\nWhen making code reviews, try to support your ideas based on evidence (papers, library documentation, stackoverflow, etc) rather than your personal preferences. \n\n<details>\n<summary><em>Click here to see some examples</em></summary>\n\n\"When reviewing this code, I saw that the Python implementation the metrics are based on classes, however, [scikit-learn](https://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics) and [tensorflow](https://www.tensorflow.org/api_docs/python/tf/metrics) use functions. We should follow the standard in the industry.\"\n\n</details>\n\n\n#### Ask questions do not give answers\nTry to be empathic. \n\n<details>\n<summary><em>Click here to see some examples</em></summary>\n\n* Would it make more sense if ...?\n* Have you considered this ... ?\n\n</details>\n\n"
  },
  {
    "path": "DatasetReferences.md",
    "content": "MICROSOFT PROVIDES THE DATASETS ON AN \"AS IS\" BASIS. MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THE DATASETS. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INLCUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THE DATASETS.\n\nThe datasets are provided under the original terms that Microsoft received such datasets. See below for more information about each dataset.\n\n### <a name=\"cnndm\"></a> CNN/Daily Mail (CNN/DM) Dataset\nThe training and evaluation for CNN/DM  dataset is available https://s3.amazonaws.com/opennmt-models/Summary/cnndm.tar.gz and released under MIT License. This is a processed version of data that's originally released by Hermann et al. (2015) in [\"Teaching machines to read and comprehend\"](https://arxiv.org/abs/1506.03340) and then made available by Kyunghyun Cho at https://cs.nyu.edu/~kcho/DMQA/.\n\n### Preprocessed CNN/Daily Mail (CNN/DM) Dataset by BERTSUM\nThe preprocessed dataset of [CNN/DM dataset](#cnndm), originally published by BERTSUM paper [\"Fine-tune BERT for Extractive Summarization\"](https://arxiv.org/pdf/1903.10318.pdf), can be found at https://github.com/nlpyang/BertSum and released under Apache License 2.0.\n\n\n### Microsoft Research Paraphrase Corpus\n>Original source: https://www.microsoft.com/en-us/download/details.aspx?id=52398\n\n\n### The Multi-Genre NLI Corpus (MultiNLI)\n>The majority of the corpus is released under the [OANC](https://www.anc.org/OANC/license.txt)’s license, The data in the FICTION section falls under several permissive licenses. See the [data description paper](https://www.nyu.edu/projects/bowman/multinli/paper.pdf) for details.\nRedistributing the datasets \"MultiNLI 1.0.zip\", \"MultiNLI Matched.zip\", and \"MultiNLI Mismatched.zip\" with attribution:\nAdina Williams, Nikita Nangia, Samuel R. Bowman. 2018. A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference. Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers).\nOriginal source: https://www.nyu.edu/projects/bowman/multinli/\n\n### The Stanford Natural Language Inference (SNLI) Corpus\n>This dataset is provided under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).\nRedistributing the dataset \"snli_1.0.zip\" with attribution:\nSamuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large annotated corpus for learning natural language inference. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (EMNLP).\nOriginal source: https://nlp.stanford.edu/projects/snli/\nThe dataset is preprocessed to remove unused columns and badly formatted rows.\n\n### Wikigold dataset\n>This dataset is provided under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.ast).\nRedistributing the dataset \"wikigold.conll.txt\" with attribution:\nBalasuriya, Dominic, et al. \"Named entity recognition in wikipedia.\"\nProceedings of the 2009 Workshop on The People's Web Meets NLP: Collaboratively Constructed Semantic Resources. Association for Computational Linguistics, 2009.\nOriginal source: https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data\nThe dataset is preprocessed to fit data format requirement of BERT.\n\n### The Cross-Lingual NLI Corpus (XNLI)\n>The majority of the corpus sentences are released under the [OANC](https://www.anc.org/OANC/license.txt)’s license. The data in the Fiction genre from Captain Blood are under [The_Project_Gutenberg_License](http://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License). See details in the [XNLI paper](https://arxiv.org/pdf/1809.05053.pdf).\nRedistributing the datasets \"XNLI 1.0.zip\" and \"XNLI-MT 1.0.zip\" with attribution:\nAlexis Conneau, Guillaume Lample, Ruty Rinott, Holger Schwenk, Ves Stoyanov. 2018. XNLI: Evaluating Cross-lingual Sentence Representations. Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing.\nOriginal source: https://www.nyu.edu/projects/bowman/xnli/\nThe dataset is preprocessed to remove unused columns.\n\n### The Stanford Question Answering Dataset (SQuAD)\n>This dataset is provided under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode).\nRedistributing the datasets \"train-v1.1.json\" and \"dev-v1.1.json\" with attribution:\nPranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. SQuAD: 100,000+ Questions for Machine Comprehension of Text. Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing (EMNLP).\nOriginal source: https://github.com/rajpurkar/SQuAD-explorer\n\n\n### The STSbenchmark dataset\n>Redistributing the dataset \"Stsbenchmark.tar.gz\" with attribution:\nEneko Agirre, Daniel Cer, Mona Diab, Iñigo Lopez-Gazpio, Lucia\n Specia. Semeval-2017 Task 1: Semantic Textual Similarity\n Multilingual and Crosslingual Focused Evaluation. Proceedings of\n SemEval 2017.\n Orignal source:http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark\n The dataset is preprocessed to remove unused columns.\n>The scores are released under [Commons Attribution - Share Alike 4.0\nInternational License](http://creativecommons.org/licenses/by-sa/4.0/)\n> The text of each dataset has a license of its own, as follows:\n\n>- MSR-Paraphrase, Microsoft Research Paraphrase Corpus. In order to use\n  MSRpar, researchers need to agree with the license terms from\n  Microsoft Research:\n  http://research.microsoft.com/en-us/downloads/607d14d9-20cd-47e3-85bc-a2f65cd28042/\n\n>- headlines: Mined from several news sources by European Media Monitor\n  (Best et al. 2005). using the RSS feed. European Media Monitor (EMM)\n  Real Time News Clusters are the top news stories for the last 4\n  hours, updated every ten minutes. The article clustering is fully\n  automatic. The selection and placement of stories are determined\n  automatically by a computer program. This site is a joint project of\n  DG-JRC and DG-COMM. The information on this site is subject to a\n  disclaimer (see\n  http://europa.eu/geninfo/legal_notices_en.htm). Please acknowledge\n  EMM when (re)using this material.\n  http://emm.newsbrief.eu/rss?type=rtn&language=en&duplicates=false\n\n>- deft-news: A subset of news article data in the DEFT\n  project.\n\n>- MSR-Video, Microsoft Research Video Description Corpus.  In order to\n  use MSRvideo, researchers need to agree with the license terms from\n  Microsoft Research:\n  http://research.microsoft.com/en-us/downloads/38cf15fd-b8df-477e-a4e4-a4680caa75af/\n\n>- image: The Image Descriptions data set is a subset of\n  the PASCAL VOC-2008 data set (Rashtchian et al., 2010) . PASCAL\n  VOC-2008 data set consists of 1,000 images and has been used by a\n  number of image description systems. The image captions of the data\n  set are released under a CreativeCommons Attribution-ShareAlike\n  license, the descriptions itself are free.\n\n>- track5.en-en: This text is a subset of the Stanford Natural\n  Language Inference (SNLI) corpus, by The Stanford NLP Group is\n  licensed under a Creative Commons Attribution-ShareAlike 4.0\n  International License. Based on a work at\n  http://shannon.cs.illinois.edu/DenotationGraph/.\n  https://creativecommons.org/licenses/by-sa/4.0/\n\n>- answers-answers: user content from stack-exchange. Check the license\n  below in ======ANSWERS-ANSWERS======\n\n>- answers-forums: user content from stack-exchange. Check the license\n  below in ======ANSWERS-FORUMS======\n\n\n\n>======ANSWER-ANSWER======\n\n>Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)\nhttp://creativecommons.org/licenses/by-sa/3.0/\n\n>Attribution Requirements:\n\n>   \"* Visually display or otherwise indicate the source of the content\n      as coming from the Stack Exchange Network. This requirement is\n      satisfied with a discreet text blurb, or some other unobtrusive but\n      clear visual indication.\n\n>    * Ensure that any Internet use of the content includes a hyperlink\n      directly to the original question on the source site on the Network\n      (e.g., http://stackoverflow.com/questions/12345)\n\n>    * Visually display or otherwise clearly indicate the author names for\n      every question and answer used\n\n>    * Ensure that any Internet use of the content includes a hyperlink for\n      each author name directly back to his or her user profile page on the\n      source site on the Network (e.g.,\n      http://stackoverflow.com/users/12345/username), directly to the Stack\n      Exchange domain, in standard HTML (i.e. not through a Tinyurl or other\n      such indirect hyperlink, form of obfuscation or redirection), without\n      any “nofollow” command or any other such means of avoiding detection by\n      search engines, and visible even with JavaScript disabled.\"\n\n>    (https://archive.org/details/stackexchange)\n\n\n\n>======ANSWERS-FORUMS======\n\n\n>Stack Exchange Inc. generously made the data used to construct the STS 2015 answer-answer statement pairs available under a Creative Commons Attribution-ShareAlike (cc-by-sa) 3.0 license.\n\n>The license is reproduced below from: https://archive.org/details/stackexchange\n\n>The STS.input.answers-forums.txt file should be redistributed with this LICENSE text and the accompanying files in LICENSE.answers-forums.zip. The tsv files in the zip file contain the additional information that's needed to comply with the license.\n\n>--\n\n>All user content contributed to the Stack Exchange network is cc-by-sa 3.0 licensed, intended to be shared and remixed. We even provide all our data as a convenient data dump.\n\n>http://creativecommons.org/licenses/by-sa/3.0/\n\n>But our cc-by-sa 3.0 licensing, while intentionally permissive, does *require attribution*:\n\n>\"Attribution — You must attribute the work in the manner specified by the author or licensor (but not in any way that suggests that they endorse you or your use of the work).\"\n\n>Specifically the attribution requirements are as follows:\n\n>  1. Visually display or otherwise indicate the source of the content as coming from the Stack Exchange Network. This requirement is satisfied with a discreet text blurb, or some other unobtrusive but clear visual indication.\n>  2. Ensure that any Internet use of the content includes a hyperlink directly to the original question on the source site on the Network (e.g., http://stackoverflow.com/questions/12345)\n\n>  3. Visually display or otherwise clearly indicate the author names for every question and answer so used.\n\n>  4. Ensure that any Internet use of the content includes a hyperlink for each author name directly back to his or her user profile page on the source site on the Network (e.g., http://stackoverflow.com/users/12345/username), directly to the Stack Exchange domain, in standard HTML (i.e. not through a Tinyurl or other such indirect hyperlink, form of obfuscation or redirection), without any “nofollow” command or any other such means of avoiding detection by search engines, and visible even with JavaScript disabled.\n\n>Our goal is to maintain the spirit of fair attribution. That means attribution to the website, and more importantly, to the individuals who so generously contributed their time to create that content in the first place!\n\n>For more information, see the Stack Exchange Terms of Service: http://stackexchange.com/legal/terms-of-service\n"
  },
  {
    "path": "LICENSE",
    "content": "    MIT License\r\n\r\n    Copyright (c) Microsoft Corporation. All rights reserved.\r\n\r\n    Permission is hereby granted, free of charge, to any person obtaining a copy\r\n    of this software and associated documentation files (the \"Software\"), to deal\r\n    in the Software without restriction, including without limitation the rights\r\n    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r\n    copies of the Software, and to permit persons to whom the Software is\r\n    furnished to do so, subject to the following conditions:\r\n\r\n    The above copyright notice and this permission notice shall be included in all\r\n    copies or substantial portions of the Software.\r\n\r\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r\n    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r\n    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r\n    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r\n    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r\n    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r\n    SOFTWARE\r\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "graft utils_nlp\n\nglobal-exclude *.py[cod] __pycache__ *.so *.dylib\n\nexclude README.md\nexclude SETUP.md\nexclude CONTRIBUTING.md\n\n"
  },
  {
    "path": "NOTICE.txt",
    "content": "NOTICES AND INFORMATION\nDo Not Translate or Localize\n\nThis software incorporates material from third parties. Microsoft makes certain\nopen source code available at https://3rdpartysource.microsoft.com, or you may\nsend a check or money order for US $5.00, including the product name, the open\nsource component name, and version number, to:\n\nSource Code Compliance Team\nMicrosoft Corporation\nOne Microsoft Way\nRedmond, WA 98052\nUSA\n\nNotwithstanding any other terms, you may reverse engineer this software to the\nextent required to debug changes to any libraries licensed under the GNU Lesser\nGeneral Public License.\n\nThis software requires you to access or provide third party code that may contain restrictions on how such third party code can be used. You are solely responsible for reviewing any licenses applicable to such code and determining whether your use is permitted by such license\n--\n\nhttps://github.com/huggingface/transformers\n\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n\n--\n\nhttps://github.com/stanfordnlp/glove\n\n\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2014 The Board of Trustees of The Leland Stanford Junior University\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\nBSD License\n\nFor SentEval software\n\nCopyright (c) 2017-present, Facebook, Inc. All rights reserved.\n\nRedistribution and use in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n * Redistributions of source code must retain the above copyright notice, this\n   list of conditions and the following disclaimer.\n\n * Redistributions in binary form must reproduce the above copyright notice,\n   this list of conditions and the following disclaimer in the documentation\n   and/or other materials provided with the distribution.\n\n * Neither the name Facebook nor the names of its contributors may be used to\n   endorse or promote products derived from this software without specific\n   prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\nANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\nWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR\nANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\nLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON\nANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\nSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n--\n\nhttps://github.com/allenai/bi-att-flow\n\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n\n--\n\nhttps://github.com/nlpyang/BertSum\n\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n\n--\n\nFor spaCY\n\nThe MIT License (MIT)\n\nCopyright (C) 2016-2020 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in\nall copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\nTHE SOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "<img src=\"NLP-Logo.png\" align=\"right\" alt=\"\" width=\"300\"/>\n\n\n# NLP Best Practices\n\nIn recent years, natural language processing (NLP) has seen quick growth in quality and usability, and this has helped to drive business adoption of artificial intelligence (AI) solutions. In the last few years, researchers have been applying newer deep learning methods to NLP. Data scientists started moving from traditional methods to state-of-the-art (SOTA) deep neural network (DNN) algorithms which use language models pretrained on large text corpora.\n\nThis repository contains examples and best practices for building NLP systems, provided as [Jupyter notebooks](examples) and [utility functions](utils_nlp). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on problems involving text and language.\n\n## Overview\n\nThe goal of this repository is to build a comprehensive set of tools and examples that leverage recent advances in NLP algorithms, neural architectures, and distributed machine learning systems.\nThe content is based on our past and potential future engagements with customers as well as collaboration with partners, researchers, and the open source community.\n\nWe hope that the tools can significantly reduce the “time to market” by simplifying the experience from defining the business problem to development of solution by orders of magnitude. In addition, the example notebooks would serve as guidelines and showcase best practices and usage of the tools in a wide variety of languages.\n\nIn an era of transfer learning, transformers, and deep architectures, we believe that pretrained models provide a unified solution to many real-world problems and allow handling different tasks and languages easily. We will, therefore, prioritize such models, as they achieve state-of-the-art results on several NLP benchmarks like [*GLUE*](https://gluebenchmark.com/leaderboard) and [*SQuAD*](https://rajpurkar.github.io/SQuAD-explorer/) leaderboards. The models can be used in a number of applications ranging from simple text classification to sophisticated intelligent chat bots.\n\nNote that for certain kind of NLP problems, you may not need to build your own models. Instead, pre-built or easily customizable solutions exist which do not require any custom coding or machine learning expertise. We strongly recommend evaluating if these can sufficiently solve your problem. If these solutions are not applicable, or the accuracy of these solutions is not sufficient, then resorting to more complex and time-consuming custom approaches may be necessary. The following cognitive services offer simple solutions to address common NLP tasks:\n<br><br><b>[Text Analytics](https://azure.microsoft.com/en-us/services/cognitive-services/text-analytics/) </b> are a set of pre-trained REST APIs which can be called for Sentiment Analysis, Key phrase extraction, Language detection and Named Entity Detection and more. These APIs work out of the box and require minimal expertise in machine learning, but have limited customization capabilities.\n<br><br><b>[QnA Maker](https://azure.microsoft.com/en-us/services/cognitive-services/qna-maker/) </b>is a cloud-based API service that lets you create a conversational question-and-answer layer over your existing data. Use it to build a knowledge base by extracting questions and answers from your semi-structured content, including FAQs, manuals, and documents.\n<br><br><b>[Language Understanding](https://azure.microsoft.com/en-us/services/cognitive-services/language-understanding-intelligent-service/)</b> is a SaaS service to train and deploy a model as a REST API given a user-provided training set. You could do Intent Classification as well as Named Entity Extraction by performing simple steps of providing example utterances and labelling them. It supports Active Learning, so your model always keeps learning and improving.\n\n## Target Audience\nFor this repository our target audience includes data scientists and machine learning engineers with varying levels of NLP knowledge as our content is source-only and targets custom machine learning modelling. The utilities and examples provided are intended to be solution accelerators for real-world NLP problems.\n\n## Focus Areas\nThe repository aims to expand NLP capabilities along three separate dimensions\n\n### Scenarios\nWe aim to have end-to-end examples of common tasks and scenarios such as text classification, named entity recognition etc.\n\n### Algorithms\nWe aim to support multiple models for each of the supported scenarios. Currently, transformer-based models are supported across most scenarios. We have been working on integrating the [transformers package](https://github.com/huggingface/transformers) from [Hugging Face](https://huggingface.co/) which allows users to easily load pretrained models and fine-tune them for different tasks.\n\n### Languages\nWe strongly subscribe to the multi-language principles laid down by [\"Emily Bender\"](http://faculty.washington.edu/ebender/papers/Bender-SDSS-2019.pdf)\n* \"Natural language is not a synonym for English\"\n* \"English isn't generic for language, despite what NLP papers might lead you to believe\"\n* \"Always name the language you are working on\" ([Bender rule](https://www.aclweb.org/anthology/Q18-1041/))\n\nThe repository aims to support non-English languages  across all the scenarios. Pre-trained models used in the repository such as BERT, FastText support 100+ languages out of the box. Our goal is to provide end-to-end examples in as many languages as possible. We encourage community contributions in this area.\n\n\n\n## Content\nThe following is a summary of the commonly used NLP scenarios covered in the repository. Each scenario is demonstrated in one or more [Jupyter notebook examples](examples) that make use of the core code base of models and repository utilities.\n\n| Scenario                              |  Models | Description|Languages|\n|-------------------------|  ------------------- |-------|---|\n|Text Classification                     |BERT, DistillBERT, XLNet, RoBERTa, ALBERT, XLM| Text classification is a supervised learning method of learning and predicting the category or the class of a document given its text content. |English, Chinese, Hindi, Arabic, German, French, Japanese, Spanish, Dutch|\n|Named Entity Recognition                |BERT| Named entity recognition (NER) is the task of classifying words or key phrases of a text into predefined entities of interest. |English|\n|Text Summarization|BERTSumExt <br> BERTSumAbs <br> UniLM (s2s-ft) <br> MiniLM |Text summarization is a language generation task of summarizing the input text into a shorter paragraph of text.|English\n|Entailment                              |BERT, XLNet, RoBERTa| Textual entailment is the task of classifying the binary relation between two natural-language texts,  *text* and *hypothesis*, to determine if the *text* agrees with the *hypothesis* or not. |English|\n|Question Answering                      |BiDAF, BERT, XLNet| Question answering (QA) is the task of retrieving or generating a valid answer for a given query in natural language, provided with a passage related to the query. |English|\n|Sentence Similarity                     |BERT, GenSen| Sentence similarity is the process of computing a similarity score given a pair of text documents. |English|\n|Embeddings| Word2Vec<br>fastText<br>GloVe| Embedding is the process of converting a word or a piece of text to a continuous vector space of real number, usually, in low dimension.|English|\n|Sentiment Analysis| Dependency Parser <br>GloVe| Provides an example of train and use Aspect Based Sentiment Analysis with Azure ML and [Intel NLP Architect](http://nlp_architect.nervanasys.com/absa.html) .|English|\n## Getting Started\nWhile solving NLP problems, it is always good to start with the prebuilt [Cognitive Services](https://azure.microsoft.com/en-us/services/cognitive-services/directory/lang/). When the needs are beyond the bounds of the prebuilt cognitive service and when you want to search for custom machine learning methods,  you will find this repository  very useful. To get started, navigate to the [Setup Guide](SETUP.md), which lists instructions on how to setup your environment and dependencies.\n\n\n## Azure Machine Learning Service\n[Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/) is a cloud service used to train, deploy, automate, and manage machine learning models, all at the broad scale that the cloud provides. AzureML is presented in notebooks across different scenarios to enhance the efficiency of developing Natural Language systems at scale and for various AI model development related tasks like:\n  * [**Accessing Datastores**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data) to easily read and write your data in Azure storage services such as blob storage or file share.\n  * Scaling up and out on [**Azure Machine Learning Compute**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute).\n  * [**Automated Machine Learning**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train) which builds high quality machine learning models by automating model and hyperparameter selection. AutoML explores BERT, BiLSTM, bag-of-words, and word embeddings on the user's dataset to handle text columns.\n  * [**Tracking experiments and monitoring metrics**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-track-experiments) to enhance the model creation process.\n  * [**Distributed Training**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-ml-models#distributed-training-and-custom-docker-images)\n  * [**Hyperparameter tuning**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters)\n  * Deploying the trained machine learning model as a web service to [**Azure Container Instance**](https://azure.microsoft.com/en-us/services/container-instances/) for deveopment and test,  or for low scale, CPU-based workloads.\n  * Deploying the trained machine learning model as a web service to [**Azure Kubernetes Service**](https://azure.microsoft.com/en-us/services/kubernetes-service/) for high-scale production deployments and provides autoscaling, and fast response times.\n\nTo successfully run these notebooks, you will need an [**Azure subscription**](https://azure.microsoft.com/en-us/) or can [**try Azure for free**](https://azure.microsoft.com/en-us/free/). There may be other Azure services or products used in the notebooks. Introduction and/or reference of those will be provided in the notebooks themselves.\n\n## Contributing\nWe hope that the open source community would contribute to the content and bring in the latest SOTA algorithm. This project welcomes contributions and suggestions. Before contributing, please see our [contribution guidelines](CONTRIBUTING.md).\n\n## Blog Posts\n\n- [Bootstrap Your Text Summarization Solution with the Latest Release from NLP-Recipes](https://techcommunity.microsoft.com/t5/ai-customer-engineering-team/bootstrap-your-text-summarization-solution-with-the-latest/ba-p/1268809)\n\n- [Text Annotation made easy with Doccano](https://techcommunity.microsoft.com/t5/ai-customer-engineering-team/text-annotation-made-easy-with-doccano/ba-p/1242612)\n\n- [Jumpstart Analyzing your Hindi Text Data using the NLP Repository](https://techcommunity.microsoft.com/t5/ai-customer-engineering-team/jumpstart-analyzing-your-hindi-text-data-using-the-nlp/ba-p/1087851)\n\n- [Speeding up the Development of Natural Language Processing Solutions with Azure Machine Learning](https://techcommunity.microsoft.com/t5/ai-customer-engineering-team/speeding-up-the-development-of-natural-language-processing/ba-p/1042577)\n\n## References\nThe following is a list of related repositories that we like and think are useful for NLP tasks.\n\n|Repository|Description|\n|---|---|\n|[Transformers](https://github.com/huggingface/transformers)|A great PyTorch library from Hugging Face with implementations of popular transformer-based models. We've been using their package extensively in this repo and greatly appreciate their effort.|\n|[Azure Machine Learning Notebooks](https://github.com/Azure/MachineLearningNotebooks/)|ML and deep learning examples with Azure Machine Learning.|\n|[AzureML-BERT](https://github.com/Microsoft/AzureML-BERT)|End-to-end recipes for pre-training and fine-tuning BERT using Azure Machine Learning service.|\n|[MASS](https://github.com/microsoft/MASS)|MASS: Masked Sequence to Sequence Pre-training for Language Generation.|\n|[MT-DNN](https://github.com/microsoft/mt-dnn)|Multi-Task Deep Neural Networks for Natural Language Understanding.|\n|[UniLM](https://github.com/microsoft/unilm)|Unified Language Model Pre-training.|\n|[DialoGPT](https://github.com/microsoft/DialoGPT)|DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation|\n\n\n## Build Status\n| Build | Branch | Status |\n| --- | --- | --- |\n| **Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=master) |\n| **Linux CPU** | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=staging)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=staging) |\n| **Linux GPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=master) |\n| **Linux GPU** | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=staging)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=staging) |\n"
  },
  {
    "path": "SETUP.md",
    "content": "# Setup Guide\n\nThis document describes how to setup all the dependencies to run the notebooks in this repository.\n\nThe recommended environment to run these notebooks is the [Azure Data Science Virtual Machine (DSVM)](https://azure.microsoft.com/en-us/services/virtual-machines/data-science-virtual-machines/). Since a considerable number of the algorithms rely on deep learning, it is recommended to use a GPU DSVM.\n\nFor training at scale, operationalization or hyperparameter tuning, it is recommended to use [Azure ML](https://docs.microsoft.com/en-us/azure/machine-learning/service/).\n\n\n## Table of Contents\n\n* [Compute environments](#compute-environments)\n* [Create a cloud-based workstation (Optional)](#Create-a-cloud-based-workstation-optional)\n* [Setup guide for Local or Virtual Machines](#setup-guide-for-local-or-virtual-machines)\n  * [Requirements](#requirements)\n  * [Dependencies setup](#dependencies-setup)\n  * [Register the conda environment in the DSVM JupyterHub](#register-conda-environment-in-dsvm-jupyterhub)\n  * [Installing the Repo's Utils via PIP](#installing-the-repos-utils-via-pip)\n* [Setup guide for docker](#Set-up-guide-for-nvidia-docker)\n\n## Compute Environments\n\nDepending on the type of NLP system and the notebook that needs to be run, there are different computational requirements. Currently, this repository supports **Python CPU** and **Python GPU**. A conda environment YAML file can be generated for either CPU or GPU environments as shown below in the *Dependencies Setup* section.\n\n## Create a cloud-based workstation (Optional)\n\n[Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/)’s Notebook Virtual Machine (VM), is a cloud-based workstation created specifically for data scientists. Notebook VM based authoring is directly integrated into Azure Machine Learning service, providing a code-first experience for Python developers to conveniently build and deploy models in the workspace. Developers and data scientists can perform every operation supported by the Azure Machine Learning Python SDK using a familiar Jupyter notebook in a secure, enterprise-ready environment. Notebook VM is secure and easy-to-use, preconfigured for machine learning, and fully customizable. \n\nYou can learn how to create a Notebook VM [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-1st-experiment-sdk-setup#azure) and then follow the same setup as in the [Setup guide for Local or DSVM](#setup-guide-for-local-or-dsvm-machines) directly using the terminal in the Notebook VM.\n\n## Setup Guide for Local or Virtual Machines\n\n### Requirements\n\n* A machine running Linux, MacOS or Windows.\n* On Windows, Microsoft Visual C++ 14.0 is required for building certain packages. Download Microsoft Visual C++ Build Tools [here](https://visualstudio.microsoft.com/downloads/).\n\n* Miniconda or Anaconda with Python version >= 3.6.\n    * This is pre-installed on Azure DSVM such that one can run the following steps directly. To setup on your local machine, [Miniconda](https://docs.conda.io/en/latest/miniconda.html) is a quick way to get started.\n    * It is recommended to update conda to the latest version: `conda update -n base -c defaults conda`\n\n> NOTE: Windows machines are not **FULLY SUPPORTED**. Please use at your own risk.\n\n### Dependencies Setup\n\n\nWe provide a script, [generate_conda_file.py](tools/generate_conda_file.py), to generate a conda-environment yaml file\nwhich you can use to create the target environment using the Python version 3.6 with all the correct dependencies.\n\nAssuming the repo is cloned as `nlp-recipes` in the system, to install **a default (Python CPU) environment**:\n\n    cd nlp-recipes\n    python tools/generate_conda_file.py\n    conda env create -f nlp_cpu.yaml\n\nYou can specify the environment name as well with the flag `-n`.\n\nClick on the following menus to see how to install the Python GPU environment:\n\n<details>\n<summary><strong><em>Python GPU environment</em></strong></summary>\n\nAssuming that you have a GPU machine, to install the Python GPU environment, \n1. Check the CUDA **driver** version on your machine by running\n\n        nvidia-smi\n    The top of the output shows the CUDA **driver** version, which is 10.0 in the example below.   \n    +-----------------------------------------------------------------------------+  \n    | NVIDIA-SMI 410.79 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;Driver Version: 410. &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;CUDA Version: 10.0     |  \n    |-------------------------------+----------------------+----------------------+\n2. Decide which cuda **runtime** version you should install.   \nThe cuda **runtime** version is the version of the cudatoolkit that will be installed in the conda environment in the next step, which should be <= the CUDA **driver** version found in step 1.  \nCurrently, this repo uses PyTorch 1.4.0 which is compatible with cuda 9.2 and cuda 10.1. The conda environment file generated in step 3 installs cudatoolkit 10.1 by default. If your CUDA **driver** version is < 10.1, you should add additional argument \"--cuda_version 9.2\" when calling generate_conda_files.py.   \n\n3. Install the GPU environment:  \nIf CUDA **driver** version >= 10.1\n\n        cd nlp-recipes\n        python tools/generate_conda_file.py --gpu\n        conda env create -n nlp_gpu -f nlp_gpu.yaml\n\n    If CUDA **driver** version < 10.1\n\n        cd nlp-recipes\n        python tools/generate_conda_file.py --gpu --cuda_version 9.2\n        conda env create -n nlp_gpu -f nlp_gpu.yaml\n\n4. Enable mixed precision training (optional)  \nMixed precision training is particularly useful if your model takes a long time to train. It usually reduces the training time by 50% and produces the same model quality. To enable mixed precision training, run the following command \n\n        conda activate nlp_gpu\n        git clone https://github.com/NVIDIA/apex.git\n        cd apex\n        pip install -v --no-cache-dir --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\" ./\n\n    **Troubleshooting**:  \n    If you run into an error message \"RuntimeError: Cuda extensions are being compiled with a version of Cuda that does not match the version used to compile Pytorch binaries.\", you need to make sure your NVIDIA Cuda compiler driver (nvcc) version and your cuda **runtime** version are exactly the same. To check the nvcc version, run   \n\n        nvcc -V\n\n    If the nvcc version is 10.0, it's recommended to upgrade to 10.1 and re-create your conda environment with cudatoolkit=10.1.\n    \n    **Steps to upgrade CUDA **driver** version and nvcc version**  \n    We have tested the following steps. Alternatively, you can follow the official instructions [here](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)  \n    a. Update apt-get and reboot your machine\n\n        sudo apt-get update\n        sudo apt-get upgrade --fix-missing\n        sudo reboot\n    b. Download the CUDA toolkit .run file from https://developer.nvidia.com/cuda-10.1-download-archive-base based on your target platform. For example, on a Linux machine with Ubuntu 16.04, run   \n\n        wget https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run  \n\n    c. Upgrade CUDA driver by running  \n\n        sudo sh cuda_10.1.105_418.39_linux.run\n    First, accept the user agreement.  \n    ![](https://nlpbp.blob.core.windows.net/images/upgrade_cuda_driver/1agree_to_user_agreement.PNG)  \n    Next, choose the components to install.  \n    It's possible that you already have NVIDIA driver 418.39 and CUDA 10.1, but nvcc 10.0. In this case, you can uncheck the \"DRIVER\" box and upgrade nvcc by re-installing CUDA toolkit only.   \n    ![](https://nlpbp.blob.core.windows.net/images/upgrade_cuda_driver/2install_cuda_only.PNG)  \n\n    If you choose to install all components, follow the instructions on the screen to uninstall existing NVIDIA driver and CUDA toolkit first.  \n    ![](https://nlpbp.blob.core.windows.net/images/upgrade_cuda_driver/3install_all.PNG)   \n    Then re-run   \n\n        sudo sh cuda_10.1.105_418.39_linux.run\n    Select \"Yes\" to update the cuda symlink.   \n    ![](https://nlpbp.blob.core.windows.net/images/upgrade_cuda_driver/4Upgrade_symlink.PNG)  \n\n    d. Run the following commands again to make sure you have NVIDIA driver 418.39, CUDA driver 10.1 and nvcc 10.1\n\n        nvidia-smi\n        nvcc -V\n\n    e. Repeat steps 3 & 4 to recreate your conda environment with cudatoolkit **runtime** 10.1 and apex installed for mixed precision training. \n\n\n</details>\n\n### Register Conda Environment in DSVM JupyterHub\n\nWe can register our created conda environment to appear as a kernel in the Jupyter notebooks.\n\n    conda activate my_env_name\n    python -m ipykernel install --user --name my_env_name --display-name \"Python (my_env_name)\"\n\nIf you are using the DSVM, you can [connect to JupyterHub](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#jupyterhub-and-jupyterlab) by browsing to `https://your-vm-ip:8000`.  If you are prompted to enter user name and password, enter the user name and password that you use to log in to your virtual machine. \n\n### Installing the Repo's Utils via PIP\n\n<details>\n    <summary>The utils_nlp module of this repository needs to be installed as a python package in order to be used by the examples. <strong><em>Click to expand and see the details</em></strong> \n    </summary> \n    <p>  \nA setup.py file is provided in order to simplify the installation of this utilities in this repo from the main directory.  \n    \nTo install the package, please run the command below (from directory root)\n\n    pip install -e . \n\nRunning the command tells pip to install the `utils_nlp` package from source in [development mode](https://setuptools.readthedocs.io/en/latest/setuptools.html#development-mode). This just means that any updates to `utils_nlp` source directory will immediately be reflected in the installed package without needing to reinstall; a very useful practice for a package with constant updates.   \n\n> It is also possible to install directly from Github, which is the best way to utilize the `utils_nlp` package in external projects (while still reflecting updates to the source as it's installed as an editable `'-e'` package). \n\n>   `pip install -e  git+git@github.com:microsoft/nlp-recipes.git@master#egg=utils_nlp`  \n\nEither command, from above, makes `utils_nlp` available in your conda virtual environment. You can verify it was properly installed by running:  \n\n    pip list  \n    \n\n**NOTE** - The pip installation does not install any of the necessary package dependencies, it is expected that conda will be used as shown above to setup the environment for the utilities being used.\n    </p>\n</details>\n\nThe details of the versioning info can be found at [VERSIONING.md](VERSIONING.md).\n\n# Set up guide for (nvidia) docker\n\n## Pre-requisites\nIn order to use the notebooks within a docker enviornment, you will need to have [nvidia docker drivers](https://github.com/NVIDIA/nvidia-docker) and [docker](https://docs.docker.com/install/linux/docker-ce/ubuntu/) installed on your computer.\n\n## Building docker image\nA docker file is provided within the [docker](docker/) folder. You can create the image using \n```\n  cd docker\n  docker build -f . -t nlp-recipes\n```\nThis will create a docker image containing all the dependencies and will name it as nlp-recipies:latest\n\n## Running the container\nYou can run the notebook within the container environment using\n```\n  docker run --gpus all -p 8888:8888 nlp-recipes\n```\nThis will map port 8888 of the local machine \n\n## Trouble shooting\n* If you have permission issues with `docker build` or `docker run`, you might need to run docker with sudo permissions. \n* If you are getting 'port already in use' errors, consider mapping a different port on the local machine to port 8888 on the container e.g.\n```\ndocker run --gpus all -p 9000:8888 nlp-recipes\n```\n"
  },
  {
    "path": "VERSIONING.md",
    "content": "# Semantic Versioning\n> NOTE: Support for `setuptools_scm` is currently removed due to a known [issue](https://github.com/pypa/setuptools_scm/issues/357) with the way pip installations restrict access to certain SCM metadata during package installation. Support will be restored when `setuptools_scm` and `pip` developers fix this with a patch.\n\nThis library is configured to use\n[setuptools_scm](https://github.com/pypa/setuptools_scm/) to automatically get package version from git commit histories.\n\n**There shouldn't be any references to manually coded versions**.\n\nVerify what git tag to use by running:\n\n```bash\npython setup.py --version\n```\nIt should look something like `0.1.0.dev4+gdfedba7.d20190209`\n\nUsing the information above the master branch, after a merge commit, can be _**Tagged**_ with the above semantic version `0.1.0` (ignoring the `dev4+gdfedba7.d20190209`)  \n\nFor example: \n\n    git tag v0.1.0  \n\nNow verify the semantic version for the package:\n\n    python setup.py --version\n\n\nAll new merged commit on master must have a\n   [Semantic Versioning](https://semver.org/) release version with an\n   accompanying tag.  TL;DR:\n   * `major.minor.patch`\n   * Patch is for bugfix\n   * Minor is for new features\n   * Major is for backwards-incompatible changes\n   * tags should be of the form `v0.1.2`  \n\nInstalling this library into another clean git repository with a tag version, you should get a nice version like `0.2.1`.  \n\nHowever, if you inspect the `__version__` in this repo,\nyou'll get a nice **'dirty'** version number like `'0.2.1.dev0+g850a76d.d20180908'`.  \n\nThis is useful for debugging, building sphinx docs in dev and so on.   \n\nYou should never have to specify a version manually except just tagging your commit from the tag calculation generated by running  \n\n    python setup.py --version \n\n"
  },
  {
    "path": "_config.yml",
    "content": "theme: jekyll-theme-cayman"
  },
  {
    "path": "cgmanifest.json",
    "content": "{\n    \"Registrations\": [\n        {\n            \"component\": {\n                \"type\": \"git\",\n                \"git\": {\n                    \"repositoryUrl\": \"https://github.com/facebookresearch/XLM\",\n                    \"commitHash\": \"\"\n                }\n            },\n            \"license\": \"CC BY-NC 4.0\"\n        },\n        {\n            \"component\": {\n                \"type\": \"git\",\n                \"git\": {\n                    \"repositoryUrl\": \"https://github.com/allenai/bi-att-flow\",\n                    \"commitHash\": \"e444acf13892cf62189b9eac3c7654bd83baf848\"\n                }\n            },\n            \"license\": \"Apache-2.0\"\n        },\n        {\n            \"component\": {\n                \"type\": \"git\",\n                \"git\": {\n                    \"repositoryUrl\": \"https://github.com/stanfordnlp/glove\",\n                    \"commitHash\": \"26f6e18eb117ca7b080d01acb453fd1c9742418d\"\n                }\n            },\n            \"license\": \"Apache-2.0\"\n        },\n        {\n            \"component\": {\n                \"type\": \"git\",\n                \"git\": {\n                    \"repositoryUrl\": \"https://github.com/nlpyang/PreSumm\",\n                    \"commitHash\": \"2df3312582a3a014aacbc1be810841705c67d06e\"\n                }\n            },\n            \"license\": \"MIT License\"\n        }\n    ],\n    \"Version\": 1\n}\n"
  },
  {
    "path": "docker/Dockerfile",
    "content": "FROM nvidia/cuda\n\n# Install Anaconda\n# Non interactive installation instructions can be found \n# https://hub.docker.com/r/continuumio/anaconda/dockerfile\n# https://hub.docker.com/r/continuumio/miniconda/dockerfile\nENV PATH /opt/conda/bin:$PATH\nSHELL [\"/bin/bash\", \"-c\"]\n\nRUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \\\n    libglib2.0-0 libxext6 libsm6 libxrender1 \\\n    git mercurial subversion\n\nRUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda2-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \\\n    /bin/bash ~/miniconda.sh -b -p /opt/conda && \\\n    rm ~/miniconda.sh && \\\n    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \\\n    echo \". /opt/conda/etc/profile.d/conda.sh\" >> ~/.bashrc && \\\n    echo \"conda activate base\" >> ~/.bashrc\n\n# Get the latest version repository\nWORKDIR /root\nRUN apt-get install -y zip && \\\n    wget --quiet https://github.com/microsoft/nlp-recipes/archive/staging.zip -O staging.zip && \\\n    unzip staging.zip  && rm staging.zip \n    \n# Install the packages\nWORKDIR /root/nlp-recipes-staging\nRUN python /root/nlp-recipes-staging/tools/generate_conda_file.py --gpu && \\\n    conda env create -n nlp_gpu -f nlp_gpu.yaml \nRUN source activate nlp_gpu && \\\n    pip install -e . && \\\n    python -m ipykernel install --user --name nlp_gpu --display-name \"Python (nlp_gpu)\"\n\n# Run notebook\nEXPOSE 8888/tcp\nWORKDIR /root/nlp-recipes-staging\nCMD source activate nlp_gpu && \\\n    jupyter notebook --allow-root --ip 0.0.0.0 --port 8888 --no-browser --notebook-dir .\n"
  },
  {
    "path": "docs/Makefile",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# You can set these variables from the command line.\nSPHINXOPTS    =\nSPHINXBUILD   = sphinx-build\nSOURCEDIR     = source\nBUILDDIR      = build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)"
  },
  {
    "path": "docs/README.md",
    "content": "# Documentation\n\nTo setup the documentation, first you need to install the dependencies of the cpu environment. For it please follow the [SETUP.md](../SETUP.md). Then type:\n\n    conda activate nlp_cpu\n    pip install sphinx_rtd_theme\n\n\nTo build the documentation as HTML:\n\n    cd docs\n    make html\n\n"
  },
  {
    "path": "docs/_config.yml",
    "content": "theme: jekyll-theme-cayman"
  },
  {
    "path": "docs/source/azureml.rst",
    "content": ".. _azureml:\n\nAzureML module\n**************************\n\nAzureML module from NLP utilities.\n\nAzureML utils\n===============================\n\n.. automodule:: utils_nlp.azureml.azureml_utils\n    :members:\n    \n\nAzureML utils for BERT\n===============================\n\n.. automodule:: utils_nlp.azureml.azureml_bert_util\n    :members:\n\n\n"
  },
  {
    "path": "docs/source/conf.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# -*- coding: utf-8 -*-\n#\n# Configuration file for the Sphinx documentation builder.\n#\n# This file does only contain a selection of the most common options. For a\n# full list see the documentation:\n# http://www.sphinx-doc.org/en/master/config\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\nimport os\nimport sys\n\nsys.path.insert(0, os.path.abspath(os.path.join(\"..\", \"..\")))\nsys.setrecursionlimit(1500)\n\nfrom utils_nlp import TITLE, VERSION, COPYRIGHT, AUTHOR\n\n# -- Project information -----------------------------------------------------\n\nproject = TITLE\ncopyright = COPYRIGHT\nauthor = AUTHOR\n\n# The short X.Y version\nversion = \".\".join(VERSION.split(\".\")[:2])\n# The full version, including alpha/beta/rc tags\nrelease = VERSION\n\nprefix = \"NLPRecipes\"\n\n# -- General configuration ---------------------------------------------------\n\n# If your documentation needs a minimal Sphinx version, state it here.\n#\n# needs_sphinx = '1.0'\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    \"sphinx.ext.autodoc\",\n    \"sphinx.ext.doctest\",\n    \"sphinx.ext.intersphinx\",\n    \"sphinx.ext.ifconfig\",\n    \"sphinx.ext.viewcode\",  # Add links to highlighted source code\n    \"sphinx.ext.napoleon\",  # to render Google format docstrings\n]\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = [\"_templates\"]\n\n# The suffix(es) of source filenames.\n# You can specify multiple suffix as a list of string:\n#\n# source_suffix = ['.rst', '.md']\nsource_suffix = \".rst\"\n\n# The master toctree document.\nmaster_doc = \"index\"\n\n# The language for content autogenerated by Sphinx. Refer to documentation\n# for a list of supported languages.\n#\n# This is also used if you do content translation via gettext catalogs.\n# Usually you set \"language\" from the command line for these cases.\nlanguage = None\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path.\nexclude_patterns = [\"Thumbs.db\", \".DS_Store\"]\n\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = None\n\n\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\nhtml_theme = \"sphinx_rtd_theme\"\n\n# Theme options are theme-specific and customize the look and feel of a theme\n# further.  For a list of options available for each theme, see the\n# documentation.\n#\n# html_theme_options = {}\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\n# html_static_path = [\"images\"]\n\n# Custom sidebar templates, must be a dictionary that maps document names\n# to template names.\n#\n# The default sidebars (for documents that don't match any pattern) are\n# defined by theme itself.  Builtin themes are using these templates by\n# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',\n# 'searchbox.html']``.\n#\n# html_sidebars = {}\n\n\n# -- Options for HTMLHelp output ---------------------------------------------\n\n# Output file base name for HTML help builder.\nhtmlhelp_basename = prefix + \"doc\"\n\n\n# -- Options for LaTeX output ------------------------------------------------\n\nlatex_elements = {\n    \"papersize\": \"letterpaper\",\n    \"pointsize\": \"10pt\",\n    \"figure_align\": \"htbp\",\n    \"preamble\": r\"\"\"\n        %% Adding source listings https://en.wikibooks.org/wiki/LaTeX/Source_Code_Listings \n        \\usepackage{listings}\n        \\usepackage{color}\n\n        \\definecolor{mygreen}{rgb}{0,0.6,0}\n        \\definecolor{mygray}{rgb}{0.5,0.5,0.5}\n        \\definecolor{mymauve}{rgb}{0.58,0,0.82}\n\n        \\lstset{ \n        backgroundcolor=\\color{white},   % choose the background color; you must add \\usepackage{color} or \\usepackage{xcolor}; should come as last argument\n        basicstyle=\\footnotesize,        % the size of the fonts that are used for the code\n        breakatwhitespace=false,         % sets if automatic breaks should only happen at whitespace\n        breaklines=true,                 % sets automatic line breaking\n        captionpos=b,                    % sets the caption-position to bottom\n        commentstyle=\\color{mygreen},    % comment style\n        deletekeywords={...},            % if you want to delete keywords from the given language\n        escapeinside={\\%*}{*)},          % if you want to add LaTeX within your code\n        extendedchars=true,              % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8\n        firstnumber=1000,                % start line enumeration with line 1000\n        frame=single,\t                 % adds a frame around the code\n        keepspaces=true,                 % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)\n        keywordstyle=\\color{blue},       % keyword style\n        language=Python,                 % the language of the code\n        morekeywords={*,...},            % if you want to add more keywords to the set\n        numbers=left,                    % where to put the line-numbers; possible values are (none, left, right)\n        numbersep=5pt,                   % how far the line-numbers are from the code\n        numberstyle=\\tiny\\color{mygray}, % the style that is used for the line-numbers\n        rulecolor=\\color{black},         % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here))\n        showspaces=false,                % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'\n        showstringspaces=false,          % underline spaces within strings only\n        showtabs=false,                  % show tabs within strings adding particular underscores\n        stepnumber=2,                    % the step between two line-numbers. If it's 1, each line will be numbered\n        stringstyle=\\color{mymauve},     % string literal style\n        tabsize=2,\t                     % sets default tabsize to 2 spaces\n        title=\\lstname                   % show the filename of files included with \\lstinputlisting; also try caption instead of title\n        }\n\n    \"\"\",\n}\n\n# Grouping the document tree into LaTeX files. List of tuples\n# (source start file, target name, title,\n#  author, documentclass [howto, manual, or own class]).\nlatex_documents = [(master_doc, prefix + \".tex\", prefix + \" Documentation\", prefix, \"manual\")]\n\n\n# -- Options for manual page output ------------------------------------------\n\n# One entry per manual page. List of tuples\n# (source start file, name, description, authors, manual section).\nman_pages = [(master_doc, prefix, prefix + \" Documentation\", [author], 1)]\n\n\n# -- Options for Texinfo output ----------------------------------------------\n\n# Grouping the document tree into Texinfo files. List of tuples\n# (source start file, target name, title, author,\n#  dir menu entry, description, category)\ntexinfo_documents = [\n    (\n        master_doc,\n        prefix,\n        prefix + \" Documentation\",\n        author,\n        prefix,\n        \"One line description of project.\",\n        \"Miscellaneous\",\n    )\n]\n\n\n# -- Options for Epub output -------------------------------------------------\n\n# Bibliographic Dublin Core info.\nepub_title = project\n\n# The unique identifier of the text. This can be a ISBN number\n# or the project homepage.\n#\n# epub_identifier = ''\n\n# A unique identification for the text.\n#\n# epub_uid = ''\n\n# A list of files that should not be packed into the epub file.\nepub_exclude_files = [\"search.html\"]\n\n\n# -- Extension configuration -------------------------------------------------\n\n# -- Options for intersphinx extension ---------------------------------------\n\n# Example configuration for intersphinx: refer to the Python standard library.\nintersphinx_mapping = {\"https://docs.python.org/\": None}\n\n##################################################\n# Other options\n# html_favicon = os.path.join(html_static_path[0], \"favicon.ico\")\n\n\n# Ensure that __init__() is always documented\n# source: https://stackoverflow.com/a/5599712\ndef skip(app, what, name, obj, would_skip, options):\n    if name == \"__init__\":\n        return False\n    return would_skip\n\n\ndef setup(app):\n    app.connect(\"autodoc-skip-member\", skip)\n"
  },
  {
    "path": "docs/source/index.rst",
    "content": "\nNLP Utilities\n===================================================\n\nThe `NLP repository <https://github.com/microsoft/nlp-recipes>`_ provides examples and best practices for building NLP systems, provided as Jupyter notebooks. \n\nThe module `utils_nlp <https://github.com/microsoft/nlp-recipes/tree/master/utils_nlp>`_ contains functions to simplify common tasks used when developing and \nevaluating NLP systems. \n\n.. toctree::\n   :maxdepth: 1\n   :caption: Contents:\n\n    AzureML <azureml>\n    Common <common>\n    Dataset <dataset>\n    Evaluation <eval>\n    NLP Algorithms <model>\n    NLP Interpretability <interpreter>\n\n\nIndices and tables\n==================\n\n* :ref:`genindex`\n* :ref:`modindex`\n* :ref:`search`\n"
  },
  {
    "path": "examples/README.md",
    "content": "# Examples\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for building Natural Language Processing systems for the following scenarios.\n\n|Category|Applications|Methods|Languages|\n|---| ------------------------ | ------------------- |---|\n|[Text Classification](text_classification)|Topic Classification|BERT, XLNet, RoBERTa, DistilBERT|en, hi, ar|\n|[Named Entity Recognition](named_entity_recognition) |Wikipedia NER|BERT|en|\n|[Text Summarization](text_summarization)|News Summarization, Headline Generation|Extractive: BERTSumExt <br> Abstractive: UniLM (s2s-ft)|en\n|[Entailment](entailment)|MultiNLI Natural Language Inference|BERT|en|\n|[Question Answering](question_answering) |SQuAD|BiDAF, BERT, XLNet, DistilBERT|en|\n|[Sentence Similarity](sentence_similarity)|STS Benchmark|BERT, GenSen|en|\n|[Embeddings](embeddings)|Custom Embeddings Training|Word2Vec, fastText, GloVe|en|\n|[Annotation](annotation)|Text Annotation|Doccano|en|\n|[Model Explainability](model_explainability)|DNN Layer Explanation|DUUDNM (Guan et al.)|en|\n\n## Data/Telemetry\nThe Azure Machine Learning notebooks collect browser usage data and send it to Microsoft to help improve our products and services. Read Microsoft's [privacy statement to learn more](https://privacy.microsoft.com/en-US/privacystatement).\n\nTo opt out of tracking, a Python [script](../tools/remove_pixelserver.py) under the `tools` folder is also provided. Executing the script will check all notebooks under the `examples` folder, and automatically remove the telemetry cell:\n\n```sh\npython ../tools/remove_pixelserver.py\n```\n"
  },
  {
    "path": "examples/annotation/Doccano.md",
    "content": "# Doccano: Text Annotation Tool\n\n## What is Doccano?\n\n[Doccano](https://github.com/chakki-works/doccano) is one of the best open source tools that provides text annotation features. The latest version supports annotation features for text classification, sequence labeling (NER) and sequence to sequence (machine translation, text summarization). There are many other open source and commercial annotation tools available. Hereafter is an list of those tools:\n\n- [Brat](https://brat.nlplab.org/) (open source)\n- [Anafora](https://github.com/weitechen/anafora) (open source)\n- [Prodigy](https://prodi.gy/) (commercial)\n- [LightTag](https://www.lighttag.io/) (commercial)\n\nDoccano needs to be hosted somewhere such that we can collaborate it. This tutorial walks through how to deploy Doccano on Azure and collaboratively annotate text data for natural language processing tasks.\n\n## Deploy to Azure\n\nDoccano can be deployed to Azure ([Web App for Containers](https://azure.microsoft.com/en-us/services/app-service/containers/) +\n[PostgreSQL database](https://azure.microsoft.com/en-us/services/postgresql/)) by clicking on the button below:\n\n<p align=\"center\">\n  <a href=\"https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fchakki-works%2Fdoccano%2Fmaster%2Fazuredeploy.json\"><img width=180 src=\"https://nlpbp.blob.core.windows.net/images/deploybutton.jpg\" /></a>\n</p>\n\nYou will need to have an existing Azure subscription such that you can create all Azure resources need to deploy Doccano. Otherwise you can get a [free Azure account](https://azure.microsoft.com/en-us/offers/ms-azr-0044p/?WT.mc_id=medium-blog-abornst) and then click the deploy button above.\n\nYou will need to specify your subscription and resource group, and fill in the setting details (App Name, Secret Key, and etc.) and then deploy. It takes a few minutes to create all needed Azure resources. Hereafter is a screen snippet of the deployment. \n\n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/deploy_to_azure.jpg\" />\n</p>\n\n## Tutorial\n\n### Useful Links\n\n#### Main Page\n\nAfter the deployment you can navigate to following url where **{`appname`}** is the `App Name` you choose when deploy to Azure:\n\n_**https://{appname}.azurewebsites.net**_\n\nFor example, if your appname is \"**doccano**\", then the link will be\n\n_**https://doccano.azurewebsites.net**_\n\nAnd we will use `doccano` as the app name for this tutorial.\n\n#### Login Page\n\nYou can login by clicking the `login` button at the top right of the main page, or you can navigate to the page with the link\n\n_**https://doccano.azurewebsites.net/login**_\n\nBoth will bring you in to the Doccano login page where you can login with the Admin user name and Admin password you configured in the deployment. \n\n#### Admin Page\n\nBy default, only the Admin user is created for you after the deployment. You can add more users, groups and configure the Doccano service by navigating to the admin page.\n\n_**https://doccano.azurewebsites.net/admin**_\n\n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/admin_page.JPG\" />\n</p>\n\n### Create Project\n\nThe first step we need to do is to create a new project for annotation. And here we will use the NER annotation task for science fictions to give you a brief tutorial on Doccano. \n\nAfter login with Admin user name and Admin password, you will be navigated to the main project list page of Doccano and there is no project. \n\n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/project_list.jpg\" />\n</p>\n\nTo create your project, make sure you’re in the project list page and click `Create Project` button. As for this tutorial, we name the project as `sequence labeling for books`, write some description, then choose the sequence labeling task type.\n\n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/create_project.jpg\" />\n</p>\n\n### Import Data\n\nAfter creating a project, we will see the \"`Import Data`\" page, or click `Import Data` button in the navigation bar. We should see the following screen:\n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/import_data.jpg\" />\n</p>\n\nWe choose JSONL and click `Select a file` button. Select `books.json` and it would be loaded automatically. Below is the `books.json` file containing lots of science fictions description with different languages. We need to annotate some entities like people name, book title, date and so on. \n\n```json\n{\"text\": \"The Hitchhiker's Guide to the Galaxy (sometimes referred to as HG2G, HHGTTGor H2G2) is a comedy science fiction series created by Douglas Adams. Originally a radio comedy broadcast on BBC Radio 4 in 1978, it was later adapted to other formats, including stage shows, novels, comic books, a 1981 TV series, a 1984 video game, and 2005 feature film.\"}\n{\"text\": \"《三体》是中国大陆作家刘慈欣于2006年5月至12月在《科幻世界》杂志上连载的一部长篇科幻小说，出版后成为中国大陆最畅销的科幻长篇小说之一。2008年，该书的单行本由重庆出版社出版。本书是三体系列（系列原名为：地球往事三部曲）的第一部，该系列的第二部《三体II：黑暗森林》已经于2008年5月出版。2010年11月，第三部《三体III：死神永生》出版发行。 2011年，“地球往事三部曲”在台湾陆续出版。小说的英文版获得美国科幻奇幻作家协会2014年度“星云奖”提名，并荣获2015年雨果奖最佳小说奖。\"}\n{\"text\": \"『銀河英雄伝説』（ぎんがえいゆうでんせつ）は、田中芳樹によるSF小説。また、これを原作とするアニメ、漫画、コンピュータゲーム、朗読、オーディオブック等の関連作品。略称は『銀英伝』（ぎんえいでん）。原作は累計発行部数が1500万部を超えるベストセラー小説である。1982年から2009年6月までに複数の版で刊行され、発行部数を伸ばし続けている。\"}\n```\n\nAfter importing the dataset, you should be able to see the dataset immediately. \n\n### Define labels\n\nClick `Labels` button in left bar to define our own labels. We should see the label editor page. In label editor page, you can create labels by specifying label text, shortcut key, background color and text color.\n\n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/define_labels.jpg\" />\n</p>\n\n### Annotation\n\nNext, we are ready to annotate the texts. Just click the `Annotate Data` button in the navigation bar, we can start to annotate the documents. You can just select the text and then use the shortcut key that you have defined to label the entities. \n\n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/annotate.jpg\" />\n</p>\n\n### Export Data\n\nAfter the annotation step, we can download the annotated data. Click the `Edit data` button in the navigation bar, and then click `Export Data`. You should see below screen:\n\n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/export_data.jpg\" />\n</p>\n\nHere we choose JSONL file to download the data by clicking the button. Below is the annotated result for our tutorial project.\n\n```json\n{\"id\": 1, \"text\": \"The Hitchhiker's Guide to the Galaxy (sometimes referred to as HG2G, HHGTTGor H2G2) is a comedy science fiction series created by Douglas Adams. Originally a radio comedy broadcast on BBC Radio 4 in 1978, it was later adapted to other formats, including stage shows, novels, comic books, a 1981 TV series, a 1984 video game, and 2005 feature film.\", \"annotations\": [{\"label\": 2, \"start_offset\": 0, \"end_offset\": 36, \"user\": 1}, {\"label\": 2, \"start_offset\": 63, \"end_offset\": 67, \"user\": 1}, {\"label\": 2, \"start_offset\": 69, \"end_offset\": 82, \"user\": 1}, {\"label\": 5, \"start_offset\": 89, \"end_offset\": 111, \"user\": 1}, {\"label\": 1, \"start_offset\": 130, \"end_offset\": 143, \"user\": 1}, {\"label\": 5, \"start_offset\": 158, \"end_offset\": 180, \"user\": 1}, {\"label\": 6, \"start_offset\": 184, \"end_offset\": 195, \"user\": 1}, {\"label\": 3, \"start_offset\": 199, \"end_offset\": 203, \"user\": 1}, {\"label\": 5, \"start_offset\": 254, \"end_offset\": 265, \"user\": 1}, {\"label\": 5, \"start_offset\": 267, \"end_offset\": 273, \"user\": 1}, {\"label\": 5, \"start_offset\": 275, \"end_offset\": 286, \"user\": 1}, {\"label\": 3, \"start_offset\": 290, \"end_offset\": 294, \"user\": 1}, {\"label\": 5, \"start_offset\": 295, \"end_offset\": 304, \"user\": 1}, {\"label\": 3, \"start_offset\": 308, \"end_offset\": 312, \"user\": 1}, {\"label\": 5, \"start_offset\": 313, \"end_offset\": 323, \"user\": 1}, {\"label\": 3, \"start_offset\": 329, \"end_offset\": 333, \"user\": 1}, {\"label\": 5, \"start_offset\": 334, \"end_offset\": 346, \"user\": 1}], \"meta\": {}, \"annotation_approver\": \"admin\"}\n{\"id\": 2, \"text\": \"《三体》是中国大陆作家刘慈欣于2006年5月至12月在《科幻世界》杂志上连载的一部长篇科幻小说，出版后成为中国大陆最畅销的科幻长篇小说之一。2008年，该书的单行本由重庆出版社出版。本书是三体系列（系列原名为：地球往事三部曲）的第一部，该系列的第二部《三体II：黑暗森林》已经于2008年5月出版。2010年11月，第三部《三体III：死神永生》出版发行。 2011年，“地球往事三部曲”在台湾陆续出版。小说的英文版获得美国科幻奇幻作家协会2014年度“星云奖”提名，并荣获2015年雨果奖最佳小说奖。\", \"annotations\": [{\"label\": 2, \"start_offset\": 1, \"end_offset\": 3, \"user\": 1}, {\"label\": 4, \"start_offset\": 5, \"end_offset\": 9, \"user\": 1}, {\"label\": 1, \"start_offset\": 11, \"end_offset\": 14, \"user\": 1}, {\"label\": 3, \"start_offset\": 15, \"end_offset\": 26, \"user\": 1}, {\"label\": 2, \"start_offset\": 28, \"end_offset\": 32, \"user\": 1}, {\"label\": 5, \"start_offset\": 41, \"end_offset\": 47, \"user\": 1}, {\"label\": 4, \"start_offset\": 53, \"end_offset\": 57, \"user\": 1}, {\"label\": 5, \"start_offset\": 61, \"end_offset\": 67, \"user\": 1}, {\"label\": 3, \"start_offset\": 70, \"end_offset\": 74, \"user\": 1}, {\"label\": 6, \"start_offset\": 83, \"end_offset\": 88, \"user\": 1}, {\"label\": 2, \"start_offset\": 105, \"end_offset\": 112, \"user\": 1}, {\"label\": 2, \"start_offset\": 94, \"end_offset\": 98, \"user\": 1}, {\"label\": 2, \"start_offset\": 126, \"end_offset\": 135, \"user\": 1}, {\"label\": 3, \"start_offset\": 139, \"end_offset\": 146, \"user\": 1}, {\"label\": 3, \"start_offset\": 149, \"end_offset\": 157, \"user\": 1}, {\"label\": 2, \"start_offset\": 162, \"end_offset\": 172, \"user\": 1}, {\"label\": 3, \"start_offset\": 179, \"end_offset\": 184, \"user\": 1}, {\"label\": 2, \"start_offset\": 186, \"end_offset\": 193, \"user\": 1}, {\"label\": 4, \"start_offset\": 195, \"end_offset\": 197, \"user\": 1}, {\"label\": 5, \"start_offset\": 202, \"end_offset\": 204, \"user\": 1}, {\"label\": 6, \"start_offset\": 210, \"end_offset\": 220, \"user\": 1}, {\"label\": 3, \"start_offset\": 220, \"end_offset\": 225, \"user\": 1}, {\"label\": 6, \"start_offset\": 227, \"end_offset\": 230, \"user\": 1}, {\"label\": 3, \"start_offset\": 237, \"end_offset\": 242, \"user\": 1}, {\"label\": 6, \"start_offset\": 242, \"end_offset\": 245, \"user\": 1}], \"meta\": {}, \"annotation_approver\": \"admin\"}\n{\"id\": 3, \"text\": \"『銀河英雄伝説』（ぎんがえいゆうでんせつ）は、田中芳樹によるSF小説。また、これを原作とするアニメ、漫画、コンピュータゲーム、朗読、オーディオブック等の関連作品。略称は『銀英伝』（ぎんえいでん）。原作は累計発行部数が1500万部を超えるベストセラー小説である。1982年から2009年6月までに複数の版で刊行され、発行部数を伸ばし続けている。\", \"annotations\": [{\"label\": 2, \"start_offset\": 1, \"end_offset\": 7, \"user\": 1}, {\"label\": 1, \"start_offset\": 23, \"end_offset\": 30, \"user\": 1}, {\"label\": 5, \"start_offset\": 30, \"end_offset\": 34, \"user\": 1}, {\"label\": 2, \"start_offset\": 85, \"end_offset\": 88, \"user\": 1}, {\"label\": 5, \"start_offset\": 50, \"end_offset\": 52, \"user\": 1}, {\"label\": 5, \"start_offset\": 63, \"end_offset\": 65, \"user\": 1}, {\"label\": 3, \"start_offset\": 130, \"end_offset\": 135, \"user\": 1}, {\"label\": 3, \"start_offset\": 137, \"end_offset\": 144, \"user\": 1}], \"meta\": {}, \"annotation_approver\": \"admin\"}\n```\n\nPlease note that in the exported JSON file, the label for each entity is an entity ID which is inconvenient if you want to consume the annotations somewhere else. Some post processing is needed if you want to have the entity type value instead of the type ID.\n\n### View Statistics\n\nOne good thing of Doccano is that it also has dashboard to display annotation progress and label distributions. Click the `Edit data` button in the navigation bar, and then click `Statistics` on the left side of the menu.\n\n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/statistic.jpg\" />\n</p>\n\nCongratulation! You just mastered how to use Doccano for a sequence labeling project.\n"
  },
  {
    "path": "examples/annotation/README.md",
    "content": "# Text Annotation\n\nThis folder contains a tutorial that walks through how to deploy text annotation tool on Azure and how to collaboratively annotate text data for natural language processing tasks. \n\n- **[Doccano](Doccano.md)**\nDoccano is an open source tools that provides three main text annotation features. This tutorial only shows a Named Entity Recognition (NER) annotation task as an example.\n\n"
  },
  {
    "path": "examples/embeddings/README.md",
    "content": "# Word Embedding\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for training word embedding on custom data from scratch.   \nThere are\nthree typical ways for training word embedding:\n[Word2Vec](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf),\n[GloVe](https://nlp.stanford.edu/pubs/glove.pdf), and [fastText](https://arxiv.org/abs/1607.01759).\nAll of the three methods provide pretrained models ([pretrained model with\nWord2Vec](https://code.google.com/archive/p/word2vec/), [pretrained model with\nGlove](https://github.com/stanfordnlp/GloVe), [pretrained model with\nfastText](https://fasttext.cc/docs/en/crawl-vectors.html)).   \nThese pretrained models are trained with\ngeneral corpus like Wikipedia data, Common Crawl data, etc., and may not serve well for situations\nwhere you have a domain-specific language learning problem or there is no pretrained model for the\nlanguage you need to work with.  In this folder, we provide examples of how to apply each of the\nthree methods to train your own word embeddings.  \n\n# What is Word Embedding?\n\nWord embedding is a technique to map words or phrases from a vocabulary to vectors or real numbers.\nThe learned vector representations of words capture  syntactic and semantic word relationships and\ntherefore can be very useful for  tasks like sentence similary, text classifcation, etc.\n\n\n## Summary\n\n\n|Notebook|Environment|Description|Dataset| Language | \n|---|---|---|---|---|\n|[Developing Word Embeddings](embedding_trainer.ipynb)|Local| A notebook shows how to learn word representation with Word2Vec, fastText and Glove|[STS Benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset) | en |\n"
  },
  {
    "path": "examples/embeddings/embedding_trainer.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Developing Word Embeddings\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Rather than use pre-trained embeddings (as we did in the sentence similarity baseline_deep_dive [notebook](../sentence_similarity/baseline_deep_dive.ipynb)), we can train word embeddings using our own dataset. In this notebook, we demonstrate the training process for producing word embeddings using the word2vec, GloVe, and fastText models. We'll utilize the STS Benchmark dataset for this task. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Table of Contents\\n\",\n    \"* [Data Loading and Preprocessing](#Load-and-Preprocess-Data)\\n\",\n    \"* [Word2Vec](#Word2Vec)\\n\",\n    \"* [fastText](#fastText)\\n\",\n    \"* [GloVe](#GloVe)\\n\",\n    \"* [Concluding Remarks](#Concluding-Remarks)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import gensim\\n\",\n    \"import sys\\n\",\n    \"import os\\n\",\n    \"\\n\",\n    \"# Set the environment path\\n\",\n    \"sys.path.append(\\\"../..\\\")\\n\",\n    \"\\n\",\n    \"import numpy as np\\n\",\n    \"from utils_nlp.dataset.preprocess import (\\n\",\n    \"    to_lowercase,\\n\",\n    \"    to_spacy_tokens,\\n\",\n    \"    rm_spacy_stopwords,\\n\",\n    \")\\n\",\n    \"from utils_nlp.dataset import stsbenchmark\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"from gensim.models import Word2Vec\\n\",\n    \"from gensim.models.fasttext import FastText\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# Set the path for where your repo is located\\n\",\n    \"NLP_REPO_PATH = os.path.join('..','..')\\n\",\n    \"\\n\",\n    \"# Set the path for where your datasets are located\\n\",\n    \"BASE_DATA_PATH = os.path.join(NLP_REPO_PATH, \\\"data\\\")\\n\",\n    \"\\n\",\n    \"# Set the path for location to save embeddings\\n\",\n    \"SAVE_FILES_PATH = os.path.join(BASE_DATA_PATH, \\\"trained_word_embeddings\\\")\\n\",\n    \"if not os.path.exists(SAVE_FILES_PATH):\\n\",\n    \"    os.makedirs(SAVE_FILES_PATH)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Load and Preprocess Data\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 401/401 [00:02<00:00, 182KB/s]  \"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Data downloaded to ../../data/raw/stsbenchmark\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Produce a pandas dataframe for the training set\\n\",\n    \"train_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\\\"train\\\")\\n\",\n    \"\\n\",\n    \"# Clean the sts dataset\\n\",\n    \"sts_train = stsbenchmark.clean_sts(train_raw)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>score</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"      <th>sentence2</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>5.00</td>\\n\",\n       \"      <td>A plane is taking off.</td>\\n\",\n       \"      <td>An air plane is taking off.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>3.80</td>\\n\",\n       \"      <td>A man is playing a large flute.</td>\\n\",\n       \"      <td>A man is playing a flute.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>3.80</td>\\n\",\n       \"      <td>A man is spreading shreded cheese on a pizza.</td>\\n\",\n       \"      <td>A man is spreading shredded cheese on an uncoo...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>2.60</td>\\n\",\n       \"      <td>Three men are playing chess.</td>\\n\",\n       \"      <td>Two men are playing chess.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>4.25</td>\\n\",\n       \"      <td>A man is playing the cello.</td>\\n\",\n       \"      <td>A man seated is playing the cello.</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   score                                      sentence1  \\\\\\n\",\n       \"0   5.00                         A plane is taking off.   \\n\",\n       \"1   3.80                A man is playing a large flute.   \\n\",\n       \"2   3.80  A man is spreading shreded cheese on a pizza.   \\n\",\n       \"3   2.60                   Three men are playing chess.   \\n\",\n       \"4   4.25                    A man is playing the cello.   \\n\",\n       \"\\n\",\n       \"                                           sentence2  \\n\",\n       \"0                        An air plane is taking off.  \\n\",\n       \"1                          A man is playing a flute.  \\n\",\n       \"2  A man is spreading shredded cheese on an uncoo...  \\n\",\n       \"3                         Two men are playing chess.  \\n\",\n       \"4                 A man seated is playing the cello.  \"\n      ]\n     },\n     \"execution_count\": 4,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"sts_train.head(5)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"(5749, 3)\"\n      ]\n     },\n     \"execution_count\": 5,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# Check the size of our dataframe\\n\",\n    \"sts_train.shape\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Training set preprocessing\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Convert all text to lowercase\\n\",\n    \"df_low = to_lowercase(sts_train)  \\n\",\n    \"# Tokenize text\\n\",\n    \"sts_tokenize = to_spacy_tokens(df_low) \\n\",\n    \"# Tokenize with removal of stopwords\\n\",\n    \"sts_train_stop = rm_spacy_stopwords(sts_tokenize) \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# Append together the two sentence columns to get a list of all tokenized sentences.\\n\",\n    \"all_sentences =  sts_train_stop[[\\\"sentence1_tokens_rm_stopwords\\\", \\\"sentence2_tokens_rm_stopwords\\\"]]\\n\",\n    \"# Flatten two columns into one list and remove all sentences that are size 0 after tokenization and stop word removal.\\n\",\n    \"sentences = [i for i in all_sentences.values.flatten().tolist() if len(i) > 0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"11498\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"len(sentences)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Minimum sentence length is 1 tokens\\n\",\n      \"Maximum sentence length is 43 tokens\\n\",\n      \"Median sentence length is 6.0 tokens\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"sentence_lengths = [len(i) for i in sentences]\\n\",\n    \"print(\\\"Minimum sentence length is {} tokens\\\".format(min(sentence_lengths)))\\n\",\n    \"print(\\\"Maximum sentence length is {} tokens\\\".format(max(sentence_lengths)))\\n\",\n    \"print(\\\"Median sentence length is {} tokens\\\".format(np.median(sentence_lengths)))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[['plane', 'taking', '.'],\\n\",\n       \" ['air', 'plane', 'taking', '.'],\\n\",\n       \" ['man', 'playing', 'large', 'flute', '.'],\\n\",\n       \" ['man', 'playing', 'flute', '.'],\\n\",\n       \" ['man', 'spreading', 'shreded', 'cheese', 'pizza', '.'],\\n\",\n       \" ['man', 'spreading', 'shredded', 'cheese', 'uncooked', 'pizza', '.'],\\n\",\n       \" ['men', 'playing', 'chess', '.'],\\n\",\n       \" ['men', 'playing', 'chess', '.'],\\n\",\n       \" ['man', 'playing', 'cello', '.'],\\n\",\n       \" ['man', 'seated', 'playing', 'cello', '.']]\"\n      ]\n     },\n     \"execution_count\": 10,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"sentences[:10]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Word2Vec\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Word2vec is a predictive model for learning word embeddings from text (see [original research paper](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)). Word embeddings are learned such that words that share common contexts in the corpus will be close together in the vector space. There are two different model architectures that can be used to produce word2vec embeddings: continuous bag-of-words (CBOW) or continuous skip-gram. The former uses a window of surrounding words (the \\\"context\\\") to predict the current word and the latter uses the current word to predict the surrounding context words. See this [tutorial](https://www.guru99.com/word-embedding-word2vec.html#3) on word2vec for more detailed background on the model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The gensim Word2Vec model has many different parameters (see [here](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec)) but the ones that are useful to know about are:  \\n\",\n    \"- size: length of the word embedding/vector (defaults to 100)\\n\",\n    \"- window: maximum distance between the word being predicted and the current word (defaults to 5)\\n\",\n    \"- min_count: ignores all words that have a frequency lower than this value (defaults to 5)\\n\",\n    \"- workers: number of worker threads used to train the model (defaults to 3)\\n\",\n    \"- sg: training algorithm; 1 for skip-gram and 0 for CBOW (defaults to 0)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Set up a Timer to see how long the model takes to train\\n\",\n    \"t = Timer()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"t.start()\\n\",\n    \"\\n\",\n    \"# Train the Word2vec model\\n\",\n    \"word2vec_model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=3, sg=0)\\n\",\n    \"\\n\",\n    \"t.stop()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Time elapsed: 0.3194\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"Time elapsed: {}\\\".format(t))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now that the model is trained we can:\\n\",\n    \"\\n\",\n    \"1. Query for the word embeddings of a given word. \\n\",\n    \"2. Inspect the model vocabulary\\n\",\n    \"3. Save the word embeddings\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Embedding for apple: [-0.13886626 -0.04330257  0.12527628  0.08564945  0.02040523 -0.10037457\\n\",\n      \" -0.1182736   0.05916803 -0.09810918  0.11094606 -0.00045659 -0.07130833\\n\",\n      \" -0.07526248  0.01439941 -0.01924936 -0.04267681  0.05364342  0.01334886\\n\",\n      \"  0.09927388  0.04298429  0.07616432 -0.09218667  0.13563654  0.13954957\\n\",\n      \"  0.17032589  0.13070972  0.04971378  0.05326121  0.1633883   0.0867981\\n\",\n      \"  0.01025774  0.19571003 -0.11564688  0.00285543 -0.02306972 -0.07086422\\n\",\n      \" -0.03311775  0.16642122  0.10450041  0.11148815 -0.11674852 -0.10021858\\n\",\n      \" -0.00149789 -0.10769422  0.1467818  -0.00330875  0.09308671 -0.12129212\\n\",\n      \"  0.07261119  0.07583102  0.00192156  0.23766024 -0.0063716  -0.10565527\\n\",\n      \" -0.06545153  0.04053855  0.24339062  0.15191206 -0.04718588 -0.05213067\\n\",\n      \"  0.00187512 -0.08648538 -0.05337012  0.15507293 -0.09485061  0.03063929\\n\",\n      \"  0.00369516 -0.20911641  0.09312427  0.03583751  0.07270095  0.18968543\\n\",\n      \"  0.08637197 -0.03679648  0.12222783 -0.11879333 -0.1462169   0.02210324\\n\",\n      \"  0.18023533  0.03193852 -0.02540419  0.01615141  0.12228711 -0.03577682\\n\",\n      \"  0.05543301  0.15039788 -0.01812798  0.10888109 -0.08378831 -0.10893872\\n\",\n      \"  0.04931932  0.03412211  0.05080304 -0.16159546  0.02976557  0.08955383\\n\",\n      \" -0.02231676  0.06976417  0.2003142   0.04647517]\\n\",\n      \"\\n\",\n      \"First 30 vocabulary words: ['plane', 'taking', '.', 'air', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'pizza', 'men', 'seated', 'fighting', 'smoking', 'piano', 'guitar', 'singing', 'woman', 'person']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# 1. Let's see the word embedding for \\\"apple\\\" by accessing the \\\"wv\\\" attribute and passing in \\\"apple\\\" as the key.\\n\",\n    \"print(\\\"Embedding for apple:\\\", word2vec_model.wv[\\\"apple\\\"])\\n\",\n    \"\\n\",\n    \"# 2. Inspect the model vocabulary by accessing keys of the \\\"wv.vocab\\\" attribute. We'll print the first 20 words.\\n\",\n    \"print(\\\"\\\\nFirst 30 vocabulary words:\\\", list(word2vec_model.wv.vocab)[:20])\\n\",\n    \"\\n\",\n    \"# 3. Save the word embeddings. We can save as binary format (to save space) or ASCII format.\\n\",\n    \"word2vec_model.wv.save_word2vec_format(SAVE_FILES_PATH+\\\"word2vec_model\\\", binary=True)  # binary format\\n\",\n    \"word2vec_model.wv.save_word2vec_format(SAVE_FILES_PATH+\\\"word2vec_model\\\", binary=False)  # ASCII format\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## fastText\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"fastText is an unsupervised algorithm created by Facebook Research for efficiently learning word embeddings (see [original research paper](https://arxiv.org/pdf/1607.04606.pdf)). fastText is significantly different than word2vec or GloVe in that these two algorithms treat each word as the smallest possible unit to find an embedding for. Conversely, fastText assumes that words are formed by an n-gram of characters (i.e. 2-grams of the word \\\"language\\\" would be {la, an, ng, gu, ua, ag, ge}). The embedding for a word is then composed of the sum of these character n-grams. This has advantages when finding word embeddings for rare words and words not present in the dictionary, as these words can still be broken down into character n-grams. Typically, for smaller datasets, fastText performs better than word2vec or GloVe. See this [tutorial](https://fasttext.cc/docs/en/unsupervised-tutorial.html) on fastText for more detail.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The gensim fastText model has many different parameters (see [here](https://radimrehurek.com/gensim/models/fasttext.html#gensim.models.fasttext.FastText)) but the ones that are useful to know about are:  \\n\",\n    \"- size: length of the word embedding/vector (defaults to 100)\\n\",\n    \"- window: maximum distance between the word being predicted and the current word (defaults to 5)\\n\",\n    \"- min_count: ignores all words that have a frequency lower than this value (defaults to 5)\\n\",\n    \"- workers: number of worker threads used to train the model (defaults to 3)\\n\",\n    \"- sg: training algorithm- 1 for skip-gram and 0 for CBOW (defaults to 0)\\n\",\n    \"- iter: number of epochs (defaults to 5)\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Set up a Timer to see how long the model takes to train\\n\",\n    \"t = Timer()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"t.start()\\n\",\n    \"\\n\",\n    \"# Train the FastText model\\n\",\n    \"fastText_model = FastText(size=100, window=5, min_count=5, sentences=sentences, iter=5)\\n\",\n    \"\\n\",\n    \"t.stop()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Time elapsed: 9.3665\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"Time elapsed: {}\\\".format(t))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can utilize the same attributes as we saw above for word2vec due to them both originating from the gensim package\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Embedding for apple: [-0.2255679  -0.15831569  0.03804937  0.47731966  0.47977886 -0.27653983\\n\",\n      \" -0.27343377 -0.4507852  -0.05649747  0.01470412  0.27904618 -0.02155268\\n\",\n      \" -0.02492249 -0.07855172  0.18532543  0.25709668  0.05939932  0.10333744\\n\",\n      \" -0.09892524 -0.61932683 -0.15273307 -0.02246136 -0.06295346 -0.5022594\\n\",\n      \" -0.13407618 -0.10411069  0.13370538  0.11902415 -0.44436237  0.27073038\\n\",\n      \"  0.06540621 -0.02650584 -0.0179158   0.08797703  0.18899101  0.12898529\\n\",\n      \"  0.05865225 -0.18658654 -0.40497953 -0.23991017  0.30457255  0.39893195\\n\",\n      \"  0.2913193  -0.18734889  0.10662938 -0.1165131  -0.42884877  0.31400812\\n\",\n      \"  0.04840293  0.10146416 -0.10285722 -0.21854313 -0.69022155 -0.48051542\\n\",\n      \" -0.17416449  0.12879132  0.12302257 -0.32911557 -0.48828328  0.22531843\\n\",\n      \" -0.35535514 -0.34300882  0.07264371  0.262703   -0.10182904  0.03486007\\n\",\n      \" -0.09019874  0.12621203  0.35632437 -0.10350075  0.3397234  -0.04080832\\n\",\n      \" -0.17116521 -0.20685913  0.18177888  0.19674565  0.00776504 -0.22853185\\n\",\n      \"  0.01387324 -0.33452377  0.1017314  -0.06989139  0.15893722  0.02910445\\n\",\n      \" -0.18428223  0.30011976 -0.05394572 -0.18550391  0.09144824  0.2203982\\n\",\n      \"  0.3605487  -0.0106479   0.729859    0.516405   -0.44636923 -0.4128766\\n\",\n      \" -0.523939   -0.20086594 -0.38725898  0.0440867 ]\\n\",\n      \"\\n\",\n      \"First 30 vocabulary words: ['plane', 'taking', '.', 'air', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'pizza', 'men', 'seated', 'fighting', 'smoking', 'piano', 'guitar', 'singing', 'woman', 'person']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# 1. Let's see the word embedding for \\\"apple\\\" by accessing the \\\"wv\\\" attribute and passing in \\\"apple\\\" as the key.\\n\",\n    \"print(\\\"Embedding for apple:\\\", fastText_model.wv[\\\"apple\\\"])\\n\",\n    \"\\n\",\n    \"# 2. Inspect the model vocabulary by accessing keys of the \\\"wv.vocab\\\" attribute. We'll print the first 20 words.\\n\",\n    \"print(\\\"\\\\nFirst 30 vocabulary words:\\\", list(fastText_model.wv.vocab)[:20])\\n\",\n    \"\\n\",\n    \"# 3. Save the word embeddings. We can save as binary format (to save space) or ASCII format.\\n\",\n    \"fastText_model.wv.save_word2vec_format(SAVE_FILES_PATH+\\\"fastText_model\\\", binary=True)  # binary format\\n\",\n    \"fastText_model.wv.save_word2vec_format(SAVE_FILES_PATH+\\\"fastText_model\\\", binary=False)  # ASCII format\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## GloVe\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"GloVe is an unsupervised algorithm for obtaining word embeddings created by the Stanford NLP group (see [original research paper](https://nlp.stanford.edu/pubs/glove.pdf)). Training occurs on word-word co-occurrence statistics with the objective of learning word embeddings such that the dot product of two words' embeddings is equal to the words' probability of co-occurrence. See this [tutorial](https://nlp.stanford.edu/projects/glove/) on GloVe for more detailed background on the model. \\n\",\n    \"\\n\",\n    \"Gensim doesn't have an implementation of the GloVe model and the other python packages that implement GloVe are unstable, so we leveraged the code directly from the Stanford NLP [repo](https://github.com/stanfordnlp/GloVe). \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"mkdir -p build\\n\",\n      \"gcc src/glove.c -o build/glove -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/glove.c:\\u001b[m\\u001b[K In function ‘\\u001b[01m\\u001b[Kglove_thread\\u001b[m\\u001b[K’:\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/glove.c:117:9:\\u001b[m\\u001b[K \\u001b[01;35m\\u001b[Kwarning: \\u001b[m\\u001b[Kignoring return value of ‘\\u001b[01m\\u001b[Kfread\\u001b[m\\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\\n\",\n      \"         fread(&cr, sizeof(CREC), 1, fin);\\n\",\n      \"\\u001b[01;32m\\u001b[K         ^\\u001b[m\\u001b[K\\n\",\n      \"gcc src/shuffle.c -o build/shuffle -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/shuffle.c:\\u001b[m\\u001b[K In function ‘\\u001b[01m\\u001b[Kshuffle_merge\\u001b[m\\u001b[K’:\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/shuffle.c:106:17:\\u001b[m\\u001b[K \\u001b[01;35m\\u001b[Kwarning: \\u001b[m\\u001b[Kignoring return value of ‘\\u001b[01m\\u001b[Kfread\\u001b[m\\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\\n\",\n      \"                 fread(&array[i], sizeof(CREC), 1, fid[j]);\\n\",\n      \"\\u001b[01;32m\\u001b[K                 ^\\u001b[m\\u001b[K\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/shuffle.c:\\u001b[m\\u001b[K In function ‘\\u001b[01m\\u001b[Kshuffle_by_chunks\\u001b[m\\u001b[K’:\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/shuffle.c:163:9:\\u001b[m\\u001b[K \\u001b[01;35m\\u001b[Kwarning: \\u001b[m\\u001b[Kignoring return value of ‘\\u001b[01m\\u001b[Kfread\\u001b[m\\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\\n\",\n      \"         fread(&array[i], sizeof(CREC), 1, fin);\\n\",\n      \"\\u001b[01;32m\\u001b[K         ^\\u001b[m\\u001b[K\\n\",\n      \"gcc src/cooccur.c -o build/cooccur -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/cooccur.c:\\u001b[m\\u001b[K In function ‘\\u001b[01m\\u001b[Kmerge_files\\u001b[m\\u001b[K’:\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/cooccur.c:267:9:\\u001b[m\\u001b[K \\u001b[01;35m\\u001b[Kwarning: \\u001b[m\\u001b[Kignoring return value of ‘\\u001b[01m\\u001b[Kfread\\u001b[m\\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\\n\",\n      \"         fread(&new, sizeof(CREC), 1, fid[i]);\\n\",\n      \"\\u001b[01;32m\\u001b[K         ^\\u001b[m\\u001b[K\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/cooccur.c:277:5:\\u001b[m\\u001b[K \\u001b[01;35m\\u001b[Kwarning: \\u001b[m\\u001b[Kignoring return value of ‘\\u001b[01m\\u001b[Kfread\\u001b[m\\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\\n\",\n      \"     fread(&new, sizeof(CREC), 1, fid[i]);\\n\",\n      \"\\u001b[01;32m\\u001b[K     ^\\u001b[m\\u001b[K\\n\",\n      \"\\u001b[01m\\u001b[Ksrc/cooccur.c:290:9:\\u001b[m\\u001b[K \\u001b[01;35m\\u001b[Kwarning: \\u001b[m\\u001b[Kignoring return value of ‘\\u001b[01m\\u001b[Kfread\\u001b[m\\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\\n\",\n      \"         fread(&new, sizeof(CREC), 1, fid[i]);\\n\",\n      \"\\u001b[01;32m\\u001b[K         ^\\u001b[m\\u001b[K\\n\",\n      \"gcc src/vocab_count.c -o build/vocab_count -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Define path\\n\",\n    \"glove_model_path = os.path.join(NLP_REPO_PATH, \\\"utils_nlp\\\", \\\"models\\\", \\\"glove\\\")\\n\",\n    \"# Execute shell commands\\n\",\n    \"!cd $glove_model_path && make\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Train GloVe vectors\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Training GloVe embeddings requires some data prep and then 4 steps (also documented in the original Stanford NLP repo [here](https://github.com/stanfordnlp/GloVe/tree/master/src)).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Step 0: Prepare Data**\\n\",\n    \"   \\n\",\n    \"In order to train our GloVe vectors, we first need to save our corpus as a text file with all words separated by 1+ spaces or tabs. Each document/sentence is separated by a new line character.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Save our corpus as tokens delimited by spaces with new line characters in between sentences.\\n\",\n    \"training_corpus_file_path = os.path.join(SAVE_FILES_PATH, \\\"training-corpus-cleaned.txt\\\")\\n\",\n    \"with open(training_corpus_file_path, 'w', encoding='utf8') as file:\\n\",\n    \"    for sent in sentences:\\n\",\n    \"        file.write(\\\" \\\".join(sent) + \\\"\\\\n\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Set up a Timer to see how long the model takes to train\\n\",\n    \"t = Timer()\\n\",\n    \"t.start()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Step 1: Build Vocabulary**\\n\",\n    \"\\n\",\n    \"Run the vocab_count executable. There are 3 optional parameters:\\n\",\n    \"1. min-count: lower limit on how many times a word must appear in dataset. Otherwise the word is discarded from our vocabulary.\\n\",\n    \"2. max-vocab: upper bound on the number of vocabulary words to keep\\n\",\n    \"3. verbose: 0, 1, or 2 (default)\\n\",\n    \"\\n\",\n    \"Then provide the path to the text file we created in Step 0 followed by a file path that we'll save the vocabulary to \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"BUILDING VOCABULARY\\r\\n\",\n      \"Processed 0 tokens.\\u001b[0GProcessed 85334 tokens.\\r\\n\",\n      \"Counted 11716 unique words.\\r\\n\",\n      \"Truncating vocabulary at min count 5.\\r\\n\",\n      \"Using vocabulary of size 2943.\\r\\n\",\n      \"\\r\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Define path\\n\",\n    \"vocab_count_exe_path = os.path.join(glove_model_path, \\\"build\\\", \\\"vocab_count\\\")\\n\",\n    \"vocab_file_path = os.path.join(SAVE_FILES_PATH, \\\"vocab.txt\\\")\\n\",\n    \"# Execute shell commands\\n\",\n    \"!$vocab_count_exe_path -min-count 5 -verbose 2 <$training_corpus_file_path> $vocab_file_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Step 2: Construct Word Co-occurrence Statistics**\\n\",\n    \"\\n\",\n    \"Run the cooccur executable. There are many optional parameters, but we list the top ones here:\\n\",\n    \"1. symmetric: 0 for only looking at left context, 1 (default) for looking at both left and right context\\n\",\n    \"2. window-size: number of context words to use (default 15)\\n\",\n    \"3. verbose: 0, 1, or 2 (default)\\n\",\n    \"4. vocab-file: path/name of the vocabulary file created in Step 1\\n\",\n    \"5. memory: soft limit for memory consumption, default 4\\n\",\n    \"6. max-product: limit the size of dense co-occurrence array by specifying the max product (integer) of the frequency counts of the two co-occurring words\\n\",\n    \"\\n\",\n    \"Then provide the path to the text file we created in Step 0 followed by a file path that we'll save the co-occurrences to\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"COUNTING COOCCURRENCES\\n\",\n      \"window size: 15\\n\",\n      \"context: symmetric\\n\",\n      \"max product: 13752509\\n\",\n      \"overflow length: 38028356\\n\",\n      \"Reading vocab from file \\\"../../data/trained_word_embeddings/vocab.txt\\\"...loaded 2943 words.\\n\",\n      \"Building lookup table...table contains 8661250 elements.\\n\",\n      \"Processing token: 0\\u001b[0GProcessed 85334 tokens.\\n\",\n      \"Writing cooccurrences to disk......2 files in total.\\n\",\n      \"Merging cooccurrence files: processed 0 lines.\\u001b[39G0 lines.\\u001b[39G100000 lines.\\u001b[0GMerging cooccurrence files: processed 188154 lines.\\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Define path\\n\",\n    \"cooccur_exe_path = os.path.join(glove_model_path, \\\"build\\\", \\\"cooccur\\\")\\n\",\n    \"cooccurrence_file_path = os.path.join(SAVE_FILES_PATH, \\\"cooccurrence.bin\\\")\\n\",\n    \"# Execute shell commands\\n\",\n    \"!$cooccur_exe_path -memory 4 -vocab-file $vocab_file_path -verbose 2 -window-size 15 <$training_corpus_file_path> $cooccurrence_file_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Step 3: Shuffle the Co-occurrences**\\n\",\n    \"\\n\",\n    \"Run the shuffle executable. The parameters are as follows:\\n\",\n    \"1. verbose: 0, 1, or 2 (default)\\n\",\n    \"2. memory: soft limit for memory consumption, default 4\\n\",\n    \"3. array-size: limit to the length of the buffer which stores chunks of data to shuffle before writing to disk\\n\",\n    \"\\n\",\n    \"Then provide the path to the co-occurrence file we created in Step 2 followed by a file path that we'll save the shuffled co-occurrences to\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 24,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"SHUFFLING COOCCURRENCES\\r\\n\",\n      \"array size: 255013683\\r\\n\",\n      \"Shuffling by chunks: processed 0 lines.\\u001b[22Gprocessed 188154 lines.\\r\\n\",\n      \"Wrote 1 temporary file(s).\\r\\n\",\n      \"Merging temp files: processed 0 lines.\\u001b[31G188154 lines.\\u001b[0GMerging temp files: processed 188154 lines.\\r\\n\",\n      \"\\r\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Define path\\n\",\n    \"shuffle_exe_path = os.path.join(glove_model_path, \\\"build\\\", \\\"shuffle\\\")\\n\",\n    \"cooccurrence_shuf_file_path = os.path.join(SAVE_FILES_PATH, \\\"cooccurrence.shuf.bin\\\")\\n\",\n    \"# Execute shell commands\\n\",\n    \"!$shuffle_exe_path -memory 4 -verbose 2 <$cooccurrence_file_path> $cooccurrence_shuf_file_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Step 4: Train GloVe model**\\n\",\n    \"\\n\",\n    \"Run the glove executable. There are many parameter options, but the top ones are listed below:\\n\",\n    \"1. verbose: 0, 1, or 2 (default)\\n\",\n    \"2. vector-size: dimension of word embeddings (50 is default)\\n\",\n    \"3. threads: number threads, default 8\\n\",\n    \"4. iter: number of iterations, default 25\\n\",\n    \"5. eta: learning rate, default 0.05\\n\",\n    \"6. binary: whether to save binary format (0: text = default, 1: binary, 2: both)\\n\",\n    \"7. x-max: cutoff for weighting function, default is 100\\n\",\n    \"8. vocab-file: file containing vocabulary as produced in Step 1\\n\",\n    \"9. save-file: filename to save vectors to \\n\",\n    \"10. input-file: filename with co-occurrences as returned from Step 3\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 25,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"TRAINING MODEL\\n\",\n      \"Read 188154 lines.\\n\",\n      \"Initializing parameters...done.\\n\",\n      \"vector size: 50\\n\",\n      \"vocab size: 2943\\n\",\n      \"x_max: 10.000000\\n\",\n      \"alpha: 0.750000\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 001, cost: 0.078545\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 002, cost: 0.072337\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 003, cost: 0.070195\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 004, cost: 0.066766\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 005, cost: 0.063480\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 006, cost: 0.060623\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 007, cost: 0.058089\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 008, cost: 0.056030\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 009, cost: 0.053907\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 010, cost: 0.051774\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 011, cost: 0.049576\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 012, cost: 0.047385\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 013, cost: 0.045207\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 014, cost: 0.043098\\n\",\n      \"08/13/19 - 05:39.53PM, iter: 015, cost: 0.041065\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Define path\\n\",\n    \"glove_exe_path = os.path.join(glove_model_path, \\\"build\\\", \\\"glove\\\")\\n\",\n    \"glove_vector_file_path = os.path.join(SAVE_FILES_PATH, \\\"GloVe_vectors\\\")\\n\",\n    \"# Execute shell commands\\n\",\n    \"!$glove_exe_path -save-file $glove_vector_file_path -threads 8 -input-file \\\\\\n\",\n    \"$cooccurrence_shuf_file_path -x-max 10 -iter 15 -vector-size 50 -binary 2 \\\\\\n\",\n    \"-vocab-file $vocab_file_path -verbose 2\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 26,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"t.stop()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 27,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Time elapsed: 3.4293\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"Time elapsed: {}\\\".format(t))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Inspect Word Vectors\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Like we did above for the word2vec and fastText models, let's now inspect our word embeddings\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 28,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#load in the saved word vectors.\\n\",\n    \"glove_wv = {}\\n\",\n    \"glove_vector_txt_file_path = os.path.join(SAVE_FILES_PATH, \\\"GloVe_vectors.txt\\\")\\n\",\n    \"with open(glove_vector_txt_file_path, encoding='utf-8') as f:\\n\",\n    \"    for line in f:\\n\",\n    \"        split_line = line.split(\\\" \\\")\\n\",\n    \"        glove_wv[split_line[0]] = [float(i) for i in split_line[1:]]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 29,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Embedding for apple: [-0.037004, -0.000665, -0.028638, 0.025758, -0.050187, 0.038694, 0.016966, -0.042032, -0.033963, 0.143667, -0.068749, -0.005046, 0.180022, 0.088593, -0.04615, -0.013351, 0.064172, 0.051637, -0.000885, 0.009899, -0.092548, -0.026595, 0.036515, -0.09158, -0.027992, 0.016924, -0.024003, -0.029879, 0.252747, 0.093754, -0.034897, 0.079439, -0.073516, -0.110923, 0.095652, 0.072123, -0.047069, -0.17929, -0.068377, -0.224694, -0.016158, 0.236704, 0.010695, -0.133073, 0.084929, 0.102969, 0.040056, -0.009444, -0.051333, 0.130339]\\n\",\n      \"\\n\",\n      \"First 30 vocabulary words: ['.', ',', 'man', '-', '\\\"', 'woman', \\\"'\\\", 'said', 'dog', 'playing', ':', 'white', 'black', '$', 'killed', 'percent', 'new', 'syria', 'people', 'china']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# 1. Let's see the word embedding for \\\"apple\\\" by passing in \\\"apple\\\" as the key.\\n\",\n    \"print(\\\"Embedding for apple:\\\", glove_wv[\\\"apple\\\"])\\n\",\n    \"\\n\",\n    \"# 2. Inspect the model vocabulary by accessing keys of the \\\"wv.vocab\\\" attribute. We'll print the first 20 words.\\n\",\n    \"print(\\\"\\\\nFirst 30 vocabulary words:\\\", list(glove_wv.keys())[:20])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Concluding Remarks\\n\",\n    \"\\n\",\n    \"In this notebook we have shown how to train word2vec, GloVe, and fastText word embeddings on the STS Benchmark dataset. We also inspected how long each model took to train on our dataset: word2vec took 0.39 seconds, GloVe took 8.16 seconds, and fastText took 10.41 seconds.\\n\",\n    \"\\n\",\n    \"FastText is typically regarded as the best baseline for word embeddings (see [blog](https://medium.com/huggingface/universal-word-sentence-embeddings-ce48ddc8fc3a)) and is a good place to start when generating word embeddings. Now that we generated word embeddings on our dataset, we could also repeat the baseline_deep_dive notebook using these embeddings (versus the pre-trained ones from the internet). \"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/entailment/README.md",
    "content": "# Natural Language Inference (NLI)  \n\nThis folder provides end-to-end examples of building Natural Language Inference (NLI) models. We\ndemonstrate the best practices of data preprocessing and model building for NLI task and use the\nutility scripts in the [utils_nlp](../../utils_nlp) folder to speed up these processes.  \nNLI is one of many NLP tasks that require robust compositional sentence understanding, but it's\nsimpler compared to other tasks like question answering and machine translation.  \n Currently, we focus on fine-tuning pre-trained BERT model. If you are interested in pre-training your own BERT model, you can view the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT), which walks through the process in depth.  We plan to continue adding state-of-the-art models as they come up and welcome community contributions.\n\n## Natural Language Inference\n\nNatural Language Inference or Recognizing Textual Entailment (RTE) is the task of classifying\na pair of premise and hypothesis sentences into three classes: contradiction, neutral, and\nentailment. For example,  \n\n|Premise|Hypothesis|Label|\n|-------|----------|-----|\n|A man inspects the uniform of a figure in some East Asian country.|The man is sleeping.|contradiction|\n|An older and younger man smiling.|Two men are smiling and laughing at the cats playing on the floor.|neutral|\n|A soccer game with multiple males playing.|Some men are playing a sport.|entailment|\n\n## Summary\n\n|Notebook|Environment|Description|Dataset| Language | \n|--------|:-----------:|-------|----------|---------| \n|[entailment_multinli_transformers.ipynb](entailment_multinli_transformers.ipynb)|Local|Fine-tuning of pre-trained BERT model for NLI|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| en | \n|[entailment_xnli_bert_azureml.ipynb](entailment_xnli_bert_azureml.ipynb)|AzureML|**Distributed** fine-tuning of pre-trained BERT model for NLI|[XNLI](https://www.nyu.edu/projects/bowman/xnli/)| en \n"
  },
  {
    "path": "examples/entailment/entailment_multinli_transformers.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"*Copyright (c) Microsoft Corporation. All rights reserved.*  \\n\",\n    \"\\n\",\n    \"*Licensed under the MIT License.*\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Natural Language Inference on MultiNLI Dataset using Transformers\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Before You Start\\n\",\n    \"\\n\",\n    \"It takes about 4 hours to fine-tune the `bert-large-cased` model on a Standard_NC24rs_v3 Azure Data Science Virtual Machine with 4 NVIDIA Tesla V100 GPUs. \\n\",\n    \"> **Tip:** If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \\n\",\n    \"\\n\",\n    \"\\n\",\n    \"If you run into CUDA out-of-memory error, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\\n\",\n    \"QUICK_RUN = False\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Summary\\n\",\n    \"In this notebook, we demostrate fine-tuning pretrained transformer models to perform Natural Language Inference (NLI). We use the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset and the task is to classify sentence pairs into three classes: contradiction, entailment, and neutral.   \\n\",\n    \"To classify a sentence pair, we concatenate the tokens in both sentences and separate the sentences by the special [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.The NLI task essentially becomes a sequence classification task. For example, the figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. \\n\",\n    \"<img src=\\\"https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG\\\">\\n\",\n    \"\\n\",\n    \"We compare the training time and performance of bert-large-cased and xlnet-large-cased. The model used can be set in the **Configurations** section. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import sys, os\\n\",\n    \"nlp_path = os.path.abspath('../../')\\n\",\n    \"if nlp_path not in sys.path:\\n\",\n    \"    sys.path.insert(0, nlp_path)\\n\",\n    \"\\n\",\n    \"import scrapbook as sb\\n\",\n    \"\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"\\n\",\n    \"import numpy as np\\n\",\n    \"from sklearn.metrics import classification_report\\n\",\n    \"from sklearn.preprocessing import LabelEncoder\\n\",\n    \"\\n\",\n    \"import torch\\n\",\n    \"\\n\",\n    \"from utils_nlp.models.transformers.sequence_classification import Processor, SequenceClassifier\\n\",\n    \"from utils_nlp.dataset.multinli import load_pandas_df\\n\",\n    \"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\\n\",\n    \"from utils_nlp.common.timer import Timer\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To see all the model supported by `SequenceClassifier`, call the `list_supported_models` method.  \\n\",\n    \"**Note**: Although `SequenceClassifier` supports distilbert for single sequence classification, distilbert doesn't support sentence pair classification and can not be used in this notebook\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"SequenceClassifier.list_supported_models()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Configurations\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"MODEL_NAME = \\\"bert-large-cased\\\"\\n\",\n    \"TO_LOWER = False\\n\",\n    \"BATCH_SIZE = 16\\n\",\n    \"\\n\",\n    \"# MODEL_NAME = \\\"xlnet-large-cased\\\"\\n\",\n    \"# TO_LOWER = False\\n\",\n    \"# BATCH_SIZE = 16\\n\",\n    \"\\n\",\n    \"TRAIN_DATA_USED_FRACTION = 1\\n\",\n    \"DEV_DATA_USED_FRACTION = 1\\n\",\n    \"NUM_EPOCHS = 2\\n\",\n    \"WARMUP_STEPS= 2500\\n\",\n    \"\\n\",\n    \"if QUICK_RUN:\\n\",\n    \"    TRAIN_DATA_USED_FRACTION = 0.001\\n\",\n    \"    DEV_DATA_USED_FRACTION = 0.01\\n\",\n    \"    NUM_EPOCHS = 1\\n\",\n    \"    WARMUP_STEPS= 10\\n\",\n    \"\\n\",\n    \"if not torch.cuda.is_available():\\n\",\n    \"    BATCH_SIZE = BATCH_SIZE/2\\n\",\n    \"\\n\",\n    \"RANDOM_SEED = 42\\n\",\n    \"\\n\",\n    \"# model configurations\\n\",\n    \"MAX_SEQ_LENGTH = 128\\n\",\n    \"\\n\",\n    \"# optimizer configurations\\n\",\n    \"LEARNING_RATE= 5e-5\\n\",\n    \"\\n\",\n    \"# data configurations\\n\",\n    \"TEXT_COL_1 = \\\"sentence1\\\"\\n\",\n    \"TEXT_COL_2 = \\\"sentence2\\\"\\n\",\n    \"LABEL_COL = \\\"gold_label\\\"\\n\",\n    \"LABEL_COL_NUM = \\\"gold_label_num\\\"\\n\",\n    \"\\n\",\n    \"CACHE_DIR = TemporaryDirectory().name\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Load Data\\n\",\n    \"The MultiNLI dataset comes with three subsets: train, dev_matched, dev_mismatched. The dev_matched dataset are from the same genres as the train dataset, while the dev_mismatched dataset are from genres not seen in the training dataset.   \\n\",\n    \"The `load_pandas_df` function downloads and extracts the zip files if they don't already exist in `local_cache_path` and returns the data subset specified by `file_split`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"train_df = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\\\"train\\\")\\n\",\n    \"dev_df_matched = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\\\"dev_matched\\\")\\n\",\n    \"dev_df_mismatched = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\\\"dev_mismatched\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"dev_df_matched = dev_df_matched.loc[dev_df_matched['gold_label'] != '-']\\n\",\n    \"dev_df_mismatched = dev_df_mismatched.loc[dev_df_mismatched['gold_label'] != '-']\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\\"Training dataset size: {}\\\".format(train_df.shape[0]))\\n\",\n    \"print(\\\"Development (matched) dataset size: {}\\\".format(dev_df_matched.shape[0]))\\n\",\n    \"print(\\\"Development (mismatched) dataset size: {}\\\".format(dev_df_mismatched.shape[0]))\\n\",\n    \"print()\\n\",\n    \"print(train_df[['gold_label', 'sentence1', 'sentence2']].head())\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# sample\\n\",\n    \"train_df = train_df.sample(frac=TRAIN_DATA_USED_FRACTION).reset_index(drop=True)\\n\",\n    \"dev_df_matched = dev_df_matched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)\\n\",\n    \"dev_df_mismatched = dev_df_mismatched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"label_encoder = LabelEncoder()\\n\",\n    \"train_labels = label_encoder.fit_transform(train_df[LABEL_COL])\\n\",\n    \"train_df[LABEL_COL_NUM] = train_labels \\n\",\n    \"num_labels = len(np.unique(train_labels))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Tokenize and Preprocess\\n\",\n    \"Before training, we tokenize and preprocess the sentence texts to convert them into the format required by transformer model classes.  \\n\",\n    \"The `dataset_from_dataframe` method of the `Processor` class performs the following preprocessing steps and returns a Pytorch `DataSet`\\n\",\n    \"* Tokenize input texts using the tokenizer of the pre-trained model specified by `model_name`. \\n\",\n    \"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\\n\",\n    \"* Pad or truncate the token lists to the specified max length.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"processor = Processor(model_name=MODEL_NAME, cache_dir=CACHE_DIR, to_lower=TO_LOWER)\\n\",\n    \"\\n\",\n    \"train_dataset = processor.dataset_from_dataframe(\\n\",\n    \"    df=train_df,\\n\",\n    \"    text_col=TEXT_COL_1,\\n\",\n    \"    label_col=LABEL_COL_NUM,\\n\",\n    \"    text2_col=TEXT_COL_2,\\n\",\n    \"    max_len=MAX_SEQ_LENGTH,\\n\",\n    \")\\n\",\n    \"dev_dataset_matched = processor.dataset_from_dataframe(\\n\",\n    \"    df=dev_df_matched,\\n\",\n    \"    text_col=TEXT_COL_1,    \\n\",\n    \"    text2_col=TEXT_COL_2,\\n\",\n    \"    max_len=MAX_SEQ_LENGTH,\\n\",\n    \")\\n\",\n    \"dev_dataset_mismatched = processor.dataset_from_dataframe(\\n\",\n    \"    df=dev_df_mismatched,\\n\",\n    \"    text_col=TEXT_COL_1,    \\n\",\n    \"    text2_col=TEXT_COL_2,\\n\",\n    \"    max_len=MAX_SEQ_LENGTH,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"train_dataloader = dataloader_from_dataset(\\n\",\n    \"    train_dataset, batch_size=BATCH_SIZE, shuffle=True\\n\",\n    \")\\n\",\n    \"dev_dataloader_matched = dataloader_from_dataset(\\n\",\n    \"    dev_dataset_matched, batch_size=BATCH_SIZE, shuffle=False\\n\",\n    \")\\n\",\n    \"dev_dataloader_mismatched = dataloader_from_dataset(\\n\",\n    \"    dev_dataset_mismatched, batch_size=BATCH_SIZE, shuffle=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Train and Predict\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Create Classifier\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"classifier = SequenceClassifier(\\n\",\n    \"    model_name=MODEL_NAME, num_labels=num_labels, cache_dir=CACHE_DIR\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Train Classifier\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"with Timer() as t:\\n\",\n    \"    classifier.fit(\\n\",\n    \"            train_dataloader,\\n\",\n    \"            num_epochs=NUM_EPOCHS,\\n\",\n    \"            learning_rate=LEARNING_RATE,\\n\",\n    \"            warmup_steps=WARMUP_STEPS,\\n\",\n    \"        )\\n\",\n    \"\\n\",\n    \"print(\\\"Training time : {:.3f} hrs\\\".format(t.interval / 3600))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Predict on Test Data\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with Timer() as t:\\n\",\n    \"    predictions_matched = classifier.predict(dev_dataloader_matched)\\n\",\n    \"print(\\\"Prediction time : {:.3f} hrs\\\".format(t.interval / 3600))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with Timer() as t:\\n\",\n    \"    predictions_mismatched = classifier.predict(dev_dataloader_mismatched)\\n\",\n    \"print(\\\"Prediction time : {:.3f} hrs\\\".format(t.interval / 3600))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Evaluate\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"predictions_matched = label_encoder.inverse_transform(predictions_matched)\\n\",\n    \"print(classification_report(dev_df_matched[LABEL_COL], predictions_matched, digits=3))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"predictions_mismatched = label_encoder.inverse_transform(predictions_mismatched)\\n\",\n    \"print(classification_report(dev_df_mismatched[LABEL_COL], predictions_mismatched, digits=3))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Compare Model Performance\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"|Model name|Training time|Scoring time|Matched F1|Mismatched F1|\\n\",\n    \"|:--------:|:-----------:|:----------:|:--------:|:-----------:|\\n\",\n    \"|xlnet-large-cased|5.15 hrs|0.11 hrs|0.887|0.890|\\n\",\n    \"|bert-large-cased|4.01 hrs|0.08 hrs|0.867|0.867|\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"result_matched_dict = classification_report(dev_df_matched[LABEL_COL], predictions_matched, digits=3, output_dict=True)\\n\",\n    \"result_mismatched_dict = classification_report(dev_df_mismatched[LABEL_COL], predictions_mismatched, digits=3, output_dict=True)\\n\",\n    \"sb.glue(\\\"matched_precision\\\", result_matched_dict[\\\"weighted avg\\\"][\\\"precision\\\"])\\n\",\n    \"sb.glue(\\\"matched_recall\\\", result_matched_dict[\\\"weighted avg\\\"][\\\"recall\\\"])\\n\",\n    \"sb.glue(\\\"matched_f1\\\", result_matched_dict[\\\"weighted avg\\\"][\\\"f1-score\\\"])\\n\",\n    \"sb.glue(\\\"mismatched_precision\\\", result_mismatched_dict[\\\"weighted avg\\\"][\\\"precision\\\"])\\n\",\n    \"sb.glue(\\\"mismatched_recall\\\", result_mismatched_dict[\\\"weighted avg\\\"][\\\"recall\\\"])\\n\",\n    \"sb.glue(\\\"mismatched_f1\\\", result_mismatched_dict[\\\"weighted avg\\\"][\\\"f1-score\\\"])\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"nlp_gpu\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/entailment/entailment_xnli_bert_azureml.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Natural Language Inference on XNLI Dataset using BERT with Azure Machine Learning\\n\",\n    \"\\n\",\n    \"## 1. Summary\\n\",\n    \"In this notebook, we demonstrate how to fine-tune BERT using distributed training (Horovod) on Azure Machine Learning service to do language inference in English. We use the [XNLI](https://github.com/facebookresearch/XNLI) dataset and to classify sentence pairs into three classes: contradiction, entailment, and neutral.   \\n\",\n    \"\\n\",\n    \"The figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. It concatenates the tokens in each sentence pairs and separates the sentences by the [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.\\n\",\n    \"<img src=\\\"https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG\\\">\\n\",\n    \"\\n\",\n    \"**Note: To learn how to do pre-training on your own, please reference the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT) created by Microsoft.**\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/entailment/entailment_xnli_bert_azureml.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Imports\\n\",\n    \"\\n\",\n    \"import sys\\n\",\n    \"\\n\",\n    \"sys.path.append(\\\"../..\\\")\\n\",\n    \"\\n\",\n    \"import os\\n\",\n    \"import shutil\\n\",\n    \"import torch\\n\",\n    \"import json\\n\",\n    \"import pandas as pd\\n\",\n    \"\\n\",\n    \"import azureml.core\\n\",\n    \"from azureml.train.dnn import PyTorch\\n\",\n    \"from azureml.core.runconfig import MpiConfiguration\\n\",\n    \"from azureml.core import Experiment\\n\",\n    \"from azureml.widgets import RunDetails\\n\",\n    \"from azureml.core.compute import ComputeTarget, AmlCompute\\n\",\n    \"from azureml.exceptions import ComputeTargetException\\n\",\n    \"from utils_nlp.azureml.azureml_utils import get_or_create_workspace, get_output_files\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# Parameters\\n\",\n    \"\\n\",\n    \"DEBUG = True\\n\",\n    \"NODE_COUNT = 4\\n\",\n    \"NUM_PROCESS = 1\\n\",\n    \"DATA_PERCENT_USED = 1.0\\n\",\n    \"\\n\",\n    \"config_path = (\\n\",\n    \"    \\\"./.azureml\\\"\\n\",\n    \")  # Path to the directory containing config.json with azureml credentials\\n\",\n    \"\\n\",\n    \"# Azure resources\\n\",\n    \"subscription_id = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"resource_group = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  \\n\",\n    \"workspace_name = \\\"YOUR_WORKSPACE_NAME\\\"  \\n\",\n    \"workspace_region = \\\"YOUR_WORKSPACE_REGION\\\"  # eg: eastus, eastus2.\\n\",\n    \"cluster_name = \\\"gpu-entail\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2. AzureML Setup\\n\",\n    \"\\n\",\n    \"### 2.1 Initialize a Workspace\\n\",\n    \"\\n\",\n    \"The following cell looks to set up the connection to your [Azure Machine Learning service Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). You can choose to connect to an existing workspace or create a new one. \\n\",\n    \"\\n\",\n    \"**To access an existing workspace:**\\n\",\n    \"1. If you have a `config.json` file, you do not need to provide the workspace information; you will only need to update the `config_path` variable that is defined above which contains the file.\\n\",\n    \"2. Otherwise, you will need to supply the following:\\n\",\n    \"    * The name of your workspace\\n\",\n    \"    * Your subscription id\\n\",\n    \"    * The resource group name\\n\",\n    \"\\n\",\n    \"**To create a new workspace:**\\n\",\n    \"\\n\",\n    \"Set the following information:\\n\",\n    \"* A name for your workspace\\n\",\n    \"* Your subscription id\\n\",\n    \"* The resource group name\\n\",\n    \"* [Azure region](https://azure.microsoft.com/en-us/global-infrastructure/regions/) to create the workspace in, such as `eastus2`. \\n\",\n    \"\\n\",\n    \"This will automatically create a new resource group for you in the region provided if a resource group with the name given does not already exist. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ws = get_or_create_workspace(\\n\",\n    \"    config_path=config_path,\\n\",\n    \"    subscription_id=subscription_id,\\n\",\n    \"    resource_group=resource_group,\\n\",\n    \"    workspace_name=workspace_name,\\n\",\n    \"    workspace_region=workspace_region,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\n\",\n    \"    \\\"Workspace name: \\\" + ws.name,\\n\",\n    \"    \\\"Azure region: \\\" + ws.location,\\n\",\n    \"    \\\"Subscription id: \\\" + ws.subscription_id,\\n\",\n    \"    \\\"Resource group: \\\" + ws.resource_group,\\n\",\n    \"    sep=\\\"\\\\n\\\",\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.3 Link AmlCompute Compute Target\\n\",\n    \"\\n\",\n    \"We need to link a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training our model (see [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) for explanation of the different options). We will use an [AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute) target and link to an existing target (if the cluster_name exists) or create a STANDARD_NC6 GPU cluster (autoscales from 0 to 4 nodes) in this example. Creating a new AmlComputes takes approximately 5 minutes. \\n\",\n    \"\\n\",\n    \"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Found compute target: gpu-entail\\n\",\n      \"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-08-03T13:43:20.068000+00:00', 'errors': None, 'creationTime': '2019-07-27T02:14:46.127092+00:00', 'modifiedTime': '2019-07-27T02:15:07.181277+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6S_V2'}\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"try:\\n\",\n    \"    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\\n\",\n    \"    print(\\\"Found compute target: {}\\\".format(cluster_name))\\n\",\n    \"except ComputeTargetException:\\n\",\n    \"    print(\\\"Creating new compute target: {}\\\".format(cluster_name))\\n\",\n    \"    compute_config = AmlCompute.provisioning_configuration(\\n\",\n    \"        vm_size=\\\"STANDARD_NC6\\\", max_nodes=NODE_COUNT\\n\",\n    \"    )\\n\",\n    \"    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\\n\",\n    \"    compute_target.wait_for_completion(show_output=True)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"print(compute_target.get_status().serialize())\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'./entail_utils\\\\\\\\utils_nlp'\"\n      ]\n     },\n     \"execution_count\": 5,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"project_dir = \\\"./entail_utils\\\"\\n\",\n    \"if DEBUG and os.path.exists(project_dir):\\n\",\n    \"    shutil.rmtree(project_dir)\\n\",\n    \"shutil.copytree(\\\"../../utils_nlp\\\", os.path.join(project_dir, \\\"utils_nlp\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3. Prepare Training Script\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Writing ./entail_utils/train.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile $project_dir/train.py\\n\",\n    \"import horovod.torch as hvd\\n\",\n    \"import torch\\n\",\n    \"import numpy as np\\n\",\n    \"import time\\n\",\n    \"import argparse\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"from utils_nlp.dataset.xnli_torch_dataset import XnliDataset\\n\",\n    \"from utils_nlp.models.bert.common import Language\\n\",\n    \"from utils_nlp.models.bert.sequence_classification_distributed import (\\n\",\n    \"    BERTSequenceClassifier,\\n\",\n    \")\\n\",\n    \"from sklearn.metrics import classification_report\\n\",\n    \"\\n\",\n    \"print(\\\"Torch version:\\\", torch.__version__)\\n\",\n    \"\\n\",\n    \"hvd.init()\\n\",\n    \"\\n\",\n    \"LANGUAGE_ENGLISH = \\\"en\\\"\\n\",\n    \"TRAIN_FILE_SPLIT = \\\"train\\\"\\n\",\n    \"TEST_FILE_SPLIT = \\\"test\\\"\\n\",\n    \"TO_LOWERCASE = True\\n\",\n    \"PRETRAINED_BERT_LNG = Language.ENGLISH\\n\",\n    \"LEARNING_RATE = 5e-5\\n\",\n    \"WARMUP_PROPORTION = 0.1\\n\",\n    \"BATCH_SIZE = 32\\n\",\n    \"NUM_GPUS = 1\\n\",\n    \"OUTPUT_DIR = \\\"./outputs/\\\"\\n\",\n    \"LABELS = [\\\"contradiction\\\", \\\"entailment\\\", \\\"neutral\\\"]\\n\",\n    \"\\n\",\n    \"## each machine gets it's own copy of data\\n\",\n    \"CACHE_DIR = \\\"./xnli-%d\\\" % hvd.rank()\\n\",\n    \"\\n\",\n    \"parser = argparse.ArgumentParser()\\n\",\n    \"# Training settings\\n\",\n    \"parser.add_argument(\\n\",\n    \"    \\\"--seed\\\", type=int, default=42, metavar=\\\"S\\\", help=\\\"random seed (default: 42)\\\"\\n\",\n    \")\\n\",\n    \"parser.add_argument(\\n\",\n    \"    \\\"--epochs\\\", type=int, default=2, metavar=\\\"S\\\", help=\\\"random seed (default: 2)\\\"\\n\",\n    \")\\n\",\n    \"parser.add_argument(\\n\",\n    \"    \\\"--no-cuda\\\", action=\\\"store_true\\\", default=False, help=\\\"disables CUDA training\\\"\\n\",\n    \")\\n\",\n    \"parser.add_argument(\\n\",\n    \"    \\\"--data_percent_used\\\",\\n\",\n    \"    type=float,\\n\",\n    \"    default=1.0,\\n\",\n    \"    metavar=\\\"S\\\",\\n\",\n    \"    help=\\\"data percent used (default: 1.0)\\\",\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"args = parser.parse_args()\\n\",\n    \"args.cuda = not args.no_cuda and torch.cuda.is_available()\\n\",\n    \"\\n\",\n    \"\\\"\\\"\\\"\\n\",\n    \"Note: For example, you have 4 nodes and 4 GPUs each node, so you spawn 16 workers. \\n\",\n    \"Every worker will have a rank [0, 15], and every worker will have a local_rank [0, 3]\\n\",\n    \"\\\"\\\"\\\"\\n\",\n    \"if args.cuda:\\n\",\n    \"    torch.cuda.set_device(hvd.local_rank())\\n\",\n    \"    torch.cuda.manual_seed(args.seed)\\n\",\n    \"\\n\",\n    \"# num_workers - this is equal to number of gpus per machine\\n\",\n    \"kwargs = {\\\"num_workers\\\": NUM_GPUS, \\\"pin_memory\\\": True} if args.cuda else {}\\n\",\n    \"\\n\",\n    \"train_dataset = XnliDataset(\\n\",\n    \"    file_split=TRAIN_FILE_SPLIT,\\n\",\n    \"    cache_dir=CACHE_DIR,\\n\",\n    \"    language=LANGUAGE_ENGLISH,\\n\",\n    \"    to_lowercase=TO_LOWERCASE,\\n\",\n    \"    tok_language=PRETRAINED_BERT_LNG,\\n\",\n    \"    data_percent_used=args.data_percent_used,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# set the label_encoder for evaluation\\n\",\n    \"label_encoder = train_dataset.label_encoder\\n\",\n    \"num_labels = len(np.unique(train_dataset.labels))\\n\",\n    \"\\n\",\n    \"# Train\\n\",\n    \"classifier = BERTSequenceClassifier(\\n\",\n    \"    language=Language.ENGLISH,\\n\",\n    \"    num_labels=num_labels,\\n\",\n    \"    cache_dir=CACHE_DIR,\\n\",\n    \"    use_distributed=True,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"train_loader = classifier.create_data_loader(\\n\",\n    \"    train_dataset, BATCH_SIZE, mode=\\\"train\\\", **kwargs\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"num_samples = len(train_loader.dataset)\\n\",\n    \"num_batches = int(num_samples / BATCH_SIZE)\\n\",\n    \"num_train_optimization_steps = num_batches * args.epochs\\n\",\n    \"optimizer = classifier.create_optimizer(\\n\",\n    \"    num_train_optimization_steps, lr=LEARNING_RATE, warmup_proportion=WARMUP_PROPORTION\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"with Timer() as t:\\n\",\n    \"    for epoch in range(1, args.epochs + 1):\\n\",\n    \"\\n\",\n    \"        # to allow data shuffling for DistributedSampler\\n\",\n    \"        train_loader.sampler.set_epoch(epoch)\\n\",\n    \"\\n\",\n    \"        # epoch and num_epochs is passed in the fit function to print loss at regular batch intervals\\n\",\n    \"        classifier.fit(\\n\",\n    \"            train_loader,\\n\",\n    \"            epoch=epoch,\\n\",\n    \"            num_epochs=args.epochs,\\n\",\n    \"            bert_optimizer=optimizer,\\n\",\n    \"            num_gpus=NUM_GPUS,\\n\",\n    \"        )\\n\",\n    \"\\n\",\n    \"#if machine has multiple gpus then run predictions on only on 1 gpu since test_dataset is small.\\n\",\n    \"if hvd.rank() == 0:\\n\",\n    \"    NUM_GPUS = 1\\n\",\n    \"    \\n\",\n    \"    test_dataset = XnliDataset(\\n\",\n    \"        file_split=TEST_FILE_SPLIT,\\n\",\n    \"        cache_dir=CACHE_DIR,\\n\",\n    \"        language=LANGUAGE_ENGLISH,\\n\",\n    \"        to_lowercase=TO_LOWERCASE,\\n\",\n    \"        tok_language=PRETRAINED_BERT_LNG,\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    test_loader = classifier.create_data_loader(test_dataset, mode=\\\"test\\\")\\n\",\n    \"\\n\",\n    \"    # predict\\n\",\n    \"    predictions, pred_labels = classifier.predict(test_loader, NUM_GPUS)\\n\",\n    \"\\n\",\n    \"    predictions = label_encoder.inverse_transform(predictions)\\n\",\n    \"\\n\",\n    \"    # Evaluate\\n\",\n    \"    results = classification_report(\\n\",\n    \"        pred_labels, predictions, target_names=LABELS, output_dict=True\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    result_file = os.path.join(OUTPUT_DIR, \\\"results.json\\\")\\n\",\n    \"    with open(result_file, \\\"w+\\\") as fp:\\n\",\n    \"        json.dump(results, fp)\\n\",\n    \"\\n\",\n    \"    # save model\\n\",\n    \"    classifier.save_model()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4. Create a PyTorch Estimator\\n\",\n    \"\\n\",\n    \"BERT is built on PyTorch, so we will use the AzureML SDK's PyTorch estimator to easily submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, see [How to Train Pytorch Models on AzureML](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch). \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"mpiConfig = MpiConfiguration()\\n\",\n    \"mpiConfig.process_count_per_node = NUM_PROCESS\\n\",\n    \"\\n\",\n    \"script_params = {\\n\",\n    \"    '--data_percent_used': DATA_PERCENT_USED\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"est = PyTorch(\\n\",\n    \"    source_directory=project_dir,\\n\",\n    \"    compute_target=compute_target,\\n\",\n    \"    entry_script=\\\"train.py\\\",\\n\",\n    \"    script_params = script_params,\\n\",\n    \"    node_count=NODE_COUNT,\\n\",\n    \"    distributed_training=mpiConfig,\\n\",\n    \"    use_gpu=True,\\n\",\n    \"    framework_version=\\\"1.0\\\",\\n\",\n    \"    conda_packages=[\\\"scikit-learn=0.20.3\\\", \\\"numpy\\\", \\\"spacy\\\", \\\"nltk\\\"],\\n\",\n    \"    pip_packages=[\\\"pandas\\\", \\\"seqeval[gpu]\\\", \\\"pytorch-pretrained-bert\\\"],\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 5. Create Experiment and Submit a Job\\n\",\n    \"Submit the estimator object to run your experiment. Results can be monitored using a Jupyter widget. The widget and run are asynchronous and update every 10-15 seconds until job completion.\\n\",\n    \"\\n\",\n    \"**Note**: The experiment takes ~4 hours with 2 NC24 nodes and ~7hours with 4 NC6 nodes. The overhead is due to the communication time between nodes.    \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"experiment = Experiment(ws, name=\\\"NLP-Entailment-BERT\\\")\\n\",\n    \"run = experiment.submit(est)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"c8e7a44fa8804e95b21eea74d7694b1e\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"RunDetails(run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Since the above cell is an async call, the below cell is a blocking call to stop the cells below it to execute.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"run.wait_for_completion()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 6. Analyze Results\\n\",\n    \"\\n\",\n    \"Download result.json from portal and open to view results. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Downloading file outputs/results.json to ./outputs\\\\results.json...\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"file_names = [\\\"outputs/results.json\\\"]\\n\",\n    \"get_output_files(run, \\\"./outputs\\\", file_names=file_names)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"               f1-score  precision    recall  support\\n\",\n      \"contradiction  0.838749   0.859296  0.819162   1670.0\\n\",\n      \"entailment     0.817280   0.877663  0.764671   1670.0\\n\",\n      \"neutral        0.777870   0.719817  0.846108   1670.0\\n\",\n      \"micro avg      0.809980   0.809980  0.809980   5010.0\\n\",\n      \"macro avg      0.811300   0.818925  0.809980   5010.0\\n\",\n      \"weighted avg   0.811300   0.818925  0.809980   5010.0\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"with open(\\\"outputs/results.json\\\", \\\"r\\\") as handle:\\n\",\n    \"    parsed = json.load(handle)\\n\",\n    \"    print(pd.DataFrame.from_dict(parsed).transpose())\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu_transformer_bug_bash)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu_transformer_bug_bash\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.7.3\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/model_explainability/README.md",
    "content": "# Model Explainability\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for explaining and\ninterpreting models. Being able to explain and understand machine learning models not only helps\nguiding further model improvements, but more importantly, it's critical for gaining users' trust in the\nmodels and detecting biases caused by the training data.\n\n## Summary\n\n|Notebook|Environment|Description|Dataset| Language |\n|---|:---:|---|---|---|\n|[DUUDNM](interpret_dnn_layers.ipynb)|Local| Interpreting DNN Layers using Mutual Information.||en|\n"
  },
  {
    "path": "examples/model_explainability/interpret_dnn_layers.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"*Copyright (c) Microsoft Corporation. All rights reserved.*\\n\",\n    \"\\n\",\n    \"*Licensed under the MIT License.*\\n\",\n    \"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Understand your NLP models\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"0. [Methodology](#0-Methodology)\\n\",\n    \"\\n\",\n    \"    - 0.1 [Multi-level Quantification](#0.1-Multi-level-Quantification)\\n\",\n    \"    - 0.2 [Perturbation-based Approximation](#0.2-Perturbation-based-Approximation)\\n\",\n    \"    \\n\",\n    \"    \\n\",\n    \"1. [How to understand a simple model](#1-How-to-understand-a-simple-model)\\n\",\n    \"\\n\",\n    \"    - 1.1 [Prepare necessary components](#1.1-Prepare-necessary-components)\\n\",\n    \"    - 1.2 [Create an Interpreter instance](#1.2-Create-an-Interpreter-instance)\\n\",\n    \"    - 1.3 [Train the Interpreter](#1.3-Train-the-Interpreter)\\n\",\n    \"    - 1.4 [Show and visualize the results](#1.4-Show-and-visualize-the-results)\\n\",\n    \"    \\n\",\n    \"    \\n\",\n    \"2. [How to understand a saved PyTorch model](#2-How-to-understand-a-saved-PyTorch-model)\\n\",\n    \"\\n\",\n    \"    - [2.1 Prepare necessary components](#2.1-Prepare-necessary-components)\\n\",\n    \"    - [2.2 Create an Interpreter instance](#2.2-Create-an-Interpreter-instance)\\n\",\n    \"    - [2.3 Train the Interpreter](#2.3-Train-the-Interpreter)\\n\",\n    \"    - [2.4 Show and visualize the results](#2.4-Show-and-visualize-the-results)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import sys\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"import json\\n\",\n    \"import torch\\n\",\n    \"import logging\\n\",\n    \"from torch import nn\\n\",\n    \"from urllib import request\\n\",\n    \"import scrapbook as sb\\n\",\n    \"from pytorch_pretrained_bert import BertModel, BertTokenizer\\n\",\n    \"\\n\",\n    \"# import utils\\n\",\n    \"from utils_nlp.interpreter.Interpreter import calculate_regularization, Interpreter\\n\",\n    \"\\n\",\n    \"# disable the inner message of pytorch_pretrained_bert\\n\",\n    \"logging.getLogger().setLevel(logging.WARNING)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This is a tutorial on how to utilize the `Interpreter` class to explain certain hidden layers in your NLP models. We provide the explanation by measuring the information of input words ${\\\\\\\\bf x}_1$,...,${\\\\\\\\bf x}_n$ that is encoded in hidden state ${\\\\bf s} = \\\\Phi({\\\\bf x})$. \\n\",\n    \"\\n\",\n    \"The method is described in our *ICML 2019* paper: [Towards a Deep and Unified Understanding of Deep Neural Models in NLP](https://www.microsoft.com/en-us/research/publication/towards-a-deep-and-unified-understanding-of-deep-neural-models-in-nlp/).\\n\",\n    \"In this torturial, we provide two examples for you to get started quickly.\\n\",\n    \"\\n\",\n    \"## 0 Methodology\\n\",\n    \"\\n\",\n    \"We briefly introduce our algorithms here. In short, we are trying to use Mutual Information to understand $\\\\Phi$, the model or layer we want to understand. You can also refer to our paper [here](https://www.microsoft.com/en-us/research/publication/towards-a-deep-and-unified-understanding-of-deep-neural-models-in-nlp/) for more details on algorithm.\\n\",\n    \"\\n\",\n    \"### 0.1 Multi-level Quantification\\n\",\n    \"\\n\",\n    \"Suppose the input random variable is $\\\\bf X$ and the hidden random variable ${\\\\bf S} = \\\\Phi({\\\\bf X})$. We can provide a global/corpus-level explanation by evaluating the mutual information of $\\\\bf X$ and $\\\\bf S$:\\n\",\n    \"\\n\",\n    \"$$MI({\\\\bf X};{\\\\bf S})=H({\\\\bf S}) - H({\\\\bf H}|{\\\\bf S})$$\\n\",\n    \"\\n\",\n    \"Where $MI(\\\\cdot;\\\\cdot)$ is the mututal information. $H(\\\\cdot)$ stands for entropy. Because $H({\\\\bf S})$ is a constant only related to input dataset $\\\\bf S$, the only thing we need to consider is $H({\\\\bf H}|{\\\\bf S})$. This conditional entropy can be seen as the global/corpus-level information loss when r.v. $\\\\bf X$ is processed by $\\\\Phi$. By definition:\\n\",\n    \"\\n\",\n    \"$$H({\\\\bf X}|{\\\\bf S}) = \\\\int_{{\\\\bf s}\\\\in {\\\\bf S}}p({\\\\bf S})H({\\\\bf X}|{\\\\bf s})d{\\\\bf s}$$\\n\",\n    \"\\n\",\n    \"Then, we can decompose the corpus-level information loss to sentence-level:\\n\",\n    \"\\n\",\n    \"$$H({\\\\bf X}|{\\\\bf s}) = \\\\int_{{\\\\bf x'}\\\\in {\\\\bf X}}p({\\\\bf x}'|{\\\\bf s})H({\\\\bf x}'|{\\\\bf s})d{\\\\bf x}'$$\\n\",\n    \"\\n\",\n    \"If we make an assumption that the inputs of $\\\\Phi$ are independent, we can further decompose the sentence-level information loss to word level:\\n\",\n    \"\\n\",\n    \"$$H({\\\\bf X}|{\\\\bf s}) = \\\\sum_i H({\\\\bf X}_i|{\\\\bf s})$$\\n\",\n    \"$$H({\\\\bf X}_i|{\\\\bf s}) = \\\\int_{{\\\\bf x'}_i\\\\in {\\\\bf X}_i}p({\\\\bf x}_i'|{\\\\bf s})H({\\\\bf x}_i'|{\\\\bf s})d{\\\\bf x}_i'$$\\n\",\n    \"\\n\",\n    \"Note that $H({\\\\bf X}_i|{\\\\bf s})$ stands for the information loss when word ${\\\\bf x}_i$ reaches hidden state $s$. Therefore, we can use this value as our explanation. Higher value stands for the information of corresponding word is largely lost, which means that this word is less important to $\\\\bf s$, and vice versa.\\n\",\n    \"\\n\",\n    \"### 0.2 Perturbation-based Approximation\\n\",\n    \"\\n\",\n    \"In order to calculate $H({\\\\bf X}_i|{\\\\bf s})$, we propose a perturbation-besed method. Let $\\\\tilde{\\\\bf x}_{i}={\\\\bf x}_{i} +{\\\\boldsymbol \\\\epsilon}_{i}$ denote an input with a certain noise $\\\\boldsymbol{\\\\epsilon}_{i}$. We assume that the noise term is a random variable that follows a Gaussian distribution, ${\\\\boldsymbol{\\\\epsilon}_{i}}\\\\in \\\\mathbb{R}^{K}$ and ${\\\\boldsymbol \\\\epsilon}_i\\\\sim{\\\\mathcal N}({\\\\bf0},{\\\\boldsymbol\\\\Sigma}_{i}=\\\\sigma_{i}^2{\\\\bf I})$. \\n\",\n    \"In order to approximate $H({\\\\bf X}_i|{\\\\bf s})$, we first learn an optimal distribution of ${\\\\boldsymbol{\\\\epsilon}} = [{\\\\boldsymbol{\\\\epsilon}}_1^T, {\\\\boldsymbol \\\\epsilon}_2^T, ..., {\\\\boldsymbol \\\\epsilon}_n^T]^T$ with respect to the hidden state \\n\",\n    \"${\\\\bf s}$ with the following loss:\\n\",\n    \"\\n\",\n    \"$$L({\\\\boldsymbol \\\\sigma})=\\\\mathbb{E}_{{\\\\boldsymbol \\\\epsilon}}\\\\Vert\\\\Phi(\\\\tilde{\\\\bf x})-{\\\\bf s}\\\\Vert^2-\\\\lambda\\\\sum_{i=1}^n H(\\\\tilde{\\\\bf X}_{i}|{\\\\bf s})|_{{\\\\boldsymbol\\\\epsilon}_{i}\\\\sim{\\\\mathcal N}({\\\\bf 0},\\\\sigma_{i}^2{\\\\bf I})}$$\\n\",\n    \"\\n\",\n    \"where $\\\\lambda>0$ is a hyper-parameter, ${\\\\boldsymbol \\\\sigma}=[\\\\sigma_1,...,\\\\sigma_n]$, and $\\\\tilde{\\\\bf x} = {\\\\bf x} + \\\\boldsymbol{\\\\epsilon}$. The first term  on the left corresponds to the maximum likelihood estimation (MLE) of the distribution of $\\\\tilde{\\\\bf x}_{i}$ that maximizes $\\\\sum_{i}\\\\sum_{\\\\tilde{\\\\bf x}_{i}}\\\\log p(\\\\tilde{\\\\bf x}_{i}|{\\\\bf s})$, if we consider $\\\\sum_{i}\\\\log p(\\\\tilde{\\\\bf x}_{i}|{\\\\bf s})\\\\propto -\\\\Vert\\\\Phi(\\\\tilde{\\\\bf x})-{\\\\bf s}\\\\Vert^2$. In other words, the first term learns a distribution that generates all potential inputs corresponding to the hidden state ${\\\\bf s}$. The second term on the right encourages a high conditional entropy $H(\\\\tilde{\\\\bf X}_{i}|{\\\\bf s})$, which corresponds to the maximum entropy principle. In other words, the noise $\\\\boldsymbol \\\\epsilon$ needs to enumerate all perturbation directions to reach the representation limit of ${\\\\bf s}$. By minimizing the loss above, we can get the optimal ${\\\\sigma}_i$, then we can get the $H(\\\\tilde{\\\\bf X}_i|{\\\\bf s})$:\\n\",\n    \"\\n\",\n    \"$$H(\\\\tilde{\\\\bf X}_{i}|{\\\\bf s})=\\\\frac{K}{2}\\\\log(2\\\\pi e)+K\\\\log\\\\sigma_{i}$$\\n\",\n    \"\\n\",\n    \"Then, we can use $H(\\\\tilde{\\\\bf X}_i|{\\\\bf s})$ to approximate $H({\\\\bf X}_i|{\\\\bf s})$. Again, you can refer to our paper [here](https://www.microsoft.com/en-us/research/publication/towards-a-deep-and-unified-understanding-of-deep-neural-models-in-nlp/) for more details on algorithm.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1 How to understand a simple model\\n\",\n    \"\\n\",\n    \"In this section, we use a simple linear function as an example to help you be familiar with the usage of Interpreter utils.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.1 Prepare necessary components\\n\",\n    \"Suppose the $\\\\Phi$ we need to explain is a simple linear function:\\n\",\n    \"$$\\\\Phi(x)=10 \\\\times x[0] + 20 \\\\times x[1] + 5 \\\\times x[2] - 20 \\\\times x[3] - 10 \\\\times x[4]$$\\n\",\n    \"From the definition of $\\\\Phi$ we can know that, the weights of the 2nd and the 4th elements in input $x$ are the biggest (in abs form), which means that they contributes the most to the results. Therefore, a reasonable explanation should show a similar pattern.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"device = torch.device(\\\"cpu\\\" if not torch.cuda.is_available() else \\\"cuda\\\")\\n\",\n    \"\\n\",\n    \"# Suppose our input is x, and the sentence is simply \\\"1 2 3 4 5\\\"\\n\",\n    \"x_simple = torch.randn(5, 256) / 100\\n\",\n    \"x_simple = x_simple.to(device)\\n\",\n    \"words = [\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\"]\\n\",\n    \"\\n\",\n    \"# Suppose our hidden state s = Phi(x), where\\n\",\n    \"# Phi = 10 * word[0] + 20 * word[1] + 5 * word[2] - 20 * word[3] - 10 * word[4]\\n\",\n    \"def Phi_simple(x):\\n\",\n    \"    W = torch.tensor([10.0, 20.0, 5.0, -20.0, -10.0]).to(device)\\n\",\n    \"    return W @ x\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# Suppose this is our dataset used for training our models\\n\",\n    \"dataset = [torch.randn(5, 256) / 100 for _ in range(100)]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.2 Create an Interpreter instance\\n\",\n    \"\\n\",\n    \"In the following, we'll show you how to calculate the $\\\\sigma_i$ using functions in this library. To explain a certain $\\\\bf x$ and certain $\\\\Phi$, we need to create an Interpreter instance, and pass your $\\\\bf x$, $\\\\Phi$ and regularization term (which is the standard variance of the hidden state r.v. $\\\\bf S$) to it. We also provide a simple function to calculate the regularization term that is needed in this method.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"Interpreter()\"\n      ]\n     },\n     \"execution_count\": 3,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# calculate the regularization term\\n\",\n    \"regularization_simple = calculate_regularization(dataset, Phi_simple, device=device)\\n\",\n    \"\\n\",\n    \"# create the interpreter instance\\n\",\n    \"# we recommend you to set hyper-parameter *scale* to 10 * Std[word_embedding_weight]\\n\",\n    \"# 10 * 0.1 in this example\\n\",\n    \"interpreter_simple = Interpreter(\\n\",\n    \"    x=x_simple,\\n\",\n    \"    Phi=Phi_simple,\\n\",\n    \"    regularization=regularization_simple,\\n\",\n    \"    scale=10 * 0.1,\\n\",\n    \"    words=words,\\n\",\n    \")\\n\",\n    \"interpreter_simple.to(device)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.3 Train the Interpreter\\n\",\n    \"\\n\",\n    \"Then, we need to train our interpreter (by minimizing the loss [here](#0.2-Perturbation-based-Approximation)) to let it find the information loss in each input word ${\\\\bf x}_i$ when they reach hidden state $\\\\bf s$. You can control the iteration and learning rate when training.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 5000/5000 [00:05<00:00, 976.01it/s] \\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Train the interpreter by optimizing the loss\\n\",\n    \"interpreter_simple.optimize(iteration=5000, lr=0.5, show_progress=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.4 Show and visualize the results\\n\",\n    \"\\n\",\n    \"After training, we can show the sigma (directly speaking, it is the range that every word can change without changing $\\\\bf s$ too much) we have got. Sigma somewhat stands for the information loss of word ${\\\\bf x}_i$ when it reaches $\\\\bf s$.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"array([0.00316059, 0.00158621, 0.00629779, 0.00158636, 0.0030826 ],\\n\",\n       \"      dtype=float32)\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# Show the sigma we get\\n\",\n    \"sigma_numbers = interpreter_simple.get_sigma()\\n\",\n    \"sigma_numbers\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"image/png\": \"iVBORw0KGgoAAAANSUhEUgAAAagAAACsCAYAAAAuVDhiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAALtUlEQVR4nO3dXYwddRnH8d9v20JJKRZtNYUSkcSQADG0KVVTQwwSLdCgl1xojNEQIxclXBAJiRETr0wMXmka0EhAQXnxAg2CESKIUvvKi6UGsMQGkqVBXiqJCDxenNntnNnZ0132zM4zs99PctLZ//znP888e3p+e84sxREhAACymWi7AAAA6hBQAICUCCgAQEoEFAAgJQIKAJDS8vlMPmn1mli57oymalkSjr3yetsl9MLGcz/cdgmdt+/QZNsldN6qtae1XUIvHDv87NGIWFcdn1dArVx3hi76/m3jq2oJenzn79ouoRf+/IcdbZfQeadf+qO2S+i8jV/b1nYJvfDoVz/5Yt04H/EBAFIioAAAKRFQAICUCCgAQEoEFAAgJQIKAJASAQUASImAAgCkREABAFIioAAAKRFQAICUCCgAQEoEFAAgJQIKAJASAQUASImAAgCkREABAFIioAAAKRFQAICUCCgAQEoEFAAgJQIKAJASAQUASImAAgCkREABAFIioAAAKRFQAICUCCgAQEoEFAAgJQIKAJASAQUASImAAgCkREABAFIioAAAKRFQAICUCCgAQEoEFAAgJQIKAJASAQUASOmEAWX7atu7be9++81/L0ZNAACcOKAiYmdEbI6IzSetPn0xagIAgI/4AAA5EVAAgJQIKABASgQUACAlAgoAkBIBBQBIiYACAKREQAEAUiKgAAApEVAAgJQIKABASgQUACAlAgoAkBIBBQBIiYACAKREQAEAUiKgAAApEVAAgJQIKABASgQUACAlAgoAkBIBBQBIiYACAKREQAEAUiKgAAApEVAAgJQIKABASgQUACAlAgoAkBIBBQBIiYACAKREQAEAUiKgAAApEVAAgJQIKABASgQUACAlAgoAkBIBBQBIiYACAKTkiJj7ZPsVSS82V86CrZV0tO0ieoA+Lhw9HA/6uHBd6OFHI2JddXBeAZWd7d0RsbntOrqOPi4cPRwP+rhwXe4hH/EBAFIioAAAKfUtoHa2XUBP0MeFo4fjQR8XrrM97NU9KABAf/TtHRQAoCcIKABASr0IKNs/tT1p++m2a+kq22fZftj2QdvP2N7Rdk1dZHul7V22DxR9vKntmrrK9jLb+2zf33YtXWX7sO2nbO+3vbvteuarF/egbF8s6Zik2yLigrbr6SLb6yWtj4i9tldL2iPpSxHx95ZL6xTblrQqIo7ZXiHpMUk7IuKvLZfWObavk7RZ0mkRsb3terrI9mFJmyMi+3+oW6sX76Ai4k+SXm27ji6LiJcjYm+x/aakg5LObLeq7omBY8WXK4pH938KXGS2N0i6QtItbdeC9vQioDBets+WtFHSE+1W0k3FR1P7JU1Keigi6OP83SzpeknvtV1Ix4WkB23vsX1128XMFwGFIbZPlXSPpGsj4o226+miiHg3Ii6UtEHSFtt87DwPtrdLmoyIPW3X0gNbI2KTpMskXVPcDukMAgrTinsm90i6IyLubbuerouI1yQ9Imlby6V0zVZJVxb3T+6UdInt29stqZsi4qXiz0lJ90na0m5F80NAQdL0zf1bJR2MiB+2XU9X2V5ne02xfYqkSyU9225V3RIRN0TEhog4W9JVkv4YEV9uuazOsb2q+IUn2V4l6fOSOvWbzr0IKNu/lPQXSefaPmL7623X1EFbJX1Fg59W9xePy9suqoPWS3rY9pOS/qbBPSh+TRpt+Iikx2wfkLRL0m8j4oGWa5qXXvyaOQCgf3rxDgoA0D8EFAAgJQIKAJASAQUASImAAgCkREABAFIioAAAKRFQAICUCCgAQEoEFAAgJQIKAJASAQUASImAAgCkREABAFIioAAAKRFQAICUCCgAQEoEFAAgJQIKAJASAQUASImAAgCkREABAFJaPp/Jyz748Yh33qrZY8l1R3jojxlf1I3PWGeua9QU4FHjg43Zlh1VytTGnC6rppZR56xMHRqd03E1C4xqkSR5xATPcszUjjl9u2oWm0/vRn+LY7ZTzL5O8UX12FFzZ6ttuoYTPXdGjNv1dZxwjaG+zfFaVN+j40Mz15n5PZ7lez/b/Bn7Z/+Gjzp2lmdPuaQT1DK6svq9o5+c81l/eNZcn6gz/uYvfP3q3BO9QNTuPWG3Ru+t/J3au2f/7yNiW/WIeQVU/O8tnXzRN6WJ4o3X1LPCllyMTbg0Xpo3URor759wZa3S/gkPrzG9Vmmseq7q+UesZbs01aVSPD3NdXOnxiaG506MmuvBOWauq9IxLpVat3/m3HL9xy/Vs5y35vwTM2udqPSlttbaayn1cKK+F/XXVbru4sX6+LXE9JN4wjH0NBi9P6Y/Hphas7z+RCmghtaqWbd2vyrnKm0P9tdvD9VaV/ds+0vXMuO65nKuurrncK7jax5/2Rvanno5nF6/9P2ubE/Nm9qurqvi69ptlZ5bNWtVt6ePKa1Vfemujk2fbagmT69V/kHOlbXk8tj06PT1lUctV84xtbdcf+m8lVfz6lj1vB66Bg/VWB0b1FKzbqn+2dadGiv3bcbVVsbqrkWSTlm+Zq1q8BEfACAlAgoAkBIBBQBIiYACAKREQAEAUiKgAAApEVAAgJQIKABASgQUACAlAgoAkBIBBQBIiYACAKREQAEAUiKgAAApEVAAgJQIKABASgQUACAlAgoAkBIBBQBIyREx98n2A5Jq/9/xLVgr6WjbRfQUvW0GfW0OvW3GYvX1aERsqw7OK6Aysb07Ija3XUcf0dtm0Nfm0NtmtN1XPuIDAKREQAEAUupyQO1su4Aeo7fNoK/NobfNaLWvnb0HBQDoty6/gwIA9BgBBQBIKU1A2d5m+5Dt52x/u2b/ybbvKvY/Yfvs0r4bivFDtr9QjJ1l+2HbB20/Y3vH4l1NHg30daXtXbYPFH29afGuJpdx97a0b5ntfbbvb/4q8mmir7YP237K9n7buxfnSnJpqK9rbN9t+9nitfbTYy06Ilp/SFom6XlJ50g6SdIBSedV5nxL0k+K7ask3VVsn1fMP1nSx4p1lklaL2lTMWe1pH9U1+z7o6G+WtKpxZwVkp6Q9Km2r7UPvS0dd52kX0i6v+3r7EtfJR2WtLbt6+thX38u6RvF9kmS1oyz7izvoLZIei4iXoiItyXdKemLlTlf1KAZknS3pM/ZdjF+Z0T8NyL+Kek5SVsi4uWI2CtJEfGmpIOSzlyEa8mkib5GRBwr5q8oHkvxN23G3ltJsr1B0hWSblmEa8iokb5i/H21fZqkiyXdKkkR8XZEvDbOorME1JmS/lX6+ohmhsn0nIh4R9Lrkj40l2OLt6obNfhpfylppK/FR1D7JU1KeigillpfpeaeszdLul7Se+MvuROa6mtIetD2HttXN1B3dk309RxJr0j6WfGR9C22V42z6CwB5Zqx6k/ls80ZeaztUyXdI+naiHjjfVfYTY30NSLejYgLJW3Q4CepCxZUZTeNvbe2t0uajIg9Cy2uw5p6LdgaEZskXSbpGtsXv/8SO6mJvi6XtEnSjyNio6T/SJpxb2shsgTUEUlnlb7eIOml2ebYXi7pA5JeHXWs7RUahNMdEXFvI5Xn1khfpxRv5x+RNOMfeVwCmujtVklX2j6swUcwl9i+vYniE2vkORsRU39OSrpPS++jvyb6ekTSkdInKHdrEFjj0/bNu+Lm2nJJL2hwA27qBt75lTnXaPgG3q+K7fM1fAPvBR2/mX+bpJvbvr6e9XWdihuhkk6R9Kik7W1fax96Wzn2s1qavyTRxHN2laTVxZxVkh6XtK3ta+16X4t9j0o6t9j+rqQfjLXuthtXas7lGvym3fOSbizGvifpymJ7paRfa3CDbpekc0rH3lgcd0jSZcXYZzR4G/qkpP3F4/K2r7MHff2EpH1FX5+W9J22r7Evva2svSQDqom+anCv5EDxeGZqzaX2aOL5KulCSbuL14PfSDp9nDXzTx0BAFLKcg8KAIAhBBQAICUCCgCQEgEFAEiJgAIApERAAQBSIqAAACn9H4nGsPaMWuCSAAAAAElFTkSuQmCC\\n\",\n      \"text/plain\": [\n       \"<Figure size 432x288 with 2 Axes>\"\n      ]\n     },\n     \"metadata\": {\n      \"needs_background\": \"light\"\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"# Visualize the information loss of our sigma\\n\",\n    \"interpreter_simple.visualize()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can see that the second and forth words are important to ${\\\\bf s} = \\\\Phi({\\\\bf x})$, which is reasonable because the weights of them are larger.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2 How to understand a saved PyTorch model\\n\",\n    \"\\n\",\n    \"In this section, we will show you how to use our Interpreter in a more complex saved PyTorch model. We use the **3rd layer** of the **pre-trained BERT-base (12 layers) model** for simplicity as an example.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.1 Prepare necessary components\\n\",\n    \"we first load the pre-trained model we need to explain and define the sentence we use in our case. Suppose the sentence we want to study is `rare bird has more than enough charm to make it memorable.`, and the layer we need to explain is the 3rd layer.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 231508/231508 [00:00<00:00, 905875.25B/s]\\n\",\n      \"100%|██████████| 407873900/407873900 [00:12<00:00, 32166694.25B/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# suppose the sentence is as following\\n\",\n    \"text = \\\"rare bird has more than enough charm to make it memorable.\\\"\\n\",\n    \"\\n\",\n    \"# get the tokenized words.\\n\",\n    \"cache_dir = TemporaryDirectory().name\\n\",\n    \"tokenizer = BertTokenizer.from_pretrained(\\\"bert-base-uncased\\\", cache_dir=cache_dir)\\n\",\n    \"words = [\\\"[CLS]\\\"] + tokenizer.tokenize(text) + [\\\"[SEP]\\\"]\\n\",\n    \"\\n\",\n    \"# load BERT base model\\n\",\n    \"model = BertModel.from_pretrained(\\\"bert-base-uncased\\\", cache_dir=cache_dir).to(device)\\n\",\n    \"for param in model.parameters():\\n\",\n    \"    param.requires_grad = False\\n\",\n    \"model.eval()\\n\",\n    \"\\n\",\n    \"# get the x (here we get x by hacking the code in the pytorch_pretrained_bert package)\\n\",\n    \"tokenized_ids = tokenizer.convert_tokens_to_ids(words)\\n\",\n    \"segment_ids = [0 for _ in range(len(words))]\\n\",\n    \"token_tensor = torch.tensor([tokenized_ids], device=device)\\n\",\n    \"segment_tensor = torch.tensor([segment_ids], device=device)\\n\",\n    \"x_bert = model.embeddings(token_tensor, segment_tensor)[0]\\n\",\n    \"\\n\",\n    \"# extract the Phi we need to explain, suppose the layer we are interested in is layer 3\\n\",\n    \"def generate_BERT_Phi(bert_model: BertModel, layer: int):\\n\",\n    \"    assert (\\n\",\n    \"        1 <= layer <= 12\\n\",\n    \"    ), \\\"model only have 12 layers, while you want to access layer %d\\\" % (layer)\\n\",\n    \"\\n\",\n    \"    def Phi(x):\\n\",\n    \"        x = x.unsqueeze(0)\\n\",\n    \"        attention_mask = torch.ones(x.shape[:2]).to(x.device)\\n\",\n    \"        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)\\n\",\n    \"        extended_attention_mask = extended_attention_mask.to(dtype=torch.float)\\n\",\n    \"        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0\\n\",\n    \"        # extract the 3rd layer\\n\",\n    \"        model_list = bert_model.encoder.layer[:layer]\\n\",\n    \"        hidden_states = x\\n\",\n    \"        for layer_module in model_list:\\n\",\n    \"            hidden_states = layer_module(hidden_states, extended_attention_mask)\\n\",\n    \"        return hidden_states[0]\\n\",\n    \"\\n\",\n    \"    return Phi\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"Phi_bert = generate_BERT_Phi(model, layer=3)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.2 Create an Interpreter instance\\n\",\n    \"\\n\",\n    \"In the following, we'll show you how to calculate the $\\\\sigma_i$ using functions in this library. To explain a certain $\\\\bf x$ and certain $\\\\Phi$, we need to create an Interpreter instance, and pass your $\\\\bf x$, $\\\\Phi$ and regularization term (which is the standard variance of the hidden state r.v. $\\\\bf S$) to it. Here, we use the regularization term we already calculated for simplicity.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"[0.6720064191778069, 0.5696053129989517, 0.5447734704199672, 0.5538335568288567, 0.6839598078248833, 0.6505799523747332, 0.6138378727376542, 0.6199456982157656, 0.6306355030517169, 0.5243086318591497, 0.5000930128511534, 0.4827939168590007, 0.51908702631163, 0.5422031857051318, 0.558117971090347, 0.5384989781528138, 0.4973997679961822, 0.5335898308359087, 0.6593399660006478, 0.6181028809756937, 0.6041363265472766, 0.5372477429375551, 0.598196971464808, 0.6076224088069293, 0.5076913720444645, 0.645040330374062, 0.4911672467247289, 0.6188230685389587, 0.500837794378735, 0.609637156019712, 0.6584791082546065, 0.5070865130054582, 0.5897822361246311, 0.6407670325047011, 0.6215270116703064, 0.5588143832710171, 0.6331066296213183, 0.5231979565110525, 0.5658921746881577, 0.5968348697164344, 0.6109241047763923, 0.6946546444303081, 0.6124973128849484, 0.5661205470990824, 0.6209094314251753, 0.5050536927131517, 0.9198743437277025, 0.6441395669510548, 0.6859488332096185, 0.601743328628138, 0.6657785886030984, 0.5833936191830211, 0.5898142874989561, 0.4939017091306656, 0.553457465711801, 0.5081231009363393, 0.5591332177596285, 0.5049017773625475, 0.6031726394194478, 0.5235285530111075, 0.5822860647045073, 0.49414878949409907, 0.4067730655843068, 0.542062997693116, 0.6219646972540404, 0.5610470463598181, 0.5953315515783884, 0.6029648761076112, 0.559769448508683, 0.6469309658573977, 0.5950178816092214, 0.6448319036619511, 0.6358337907663586, 0.5693825155687869, 0.6652512647877379, 0.46869892954355613, 0.6353106204720417, 0.4562067289029469, 0.4964502603196801, 0.5206233875458052, 0.6428625806381059, 0.6108864631733767, 0.6486324059870165, 0.5352102313528296, 0.4148418325724406, 0.510248378037191, 0.5138279230146225, 0.5603926560966265, 0.595676369003732, 0.640714133095125, 0.537345020757705, 0.5880741669140909, 0.5071738799325218, 0.5429150432027633, 0.5119552619987725, 0.5082208016140594, 0.3948878209321561, 0.5077286211275706, 0.5822582643185853, 0.713473444398512, 0.7043491018182539, 0.6912586136177066, 0.5994933104801177, 0.5322452750878545, 0.46650028309002645, 0.35287523313522456, 0.5493517011722789, 0.6587226734253601, 0.5665924293645145, 0.5068284765701566, 0.6302226696199406, 0.6266167971297549, 0.536432292309199, 0.5948000603725734, 0.6469345665904533, 0.5528489765406325, 0.5280362853934059, 0.6481372026819949, 0.6192433717680882, 0.5450068875551102, 0.5288482106690569, 0.39416141845562686, 0.5245808558547367, 0.5259086532041622, 0.393548948727694, 0.48990314446643507, 0.5703322066306044, 0.44862071323671415, 0.582652673791941, 0.521113021353686, 0.681793501539276, 0.5443441656998707, 0.37084482740748426, 0.6015358235542453, 0.5957221796890386, 0.6995457771130384, 0.5453373563264777, 0.4844465750539164, 0.5715930474350235, 0.5046089569638691, 0.6171893091324396, 0.611591297342313, 0.6827167176256492, 0.7376992908309039, 0.5582270652247779, 0.31228514743652797, 0.6263384555591894, 0.5075020071367358, 0.5834912120567404, 0.5358572981041639, 0.5789452180588497, 0.5361360541953356, 0.45794489247599557, 0.6876262699630906, 0.6650665195124484, 0.5126871568466, 0.6423847524196573, 0.5092494090147971, 0.6336159830721736, 0.3386088037608132, 0.5904231572668198, 0.6248617005511456, 0.5190389568435076, 0.6409503576736226, 0.5527542193297937, 0.6216121274931188, 0.6050308962350159, 0.5212559401366526, 0.5019995204048897, 0.5566706919664209, 0.5758882387663521, 0.6057541264876548, 0.5408059995901172, 0.6402653854107861, 0.6924132203189057, 0.3894033882458203, 0.5251399223179439, 0.5592368166825678, 0.5050009958414926, 0.5484504885732712, 0.6359153476486328, 0.6713817908504806, 0.5528231360245106, 0.7047493918228277, 0.6747710082303218, 0.554946500855997, 0.7382581961284679, 0.5725449044397448, 0.6928065652155618, 0.5869639678552843, 0.5969129932901078, 0.687205195680762, 0.6173225773079768, 0.5625884688571695, 0.554521799472328, 0.599777125422387, 0.6347430361889577, 0.48948299247809623, 0.5300653846822636, 0.47534867458011576, 0.5943453147851882, 0.5653052137388904, 0.3744017641816805, 0.5241624143861529, 0.5400506458238573, 0.5768941593863307, 0.5824398948859419, 0.507294188065184, 0.6074959838986872, 0.5143751868996466, 0.5397642313337914, 0.5562825550742968, 0.5573326668537761, 0.595323709987766, 0.5335037599947999, 0.40645409459315884, 0.38584858519561777, 0.6763269336685823, 0.49864392338741487, 0.5556405102765481, 0.56796873872809, 0.6704090264129464, 0.6365633549192408, 0.491858674581239, 0.571182550422225, 0.39636775642315364, 0.4927436956522672, 0.5325300657983706, 0.5914024175418211, 0.6572771291592268, 0.5704864304921875, 0.595574678005175, 0.5556762243555338, 0.6367940265899822, 0.588938002385068, 0.5494135473896908, 0.5194181997174186, 0.4797671520725825, 0.6518392146394686, 0.46972510885465957, 0.5284412162612128, 0.5815074512359419, 0.5823763772131357, 0.5029126283299797, 0.5109125789183494, 0.3649196005975494, 0.4906738472864752, 0.5488590983504192, 0.49753811165261946, 0.505679716613382, 0.5129037523367018, 0.6311914470548811, 0.5946013817765508, 0.6477204415156932, 0.6515995975676336, 0.5768488525143426, 0.523685718723664, 0.5295288664585905, 0.5009731898317803, 0.5587233774569005, 0.6580833925637624, 0.5373233954142419, 0.6582209087756435, 0.5489848637461159, 0.47337027482398153, 0.5043031106374255, 0.577317464046264, 0.5219168721158063, 0.5408242665558518, 0.6356068922290092, 0.5733903517119527, 0.6016504227165926, 0.5288946338379021, 0.5892373351065394, 0.6342615103769079, 0.6130451086636514, 0.46611514729384973, 0.5921430847081095, 0.5779202188955178, 0.6357414621588608, 0.5030608331783005, 0.6395649333620499, 0.6377276982623341, 0.38973320353067653, 0.606781050411065, 0.5093703899551436, 0.5811707770744574, 0.7232476284557209, 0.5168817343337841, 0.6638206576529871, 0.48901418733542623, 0.43739760969575475, 0.6015852419136676, 0.6279423808477765, 0.5814834444420861, 0.4437173666943161, 0.415649335926807, 0.5494089646655524, 0.5699369266474212, 0.5716393111700822, 0.7203262934280066, 0.6109806114151356, 0.6086680149215294, 0.6784207314117051, 0.7510138193211784, 0.595752370855507, 0.5096256983929184, 0.6509022221259164, 2.5251604297907906, 0.5996432671274571, 0.5191965465959889, 0.3910454371052007, 0.5252704538224279, 0.5487563320085694, 0.6497572197588998, 0.539947818113331, 0.5727221067381307, 0.5859855369201918, 0.5550066573682404, 0.4052812429107734, 0.5767050515446769, 0.5074083423324552, 0.5181425538650056, 0.5867669850438991, 0.622336928241333, 0.619730307042242, 0.5159150588135848, 0.5677132852737771, 0.6088984959356519, 0.55785345177934, 0.4887743221019428, 0.4488903808698761, 0.5784225594460584, 0.6405666164702244, 0.6272819335447044, 0.5812181962692605, 0.5423161572251998, 0.6681405021562552, 0.5763012190181668, 0.5985999991742619, 0.48662936326087963, 0.8344473533595789, 0.4856559120241086, 0.5897134269617805, 0.5323088631490295, 0.4448512715743428, 0.5975625673827604, 0.5743464047665394, 0.5417000727311233, 0.6900951864217544, 0.5287838019788256, 0.6675197469611557, 0.6185928209906237, 0.35727423668025304, 0.6143233750515678, 0.6060399652428077, 0.5558822642469178, 0.530276102166812, 0.5116897130040144, 0.5977306063905552, 0.5867864648924253, 0.5278337771927558, 0.4806879149051836, 0.5382478622650878, 0.5856067571861844, 0.6306318442056607, 0.49391258821445383, 0.5489518148259469, 0.6131126056440158, 0.5089564625233208, 0.6258332685683875, 0.5562664332792977, 0.6587553832219688, 0.5721395069393773, 0.7316276447951, 0.6815503509226922, 0.6413576967825804, 0.5484064973484217, 0.5643773131034424, 0.6745771327685542, 0.585569843073139, 4.148380916210813, 0.5982397003506298, 0.7019254715223583, 0.5288198822451418, 0.6240095756994913, 0.5380081185581451, 0.678795541845524, 0.588529980887846, 0.6280178517136987, 0.4657859039190076, 0.5132997470267544, 0.5313934116466171, 0.5266762597476911, 0.6660931196796194, 0.7234808055126262, 0.594061226970833, 0.5515582546731305, 0.5515454825862416, 0.6217069320805647, 0.542655348574746, 0.44244111415482446, 0.462192639977372, 0.5957314889234636, 0.4433120212592345, 0.5494358069465771, 0.6681352256096942, 0.5277128695902392, 0.6099108527298399, 0.6512378710022276, 0.5679475425471902, 0.5546903245945867, 0.6674891410259085, 0.6159889738277975, 0.5735603517089791, 0.5248157975023054, 0.5168292372862809, 0.553843050306285, 0.5985645439688244, 0.39715573701478935, 0.5283503609143612, 0.447520945026398, 0.638239313062452, 0.507419952536012, 0.5760147366093716, 0.5515266316019759, 0.5807239816994251, 0.5599335674825909, 0.5974892666780456, 0.5004662990922536, 0.5888220193638405, 0.4718941619780529, 0.5861252978337973, 0.49223294458949224, 0.6064087992507717, 0.7214565901405107, 0.5484770327034333, 0.5503630111171568, 0.5904485777568088, 0.5939656911153031, 0.5038700884054164, 0.6045103501937796, 0.499118136236574, 0.37193186488077934, 0.5723301608055107, 0.525825703005349, 0.7301040665136092, 0.5140891502639778, 0.5638728483758652, 0.4977489920384463, 0.6471063133748627, 0.6190685639017889, 0.6775629213383808, 0.612278641490936, 0.767169815870711, 0.6605883129662021, 0.6187656212507987, 0.6353017504729984, 0.5077724881945187, 0.6115224371318742, 0.5697210817105954, 0.45587282019630315, 0.6138376361062206, 0.6027143188401093, 0.5812920517773535, 0.5187384796774935, 0.4845791539559439, 0.5139248717515621, 0.5097321564832612, 0.5702922031533869, 0.7662102767707549, 0.5100037970767839, 0.5769086276467204, 0.5918487388928512, 0.4932610414911433, 0.5612912007434486, 0.5019870022881627, 0.5683701006907799, 0.5471292375834348, 0.5163464077571903, 0.5662265235043846, 0.6262520734546206, 0.4045579543142629, 0.5740986318745384, 0.5404645788548182, 0.5729629704291168, 0.543927143587405, 0.6151617140048048, 0.5022090234235793, 0.5783303424697744, 0.616379942383976, 0.5646497198031761, 0.5213611237979402, 0.5846124785989514, 0.6169145129780825, 0.534197561211705, 0.4465214795463175, 0.5524803477799444, 0.6330414104074786, 0.49492489969592696, 0.5329551473534944, 0.5929020635462238, 0.5238675494097854, 0.5429900230821683, 0.5178870028057245, 0.6226470853153954, 0.5495123138896764, 0.5208366967349471, 0.6039583328156696, 0.5546041461090794, 0.47913038384104656, 0.5260999213211526, 0.5471560408038963, 0.5978561074109934, 0.5814516514009886, 0.6079996257427562, 0.634367589709343, 0.36981740415466857, 0.6970604594957829, 0.5697476683491814, 0.5939516289500889, 0.49108811101156746, 0.6614081220367268, 0.5676172157021177, 0.5633578995510955, 0.6006402768683331, 0.5588224559173837, 0.6092733345232592, 0.5769390725536546, 0.37429565713285773, 0.5444970698964151, 0.4640715629842989, 0.6700378566201302, 0.6034207436950111, 0.6619094031660634, 0.5034445564834289, 0.5866391165503538, 0.6139727787785322, 0.4639254512909701, 0.8315876916202657, 0.475982003413719, 0.4372537729306341, 0.5722447067075571, 0.5882645819759047, 0.5779057015430793, 0.6627630369268039, 0.5916307899922117, 0.5342299317908815, 0.5362378111872667, 0.5442251792617785, 0.6348450888774665, 0.4871357148469115, 0.5614704400230816, 0.6329994610904498, 0.6671062619076585, 0.5660308869774, 0.553753223172401, 0.5062704350286694, 0.6968807651805093, 0.6275035575642098, 0.6714519055052968, 0.5595083618517822, 0.5262363525655602, 0.6077647308803884, 0.5445943891364097, 0.6106190753878226, 0.5087365182910752, 0.6428060786938798, 0.5572595189411257, 0.5982255767281885, 0.6064772943030274, 0.641796870858977, 0.5625580596207618, 0.4024176418766817, 0.5646488254069216, 0.5803300611004657, 0.6054487459605079, 0.39705800608168873, 0.6481802283924225, 0.6511188758003585, 0.5391900380698975, 0.5814531439531853, 0.6471624238647458, 0.49907443129699103, 0.5519686262012745, 0.5868227143748256, 0.6935372558308686, 0.6449801314172242, 0.49368417061492087, 0.6030356697914756, 0.655457963578686, 0.6140634241247342, 0.7083376470121734, 0.5302490023499564, 0.5004156026909182, 0.5294756142889654, 0.5752491194188754, 0.5515118322540393, 0.6266358404603671, 0.5721215724933211, 0.5709573199551158, 0.6895899657142461, 0.6884385663549084, 0.5552502177229434, 0.5993480716206604, 0.511304209248702, 0.7221669262906404, 0.5022257278527431, 0.5905228187869876, 0.5923767227511361, 0.657322623740696, 0.5205411847873718, 0.6163160251297007, 0.4961048543439325, 0.45154069182902334, 0.5853784934736037, 0.47034049733531025, 0.7105284045363544, 0.5979161016416713, 0.49215242740699805, 0.5667098695380436, 0.5943849976101567, 0.7154269400608548, 0.583378099295273, 0.5835240865444338, 0.6486352085980603, 0.6384153780423892, 0.5384197352251137, 0.5448564471991062, 0.6488850948552147, 0.5294210616757474, 0.517439144119657, 0.5270373136055498, 0.5616827422722539, 0.5868951702028061, 0.5565920647445286, 0.6232391803921958, 0.6020491794954504, 0.46992208568892646, 0.6486807312726811, 0.5613844933224755, 0.555391749084397, 0.5772819796113084, 0.45810557230990834, 0.6398638599377786, 0.5366702136482702, 0.5739951459427881, 0.5982143252184688, 0.5457185094619164, 0.5665062960826616, 0.7118753111945728, 0.5854817189813198, 0.5800307675409715, 0.5544727708631073, 0.3338893382006267, 0.5091045621758792, 0.46165716062730827, 0.6030307388526068, 0.6080089650119052, 0.5194952380136443, 0.6433003128528647, 0.6914219001844641, 0.6480064308291791, 0.7222649910184369, 0.5560988035902191, 0.5104647434087874, 0.679415076191899, 0.5328008354866952, 0.5899761539786094, 0.5390000973833181, 0.3701084043163154, 0.6339217411098885, 0.593559378938782, 0.5144669461423722, 0.5881084008662535, 0.551622833665805, 0.571227894472472, 0.6411525062691923, 0.7151859458336626, 0.6522025417511709, 0.6322082840004087, 0.6860187619791485, 0.626165217142063, 0.5916004662153633, 0.6487841124590062, 0.5309168041739798, 0.48527289381010597, 0.6020822653764765, 0.6807790880919588, 0.5898577639409405, 0.6344649343044279, 0.6173375657307242, 0.5890823347628427, 0.6726218694450057, 0.5859075938014776, 0.5815875066102324, 0.5758172547583169, 0.48712524314796934, 0.5377782629361193, 0.3938382744806918, 0.676308704853732, 0.6677226491785414, 0.5223299611243529, 0.5495415131548628, 0.6108127620421834, 0.4988800373920646, 0.5072853471520158, 0.5324874634673668, 0.6421144110599337, 0.5641352537195086, 0.4815342678555669, 0.5920600911108078, 0.500374587993995, 0.5367396038113736, 0.5451746772498243, 0.6622860029957304, 0.5333077496098986, 0.5155271033413926, 0.607286350062922, 0.5020017637448351, 0.516500845429085, 0.629443864762706, 0.7105638158452252, 0.5329168640353985, 0.5608956952115493, 0.6469797564951193, 0.5398440588271034, 0.5556897712098293, 0.6163183363303857, 0.610403496455368, 0.5996230941159092, 0.6347018234826138, 0.6549477032938691, 0.6329173885830707, 0.4950253371618349, 0.5230316699129772, 0.5882612702166155, 0.5079731724738321, 0.6786573152179876, 0.46707104070019356, 0.5800420961252108, 0.6110073061061204, 0.7033818294980708, 0.6022168812302134, 0.5288295241084625, 0.5626981078946942, 0.543953278377877, 0.6310410566957821, 0.5937899619372705, 0.6103545565256665, 0.660002389861427, 0.6041752509423339, 0.5308771948309202, 0.572450204884434, 0.5689950110928802, 0.48266978637745767, 0.5245673168231065, 0.6406084034155108, 0.52956038481703, 0.4625944334907985, 0.5517903459951017, 0.5861719638693285, 0.5376046382766501, 0.5671392388053157, 0.5736645873541928, 0.46055001581443344, 0.5001459012929047, 0.7395769297862513]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# here, we load the regularization we already calculated for simplicity\\n\",\n    \"data = request.urlopen(\\\"https://nlpbp.blob.core.windows.net/data/regular.json\\\").read()\\n\",\n    \"regularization_bert = json.loads(data)\\n\",\n    \"print(regularization_bert)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"interpreter_bert = Interpreter(\\n\",\n    \"    x=x_bert, Phi=Phi_bert, regularization=regularization_bert, words=words\\n\",\n    \").to(device)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.3 Train the Interpreter\\n\",\n    \"\\n\",\n    \"Then, we need to train our interpreter (by minimizing the loss [here](#0.2-Perturbation-based-Approximation)) to let it find the information loss in each input word ${\\\\bf x}_i$ when they reach hidden state $\\\\bf s$. You can control the iteration and learning rate when training.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 5000/5000 [00:58<00:00, 85.92it/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"interpreter_bert.optimize(iteration=5000, lr=0.01, show_progress=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.4 Show and visualize the results\\n\",\n    \"\\n\",\n    \"After training, we can show the sigma (directly speaking, it is the range that every word can change without changing $\\\\bf s$ too much) we have got. Sigma somewhat stands for the information loss of word ${\\\\bf x}_i$ when it reaches $\\\\bf s$.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"array([0.17860198, 0.14068164, 0.15262878, 0.22471362, 0.20457381,\\n\",\n       \"       0.21281476, 0.18869533, 0.13970219, 0.25510186, 0.22200805,\\n\",\n       \"       0.24051382, 0.1302286 , 0.2824908 , 0.36167043], dtype=float32)\"\n      ]\n     },\n     \"execution_count\": 14,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"sigma_bert = interpreter_bert.get_sigma()\\n\",\n    \"sigma_bert\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"image/png\": \"iVBORw0KGgoAAAANSUhEUgAAAacAAAB4CAYAAABW8D3WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAATt0lEQVR4nO3de7gdVXnH8e8vF0IAyYUgAhESeFAbLGAJIAoSWiqohWhBqIJNvFFslYpFHy0WI4it0Kr1WtSnDSoWRCsFtFwEAoJGTCB3iCQkCKJoEAOYACF5+8daO2fOzuyz9z5nn32Gk9/nefZz1p5ZM++aNWvmncsOKCIwMzOrkhFD3QAzM7N6Tk5mZlY5Tk5mZlY5Tk5mZlY5Tk5mZlY5Tk5mZlY5o9qpPHKncTFq/B6D1ZZSmzZs7Go8AEa11S0dMWHi2K7HfHZT9/8ZwYiR6nrMZ5/Z3PWYzzy5oesx99p7p67H3LCp+8fK6JFbuh7ztw8/0fWY+03duesxx+3Q3ZgPrv0F69Y9VnpSaGtkjRq/B3ue+bnOtKpFj959X1fjATBhYtdDHn/6QV2P+cijm7oec+zOo7se86E1j3c95up5i7oe8+yLXtH1mAsemdD1mJPHdf+C9cv/eGPXY3567iu7HvO1k4/oarxXHzGj4Tw/1jMzs8pxcjIzs8pxcjIzs8pxcjIzs8pxcjIzs8pxcjIzs8pxcjIzs8pxcjIzs8pxcjIzs8pxcjIzs8pxcjIzs8pxcjIzs8ppmpwknSlpgaQFmzes70abzMxsO9c0OUXEVyJiekRMH7nTuG60yczMtnN+rGdmZpXj5GRmZpXj5GRmZpXj5GRmZpXj5GRmZpXj5GRmZpXj5GRmZpXj5GRmZpXj5GRmZpXj5GRmZpXj5GRmZpXj5GRmZpXj5GRmZpWjiGi9svRb4MF+xJkErOvHcgPhmI7pmI7pmNWOuW9E7F42o63k1F+SFkTE9EEP5JiO6ZiO6ZjDIqYf65mZWeU4OZmZWeV0Kzl9pUtxHNMxHdMxHXMYxOzKOyczM7N2+LGemZlVjpNTiyRNkbSsZPrXJE1rYfkZkq7rVNzhQNJ4SX+by/3qn6qRtFbSpJLpcyWdMhRtqmvH1j6vAkmzJX1hkGP8OP+dIumtgxmrEyTNkXRuyfRhey4o06/klDtpo6RF+fuLJF0habWkFZJ+IOklfZzQXynpp5IWSbpX0pw8/TRJq9o5SSlpuh2SRraxiS2LiHdFxIpuxasySaPaXGQ8UJkTZVW1OsZbtN31eUS8KhenAJVITv04Vrqq5Bx/nqTlkpbk8/YRefo8SSvztEWSvpOnz5H0yzxtmaST8vRzJP2ipQuSiGj7Q9rJy3JZwE+AswrzDwGOLtarW34lcHAujwSmFebNAK5rIf69wJeAe4D/AhYAy4GPF+qtBc4H7gD+CtgfuB5YCPwIeFmb23wfcBmwBPgOsBMwD5ie6zwFXAD8FDgKOCEvcwfwuWbb1WRbv5q370ZgLPBu4GfAYuC7wE65/puBZXn67W1u29fyspcDxwF3AvcDhwMTgavzts8HDsrLziG9DL0R+Fben5fkti0B/qaPuFcAG4FFuf683K/35TbU3omen+cvy7Fq0+cBnwLuAn4OHF1Y9xl5+iLg0tyup4CLct/MB/bIdfcFbs7tvRnYJ0+fC5xSWOdT+e8I0thbDlwH/KBWjzTmPp7/bszb8o28rs8BPwYeKNTfJce8G1gKzGwwxvfN7f8Uafz+MO+XeXl9J7W4r4t9fkn+LMuxTxvA+aDZ+Dk8b/s9+e9L87KzgS/k8htI55JJwO6kcf2z/Hl1f9pWt9/mA08AT+dyo7buDPxnjntPYZ/MJh0D1wJrgPcCH8h15gMTC+e/+Xk8fQ+YUBivnwRuA/4BOJF0rrgn78/aeJyTx8wtuU3vLjnvtnycDWCf1mIdmffLmPx9ErBXYZumlyw/Bzg3l/+I9A90R9Tv8z7b0IGG/ykNToI0Tk6PAy9ssMwMWktOW4BX5u+1QTEyd1btxLkW+FBhuZuBA3L5COCWNrc5yAdJHrzn0js5BXBqLu8IPAQcQErg3262XX3EfQ44JH//NunEu1uhzieA9+XyUmDvXB7fZow/Jp14F+btEzCTdEB+HvhYYZ8vKgzChcDY/P1M4KO5PIZ00TC1hXE0A1gPTM5t+AlwVHH/5vI3gBMLB8a/5fLrgR8WDoZrgdH5+5eAv877p7bsxYV2XgvMyuV3AFfn8lzKk9MppIQ0AngRaTwXk9NFpAuwD5JO2BPzuq7Ky0wDVuX6o4BdCwf9qtzvUyiM8cL4el0uf490QTAaOLi2P9o8dk8GbiIdN3sAvwD2HMAY7Wv87AqMyvWPA76by7OBLwBvIl0w1k7k3yrs/32Ae9ttV8l+m0E6BzRr6yeBM2rHEOnCZ+fc1lXAC0jJcz35ohz4DPD+XF4CHJPLFwCfLYzXLxXaNYGeC6130TOW55AuoMbmMfEQsFfdvmv5OOtnnxVj/SVwbYN682iSnPL3X5PP+bSYnDpxa/ly0g5ux2eAlZLmke5kLouIp9tcx4MRMT+XT5V0JulA35N08C/J864EkLQL8CrgKkm1dYxpM+ZDEXFnLn8TOLtu/mbS1R7Ay4A1EXF/jv9N0oDqjzURsSiXF5IGzsslfYJ08OwC3JDn3wnMlfRt4H/ajLE0t3U5cHNEhKSlOd6+pJMZEXGLpN0kjcvLXhMRG3P5tcBBhfcr40gJek0LbbgrIh7ObViU494BHCvpQ6Q71YmkO5Zr8zK1baz1C8CfAYcCP8v7eizwG+BZ0p1Orf6f5/KRpAMQUvK7uEk7jwKuiogtwK8l3Vo3fyPpDvB24PiI+F1ux9V5mRWS9sh1BXxS0mtIyWhvUqKA3mOc3P7rc3kp8ExEbCrso3YdBfx3RGwGHpV0G3AYcE0/1tVs/IwDLpN0ACnJji4seywwHXhtRDyRpx0HTCscq7tKekFEPNmPtrXb1snASYX3PjuSEiTArbkNT0paT884XEoa9+NIF4W35emXkS5Kaq4slCcDV0raE9iB3sfI/+ZjamMeX4eT7nZrBnKctetG4HxJPyfd4V1Z2D6AyyXVjv+bIuKDxYXzI8AtwG/bCTokzz0j4gJJl5M6+K3AW0hXNe34A4CkqaQ7mMMi4nFJc0mDqVc90lXS7yPikIE0vcn3p/OB3mh+fz1TKG8mnWznAm+MiMWSZpP7LyLOyoPhDcAiSYdExGNtxthS+L6FNE6eK1mmtn1/KEwT6S7uhpL67bRhMzBK0o6kO5/pEfFQfj+5Y8kym+kZzyJd8HykuHJJ50a+dKurX69W5znye1mls+QOhfX3ZXNeR32M4vbV1nE66Sr80Jxo1tKzfcV+BdhUaP/WfRQRW/r5DqPZdrSj2fi5kHRif5OkKaQr7poHgP2Al5DuACD1+5GFi55OatbWzcDJEbGyuFA+rpot20xxn34e+HREXCNpBuluo6bZuWYgx1lbIuIpSYeSXtUcS0qoH46IubnK6RGxoGTRcySdATxJemTc1vmwEy9Zl5OuUtsSEasj4sukq9yDJe3Wz/i7knb4+nw1+roG8Z4A1kh6M2x9yXxwm7H2kXRkLr+FdFXfyH3AVEn7F+p30guAX0kaTTrBASBp/4j4aUScT3rO++IOxbu9FicfSOsKV7lFNwDvye0i/zBm5wbrfDJvR19qJ+p1+e63lV+83QycIumFuQ0TJe3bR/0fk95JQtrG2n5dS8/YnknP1f4dwMmSRuQxN6NufbcDp5KuZpE0sY/Y44Df5MR0LOkOdTAV+/x24DRJIyXtDryG9J5uMIwDfpnLs+vmPUi6c/26pAPztBtJ73QAkDSQi8qaJ0mP55q5AXhfviBB0itaDRAR64HHJR2dJ72N9I6pTLFPZtXNmylpx3xenEF6t1TfxlaPswGLiM0RMS8iPkbaLye3sNhnIuKQiDg6In7UbsxOJKdbgDGS3l2bIOkwScc0WkDSG2o7nnQruhn4fX+CR8Ri0gvF5aRnx3f2Uf104J2SFuf6M9sMdy8wS9IS0uOlL/fRrqdJj/G+L+kO+vdfc+/LP5Fept5ESoQ1l0haqvQrydtJz647YQ4wPW/7v7DtwVTzNWAFcHduw6U0uKLMd3R35nqXNKjze9KPQZaS3gfUH6Rly6wAPgrcmNt7E+lxbyNnA2/Pdd8G/H2e/lXgGEl3kd5R1q56vws8THqhfilpP6wvrG8l6b3TpaTHVZ/uI/blpH5dQBqf9/VRd8Dq+vxI0uPvxaTj+EMR8etBCn0x8M+S7iS946pv10rS9l+VL+jOJo83SSuAszrQhiWkc83+ks7po96FpAuRJbmfLmwzzizScbiE9OOICxrUm0Pa3h+x7X/R+y7g+6QfVlwYEY/UzS89zpR+Kb1Xm+3tk6SX5sexNYfQ+fPZtnHbvNNKC6Xb8usi4uX5+17AZ0lXmU+TrjjfD2wi/drk0cLi55Cy7p8AG0iPTs6r3Z7mq/JzI+Iv+rE9Zl0haZf8uGM30onk1YN4YjfrquI5Pj/S+zzp/fZzpB+FnBkR65R+N7An6T0rpCcqx+XH709FxL+WrHs26TH9e+vnFXXknVPO6qc2mD26ZNpVJdPMnk+ukzSe9B7qQicmG64iYiHpx2Rl82Y0mD5noHH7+1hvMzBO+R9odYqk00gvvx/v5HrNOi0iZuTn6dMKL4bNhovBOsefA3yE9O/N+q7bn8d6ZmZmg8n/bT0zM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6scJyczM6ucUe1UHjnxgIjnNpTMEahsibLpKi1u/dKoflndXsWSBqj59KarLZ2v1jerpI2NYpY0bwDLaduqpRsFalBBDerXZvbVR9v2gcqqbfOlbJ2Nd2OUNatU+VDbdvnGdRutN7ap0LhuyTS10Iayeaqf1vp66vuo52tJfzZYuJ1t7D2/YfAWli2v1dI+b7L28rl9N66d9feu1UfdbbqntfW2vP76us1ODqVzm/ZW33MLaeHuhYtuiIgTypZoKznFpg2MOewsGJFvuGqjQgLlaSNUmD6ipzyiZJpGFKY3WEaFWFunl0xrc12qHWRbq6qwuAqbptJpPavqWc+IRnULMXqaXVZWoamt1VVd3RF9zq/fhro+kArL915um/gj6rexp+0988uXK9uWnr6Pwi5LJ0sVy8Vh0HB+KtceC0jRa/1by5TEUu/1ls6nJFajuIVy7YBs2O7C/LK6xfW3E2vrNtDfWCpM7336EyqsvzAmaqdJlZd72tpz+m1Yri3TYF3FmMVyMUatxWowbWu0XtunresqXsCpbl3pQq2nbs/k+i3IJZXE7bUthbh1V0f10+rjqtc2qEEbi20pWW+h/c3WW+y3RtvYV7vHjho/iQb8WM/MzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCrHycnMzCpHEdF6Zel6oOH/VreLJgHrhroRw5j7d3C5fweP+3Zwdbp/10XECWUz2kpOVSFpQURMH+p2DFfu38Hl/h087tvB1c3+9WM9MzOrHCcnMzOrnOdrcvrKUDdgmHP/Di737+Bx3w6urvXv8/Kdk5mZDW/P1zsnMzMbxiqXnCSdIGmlpFWSPlwy/zWS7pb0nKRT6uZtlrQof67pXqufP1ro3w9IWiFpiaSbJe1bmDdL0v35M6u7La++Afatx24TLfTvWZKW5j68Q9K0wryP5OVWSjq+uy2vvv72raQpkjYWxu5/dKxREVGZDzASWA3sB+wALAam1dWZAhwEfB04pW7eU0O9DVX+tNi/xwI75fJ7gCtzeSLwQP47IZcnDPU2VeUzkL7N3z12B96/uxbKJwHX5/K0XH8MMDWvZ+RQb1NVPgPs2ynAssFoV9XunA4HVkXEAxHxLHAFMLNYISLWRsQSYMtQNPB5rpX+vTUiNuSv84HJuXw8cFNE/C4iHgduAkr/8dx2aiB9a8210r9PFL7uDNReqM8EroiIZyJiDbAqr8+SgfTtoKlactobeKjw/eE8rVU7Slogab6kN3a2acNCu/37TuD/+rns9mYgfQseu8201L+S/k7SauBi4Ox2lt2ODaRvAaZKukfSbZKO7lSjRnVqRR2ikmntZOh9IuIRSfsBt0haGhGrO9S24aDl/pV0BjAdOKbdZbdTA+lb8NhtpqX+jYgvAl+U9Fbgo8CsVpfdjg2kb39FGruPSToUuFrSgXV3Wv1StTunh4EXF75PBh5pdeGIeCT/fQCYB7yik40bBlrqX0nHAecBJ0XEM+0sux0bSN967DbX7vi7AqjdgXrs9q3ffZsflT6WywtJ765e0pFWDfXLuLqXbqNIL9qn0vNi7sAGdedS+EEE6SX9mFyeBNxP3Uu97f3TSv+SToqrgQPqpk8E1uR+npDLE4d6m6ryGWDfeux2pn8PKJRPBBbk8oH0/kHEA/gHEZ3q291rfUn6QcUvO3VeGPKOKemo1wM/zwfxeXnaBaQrTYDDSJn+D8BjwPI8/VXA0tyxS4F3DvW2VPHTQv/+EHgUWJQ/1xSWfQfpZfIq4O1DvS1V+/S3bz12O9a//w4sz317a/EES7pbXQ2sBF431NtStU9/+xY4OU9fDNwNnNipNvm/EGFmZpVTtXdOZmZmTk5mZlY9Tk5mZlY5Tk5mZlY5Tk5mZlY5Tk5mZlY5Tk5mZlY5Tk5mZlY5/w+/x818iK0BrQAAAABJRU5ErkJggg==\\n\",\n      \"text/plain\": [\n       \"<Figure size 432x288 with 2 Axes>\"\n      ]\n     },\n     \"metadata\": {\n      \"needs_background\": \"light\"\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"interpreter_bert.visualize()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can see that the word 'rare', 'bird', 'charm', 'memorable' is important to the third layer.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": [\n        0.003160591935738921,\n        0.0015862112632021308,\n        0.00629778765141964,\n        0.0015863593434914947,\n        0.003082597628235817\n       ],\n       \"encoder\": \"json\",\n       \"name\": \"sigma_numbers\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"sigma_numbers\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": [\n        0.17860198020935059,\n        0.14068163931369781,\n        0.15262877941131592,\n        0.22471362352371216,\n        0.20457381010055542,\n        0.21281476318836212,\n        0.18869532644748688,\n        0.1397021859884262,\n        0.25510185956954956,\n        0.22200804948806763,\n        0.24051381647586823,\n        0.13022859394550323,\n        0.2824907898902893,\n        0.36167043447494507\n       ],\n       \"encoder\": \"json\",\n       \"name\": \"sigma_bert\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"sigma_bert\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"# for testing\\n\",\n    \"sb.glue(\\\"sigma_numbers\\\", list(sigma_numbers))\\n\",\n    \"sb.glue(\\\"sigma_bert\\\", list(sigma_bert))\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.5\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/named_entity_recognition/README.md",
    "content": "# Named Entity Recognition (NER)\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for building Named\nEntity Recognition models. We use the\nutility scripts in the [utils_nlp](../../utils_nlp) folder to speed up data preprocessing and model building for NER.  \nThe models can be used in a wide variety of applications, such as\ninformation extraction and filtering. It also plays an important role in other NLP tasks like\nquestion answering and text summarization.  \nCurrently, we focus on fine-tuning pre-trained BERT\nmodel. We plan to continue adding state-of-the-art models as they come up and welcome community\ncontributions.\n\n## What is Named Entity Recognition (NER)\n\nNamed Entity Recognition (NER) is the task of detecting and classifying real-world objects mentioned\nin text. Common named entities include person names, locations, organizations, etc. The\n[state-of-the art](https://paperswithcode.com/task/named-entity-recognition-ner) NER methods include\ncombining Long Short-Term Memory neural network with Conditional Random Field (LSTM-CRF) and\npretrained language models like BERT.\n\nNER usually involves assigning an entity label to each word in a sentence as shown in the figure below.   \n<p align=\"center\">\n  <img src=\"https://nlpbp.blob.core.windows.net/images/ner.PNG\" alt=\" Fine-tuned BERT for NER tasks\"/>\n</p>\n\n* O:  Not an entity\n* I-LOC: Location\n* I-ORG: Organization\n* I-PER: Person\n\nThere are a few standard labeling schemes and you can find the details\n[here](http://cs229.stanford.edu/proj2005/KrishnanGanapathy-NamedEntityRecognition.pdf). The data\ncan also be labeled with custom entities as required by the use case.\n\n## Summary\n\n|Notebook|Environment|Description|Dataset|Language| \n|---|:---:|---|---|---|\n|[BERT](ner_wikigold_transformer.ipynb)|Local| Fine-tune a pretrained BERT model for token classification.|[wikigold](https://www.aclweb.org/anthology/W09-3302)| English | \n"
  },
  {
    "path": "examples/named_entity_recognition/ner_wikigold_transformer.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"*Copyright (c) Microsoft Corporation. All rights reserved.*  \\n\",\n    \"*Licensed under the MIT License.*\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Named Entity Recognition Using Transformer Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Summary\\n\",\n    \"\\n\",\n    \"This notebook demonstrates how to fine tune [pretrained Transformer model](https://github.com/huggingface/transformers) for named entity recognition (NER) task. Utility functions and classes in the NLP Best Practices repo are used to facilitate data preprocessing, model training, model scoring, and model evaluation. \\n\",\n    \"\\n\",\n    \"The pretrained transformer of [BERT (Bidirectional Transformers for Language Understanding)](https://arxiv.org/pdf/1810.04805.pdf) architecture is used in this notebook. [BERT](https://arxiv.org/pdf/1810.04805.pdf) is a powerful pre-trained lanaguage model that can be used for multiple NLP tasks, including text classification, question answering, named entity recognition, etc. It's able to achieve state of the art performance with only a few epochs of fine tuning on task specific datasets.\\n\",\n    \"\\n\",\n    \"The figure below illustrates how BERT can be fine tuned for NER tasks. The input data is a list of tokens representing a sentence. In the training data, each token has an entity label. After fine tuning, the model predicts an entity label for each token in a given testing sentence. \\n\",\n    \"\\n\",\n    \"<img src=\\\"https://nlpbp.blob.core.windows.net/images/bert_architecture.png\\\">\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"import random\\n\",\n    \"import string\\n\",\n    \"import sys\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"\\n\",\n    \"import pandas as pd\\n\",\n    \"import scrapbook as sb\\n\",\n    \"import torch\\n\",\n    \"from seqeval.metrics import classification_report\\n\",\n    \"from sklearn.model_selection import train_test_split\\n\",\n    \"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"from utils_nlp.dataset import wikigold\\n\",\n    \"from utils_nlp.dataset.ner_utils import read_conll_file\\n\",\n    \"from utils_nlp.dataset.url_utils import maybe_download\\n\",\n    \"from utils_nlp.models.transformers.named_entity_recognition import (\\n\",\n    \"    TokenClassificationProcessor, TokenClassifier)\\n\",\n    \"from utils_nlp.models.transformers.named_entity_recognition import supported_models as SUPPORTED_MODELS\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Configuration\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The running time shown in this notebook is on a Standard_NC12 Azure Virtual Machine with 2 NVIDIA Tesla K80 GPUs. \\n\",\n    \"> **Tip**: If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \\n\",\n    \"\\n\",\n    \"The table below provides some reference running time on different machine configurations.  \\n\",\n    \"\\n\",\n    \"|QUICK_RUN|Machine Configurations|Running time|\\n\",\n    \"|:---------|:----------------------|:------------|\\n\",\n    \"|True|4 CPUs, 14GB memory| ~ 2 minutes|\\n\",\n    \"|False|4 CPUs, 14GB memory| ~1.5 hours|\\n\",\n    \"|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 1 minute|\\n\",\n    \"|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 7 minutes |\\n\",\n    \"\\n\",\n    \"If you run into CUDA out-of-memory error or the jupyter kernel dies constantly, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\\n\",\n    \"QUICK_RUN = False\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# Wikigold dataset\\n\",\n    \"DATA_URL = (\\n\",\n    \"    \\\"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets\\\"\\n\",\n    \"    \\\"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt\\\"\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"# fraction of the dataset used for testing\\n\",\n    \"TEST_DATA_FRACTION = 0.3\\n\",\n    \"\\n\",\n    \"# sub-sampling ratio\\n\",\n    \"SAMPLE_RATIO = 1\\n\",\n    \"\\n\",\n    \"# the data path used to save the downloaded data file\\n\",\n    \"DATA_PATH = TemporaryDirectory().name\\n\",\n    \"\\n\",\n    \"# the cache data path during find tuning\\n\",\n    \"CACHE_DIR = TemporaryDirectory().name\\n\",\n    \"\\n\",\n    \"# set random seeds\\n\",\n    \"RANDOM_SEED = 100\\n\",\n    \"torch.manual_seed(RANDOM_SEED)\\n\",\n    \"\\n\",\n    \"# model configurations\\n\",\n    \"NUM_TRAIN_EPOCHS = 5\\n\",\n    \"MODEL_NAME = \\\"distilbert-base-cased\\\"\\n\",\n    \"DO_LOWER_CASE = False\\n\",\n    \"MAX_SEQ_LENGTH = 200\\n\",\n    \"TRAILING_PIECE_TAG = \\\"X\\\"\\n\",\n    \"NUM_GPUS = None  # uses all if available\\n\",\n    \"BATCH_SIZE = 16\\n\",\n    \"\\n\",\n    \"# update variables for quick run option\\n\",\n    \"if QUICK_RUN:\\n\",\n    \"    SAMPLE_RATIO = 0.1\\n\",\n    \"    NUM_TRAIN_EPOCHS = 1\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Models that can be used for token classification task\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>supported models</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>albert-base-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>albert-base-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>albert-large-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>albert-large-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>albert-xlarge-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>...</th>\\n\",\n       \"      <td>...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>65</th>\\n\",\n       \"      <td>xlm-roberta-large-finetuned-conll02-spanish</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>66</th>\\n\",\n       \"      <td>xlm-roberta-large-finetuned-conll03-english</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>67</th>\\n\",\n       \"      <td>xlm-roberta-large-finetuned-conll03-german</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>68</th>\\n\",\n       \"      <td>xlnet-base-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>69</th>\\n\",\n       \"      <td>xlnet-large-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"<p>70 rows × 1 columns</p>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                               supported models\\n\",\n       \"0                                albert-base-v1\\n\",\n       \"1                                albert-base-v2\\n\",\n       \"2                               albert-large-v1\\n\",\n       \"3                               albert-large-v2\\n\",\n       \"4                              albert-xlarge-v1\\n\",\n       \"..                                          ...\\n\",\n       \"65  xlm-roberta-large-finetuned-conll02-spanish\\n\",\n       \"66  xlm-roberta-large-finetuned-conll03-english\\n\",\n       \"67   xlm-roberta-large-finetuned-conll03-german\\n\",\n       \"68                             xlnet-base-cased\\n\",\n       \"69                            xlnet-large-cased\\n\",\n       \"\\n\",\n       \"[70 rows x 1 columns]\"\n      ]\n     },\n     \"execution_count\": 4,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"pd.DataFrame({\\\"supported models\\\": SUPPORTED_MODELS})\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Get Traning & Testing Dataset\\n\",\n    \"\\n\",\n    \"The dataset used in this notebook is the [wikigold dataset](https://www.aclweb.org/anthology/W09-3302). The wikigold dataset consists of 145 mannually labelled Wikipedia articles, including 1841 sentences and 40k tokens in total. The dataset can be directly downloaded from [here](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold). \\n\",\n    \"\\n\",\n    \"In the following cell, we download the data file, parse the tokens and labels, sample a given number of sentences, and split the dataset for training and testing.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 96.0/96.0 [00:00<00:00, 4.02kKB/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Maximum sequence length is: 144\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# download data\\n\",\n    \"file_name = DATA_URL.split(\\\"/\\\")[-1]  # a name for the downloaded file\\n\",\n    \"maybe_download(DATA_URL, file_name, DATA_PATH)\\n\",\n    \"data_file = os.path.join(DATA_PATH, file_name)\\n\",\n    \"\\n\",\n    \"# parse CoNll file\\n\",\n    \"sentence_list, labels_list = read_conll_file(data_file, sep=\\\" \\\")\\n\",\n    \"\\n\",\n    \"# sub-sample (optional)\\n\",\n    \"random.seed(RANDOM_SEED)\\n\",\n    \"sample_size = int(SAMPLE_RATIO * len(sentence_list))\\n\",\n    \"sentence_list, labels_list = list(\\n\",\n    \"    zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size))\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"# train-test split\\n\",\n    \"train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split(\\n\",\n    \"    sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The following is an example input sentence of the training set.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>sentence</th>\\n\",\n       \"      <th>labels</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>[The, origin, of, Agotes, (, or, Cagots, ), is...</td>\\n\",\n       \"      <td>[O, O, O, I-MISC, O, O, I-MISC, O, O, O, O]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>[-DOCSTART-]</td>\\n\",\n       \"      <td>[O]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>[It, provides, full, -, and, part-time, polyte...</td>\\n\",\n       \"      <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>[Since, she, was, the, daughter, of, the, grea...</td>\\n\",\n       \"      <td>[O, O, O, O, O, O, O, O, I-MISC, O, O, O, I-MI...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>[The, goals, were, two, posts, ,, with, no, cr...</td>\\n\",\n       \"      <td>[O, O, O, O, O, O, O, O, O, O]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <td>[At, one, point, ,, so, many, orders, had, bee...</td>\\n\",\n       \"      <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <td>[Left, camp, in, July, 1972, ,, and, was, deal...</td>\\n\",\n       \"      <td>[O, O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>7</th>\\n\",\n       \"      <td>[She, fled, again, to, Abra, ,, where, she, wa...</td>\\n\",\n       \"      <td>[O, O, O, O, I-LOC, O, O, O, O, O, O]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>8</th>\\n\",\n       \"      <td>[As, the, younger, sibling, ,, Ben, was, const...</td>\\n\",\n       \"      <td>[O, O, O, O, O, I-PER, O, O, O, O, O, O, O, O,...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>9</th>\\n\",\n       \"      <td>[Milepost, 1, :, granite, masonry, arch, over,...</td>\\n\",\n       \"      <td>[O, O, O, O, O, O, O, I-LOC, I-LOC, O]</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                                            sentence  \\\\\\n\",\n       \"0  [The, origin, of, Agotes, (, or, Cagots, ), is...   \\n\",\n       \"1                                       [-DOCSTART-]   \\n\",\n       \"2  [It, provides, full, -, and, part-time, polyte...   \\n\",\n       \"3  [Since, she, was, the, daughter, of, the, grea...   \\n\",\n       \"4  [The, goals, were, two, posts, ,, with, no, cr...   \\n\",\n       \"5  [At, one, point, ,, so, many, orders, had, bee...   \\n\",\n       \"6  [Left, camp, in, July, 1972, ,, and, was, deal...   \\n\",\n       \"7  [She, fled, again, to, Abra, ,, where, she, wa...   \\n\",\n       \"8  [As, the, younger, sibling, ,, Ben, was, const...   \\n\",\n       \"9  [Milepost, 1, :, granite, masonry, arch, over,...   \\n\",\n       \"\\n\",\n       \"                                              labels  \\n\",\n       \"0        [O, O, O, I-MISC, O, O, I-MISC, O, O, O, O]  \\n\",\n       \"1                                                [O]  \\n\",\n       \"2  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  \\n\",\n       \"3  [O, O, O, O, O, O, O, O, I-MISC, O, O, O, I-MI...  \\n\",\n       \"4                     [O, O, O, O, O, O, O, O, O, O]  \\n\",\n       \"5  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  \\n\",\n       \"6  [O, O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG...  \\n\",\n       \"7              [O, O, O, O, I-LOC, O, O, O, O, O, O]  \\n\",\n       \"8  [O, O, O, O, O, I-PER, O, O, O, O, O, O, O, O,...  \\n\",\n       \"9             [O, O, O, O, O, O, O, I-LOC, I-LOC, O]  \"\n      ]\n     },\n     \"execution_count\": 7,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# Show example sentences from input\\n\",\n    \"pd.DataFrame({\\\"sentence\\\": sentence_list, \\\"labels\\\": labels_list}).head(10)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>token</th>\\n\",\n       \"      <th>label</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>In</td>\\n\",\n       \"      <td>O</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>1999</td>\\n\",\n       \"      <td>O</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>,</td>\\n\",\n       \"      <td>O</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>the</td>\\n\",\n       \"      <td>O</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>Caloi</td>\\n\",\n       \"      <td>I-PER</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <td>family</td>\\n\",\n       \"      <td>O</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <td>sold</td>\\n\",\n       \"      <td>O</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>7</th>\\n\",\n       \"      <td>the</td>\\n\",\n       \"      <td>O</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>8</th>\\n\",\n       \"      <td>majority</td>\\n\",\n       \"      <td>O</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>9</th>\\n\",\n       \"      <td>of</td>\\n\",\n       \"      <td>O</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>10</th>\\n\",\n       \"      <td>Caloi</td>\\n\",\n       \"      <td>I-ORG</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"       token  label\\n\",\n       \"0         In      O\\n\",\n       \"1       1999      O\\n\",\n       \"2          ,      O\\n\",\n       \"3        the      O\\n\",\n       \"4      Caloi  I-PER\\n\",\n       \"5     family      O\\n\",\n       \"6       sold      O\\n\",\n       \"7        the      O\\n\",\n       \"8   majority      O\\n\",\n       \"9         of      O\\n\",\n       \"10     Caloi  I-ORG\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# Show example tokens from input\\n\",\n    \"pd.DataFrame({\\\"token\\\": train_sentence_list[0], \\\"label\\\": train_labels_list[0]}).head(11)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"> If your data is unlabeled, try using an annotation tool to simplify the process of labeling. The example [here](../annotation/Doccano.md) introduces [Doccanno](https://github.com/chakki-works/doccano) and shows how it can be used for NER annotation.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Create PyTorch Datasets and Dataloaders\\n\",\n    \"Given the tokenized input and corresponding labels, we use a custom processer to convert our input lists into a PyTorch dataset that can be used with our token classifier. Next, we create PyTorch dataloaders for training and testing.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"ea57217fe6394812af03defcdaffe4db\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"HBox(children=(IntProgress(value=0, description='Downloading', max=411, style=ProgressStyle(description_width=…\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"00884141779a4ddead34204d5ea01b41\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:root:Token lists with length > 512 will be truncated\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:root:Token lists with length > 512 will be truncated\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR)\\n\",\n    \"\\n\",\n    \"label_map = TokenClassificationProcessor.create_label_map(\\n\",\n    \"    label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"train_dataset = processor.preprocess(\\n\",\n    \"    text=train_sentence_list,\\n\",\n    \"    max_len=MAX_SEQ_LENGTH,\\n\",\n    \"    labels=train_labels_list,\\n\",\n    \"    label_map=label_map,\\n\",\n    \"    trailing_piece_tag=TRAILING_PIECE_TAG,\\n\",\n    \")\\n\",\n    \"train_dataloader = dataloader_from_dataset(\\n\",\n    \"    train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"test_dataset = processor.preprocess(\\n\",\n    \"    text=test_sentence_list,\\n\",\n    \"    max_len=MAX_SEQ_LENGTH,\\n\",\n    \"    labels=test_labels_list,\\n\",\n    \"    label_map=label_map,\\n\",\n    \"    trailing_piece_tag=TRAILING_PIECE_TAG,\\n\",\n    \")\\n\",\n    \"test_dataloader = dataloader_from_dataset(\\n\",\n    \"    test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False\\n\",\n    \")\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Train Model\\n\",\n    \"\\n\",\n    \"There are two steps to train a NER model using pretrained transformer model: 1) Instantiate a TokenClassifier class which is a wrapper of a transformer-based network, and 2) Fit the model using the preprocessed training dataloader. The member method `fit` of TokenClassifier class is used to fine-tune the model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"7cd3a9259b5c42638e8580f9fbae27db\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"HBox(children=(IntProgress(value=0, description='Downloading', max=263273408, style=ProgressStyle(description_…\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Training time : 0.060 hrs\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Instantiate a TokenClassifier class for NER using pretrained transformer model\\n\",\n    \"model = TokenClassifier(\\n\",\n    \"    model_name=MODEL_NAME,\\n\",\n    \"    num_labels=len(label_map),\\n\",\n    \"    cache_dir=CACHE_DIR\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"# Fine tune the model using the training dataset\\n\",\n    \"with Timer() as t:\\n\",\n    \"    model.fit(\\n\",\n    \"        train_dataloader=train_dataloader,\\n\",\n    \"        num_epochs=NUM_TRAIN_EPOCHS,\\n\",\n    \"        num_gpus=NUM_GPUS,\\n\",\n    \"        local_rank=-1,\\n\",\n    \"        weight_decay=0.0,\\n\",\n    \"        learning_rate=5e-5,\\n\",\n    \"        adam_epsilon=1e-8,\\n\",\n    \"        warmup_steps=0,\\n\",\n    \"        verbose=False,\\n\",\n    \"        seed=RANDOM_SEED\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"print(\\\"Training time : {:.3f} hrs\\\".format(t.interval / 3600))\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Evaluate on Testing Dataset\\n\",\n    \"\\n\",\n    \"The `predict` method of the TokenClassifier returns a Numpy ndarray of raw predictions. The shape of the ndarray is \\\\[`number_of_examples`, `sequence_length`, `number_of_labels`\\\\]. Each value in the ndarray is not normalized. Post-process will be needed to get the probability for each class label. Function `get_predicted_token_labels` will process the raw prediction and output the predicted labels for each token.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Scoring: 100%|██████████| 35/35 [00:06<00:00,  6.14it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Prediction time : 0.002 hrs\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"with Timer() as t:\\n\",\n    \"    preds = model.predict(\\n\",\n    \"        test_dataloader=test_dataloader,\\n\",\n    \"        num_gpus=None,\\n\",\n    \"        verbose=True\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"print(\\\"Prediction time : {:.3f} hrs\\\".format(t.interval / 3600))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Get the true token labels of the testing dataset:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"true_labels = model.get_true_test_labels(label_map=label_map, dataset=test_dataset)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Get the predicted labels for each token by calling member method `get_predicted_token_labels`, and generate the classification report.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"           precision    recall  f1-score   support\\n\",\n      \"\\n\",\n      \"      ORG       0.72      0.76      0.74       274\\n\",\n      \"     MISC       0.67      0.73      0.70       221\\n\",\n      \"      LOC       0.79      0.84      0.81       317\\n\",\n      \"      PER       0.90      0.93      0.92       257\\n\",\n      \"\\n\",\n      \"micro avg       0.76      0.82      0.79      1069\\n\",\n      \"macro avg       0.77      0.82      0.79      1069\\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"predicted_labels = model.get_predicted_token_labels(\\n\",\n    \"    predictions=preds,\\n\",\n    \"    label_map=label_map,\\n\",\n    \"    dataset=test_dataset\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"report = classification_report(true_labels, \\n\",\n    \"              predicted_labels, \\n\",\n    \"              digits=2\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"print(report)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Score Example Sentences\\n\",\n    \"Finally, we test the model on some random input sentences.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:root:Token lists with length > 512 will be truncated\\n\",\n      \"Scoring: 100%|██████████| 1/1 [00:00<00:00, 25.31it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \" Is it true that Jane works at Microsoft?\\n\",\n      \"       tokens labels\\n\",\n      \"0          Is      O\\n\",\n      \"1          it      O\\n\",\n      \"2        true      O\\n\",\n      \"3        that      O\\n\",\n      \"4        Jane  I-PER\\n\",\n      \"5       works      O\\n\",\n      \"6          at      O\\n\",\n      \"7  Microsoft?  I-ORG\\n\",\n      \"\\n\",\n      \" Joe now lives in Copenhagen.\\n\",\n      \"        tokens labels\\n\",\n      \"0          Joe  I-PER\\n\",\n      \"1          now      O\\n\",\n      \"2        lives      O\\n\",\n      \"3           in      O\\n\",\n      \"4  Copenhagen.  I-LOC\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# test\\n\",\n    \"sample_text = [    \\n\",\n    \"    \\\"Is it true that Jane works at Microsoft?\\\",\\n\",\n    \"    \\\"Joe now lives in Copenhagen.\\\"\\n\",\n    \"]\\n\",\n    \"sample_tokens = [x.split() for x in sample_text]\\n\",\n    \"\\n\",\n    \"sample_dataset = processor.preprocess(\\n\",\n    \"    text=sample_tokens,\\n\",\n    \"    max_len=MAX_SEQ_LENGTH,\\n\",\n    \"    labels=None,\\n\",\n    \"    label_map=label_map,\\n\",\n    \"    trailing_piece_tag=TRAILING_PIECE_TAG,\\n\",\n    \")\\n\",\n    \"sample_dataloader = dataloader_from_dataset(\\n\",\n    \"    sample_dataset, batch_size=BATCH_SIZE, num_gpus=None, shuffle=False, distributed=False\\n\",\n    \")\\n\",\n    \"preds = model.predict(\\n\",\n    \"        test_dataloader=sample_dataloader,\\n\",\n    \"        num_gpus=None,\\n\",\n    \"        verbose=True\\n\",\n    \")\\n\",\n    \"predicted_labels = model.get_predicted_token_labels(\\n\",\n    \"    predictions=preds,\\n\",\n    \"    label_map=label_map,\\n\",\n    \"    dataset=sample_dataset\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"for i in range(len(sample_text)):\\n\",\n    \"    print(\\\"\\\\n\\\", sample_text[i])\\n\",\n    \"    print(pd.DataFrame({\\\"tokens\\\": sample_tokens[i] , \\\"labels\\\":predicted_labels[i]}))  \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## For Testing\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.77,\n       \"encoder\": \"json\",\n       \"name\": \"precision\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"precision\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.82,\n       \"encoder\": \"json\",\n       \"name\": \"recall\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"recall\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.79,\n       \"encoder\": \"json\",\n       \"name\": \"f1\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"f1\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"report_splits = report.split('\\\\n')[-2].split()\\n\",\n    \"\\n\",\n    \"sb.glue(\\\"precision\\\", float(report_splits[2]))\\n\",\n    \"sb.glue(\\\"recall\\\", float(report_splits[3]))\\n\",\n    \"sb.glue(\\\"f1\\\", float(report_splits[4]))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "examples/question_answering/README.md",
    "content": "# Question Answering (QA)\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for building\nquestion answering models. These models can be used in a wide variety of applications, such as\nsearch engines, and virtual assistants.\n\n\n## What is Question Answering?\n\nQuestion Answering is a classical NLP task which consists of determining the relevant \"answer\"\n(snippet of text out of a provided passage) that answers a user's \"question\". This task is a subset\nof Machine Comprehension, or measuring how well a machine comprehends a passage of text. The\nStanford Question Answering Dataset ([SQuAD](https://rajpurkar.github.io/SQuAD-explorer/))\nleader board displays the state-of-the-art models in this space. Traditional QA models are variants\nof Bidirectional Recurrent Neural Networks (BRNN).\n\n## Summary\n\n|Notebook|Environment|Description|Dataset | Language\n|---|---|---|---|----|\n|[Deployed QA System in Under 20 minutes](question_answering_system_bidaf_quickstart.ipynb)|Azure Container Instances| Learn how to deploy a QA system in under 20 minutes using Azure Container Instances (ACI) and a popular AllenNLP pre-trained model called BiDAF.|[SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)| English | \n|[BiDAF Deep Dive](bidaf_aml_deep_dive.ipynb)|Azure ML| Learn about the architecture of the BiDAF model and how to train it from scratch using the AllenNLP library on the AzureML platform.|[SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) | English |\n|[Pretrained BERT SQuAD Deep Dive](pretrained-BERT-SQuAD-deep-dive-aml.ipynb)|Azure ML| Learn about the mechanism of the BERT model in an end to end pipeline on the AzureML platform and how to fine tune it from scratch using the distributed training with Horovod. Show the improvement on the model performance using hyper-parameter tuning|[SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)| English |\n\n"
  },
  {
    "path": "examples/question_answering/bert_run_squad_azureml.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Run BERT on SQuAD.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport argparse\nimport collections\nimport logging\nimport json\nimport math\nimport os\nimport random\nimport pickle\nimport socket\nfrom tqdm import tqdm, trange\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\nfrom torch.utils.data.distributed import DistributedSampler\nimport torch.multiprocessing as mp\n\nfrom pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer\nfrom pytorch_pretrained_bert.modeling import BertForQuestionAnswering\nfrom pytorch_pretrained_bert.optimization import BertAdam\nfrom pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE\n\nfrom azureml.core.run import Run\nfrom evaluate_squad import evaluate\nfrom azureml_bert_util import *\n\nlogging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', \n                    datefmt = '%m/%d/%Y %H:%M:%S',\n                    level = logging.INFO)\nlogger = logging.getLogger(__name__)\n\nclass SquadExample(object):\n    \"\"\"A single training/test example for the Squad dataset.\"\"\"\n\n    def __init__(self,\n                 qas_id,\n                 question_text,\n                 doc_tokens,\n                 orig_answer_text=None,\n                 start_position=None,\n                 end_position=None):\n        self.qas_id = qas_id\n        self.question_text = question_text\n        self.doc_tokens = doc_tokens\n        self.orig_answer_text = orig_answer_text\n        self.start_position = start_position\n        self.end_position = end_position\n\n    def __str__(self):\n        return self.__repr__()\n\n    def __repr__(self):\n        s = \"\"\n        s += \"qas_id: %s\" % (self.qas_id)\n        s += \", question_text: %s\" % (\n            self.question_text)\n        s += \", doc_tokens: [%s]\" % (\" \".join(self.doc_tokens))\n        if self.start_position:\n            s += \", start_position: %d\" % (self.start_position)\n        if self.start_position:\n            s += \", end_position: %d\" % (self.end_position)\n        return s\n\n\nclass InputFeatures(object):\n    \"\"\"A single set of features of data.\"\"\"\n\n    def __init__(self,\n                 unique_id,\n                 example_index,\n                 doc_span_index,\n                 tokens,\n                 token_to_orig_map,\n                 token_is_max_context,\n                 input_ids,\n                 input_mask,\n                 segment_ids,\n                 start_position=None,\n                 end_position=None):\n        self.unique_id = unique_id\n        self.example_index = example_index\n        self.doc_span_index = doc_span_index\n        self.tokens = tokens\n        self.token_to_orig_map = token_to_orig_map\n        self.token_is_max_context = token_is_max_context\n        self.input_ids = input_ids\n        self.input_mask = input_mask\n        self.segment_ids = segment_ids\n        self.start_position = start_position\n        self.end_position = end_position\n\n\ndef read_squad_examples(input_file, is_training):\n    \"\"\"Read a SQuAD json file into a list of SquadExample.\"\"\"\n    with open(input_file, \"r\", encoding='utf-8') as reader:\n        input_data = json.load(reader)[\"data\"]\n\n    def is_whitespace(c):\n        if c == \" \" or c == \"\\t\" or c == \"\\r\" or c == \"\\n\" or ord(c) == 0x202F:\n            return True\n        return False\n\n    examples = []\n    for entry in input_data:\n        for paragraph in entry[\"paragraphs\"]:\n            paragraph_text = paragraph[\"context\"]\n            doc_tokens = []\n            char_to_word_offset = []\n            prev_is_whitespace = True\n            for c in paragraph_text:\n                if is_whitespace(c):\n                    prev_is_whitespace = True\n                else:\n                    if prev_is_whitespace:\n                        doc_tokens.append(c)\n                    else:\n                        doc_tokens[-1] += c\n                    prev_is_whitespace = False\n                char_to_word_offset.append(len(doc_tokens) - 1)\n\n            for qa in paragraph[\"qas\"]:\n                qas_id = qa[\"id\"]\n                question_text = qa[\"question\"]\n                start_position = None\n                end_position = None\n                orig_answer_text = None\n                if is_training:\n                    if len(qa[\"answers\"]) != 1:\n                        raise ValueError(\n                            \"For training, each question should have exactly 1 answer.\")\n                    answer = qa[\"answers\"][0]\n                    orig_answer_text = answer[\"text\"]\n                    answer_offset = answer[\"answer_start\"]\n                    answer_length = len(orig_answer_text)\n                    start_position = char_to_word_offset[answer_offset]\n                    end_position = char_to_word_offset[answer_offset + answer_length - 1]\n                    # Only add answers where the text can be exactly recovered from the\n                    # document. If this CAN'T happen it's likely due to weird Unicode\n                    # stuff so we will just skip the example.\n                    #\n                    # Note that this means for training mode, every example is NOT\n                    # guaranteed to be preserved.\n                    actual_text = \" \".join(doc_tokens[start_position:(end_position + 1)])\n                    cleaned_answer_text = \" \".join(\n                        whitespace_tokenize(orig_answer_text))\n                    if actual_text.find(cleaned_answer_text) == -1:\n                        logger.warning(\"Could not find answer: '%s' vs. '%s'\",\n                                           actual_text, cleaned_answer_text)\n                        continue\n\n                example = SquadExample(\n                    qas_id=qas_id,\n                    question_text=question_text,\n                    doc_tokens=doc_tokens,\n                    orig_answer_text=orig_answer_text,\n                    start_position=start_position,\n                    end_position=end_position)\n                examples.append(example)\n    return examples\n\n\ndef convert_examples_to_features(examples, tokenizer, max_seq_length,\n                                 doc_stride, max_query_length, is_training):\n    \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n    unique_id = 1000000000\n\n    features = []\n    for (example_index, example) in enumerate(examples):\n        query_tokens = tokenizer.tokenize(example.question_text)\n\n        if len(query_tokens) > max_query_length:\n            query_tokens = query_tokens[0:max_query_length]\n\n        tok_to_orig_index = []\n        orig_to_tok_index = []\n        all_doc_tokens = []\n        for (i, token) in enumerate(example.doc_tokens):\n            orig_to_tok_index.append(len(all_doc_tokens))\n            sub_tokens = tokenizer.tokenize(token)\n            for sub_token in sub_tokens:\n                tok_to_orig_index.append(i)\n                all_doc_tokens.append(sub_token)\n\n        tok_start_position = None\n        tok_end_position = None\n        if is_training:\n            tok_start_position = orig_to_tok_index[example.start_position]\n            if example.end_position < len(example.doc_tokens) - 1:\n                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1\n            else:\n                tok_end_position = len(all_doc_tokens) - 1\n            (tok_start_position, tok_end_position) = _improve_answer_span(\n                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,\n                example.orig_answer_text)\n\n        # The -3 accounts for [CLS], [SEP] and [SEP]\n        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3\n\n        # We can have documents that are longer than the maximum sequence length.\n        # To deal with this we do a sliding window approach, where we take chunks\n        # of the up to our max length with a stride of `doc_stride`.\n        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name\n            \"DocSpan\", [\"start\", \"length\"])\n        doc_spans = []\n        start_offset = 0\n        while start_offset < len(all_doc_tokens):\n            length = len(all_doc_tokens) - start_offset\n            if length > max_tokens_for_doc:\n                length = max_tokens_for_doc\n            doc_spans.append(_DocSpan(start=start_offset, length=length))\n            if start_offset + length == len(all_doc_tokens):\n                break\n            start_offset += min(length, doc_stride)\n\n        for (doc_span_index, doc_span) in enumerate(doc_spans):\n            tokens = []\n            token_to_orig_map = {}\n            token_is_max_context = {}\n            segment_ids = []\n            tokens.append(\"[CLS]\")\n            segment_ids.append(0)\n            for token in query_tokens:\n                tokens.append(token)\n                segment_ids.append(0)\n            tokens.append(\"[SEP]\")\n            segment_ids.append(0)\n\n            for i in range(doc_span.length):\n                split_token_index = doc_span.start + i\n                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]\n\n                is_max_context = _check_is_max_context(doc_spans, doc_span_index,\n                                                       split_token_index)\n                token_is_max_context[len(tokens)] = is_max_context\n                tokens.append(all_doc_tokens[split_token_index])\n                segment_ids.append(1)\n            tokens.append(\"[SEP]\")\n            segment_ids.append(1)\n\n            input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n            # The mask has 1 for real tokens and 0 for padding tokens. Only real\n            # tokens are attended to.\n            input_mask = [1] * len(input_ids)\n\n            # Zero-pad up to the sequence length.\n            while len(input_ids) < max_seq_length:\n                input_ids.append(0)\n                input_mask.append(0)\n                segment_ids.append(0)\n\n            assert len(input_ids) == max_seq_length\n            assert len(input_mask) == max_seq_length\n            assert len(segment_ids) == max_seq_length\n\n            start_position = None\n            end_position = None\n            if is_training:\n                # For training, if our document chunk does not contain an annotation\n                # we throw it out, since there is nothing to predict.\n                doc_start = doc_span.start\n                doc_end = doc_span.start + doc_span.length - 1\n                if (example.start_position < doc_start or\n                        example.end_position < doc_start or\n                        example.start_position > doc_end or example.end_position > doc_end):\n                    continue\n\n                doc_offset = len(query_tokens) + 2\n                start_position = tok_start_position - doc_start + doc_offset\n                end_position = tok_end_position - doc_start + doc_offset\n\n            if example_index < 20:\n                logger.info(\"*** Example ***\")\n                logger.info(\"unique_id: %s\" % (unique_id))\n                logger.info(\"example_index: %s\" % (example_index))\n                logger.info(\"doc_span_index: %s\" % (doc_span_index))\n                logger.info(\"tokens: %s\" % \" \".join(tokens))\n                logger.info(\"token_to_orig_map: %s\" % \" \".join([\n                    \"%d:%d\" % (x, y) for (x, y) in token_to_orig_map.items()]))\n                logger.info(\"token_is_max_context: %s\" % \" \".join([\n                    \"%d:%s\" % (x, y) for (x, y) in token_is_max_context.items()\n                ]))\n                logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n                logger.info(\n                    \"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n                logger.info(\n                    \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n                if is_training:\n                    answer_text = \" \".join(tokens[start_position:(end_position + 1)])\n                    logger.info(\"start_position: %d\" % (start_position))\n                    logger.info(\"end_position: %d\" % (end_position))\n                    logger.info(\n                        \"answer: %s\" % (answer_text))\n\n            features.append(\n                InputFeatures(\n                    unique_id=unique_id,\n                    example_index=example_index,\n                    doc_span_index=doc_span_index,\n                    tokens=tokens,\n                    token_to_orig_map=token_to_orig_map,\n                    token_is_max_context=token_is_max_context,\n                    input_ids=input_ids,\n                    input_mask=input_mask,\n                    segment_ids=segment_ids,\n                    start_position=start_position,\n                    end_position=end_position))\n            unique_id += 1\n\n    return features\n\n\ndef _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,\n                         orig_answer_text):\n    \"\"\"Returns tokenized answer spans that better match the annotated answer.\"\"\"\n\n    # The SQuAD annotations are character based. We first project them to\n    # whitespace-tokenized words. But then after WordPiece tokenization, we can\n    # often find a \"better match\". For example:\n    #\n    #   Question: What year was John Smith born?\n    #   Context: The leader was John Smith (1895-1943).\n    #   Answer: 1895\n    #\n    # The original whitespace-tokenized answer will be \"(1895-1943).\". However\n    # after tokenization, our tokens will be \"( 1895 - 1943 ) .\". So we can match\n    # the exact answer, 1895.\n    #\n    # However, this is not always possible. Consider the following:\n    #\n    #   Question: What country is the top exporter of electornics?\n    #   Context: The Japanese electronics industry is the lagest in the world.\n    #   Answer: Japan\n    #\n    # In this case, the annotator chose \"Japan\" as a character sub-span of\n    # the word \"Japanese\". Since our WordPiece tokenizer does not split\n    # \"Japanese\", we just use \"Japanese\" as the annotation. This is fairly rare\n    # in SQuAD, but does happen.\n    tok_answer_text = \" \".join(tokenizer.tokenize(orig_answer_text))\n\n    for new_start in range(input_start, input_end + 1):\n        for new_end in range(input_end, new_start - 1, -1):\n            text_span = \" \".join(doc_tokens[new_start:(new_end + 1)])\n            if text_span == tok_answer_text:\n                return (new_start, new_end)\n\n    return (input_start, input_end)\n\n\ndef _check_is_max_context(doc_spans, cur_span_index, position):\n    \"\"\"Check if this is the 'max context' doc span for the token.\"\"\"\n\n    # Because of the sliding window approach taken to scoring documents, a single\n    # token can appear in multiple documents. E.g.\n    #  Doc: the man went to the store and bought a gallon of milk\n    #  Span A: the man went to the\n    #  Span B: to the store and bought\n    #  Span C: and bought a gallon of\n    #  ...\n    #\n    # Now the word 'bought' will have two scores from spans B and C. We only\n    # want to consider the score with \"maximum context\", which we define as\n    # the *minimum* of its left and right context (the *sum* of left and\n    # right context will always be the same, of course).\n    #\n    # In the example the maximum context for 'bought' would be span C since\n    # it has 1 left context and 3 right context, while span B has 4 left context\n    # and 0 right context.\n    best_score = None\n    best_span_index = None\n    for (span_index, doc_span) in enumerate(doc_spans):\n        end = doc_span.start + doc_span.length - 1\n        if position < doc_span.start:\n            continue\n        if position > end:\n            continue\n        num_left_context = position - doc_span.start\n        num_right_context = end - position\n        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length\n        if best_score is None or score > best_score:\n            best_score = score\n            best_span_index = span_index\n\n    return cur_span_index == best_span_index\n\n\n\nRawResult = collections.namedtuple(\"RawResult\",\n                                   [\"unique_id\", \"start_logits\", \"end_logits\"])\n\n\ndef write_predictions(all_examples, all_features, all_results, n_best_size,\n                      max_answer_length, do_lower_case, output_prediction_file,\n                      output_nbest_file, verbose_logging):\n    \"\"\"Write final predictions to the json file.\"\"\"\n    logger.info(\"Writing predictions to: %s\" % (output_prediction_file))\n    logger.info(\"Writing nbest to: %s\" % (output_nbest_file))\n\n    example_index_to_features = collections.defaultdict(list)\n    for feature in all_features:\n        example_index_to_features[feature.example_index].append(feature)\n\n    unique_id_to_result = {}\n    for result in all_results:\n        unique_id_to_result[result.unique_id] = result\n\n    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name\n        \"PrelimPrediction\",\n        [\"feature_index\", \"start_index\", \"end_index\", \"start_logit\", \"end_logit\"])\n\n    all_predictions = collections.OrderedDict()\n    all_nbest_json = collections.OrderedDict()\n    for (example_index, example) in enumerate(all_examples):\n        features = example_index_to_features[example_index]\n\n        prelim_predictions = []\n        for (feature_index, feature) in enumerate(features):\n            result = unique_id_to_result[feature.unique_id]\n\n            start_indexes = _get_best_indexes(result.start_logits, n_best_size)\n            end_indexes = _get_best_indexes(result.end_logits, n_best_size)\n            for start_index in start_indexes:\n                for end_index in end_indexes:\n                    # We could hypothetically create invalid predictions, e.g., predict\n                    # that the start of the span is in the question. We throw out all\n                    # invalid predictions.\n                    if start_index >= len(feature.tokens):\n                        continue\n                    if end_index >= len(feature.tokens):\n                        continue\n                    if start_index not in feature.token_to_orig_map:\n                        continue\n                    if end_index not in feature.token_to_orig_map:\n                        continue\n                    if not feature.token_is_max_context.get(start_index, False):\n                        continue\n                    if end_index < start_index:\n                        continue\n                    length = end_index - start_index + 1\n                    if length > max_answer_length:\n                        continue\n                    prelim_predictions.append(\n                        _PrelimPrediction(\n                            feature_index=feature_index,\n                            start_index=start_index,\n                            end_index=end_index,\n                            start_logit=result.start_logits[start_index],\n                            end_logit=result.end_logits[end_index]))\n\n        prelim_predictions = sorted(\n            prelim_predictions,\n            key=lambda x: (x.start_logit + x.end_logit),\n            reverse=True)\n\n        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name\n            \"NbestPrediction\", [\"text\", \"start_logit\", \"end_logit\"])\n\n        seen_predictions = {}\n        nbest = []\n        for pred in prelim_predictions:\n            if len(nbest) >= n_best_size:\n                break\n            feature = features[pred.feature_index]\n\n            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]\n            orig_doc_start = feature.token_to_orig_map[pred.start_index]\n            orig_doc_end = feature.token_to_orig_map[pred.end_index]\n            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]\n            tok_text = \" \".join(tok_tokens)\n\n            # De-tokenize WordPieces that have been split off.\n            tok_text = tok_text.replace(\" ##\", \"\")\n            tok_text = tok_text.replace(\"##\", \"\")\n\n            # Clean whitespace\n            tok_text = tok_text.strip()\n            tok_text = \" \".join(tok_text.split())\n            orig_text = \" \".join(orig_tokens)\n\n            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)\n            if final_text in seen_predictions:\n                continue\n\n            seen_predictions[final_text] = True\n            nbest.append(\n                _NbestPrediction(\n                    text=final_text,\n                    start_logit=pred.start_logit,\n                    end_logit=pred.end_logit))\n\n        # In very rare edge cases we could have no valid predictions. So we\n        # just create a nonce prediction in this case to avoid failure.\n        if not nbest:\n            nbest.append(\n                _NbestPrediction(text=\"empty\", start_logit=0.0, end_logit=0.0))\n\n        assert len(nbest) >= 1\n\n        total_scores = []\n        for entry in nbest:\n            total_scores.append(entry.start_logit + entry.end_logit)\n\n        probs = _compute_softmax(total_scores)\n\n        nbest_json = []\n        for (i, entry) in enumerate(nbest):\n            output = collections.OrderedDict()\n            output[\"text\"] = entry.text\n            output[\"probability\"] = probs[i]\n            output[\"start_logit\"] = entry.start_logit\n            output[\"end_logit\"] = entry.end_logit\n            nbest_json.append(output)\n\n        assert len(nbest_json) >= 1\n\n        all_predictions[example.qas_id] = nbest_json[0][\"text\"]\n        all_nbest_json[example.qas_id] = nbest_json\n\n    with open(output_prediction_file, \"w\") as writer:\n        writer.write(json.dumps(all_predictions, indent=4) + \"\\n\")\n\n    with open(output_nbest_file, \"w\") as writer:\n        writer.write(json.dumps(all_nbest_json, indent=4) + \"\\n\")\n\n\ndef get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):\n    \"\"\"Project the tokenized prediction back to the original text.\"\"\"\n\n    # When we created the data, we kept track of the alignment between original\n    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So\n    # now `orig_text` contains the span of our original text corresponding to the\n    # span that we predicted.\n    #\n    # However, `orig_text` may contain extra characters that we don't want in\n    # our prediction.\n    #\n    # For example, let's say:\n    #   pred_text = steve smith\n    #   orig_text = Steve Smith's\n    #\n    # We don't want to return `orig_text` because it contains the extra \"'s\".\n    #\n    # We don't want to return `pred_text` because it's already been normalized\n    # (the SQuAD eval script also does punctuation stripping/lower casing but\n    # our tokenizer does additional normalization like stripping accent\n    # characters).\n    #\n    # What we really want to return is \"Steve Smith\".\n    #\n    # Therefore, we have to apply a semi-complicated alignment heruistic between\n    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This\n    # can fail in certain cases in which case we just return `orig_text`.\n\n    def _strip_spaces(text):\n        ns_chars = []\n        ns_to_s_map = collections.OrderedDict()\n        for (i, c) in enumerate(text):\n            if c == \" \":\n                continue\n            ns_to_s_map[len(ns_chars)] = i\n            ns_chars.append(c)\n        ns_text = \"\".join(ns_chars)\n        return (ns_text, ns_to_s_map)\n\n    # We first tokenize `orig_text`, strip whitespace from the result\n    # and `pred_text`, and check if they are the same length. If they are\n    # NOT the same length, the heuristic has failed. If they are the same\n    # length, we assume the characters are one-to-one aligned.\n    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)\n\n    tok_text = \" \".join(tokenizer.tokenize(orig_text))\n\n    start_position = tok_text.find(pred_text)\n    if start_position == -1:\n        if verbose_logging:\n            logger.info(\n                \"Unable to find text: '%s' in '%s'\" % (pred_text, orig_text))\n        return orig_text\n    end_position = start_position + len(pred_text) - 1\n\n    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)\n    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)\n\n    if len(orig_ns_text) != len(tok_ns_text):\n        if verbose_logging:\n            logger.info(\"Length not equal after stripping spaces: '%s' vs '%s'\",\n                            orig_ns_text, tok_ns_text)\n        return orig_text\n\n    # We then project the characters in `pred_text` back to `orig_text` using\n    # the character-to-character alignment.\n    tok_s_to_ns_map = {}\n    for (i, tok_index) in tok_ns_to_s_map.items():\n        tok_s_to_ns_map[tok_index] = i\n\n    orig_start_position = None\n    if start_position in tok_s_to_ns_map:\n        ns_start_position = tok_s_to_ns_map[start_position]\n        if ns_start_position in orig_ns_to_s_map:\n            orig_start_position = orig_ns_to_s_map[ns_start_position]\n\n    if orig_start_position is None:\n        if verbose_logging:\n            logger.info(\"Couldn't map start position\")\n        return orig_text\n\n    orig_end_position = None\n    if end_position in tok_s_to_ns_map:\n        ns_end_position = tok_s_to_ns_map[end_position]\n        if ns_end_position in orig_ns_to_s_map:\n            orig_end_position = orig_ns_to_s_map[ns_end_position]\n\n    if orig_end_position is None:\n        if verbose_logging:\n            logger.info(\"Couldn't map end position\")\n        return orig_text\n\n    output_text = orig_text[orig_start_position:(orig_end_position + 1)]\n    return output_text\n\n\ndef _get_best_indexes(logits, n_best_size):\n    \"\"\"Get the n-best logits from a list.\"\"\"\n    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)\n\n    best_indexes = []\n    for i in range(len(index_and_score)):\n        if i >= n_best_size:\n            break\n        best_indexes.append(index_and_score[i][0])\n    return best_indexes\n\n\ndef _compute_softmax(scores):\n    \"\"\"Compute softmax probability over raw logits.\"\"\"\n    if not scores:\n        return []\n\n    max_score = None\n    for score in scores:\n        if max_score is None or score > max_score:\n            max_score = score\n\n    exp_scores = []\n    total_sum = 0.0\n    for score in scores:\n        x = math.exp(score - max_score)\n        exp_scores.append(x)\n        total_sum += x\n\n    probs = []\n    for score in exp_scores:\n        probs.append(score / total_sum)\n    return probs\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n\n    ## Required parameters\n    parser.add_argument(\"--bert_model\", default=None, type=str, required=True,\n                        help=\"Bert pre-trained model selected in the list: bert-base-uncased, \"\n                                \"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.\")\n    parser.add_argument(\"--output_dir\", default=None, type=str, required=True,\n                        help=\"The output directory where the model checkpoints will be written.\")\n\n    ## Other parameters\n    parser.add_argument(\"--train_file\", default=None, type=str, help=\"SQuAD json for training. E.g., train-v1.1.json\")\n    parser.add_argument(\"--predict_file\", default=None, type=str,\n                        help=\"SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json\")\n    parser.add_argument(\"--max_seq_length\", default=384, type=int,\n                        help=\"The maximum total input sequence length after WordPiece tokenization. Sequences \"\n                                \"longer than this will be truncated, and sequences shorter than this will be padded.\")\n    parser.add_argument(\"--doc_stride\", default=128, type=int,\n                        help=\"When splitting up a long document into chunks, how much stride to take between chunks.\")\n    parser.add_argument(\"--max_query_length\", default=64, type=int,\n                        help=\"The maximum number of tokens for the question. Questions longer than this will \"\n                                \"be truncated to this length.\")\n    parser.add_argument(\"--do_train\", default=False, action='store_true', help=\"Whether to run training.\")\n    parser.add_argument(\"--do_predict\", default=False, action='store_true', help=\"Whether to run eval on the dev set.\")\n    parser.add_argument(\"--train_batch_size\", default=32, type=int, help=\"Total batch size for training.\")\n    parser.add_argument(\"--predict_batch_size\", default=8, type=int, help=\"Total batch size for predictions.\")\n    parser.add_argument(\"--learning_rate\", default=5e-5, type=float, help=\"The initial learning rate for Adam.\")\n    parser.add_argument(\"--num_train_epochs\", default=3.0, type=float,\n                        help=\"Total number of training epochs to perform.\")\n    parser.add_argument(\"--warmup_proportion\", default=0.1, type=float,\n                        help=\"Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% \"\n                                \"of training.\")\n    parser.add_argument(\"--n_best_size\", default=20, type=int,\n                        help=\"The total number of n-best predictions to generate in the nbest_predictions.json \"\n                                \"output file.\")\n    parser.add_argument(\"--max_answer_length\", default=30, type=int,\n                        help=\"The maximum length of an answer that can be generated. This is needed because the start \"\n                                \"and end predictions are not conditioned on one another.\")\n    parser.add_argument(\"--verbose_logging\", default=False, action='store_true',\n                        help=\"If true, all of the warnings related to data processing will be printed. \"\n                                \"A number of warnings are expected for a normal SQuAD evaluation.\")\n    parser.add_argument('--seed', \n                        type=int, \n                        default=42,\n                        help=\"random seed for initialization\")\n    parser.add_argument('--init_gradient_accumulation_steps',\n                        type=int,\n                        default=1,\n                        help=\"Initial number of updates steps to accumulate before performing a backward/update pass.\")\n    parser.add_argument('--target_gradient_accumulation_steps',\n                        type=int,\n                        default=1,\n                        help=\"Target number of updates steps to accumulate before performing a backward/update pass.\")\n    parser.add_argument('--accumulation_warmup_proportion',default=0.2, type=float,\n                        help=\"Proportion of training to ramp up gradient_accumulation_steps for. E.g., 0.1 = 10% \")\n    parser.add_argument(\"--do_lower_case\",\n                        default=True,\n                        action='store_true',\n                        help=\"Whether to lower case the input text. True for uncased models, False for cased models.\")\n    parser.add_argument('--step_per_log',\n                        type=int, default=100,\n                        help='Number of updates steps to log metrics.')\n    parser.add_argument('--fp16',\n                        default=False,\n                        action='store_true',\n                        help=\"Whether to use 16-bit float precision instead of 32-bit\")\n    parser.add_argument('--loss_scale',\n                        type=float, default=0,\n                        help=\"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\\n\"\n                             \"0 (default value): dynamic loss scaling.\\n\"\n                             \"Positive power of 2: static loss scaling value.\\n\")\n\n    args = parser.parse_args()\n\n    # get the Azure ML run object\n    run = Run.get_context()\n\n    comm = DistributedCommunicator(accumulation_step=args.init_gradient_accumulation_steps)\n    rank = comm.rank\n    local_rank = comm.local_rank\n    world_size = comm.world_size\n\n    torch.cuda.set_device(local_rank)\n    device = torch.device(\"cuda\", local_rank)\n    is_master = rank == 0\n    logger.info(\"world size: {}, local rank: {}, global rank: {}, fp16: {}\".format(world_size, local_rank, rank, args.fp16))\n\n    random.seed(args.seed)\n    np.random.seed(args.seed)\n    torch.manual_seed(args.seed)\n\n    if not args.do_train and not args.do_predict:\n        raise ValueError(\"At least one of `do_train` or `do_predict` must be True.\")\n\n    if args.do_train:\n        if not args.train_file:\n            raise ValueError(\n                \"If `do_train` is True, then `train_file` must be specified.\")\n    if args.do_predict:\n        if not args.predict_file:\n            raise ValueError(\n                \"If `do_predict` is True, then `predict_file` must be specified.\")\n\n    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):\n        raise ValueError(\"Output directory () already exists and is not empty.\")\n    os.makedirs(args.output_dir, exist_ok=True)\n    output_model_file = os.path.join(args.output_dir, \"pytorch_model.bin\")\n\n    tokenizer = BertTokenizer.from_pretrained(args.bert_model)\n\n    # Prepare model\n    model = BertForQuestionAnswering.from_pretrained(args.bert_model,\n                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank))\n    \n    if args.fp16:\n        model.half()\n    \n    model.to(device)\n    comm.register_model(model, args.fp16)\n    if args.do_train:\n        train_examples = read_squad_examples(input_file=args.train_file, is_training=True)\n        num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs)\n\n        param_optimizer = list(model.named_parameters())\n\n        # hack to remove pooler, which is not used\n        # thus it produce None grad that break apex\n        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]\n\n        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n        optimizer_grouped_parameters = [\n            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n            ]\n        t_total = num_train_steps // world_size\n\n        if args.fp16:\n            try:\n                from apex.optimizers import FP16_Optimizer\n                from apex.optimizers import FusedAdam\n            except ImportError:\n                raise ImportError(\"Please install apex from https://www.github.com/nvidia/apex to run this.\")\n\n            optimizer = FusedAdam(optimizer_grouped_parameters,\n                                lr=args.learning_rate,\n                                bias_correction=False,\n                                max_grad_norm=1.0)\n            if args.loss_scale == 0:\n                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)\n            else:\n                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)\n        else:\n            optimizer = BertAdam(optimizer_grouped_parameters,\n                            lr=args.learning_rate,\n                            warmup=args.warmup_proportion,\n                            t_total=t_total)\n\n        if is_master:\n            run.log('lr', np.float(args.learning_rate))\n\n        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(\n            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))\n        train_features = None\n        try:\n            with open(cached_train_features_file, \"rb\") as reader:\n                train_features = pickle.load(reader)\n        except:\n            train_features = convert_examples_to_features(\n                examples=train_examples,\n                tokenizer=tokenizer,\n                max_seq_length=args.max_seq_length,\n                doc_stride=args.doc_stride,\n                max_query_length=args.max_query_length,\n                is_training=True)\n            if rank == 0:\n                logger.info(\"  Saving train features into cached file %s\", cached_train_features_file)\n                with open(cached_train_features_file, \"wb\") as writer:\n                    pickle.dump(train_features, writer)\n        logger.info(\"***** Running training *****\")\n        logger.info(\"  Num orig examples = %d\", len(train_examples))\n        logger.info(\"  Num split examples = %d\", len(train_features))\n        logger.info(\"  Batch size = %d\", args.train_batch_size)\n        logger.info(\"  Num steps = %d\", num_train_steps)\n        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)\n        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)\n        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)\n        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)\n        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,\n                                   all_start_positions, all_end_positions)\n        if world_size > 1:\n            train_sampler = DistributedSampler(train_data,num_replicas=world_size, rank=rank)\n        else:\n            train_sampler = RandomSampler(train_data)\n\n        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)\n        global_step, tr_loss = 0, 0\n        model.train()\n        for _ in trange(int(args.num_train_epochs), desc=\"Epoch\"):\n            for _, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")):\n                batch = tuple(t.to(device) for t in batch)\n                input_ids, input_mask, segment_ids, start_positions, end_positions = batch\n                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)\n                tr_loss += loss.item()\n                if args.fp16:\n                    optimizer.backward(loss)\n                else:\n                    loss.backward()\n                global_step += 1\n                if comm.synchronize():\n                    lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)\n                    for param_group in optimizer.param_groups:\n                        param_group['lr'] = lr_this_step\n                    optimizer.step()\n                    model.zero_grad()\n                    comm.set_accumulation_step(adjust_gradient_accumulation_steps(\n                        global_step/t_total, args.init_gradient_accumulation_steps,\n                        args.target_gradient_accumulation_steps, args.accumulation_warmup_proportion))\n\n                if is_master and (global_step + 1) % args.step_per_log == 0:\n                    run.log('train_loss', np.float(tr_loss / args.step_per_log))\n                    tr_loss = 0\n        if is_master:\n            # Save a trained model\n            torch.save(model.state_dict(), output_model_file)\n                        \n\n    if args.do_predict and is_master:\n        # Load a trained model that you have fine-tuned\n        model_state_dict = torch.load(output_model_file)\n        model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict)\n        model.to(device)\n\n        eval_examples = read_squad_examples(\n            input_file=args.predict_file, is_training=False)\n        eval_features = convert_examples_to_features(\n            examples=eval_examples,\n            tokenizer=tokenizer,\n            max_seq_length=args.max_seq_length,\n            doc_stride=args.doc_stride,\n            max_query_length=args.max_query_length,\n            is_training=False)\n\n        logger.info(\"***** Running predictions *****\")\n        logger.info(\"  Num orig examples = %d\", len(eval_examples))\n        logger.info(\"  Num split examples = %d\", len(eval_features))\n        logger.info(\"  Batch size = %d\", args.predict_batch_size)\n\n        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)\n        # Run prediction for full data\n        eval_sampler = SequentialSampler(eval_data)\n        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)\n\n        model.eval()\n        all_results = []\n        logger.info(\"Start evaluating\")\n        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc=\"Evaluating\"):\n            if len(all_results) % 1000 == 0:\n                logger.info(\"Processing example: %d\" % (len(all_results)))\n            input_ids = input_ids.to(device)\n            input_mask = input_mask.to(device)\n            segment_ids = segment_ids.to(device)\n            with torch.no_grad():\n                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)\n            for i, example_index in enumerate(example_indices):\n                start_logits = batch_start_logits[i].detach().cpu().tolist()\n                end_logits = batch_end_logits[i].detach().cpu().tolist()\n                eval_feature = eval_features[example_index.item()]\n                unique_id = int(eval_feature.unique_id)\n                all_results.append(RawResult(unique_id=unique_id,\n                                             start_logits=start_logits,\n                                             end_logits=end_logits))\n        output_prediction_file = os.path.join(args.output_dir, \"predictions.json\")\n        output_nbest_file = os.path.join(args.output_dir, \"nbest_predictions.json\")\n        write_predictions(eval_examples, eval_features, all_results,\n                          args.n_best_size, args.max_answer_length,\n                          args.do_lower_case, output_prediction_file,\n                          output_nbest_file, args.verbose_logging)\n\n        with open(args.predict_file) as predict_file:\n            dataset_json = json.load(predict_file)\n            dataset = dataset_json['data']\n        with open(output_prediction_file) as prediction_file:\n            predictions = json.load(prediction_file)\n        \n        result = evaluate(dataset, predictions)\n        for key in result.keys():\n            logger.info(\"  %s = %s\", key, str(result[key]))\n        run.log('exact_match', result['exact_match'])\n        run.log('f1', result['f1'])\n\n\nif __name__ == \"__main__\":\n    main()\n\n\n# Original source: https://github.com/microsoft/AzureML-BERT/blob/master/PyTorch/run_squad_azureml.py"
  },
  {
    "path": "examples/question_answering/bidaf_aml_deep_dive.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\n    \"\\n\",\n    \"Licensed under the MIT License.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# BiDAF Model Deep Dive on AzureML\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/question_answering/bidaf_aml_deep_dive.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This notebook demonstrates a deep dive into a popular question-answering (QA) model, Bi-Directional Attention Flow (BiDAF). We use [AllenNLP](https://allennlp.org/), an open-source NLP research library built on top of PyTorch, to train the BiDAF model from scratch on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset, using Azure Machine Learning ([AzureML](https://azure.microsoft.com/en-us/services/machine-learning-service/)). \\n\",\n    \"\\n\",\n    \"The following capabilities are highlighted in this notebook:  \\n\",\n    \"- AmlCompute\\n\",\n    \"- Datastore\\n\",\n    \"- Logging\\n\",\n    \"- AllenNLP library\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Table of Contents\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"1. [Introduction](#1.-Introduction)  \\n\",\n    \"    * 1.1 [SQuAD Dataset](#1.1-SQuAD-Dataset)  \\n\",\n    \"    * 1.2 [BiDAF Model](#1.2-BiDAF-Model)  \\n\",\n    \"    * 1.3 [AllenNLP](#1.3-AllenNLP)  \\n\",\n    \"2. [AzureML Setup](#2.-AzureML-Setup)  \\n\",\n    \"    * 2.1 [Link to or create a `Workspace`](#2.1-Link-to-or-create-a-Workspace)  \\n\",\n    \"    * 2.2 [Set up an `Experiment` and Logging](#2.2-Set-up-an-Experiment-and-Logging)  \\n\",\n    \"    * 2.3 [Link `AmlCompute` compute target](#2.3-Link-AmlCompute-Compute-Target)  \\n\",\n    \"    * 2.4 [Upload Files to `Datastore`](#2.4-Upload-Files-to-Datastore)  \\n\",\n    \"3. [Prepare Training Script](#3.-Prepare-Training-Script) \\n\",\n    \"4. [Create a PyTorch Estimator](#4.-Create-a-PyTorch-Estimator)\\n\",\n    \"5. [Submit a Job](#5.-Submit-a-Job)  \\n\",\n    \"6. [Inspect Results of Run](#6.-Inspect-Results-of-Run)  \\n\",\n    \"    * 6.1 [Evaluation on SQuAD](#6.1-Evaluation-on-SQuAD)\\n\",\n    \"    * 6.2 [Try the Best Model](#6.2-Try-the-Best-Model)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1. Introduction\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.1 SQuAD Dataset\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset was released in 2016 and has become a benchmarking dataset for machine comprehension tasks. It contains a set of more than 100,000 question-context tuples along with their answers, extracted from Wikipedia articles. 90,000 of the question-context tuples make up the training set and the remaining 10,000 compose the development set. The answers are spans in the context (given passage) and are evaluated against human-labeled answers. Two metrics are used for evaluation: Exact Match (EM) and F1 score.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.2 BiDAF Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The [BiDAF](https://www.semanticscholar.org/paper/Bidirectional-Attention-Flow-for-Machine-Seo-Kembhavi/007ab5528b3bd310a80d553cccad4b78dc496b02\\n\",\n    \") model achieved state-of-the-art performance on the SQuAD dataset in 2017 and is a well-respected, performant baseline for QA. The BiDAF network is a \\\"hierarchical multi-stage architecture for modeling representations of the context at different levels of granularity. BiDAF includes character-level, word-level, and phrase-level embeddings, and uses bi-directional attention flow to allow for query-aware context representations\\\". \\n\",\n    \"\\n\",\n    \"The network contains six different layers, as described by [Seo et al, 2017](https://www.semanticscholar.org/paper/Bidirectional-Attention-Flow-for-Machine-Seo-Kembhavi/007ab5528b3bd310a80d553cccad4b78dc496b02):\\n\",\n    \"\\n\",\n    \"1. **Character Embedding Layer**: character-level CNNs to embed each word\\n\",\n    \"2. **Word Embedding Layer**: word embeddings using pre-trained GloVe word vectors\\n\",\n    \"3. **Phrase Embedding Layer**: LSTM on top of the previous layers to model the temporal interactions between words\\n\",\n    \"4. **Attention Flow Layer**: Fuses information from the context and query words. Unlike previous models, \\\"the attention flow layer is not used to summarize the query and context into a single feature vectors. Instead, the attention vectors at each time step, along with embeddings from previous layers, are allowed to flow through to the subsequent modeling layers\\\", reducing information loss.\\n\",\n    \"5. **Modeling Layer**: produces a matrix of contextual information about the word with respect to the entire context paragraph and query\\n\",\n    \"6. **Output Layer**: predicts the start and end indices of the phrase in the paragraph\\n\",\n    \"\\n\",\n    \"The following figure displays the architecture of the BiDAF network.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![](https://nlpbp.blob.core.windows.net/images/BiDAF_model.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.3 AllenNLP\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The notebook demonstrates how to use the BiDAF implementation provided by [AllenNLP](https://www.semanticscholar.org/paper/A-Deep-Semantic-Natural-Language-Processing-Gardner-Grus/a5502187140cdd98d76ae711973dbcdaf1fef46d), an open-source NLP research library built on top of PyTorch. AllenNLP is a product of the Allen Institute for Artifical Intelligence and is used widely across differnet universities and top companies (including Facebook Research and Amazon Alexa). They maintain a robust and active [Github repository](https://github.com/allenai/allennlp) as well as a [website](https://allennlp.org/) with documentation and demos. Their model is a reimplementation of the original BiDAF model and they report a higher EM score and faster training times than the original BiDAF system (68.3 EM score versus 67.7 and 10x speedup, taking ~4 hours on a p2.xlarge). The AllenNLP library is mainly designed for use through the command line (and most tutorials use this method), but can also be used programatically. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The AllenNLP library focuses on the creation of NLP pipelines with easily interchangable building blocks. The general pipeline steps are as follows:  \\n\",\n    \"- DatasetReader: defines how to extract information from your data and convert it into Instance objects that will be used by the model  \\n\",\n    \"- Iterator: takes the instances produced by the DatasetReader and batches them for training\\n\",\n    \"- Model\\n\",\n    \"- Trainer: trains the model and records metrics  \\n\",\n    \"- Predictor: takes raw strings and produces predictions\\n\",\n    \"\\n\",\n    \"Each step is loosely-coupled, making it easy to swap different options for each step. While it is possible to construct your own AllenNLP objects (see this [tutorial](https://mlexplained.com/2019/01/30/an-in-depth-tutorial-to-allennlp-from-basics-to-elmo-and-bert/) for a great deep-dive into constructing your own AllenNLP pipeline), the easiest way is to utilize the JSON-like parameter constructor methods provided by most AllenNLP objects. For example, rather than\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))\\n\",\n    \"```\\n\",\n    \"we can use  \\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"lstm_params = Params({\\n\",\n    \"    \\\"type\\\": \\\"lstm\\\",\\n\",\n    \"    \\\"input_size\\\": EMBEDDING_DIM,\\n\",\n    \"    \\\"hidden_size\\\": HIDDEN_DIM\\n\",\n    \"})\\n\",\n    \"\\n\",\n    \"lstm = Seq2SeqEncoder.from_params(lstm_params)\\n\",\n    \"```\\n\",\n    \"This provides two advantages:  \\n\",\n    \"1. Experiments can be declaratively specified in a separate [configuration file](https://github.com/allenai/allennlp/blob/master/tutorials/tagger/README.md#using-config-files)  \\n\",\n    \"2. Experiments can be easily changed with no coding, rather just changing the entry in the config file\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**AllenNLP Resources:**\\n\",\n    \"\\n\",\n    \"The following resources are recommended for understanding how the AllenNLP library works and being able to implement your own models and pipelines\\n\",\n    \"\\n\",\n    \"- Information about the provided AllenNLP models: https://allennlp.org/models\\n\",\n    \"- Using configuration files: https://github.com/allenai/allennlp/blob/master/tutorials/tagger/README.md#using-config-files   \\n\",\n    \"- In-depth discussion of each AllenNLP object used and how to construct your own specialized ones: https://mlexplained.com/2019/01/30/an-in-depth-tutorial-to-allennlp-from-basics-to-elmo-and-bert/  \\n\",\n    \"- AllenNLPs Part-of-Speech-Tagging tutorial showcasing how to use their methods programatically: https://allennlp.org/tutorials   \\n\",\n    \"- Short AllenNLP programatic tutorial: https://github.com/titipata/allennlp-tutorial/blob/master/allennlp_tutorial.ipynb  \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \\n\",\n      \"[GCC 7.3.0]\\n\",\n      \"Azure ML SDK Version: 1.0.48\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Imports\\n\",\n    \"import sys\\n\",\n    \"import os\\n\",\n    \"import shutil\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"import json\\n\",\n    \"from urllib.request import urlretrieve\\n\",\n    \"import scrapbook as sb\\n\",\n    \"\\n\",\n    \"#import utils\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"from utils_nlp.azureml import azureml_utils\\n\",\n    \"\\n\",\n    \"import azureml as aml\\n\",\n    \"from azureml.core import Datastore, Experiment\\n\",\n    \"from azureml.core.compute import ComputeTarget, AmlCompute\\n\",\n    \"from azureml.exceptions import ComputeTargetException\\n\",\n    \"from azureml.train.dnn import PyTorch\\n\",\n    \"from azureml.widgets import RunDetails\\n\",\n    \"from azureml.core.conda_dependencies import CondaDependencies\\n\",\n    \"from azureml.exceptions import ComputeTargetException\\n\",\n    \"from allennlp.predictors import Predictor\\n\",\n    \"\\n\",\n    \"print(\\\"System version: {}\\\".format(sys.version))\\n\",\n    \"print(\\\"Azure ML SDK Version:\\\", aml.core.VERSION)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"PROJECT_FOLDER = \\\"./bidaf-question-answering\\\"\\n\",\n    \"SQUAD_FOLDER = \\\"./squad\\\"\\n\",\n    \"BIDAF_CONFIG_PATH = \\\".\\\"\\n\",\n    \"LOGS_FOLDER = '.'\\n\",\n    \"NUM_EPOCHS = 25\\n\",\n    \"PIP_PACKAGES = [\\n\",\n    \"        \\\"allennlp==0.8.4\\\",\\n\",\n    \"        \\\"azureml-sdk==1.0.48\\\",\\n\",\n    \"        \\\"https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz\\\",\\n\",\n    \"    ]\\n\",\n    \"CONDA_PACKAGES = [\\\"jsonnet\\\", \\\"cmake\\\", \\\"regex\\\", \\\"pytorch\\\", \\\"torchvision\\\"]\\n\",\n    \"config_path = (\\n\",\n    \"    \\\"./.azureml\\\"\\n\",\n    \")  # Path to the directory containing config.json with azureml credentials\\n\",\n    \"\\n\",\n    \"# Azure resources\\n\",\n    \"subscription_id = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"resource_group = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  \\n\",\n    \"workspace_name = \\\"YOUR_WORKSPACE_NAME\\\"  \\n\",\n    \"workspace_region = \\\"YOUR_WORKSPACE_REGION\\\" #Possible values eastus, eastus2 and so on.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2. AzureML Setup\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now, we set up the necessary components for running this as an AzureML experiment\\n\",\n    \"1. Create or link to an existing `Workspace`\\n\",\n    \"2. Set up an `Experiment` with `logging`\\n\",\n    \"3. Create or attach existing `AmlCompute`\\n\",\n    \"4. Upload our data to a `Datastore`\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.1 Link to or create a Workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The following cell looks to set up the connection to your [Azure Machine Learning service Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). You can choose to connect to an existing workspace or create a new one. \\n\",\n    \"\\n\",\n    \"**To access an existing workspace:**\\n\",\n    \"1. If you have a `config.json` file, you do not need to provide the workspace information; you will only need to update the `config_path` variable that is defined above which contains the file.\\n\",\n    \"2. Otherwise, you will need to supply the following:\\n\",\n    \"    * The name of your workspace\\n\",\n    \"    * Your subscription id\\n\",\n    \"    * The resource group name\\n\",\n    \"\\n\",\n    \"**To create a new workspace:**\\n\",\n    \"\\n\",\n    \"Set the following information:\\n\",\n    \"* A name for your workspace\\n\",\n    \"* Your subscription id\\n\",\n    \"* The resource group name\\n\",\n    \"* [Azure region](https://azure.microsoft.com/en-us/global-infrastructure/regions/) to create the workspace in, such as `eastus2`. \\n\",\n    \"\\n\",\n    \"This will automatically create a new resource group for you in the region provided if a resource group with the name given does not already exist. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Performing interactive authentication. Please follow the instructions on the terminal.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING - Note, we have launched a browser for you to login. For old experience with device code, use \\\"az login --use-device-code\\\"\\n\",\n      \"WARNING - You have logged in. Now let us find all the subscriptions to which you have access...\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Interactive authentication successfully completed.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"ws = azureml_utils.get_or_create_workspace(\\n\",\n    \"    config_path=config_path,\\n\",\n    \"    subscription_id=subscription_id,\\n\",\n    \"    resource_group=resource_group,\\n\",\n    \"    workspace_name=workspace_name,\\n\",\n    \"    workspace_region=workspace_region,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\n\",\n    \"    \\\"Workspace name: \\\" + ws.name,\\n\",\n    \"    \\\"Azure region: \\\" + ws.location,\\n\",\n    \"    \\\"Subscription id: \\\" + ws.subscription_id,\\n\",\n    \"    \\\"Resource group: \\\" + ws.resource_group,\\n\",\n    \"    sep=\\\"\\\\n\\\",\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.2 Set up an Experiment and Logging\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Next, we set up an `Experiment` named bidaf-question-answering, add logging capabilities, and create a local folder that will be the source directory for the AzureML run.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Make a folder for the project\\n\",\n    \"os.makedirs(PROJECT_FOLDER, exist_ok=True)\\n\",\n    \"\\n\",\n    \"# Set up an experiment\\n\",\n    \"experiment_name = \\\"NLP-QA-BiDAF-deepdive\\\"\\n\",\n    \"experiment = Experiment(ws, experiment_name)\\n\",\n    \"\\n\",\n    \"# Add logging to our experiment\\n\",\n    \"run = experiment.start_logging(snapshot_directory=PROJECT_FOLDER)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.3 Link AmlCompute Compute Target\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We need to link a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training our model (see [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) for explanation of the different options). We will use an [AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute) target and link to an existing target (if the cluster_name exists) or create a STANDARD_NC6 GPU cluster (autoscales from 0 to 4 nodes) in this example. Creating a new AmlComputes takes approximately 5 minutes. \\n\",\n    \"\\n\",\n    \"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Found existing compute target.\\n\",\n      \"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-23T16:18:34.392000+00:00', 'errors': None, 'creationTime': '2019-07-09T16:20:30.625908+00:00', 'modifiedTime': '2019-07-09T16:20:46.601973+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# choose your cluster\\n\",\n    \"cluster_name = \\\"gpu-test\\\"\\n\",\n    \"\\n\",\n    \"try:\\n\",\n    \"    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\\n\",\n    \"    print(\\\"Found existing compute target.\\\")\\n\",\n    \"except ComputeTargetException:\\n\",\n    \"    print(\\\"Creating a new compute target...\\\")\\n\",\n    \"    compute_config = AmlCompute.provisioning_configuration(\\n\",\n    \"        vm_size=\\\"STANDARD_NC6\\\", max_nodes=4\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    # create the cluster\\n\",\n    \"    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\\n\",\n    \"\\n\",\n    \"    compute_target.wait_for_completion(show_output=True)\\n\",\n    \"\\n\",\n    \"# use get_status() to get a detailed status for the current AmlCompute.\\n\",\n    \"print(compute_target.get_status().serialize())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.4 Upload Files to Datastore\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This step uploads our local files to a `Datastore` so that the data is accessible from the remote compute target. A DataStore is backed either by a Azure File Storage (default option) or Azure Blob Storage ([how to decide between these options](https://docs.microsoft.com/en-us/azure/storage/common/storage-decide-blobs-files-disks)) and data is made accessible by mounting or copying data to the compute target. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"First, we download the SQuAD data files and save to a folder called squad.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"('./squad/squad_dev.json', <http.client.HTTPMessage at 0x2646892de10>)\"\n      ]\n     },\n     \"execution_count\": 7,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"os.makedirs(SQUAD_FOLDER, exist_ok=True)  # make squad folder locally\\n\",\n    \"\\n\",\n    \"urlretrieve(\\n\",\n    \"    \\\"https://allennlp.s3.amazonaws.com/datasets/squad/squad-train-v1.1.json\\\",\\n\",\n    \"    filename=SQUAD_FOLDER+\\\"/squad_train.json\\\",\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"urlretrieve(\\n\",\n    \"    \\\"https://allennlp.s3.amazonaws.com/datasets/squad/squad-dev-v1.1.json\\\",\\n\",\n    \"    filename=SQUAD_FOLDER+\\\"/squad_dev.json\\\",\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We also copy our AllenNLP configuration file (bidaf_config.json) into this squad folder so that it can be uploaded to the `Datastore` and accessed during training. As described in [Section 1.3](#1.3-AllenNLP), this configuration files allows us to easily specify the parameters for instantiating AllenNLP objects. This file contains a dictionary of dictionaries. The top level contains 4 main keys: dataset_reader, model, iterator, and trainer (plus keys for train_data_path, validation_data_path, and evaluate_on_test). If you notice carefully from [Section 1.3](#1.3-AllenNLP), these correspond to the AllenNLP object building blocks. Each of these keys map to a dictionary of parameters. For instance, the trainer dictionary contains keys to specify the number of epochs, learning rate scheduler, optimizer, etc. The parameter settings provided here are the ones suggested by AllenNLP for the BiDAF model; however, below we demonstrate how to override these parameters without having to change this configuration file directly.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'./squad\\\\\\\\bidaf_config.json'\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"shutil.copy(BIDAF_CONFIG_PATH+'/bidaf_config.json', SQUAD_FOLDER)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now we upload both the SQuAD data files as well as the configuration file to the datastore. `ws.datastores` lists all options for datastores and `ds.account_name` gets the name of the datastore that can be used to find it in the Azure portal. Once we have selected the appropriate datastore, we use the `upload()` method to upload all files from the squad local folder to a folder on the datastore called squad_data.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Uploading an estimated of 3 files\\n\",\n      \"Uploading ./squad\\\\bidaf_config.json\\n\",\n      \"Uploading ./squad\\\\squad_dev.json\\n\",\n      \"Uploading ./squad\\\\squad_train.json\\n\",\n      \"Uploaded ./squad\\\\bidaf_config.json, 1 files out of an estimated total of 3\\n\",\n      \"Uploaded ./squad\\\\squad_dev.json, 2 files out of an estimated total of 3\\n\",\n      \"Uploaded ./squad\\\\squad_train.json, 3 files out of an estimated total of 3\\n\",\n      \"Uploaded 3 files\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"$AZUREML_DATAREFERENCE_09a567b57ea546b697d8d7ce1bcf2d86\"\n      ]\n     },\n     \"execution_count\": 9,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# Select a specific datastore or you can call ws.get_default_datastore()\\n\",\n    \"datastore_name = \\\"workspacefilestore\\\"\\n\",\n    \"ds = ws.datastores[datastore_name]\\n\",\n    \"\\n\",\n    \"# Upload files in squad data folder to the datastore\\n\",\n    \"ds.upload(\\n\",\n    \"    src_dir=SQUAD_FOLDER, target_path=\\\"squad_data\\\", overwrite=True, show_progress=True\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3. Prepare Training Script\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Here, we create a simple training script that uses AllenNLP's `train_model_from_file()` function containing the following parameters:  \\n\",\n    \"- parameter_filename (str) : A json parameter file specifying an AllenNLP experiment\\n\",\n    \"- serialization_dir (str): The directory in which to save results and logs\\n\",\n    \"- overrides (str): A JSON string that we will use to override values in the input parameter file\\n\",\n    \"- file_friendly_logging (bool, optional): If True, we make our output more friendly to saved model files\\n\",\n    \"- recover (bool, optional): If True, we will try to recover a training run from an existing serialization\\n\",\n    \"\\n\",\n    \"Our training script parameters are: the location of the data folder, name of the configuration file, and JSON string with any overrides for the configuration file. See the [documentation](https://github.com/allenai/allennlp/blob/9a13ab570025a0c1659986009d2abddb2e652020/allennlp/commands/train.py) on AllenNLP `train_model_from_file()` function for more details.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Overwriting ./bidaf-question-answering/train.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile $PROJECT_FOLDER/train.py\\n\",\n    \"import torch\\n\",\n    \"import argparse\\n\",\n    \"import os\\n\",\n    \"import shutil\\n\",\n    \"from allennlp.common import Params\\n\",\n    \"from allennlp.commands.train import train_model_from_file\\n\",\n    \"\\n\",\n    \"def main():\\n\",\n    \"    # get command-line arguments\\n\",\n    \"    parser = argparse.ArgumentParser()\\n\",\n    \"    parser.add_argument('--data_folder', type=str, \\n\",\n    \"                        help='Folder where data is stored')\\n\",\n    \"    parser.add_argument('--config_name', type=str, \\n\",\n    \"                        help='Name of json configuration file')\\n\",\n    \"    parser.add_argument('--overrides', type=str, \\n\",\n    \"                        help='Override parameters on config file')\\n\",\n    \"    args = parser.parse_args()\\n\",\n    \"    squad_folder = os.path.join(args.data_folder, \\\"squad_data\\\")\\n\",\n    \"    serialization_folder = \\\"./logs\\\" #save to the run logs folder\\n\",\n    \"    \\n\",\n    \"    #delete log file if it already exists\\n\",\n    \"    if os.path.isdir(serialization_folder):\\n\",\n    \"        shutil.rmtree(serialization_folder)\\n\",\n    \"        \\n\",\n    \"    train_model_from_file(parameter_filename = os.path.join(squad_folder, args.config_name),\\n\",\n    \"           overrides = args.overrides,\\n\",\n    \"           serialization_dir = serialization_folder,\\n\",\n    \"           file_friendly_logging = True,\\n\",\n    \"           recover = False)\\n\",\n    \"\\n\",\n    \"if __name__ == \\\"__main__\\\":\\n\",\n    \"    main()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4. Create a PyTorch Estimator\\n\",\n    \"\\n\",\n    \"AllenNLP is built on PyTorch, so we will use the AzureML SDK's PyTorch estimator to easily submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, see [How to Train Pytorch Models on AzureML](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch). First we set up a .yml file with the necessary dependencies.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'bidafenv.yml'\"\n      ]\n     },\n     \"execution_count\": 11,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"myenv = CondaDependencies.create(\\n\",\n    \"    conda_packages= CONDA_PACKAGES,\\n\",\n    \"    pip_packages= PIP_PACKAGES,\\n\",\n    \"    python_version=\\\"3.6.8\\\",\\n\",\n    \")\\n\",\n    \"myenv.add_channel(\\\"conda-forge\\\")\\n\",\n    \"myenv.add_channel(\\\"pytorch\\\")\\n\",\n    \"\\n\",\n    \"conda_env_file_name = \\\"bidafenv.yml\\\"\\n\",\n    \"myenv.save_to_file(PROJECT_FOLDER, conda_env_file_name)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We next define any parameters in the configuration file that we want to override for this specific training run. We demonstrate overriding the num_epochs parameter to perform 25 epochs (rather than 20 epochs as set in bidaf_config.json). The AllenNLP training function expects that overrides are a JSON string, so we convert our dictionary into a JSON string before passing it in as an argument to our training script.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"overrides = {\\\"trainer\\\":{'num_epochs': NUM_EPOCHS}}\\n\",\n    \"overrides = json.dumps(overrides)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Define the parameters to pass to the training script, the project folder, compute target, conda dependencies file, and the name of the training script. Notice that we set `use_gpu` equal to True. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING - If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\\n\",\n      \"WARNING - framework_version is not specified, defaulting to version 1.1.\\n\",\n      \"WARNING - You have specified to install packages in your run. Note that Azure ML also installs the following packages on your behalf: ['torchvision']. \\n\",\n      \"This may lead to unexpected package installation errors. Take a look at `estimator.conda_dependencies` to understand what packages are installed by Azure ML.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"script_params = {\\n\",\n    \"    \\\"--data_folder\\\": ds.as_mount(),\\n\",\n    \"    \\\"--config_name\\\": \\\"bidaf_config.json\\\",\\n\",\n    \"    \\\"--overrides\\\": overrides,\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"estimator = PyTorch(\\n\",\n    \"    source_directory=PROJECT_FOLDER,\\n\",\n    \"    script_params=script_params,\\n\",\n    \"    compute_target=compute_target,\\n\",\n    \"    entry_script=\\\"train.py\\\",\\n\",\n    \"    use_gpu=True,\\n\",\n    \"    conda_dependencies_file=\\\"bidafenv.yml\\\",\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 5. Submit a Job\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Submit the estimator object to run your experiment. Results can be monitored using a Jupyter widget. The widget and run are asynchronous and update every 10-15 seconds until job completion.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Run(Experiment: bidaf-question-answering,\\n\",\n      \"Id: bidaf-question-answering_1563899344_bce3c688,\\n\",\n      \"Type: azureml.scriptrun,\\n\",\n      \"Status: Starting)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"run = experiment.submit(estimator)\\n\",\n    \"print(run)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"3da61f9cf1a84f91ae23925843b584d7\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"RunDetails(run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#wait for the run to complete before continuing in the notebook\\n\",\n    \"run.wait_for_completion() \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Cancel the Job**\\n\",\n    \"\\n\",\n    \"Interrupting/restarting the Jupyter kernel will not properly cancel the run, which can lead to wasted compute resources. To avoid this, we recommend explicitly canceling a run with the following code:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#run.cancel()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6. Inspect Results of Run \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"AllenNLP's training saves all intermediate and final results to the serialization_dir (defined in train.py). In order to inspect the results as well as use the trained model, we will download the files from the run logs using the `download_files()` command.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"run.download_files(prefix=\\\"./logs\\\", output_directory=LOGS_FOLDER)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 6.1 Evaluation on SQuAD\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The metrics.json file contains the final metrics. We can load this file and extract the final SQuAD dev set EM score (key is 'best_validation_em'). AllenNLP reports an EM score of 68.3, so depending on the parameters specified in your config file, expect a score in that range.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.6152317880794702,\n       \"encoder\": \"json\",\n       \"name\": \"validation_EM\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"validation_EM\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"0.6152317880794702\"\n      ]\n     },\n     \"execution_count\": 19,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"with open(LOGS_FOLDER+\\\"/logs/metrics.json\\\") as f:\\n\",\n    \"    metrics = json.load(f)\\n\",\n    \"\\n\",\n    \"sb.glue(\\\"validation_EM\\\", metrics[\\\"best_validation_em\\\"])\\n\",\n    \"metrics[\\\"best_validation_em\\\"]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 6.2 Try the Best Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In order to use our model, we need to create an AllenNLP [Predictor](https://github.com/allenai/allennlp/blob/master/allennlp/predictors/predictor.py) object. We instantiate this object from an archive path. An archive comprises a Model and its experimental configuration file. After training a model, the archive is saved to the serialization_dir (whose path is set in train.py).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING - _jsonnet not loaded, treating ./logs\\\\config.json as json\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"model = Predictor.from_path(LOGS_FOLDER+\\\"/logs\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The Predictor object allows us to directly pass in a question and passage (behind the scenes it converts this to Instance objects using the DatasetReader). We define an example passage/question, call the model's `predict()` function, and finally extract the `best_span_str` attribute which contains the answer to our query.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"passage = \\\"Machine Comprehension (MC), answering questions about a given context, \\\\\\n\",\n    \"requires modeling complex interactions between the context and the query. Recently,\\\\\\n\",\n    \"attention mechanisms have been successfully extended to MC. Typically these mechanisms\\\\\\n\",\n    \"use attention to summarize the query and context into a single vector, couple \\\\\\n\",\n    \"attentions temporally, and often form a uni-directional attention. In this paper \\\\\\n\",\n    \"we introduce the Bi-Directional Attention Flow (BIDAF) network, a multi-stage \\\\\\n\",\n    \"hierarchical process that represents the context at different levels of granularity \\\\\\n\",\n    \"and uses a bi-directional attention flow mechanism to achieve a query-aware context \\\\\\n\",\n    \"representation without early summarization. Our experimental evaluations show that \\\\\\n\",\n    \"our model achieves the state-of-the-art results in Stanford QA (SQuAD) and \\\\\\n\",\n    \"CNN/DailyMail Cloze Test datasets.\\\"\\n\",\n    \"\\n\",\n    \"question = \\\"What dataset does BIDAF achieve state-of-the-art results on?\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"result = model.predict(question, passage)[\\\"best_span_str\\\"]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'Stanford QA'\"\n      ]\n     },\n     \"execution_count\": 23,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"result\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/question_answering/bidaf_config.json",
    "content": "{\r\n  \"dataset_reader\": {\r\n    \"type\": \"squad\",\r\n    \"token_indexers\": {\r\n      \"tokens\": {\r\n        \"type\": \"single_id\",\r\n        \"lowercase_tokens\": true\r\n      },\r\n      \"token_characters\": {\r\n        \"type\": \"characters\",\r\n        \"character_tokenizer\": {\r\n          \"byte_encoding\": \"utf-8\",\r\n          \"start_tokens\": [259],\r\n          \"end_tokens\": [260]\r\n        },\r\n        \"min_padding_length\": 5\r\n      }\r\n    }\r\n  },\r\n  \"train_data_path\": \"https://allennlp.s3.amazonaws.com/datasets/squad/squad-train-v1.1.json\",\r\n  \"validation_data_path\": \"https://allennlp.s3.amazonaws.com/datasets/squad/squad-dev-v1.1.json\",\r\n  \"evaluate_on_test\": true,\r\n  \"model\": {\r\n    \"type\": \"bidaf\",\r\n    \"text_field_embedder\": {\r\n      \"token_embedders\": {\r\n        \"tokens\": {\r\n          \"type\": \"embedding\",\r\n          \"pretrained_file\": \"https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.100d.txt.gz\",\r\n          \"embedding_dim\": 100,\r\n          \"trainable\": false\r\n        },\r\n        \"token_characters\": {\r\n          \"type\": \"character_encoding\",\r\n          \"embedding\": {\r\n            \"num_embeddings\": 262,\r\n            \"embedding_dim\": 16\r\n          },\r\n          \"encoder\": {\r\n            \"type\": \"cnn\",\r\n            \"embedding_dim\": 16,\r\n            \"num_filters\": 100,\r\n            \"ngram_filter_sizes\": [5]\r\n          },\r\n          \"dropout\": 0.2\r\n        }\r\n      }\r\n    },\r\n    \"num_highway_layers\": 2,\r\n    \"phrase_layer\": {\r\n      \"type\": \"lstm\",\r\n      \"bidirectional\": true,\r\n      \"input_size\": 200,\r\n      \"hidden_size\": 100,\r\n      \"num_layers\": 1\r\n    },\r\n    \"similarity_function\": {\r\n      \"type\": \"linear\",\r\n      \"combination\": \"x,y,x*y\",\r\n      \"tensor_1_dim\": 200,\r\n      \"tensor_2_dim\": 200\r\n    },\r\n    \"modeling_layer\": {\r\n      \"type\": \"lstm\",\r\n      \"bidirectional\": true,\r\n      \"input_size\": 800,\r\n      \"hidden_size\": 100,\r\n      \"num_layers\": 2,\r\n      \"dropout\": 0.2\r\n    },\r\n    \"span_end_encoder\": {\r\n      \"type\": \"lstm\",\r\n      \"bidirectional\": true,\r\n      \"input_size\": 1400,\r\n      \"hidden_size\": 100,\r\n      \"num_layers\": 1\r\n    },\r\n    \"dropout\": 0.2\r\n  },\r\n  \"iterator\": {\r\n    \"type\": \"bucket\",\r\n    \"sorting_keys\": [[\"passage\", \"num_tokens\"], [\"question\", \"num_tokens\"]],\r\n    \"batch_size\": 40\r\n  },\r\n\r\n  \"trainer\": {\r\n    \"num_epochs\": 20, \r\n    \"grad_norm\": 5.0,\r\n    \"patience\": 10,\r\n    \"validation_metric\": \"+em\",\r\n    \"cuda_device\": 0,\r\n    \"learning_rate_scheduler\": {\r\n      \"type\": \"reduce_on_plateau\",\r\n      \"factor\": 0.5,\r\n      \"mode\": \"max\",\r\n      \"patience\": 2\r\n    },\r\n    \"optimizer\": {\r\n      \"type\": \"adam\",\r\n      \"betas\": [0.9, 0.9]\r\n    }\r\n  }\r\n}"
  },
  {
    "path": "examples/question_answering/pretrained-BERT-SQuAD-deep-dive-aml.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\n    \"\\n\",\n    \"Licensed under the MIT License.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Question Answering: Fine-Tune BERT on AzureML (PyTorch)\\n\",\n    \"**BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding** [\\\\[1\\\\]](#References)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/question_answering/pretrained_BERT_SQuAD_deep_dive_aml.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This notebook contains an end-to-end walkthrough using [Azure Machine Learning service (AzureML)](https://azure.microsoft.com/en-us/services/machine-learning-service/) to fine-tune the pretrained [PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of [Google's TensorFlow repository for the BERT model](https://github.com/google-research/bert) developed by Hugging Face. \\n\",\n    \"\\n\",\n    \"**Note: To learn how to do pre-training on your own, please reference the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT) created by Microsoft.**\\n\",\n    \"\\n\",\n    \"This notebook will walk through the following:\\n\",\n    \"- Download the SQuAD dataset on a remote compute and store the dataset in Azure Blob storage\\n\",\n    \"- Fine-tune BERT with distributed PyTorch by Horovod for SQuAD dataset using GPU clusters provided by AzureML\\n\",\n    \"- Further fine-tune BERT with AzureML's hyperparameter tuning \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## What is BERT?\\n\",\n    \"\\n\",\n    \"[BERT (Bidirectional Encoder Representations from Transformers)](https://arxiv.org/abs/1810.04805) is a powerful pre-trained language model by presenting state-of-the-art results in a wide variety of NLP tasks, including Question Answering (SQuAD v1.1), Natural language Inference (MNLI), Text Classification, Name Entity Recognition, etc., by only a few epochs of fine tuning on task specific datasets. The key technical innovation of BERT is applying the bidirectional training of Transformer, which is a popular attention model that learns contextual relations between words (or sub-words) in a text.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## How to fine-tune BERT for QA\\n\",\n    \"\\n\",\n    \"The figure below shows how BERT can be fine tuned for Question and Answering (QA) tasks. BERT plugs the question-passage pairs in SQuAD dataset as the inputs, and the `[SEP]` representation is a special separator token for separating questions/answers. At the output layer, it outputs `Start/End` to denote the answer in the paragraph.\\n\",\n    \"\\n\",\n    \"<img src=\\\"https://nlpbp.blob.core.windows.net/images/bertqa.PNG\\\" height=400 width=400>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## What is the SQuAD dataset?\\n\",\n    \"\\n\",\n    \"\\\"Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable\\\". [\\\\[2\\\\]](#References)\\n\",\n    \"\\n\",\n    \"\\\"SQuAD 1.1, the previous version of the SQuAD dataset, contains 100,000+ question-answer pairs on 500+ articles\\\". [\\\\[2\\\\]](#References) More details from [https://rajpurkar.github.io/SQuAD-explorer/](https://rajpurkar.github.io/SQuAD-explorer/).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Table of Contents\\n\",\n    \"\\n\",\n    \"<ol start=\\\"0\\\">\\n\",\n    \"  <li> [Prerequisites: Global Settings](#0.-Prerequisites:-Global-Settings)</li>\\n\",\n    \"  <li> [Data Loading](#1.-Data-Loading)\\n\",\n    \"  <ul style=\\\"list-style: none;\\\"><li>[1.1 Default AzureML datastore](#1.1-Default-AzureML-datastore)</li>\\n\",\n    \"        <li>[1.2 Download training dataset - SQuAD](#1.2-Download-training-dataset---SQuAD)</li>\\n\",\n    \"        <li>[1.3 Upload to Azure blob storage](#1.3-Upload-to-Azure-blob-storage)</li>\\n\",\n    \"    </ul>\\n\",\n    \"  </li><br>\\n\",\n    \"  <li> [Fine tuning BERT with Distributed Training by Horovod](#2.-Fine-tuning-BERT-with-Distributed-Training-by-Horovod)\\n\",\n    \"    <ul style=\\\"list-style: none;\\\">\\n\",\n    \"        <li> [2.1 Create or Attach Existing AmlCompute](#2.1-Create-a-GPU-remote-compute-target)</li>\\n\",\n    \"        <li> [2.2 Access to a Project Directory](#2.2-Create-a-project-directory)  </li>\\n\",\n    \"        <li> [2.3 Train Model on the Remote Compute](#2.3-Prepare-your-training-script) </li> \\n\",\n    \"        <li> [2.4 Create a PyTorch estimator for fine tuning](#2.4-Create-a-PyTorch-estimator-for-fine-tuning) </li> \\n\",\n    \"        <li> [2.5 Create an experiment](#2.5-Create-an-experiment)  </li>\\n\",\n    \"        <li> [2.6 Submit and Monitor your run](#2.6-Submit-and-Monitor-your-run)  </li>\\n\",\n    \"      </ul>\\n\",\n    \"    </li><br>\\n\",\n    \"    <li>[Fine Tuning BERT with Hyperparameter Tuning](#3-Fine-Tuning-BERT-with-Hyperparameter-Tuning)\\n\",\n    \"        <ul style=\\\"list-style: none;\\\">\\n\",\n    \"            <li> [3.1 Start a hyperparameter sweep](#3.1-Start-a-hyperparameter-sweep)</li>\\n\",\n    \"            <li> [3.2 Monitor HyperDrive runs](#3.2-Monitor-HyperDrive-runs)</li>\\n\",\n    \"            <li> [3.3 Find and register the best model](#3.3-Find-and-register-the-best-model)</li>\\n\",\n    \"        </ul>\\n\",\n    \"    </li><br>\\n\",\n    \"    <li>[References](#References)</li>\\n\",\n    \"</ol>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 0. Prerequisites: Global Settings\\n\",\n    \"You will need to do the following to be successful with the rest of the notebook:\\n\",\n    \"- Have an existing Azure subscription. You can get started for free [here](https://azure.microsoft.com/free/)\\n\",\n    \"- Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning service (AzureML)\\n\",\n    \"\\n\",\n    \"- Make sure the [AzureML Python SDK](https://pypi.org/project/azureml-sdk/) is installed with notebooks and contrib add ons.\\n\",\n    \"```\\n\",\n    \"conda create -n azureml -y Python=3.6\\n\",\n    \"source activate azureml\\n\",\n    \"pip install --upgrade azureml-sdk[notebooks,contrib] \\n\",\n    \"conda install ipywidgets\\n\",\n    \"jupyter nbextension install --py --user azureml.widgets\\n\",\n    \"jupyter nbextension enable azureml.widgets --user --py\\n\",\n    \"```\\n\",\n    \"- Import the required packages\\n\",\n    \"- Set Environment Variables\\n\",\n    \"- Connect to an [Azure Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace#create-a-workspace)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Run the following cell to make sure you have installed all the packages.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import sys\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"import urllib, os\\n\",\n    \"from utils_nlp.azureml import azureml_utils\\n\",\n    \"import math\\n\",\n    \"import json \\n\",\n    \"import pandas as pd\\n\",\n    \"import papermill as pm\\n\",\n    \"#package for flattening json in pandas df\\n\",\n    \"from pandas.io.json import json_normalize\\n\",\n    \"import shutil\\n\",\n    \"import scrapbook as sb\\n\",\n    \"# Check core SDK version number\\n\",\n    \"import azureml.core\\n\",\n    \"from azureml.core import Datastore\\n\",\n    \"from azureml.core import Experiment\\n\",\n    \"from azureml.core.compute import ComputeTarget, AmlCompute\\n\",\n    \"from azureml.core.compute_target import ComputeTargetException\\n\",\n    \"from azureml.core.runconfig import MpiConfiguration\\n\",\n    \"from azureml.telemetry import set_diagnostics_collection\\n\",\n    \"from azureml.train.dnn import PyTorch\\n\",\n    \"from azureml.train.hyperdrive import *\\n\",\n    \"from azureml.widgets import RunDetails\\n\",\n    \"\\n\",\n    \"print(\\\"SDK version:\\\", azureml.core.VERSION)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The following parameters are set as variables to be used throughout the notebooks.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# Model configuration\\n\",\n    \"DATA_FOLDER = './squad'\\n\",\n    \"PROJECT_FOLDER = './transformers'\\n\",\n    \"EXPERIMENT_NAME = 'NLP-QA-BERT-deepdive'\\n\",\n    \"BERT_MODEL = 'bert-large-uncased'\\n\",\n    \"TARGET_GRADIENT_STEPS = 16\\n\",\n    \"INIT_GRADIENT_STEPS = 2\\n\",\n    \"MAX_SEQ_LENGTH = 384\\n\",\n    \"NUM_TRAIN_EPOCHS = 2.0\\n\",\n    \"NODE_COUNT = 2\\n\",\n    \"TRAIN_SCRIPT_PATH = 'bert_run_squad_azureml.py'\\n\",\n    \"MAX_TOTAL_RUNS = 8\\n\",\n    \"MAX_CONCURRENT_RUNS = 4\\n\",\n    \"BERT_UTIL_PATH = '../../utils_nlp/azureml/azureml_bert_util.py'\\n\",\n    \"EVALUATE_SQAD_PATH = '../../utils_nlp/eval/evaluate_squad.py'\\n\",\n    \"\\n\",\n    \"# Azure resources\\n\",\n    \"subscription_id = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"resource_group = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  \\n\",\n    \"workspace_name = \\\"YOUR_WORKSPACE_NAME\\\"  \\n\",\n    \"workspace_region = \\\"YOUR_WORKSPACE_REGION\\\" #Possible values eastus, eastus2 and so on.\\n\",\n    \"AZUREML_CONFIG_PATH = \\\"./.azureml\\\"\\n\",\n    \"AZUREML_VERBOSE = False\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Initialize workspace**\\n\",\n    \"\\n\",\n    \"The following cell looks to set up the connection to your [Azure Machine Learning service Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). You can choose to connect to an existing workspace or create a new one. \\n\",\n    \"\\n\",\n    \"**To access an existing workspace:**\\n\",\n    \"1. If you have a `config.json` file, you do not need to provide the workspace information; you will only need to update the `config_path` variable that is defined above which contains the file.\\n\",\n    \"2. Otherwise, you will need to supply the following:\\n\",\n    \"    * The name of your workspace\\n\",\n    \"    * Your subscription id\\n\",\n    \"    * The resource group name\\n\",\n    \"\\n\",\n    \"**To create a new workspace:**\\n\",\n    \"\\n\",\n    \"Set the following information:\\n\",\n    \"* A name for your workspace\\n\",\n    \"* Your subscription id\\n\",\n    \"* The resource group name\\n\",\n    \"* [Azure region](https://azure.microsoft.com/en-us/global-infrastructure/regions/) to create the workspace in, such as `eastus2`. \\n\",\n    \"\\n\",\n    \"This will automatically create a new resource group for you in the region provided if a resource group with the name given does not already exist. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if os.path.exists(AZUREML_CONFIG_PATH):\\n\",\n    \"    ws = azureml_utils.get_or_create_workspace(config_path=AZUREML_CONFIG_PATH)\\n\",\n    \"else:\\n\",\n    \"    ws = azureml_utils.get_or_create_workspace(\\n\",\n    \"        subscription_id=subscription_id,\\n\",\n    \"        resource_group=resource_group,\\n\",\n    \"        workspace_name=workspace_name,\\n\",\n    \"        workspace_region=workspace_region,\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"if AZUREML_VERBOSE:\\n\",\n    \"    print('Workspace name: ' + ws.name, \\n\",\n    \"          'Azure region: ' + ws.location, \\n\",\n    \"          'Subscription id: ' + ws.subscription_id, \\n\",\n    \"          'Resource group: ' + ws.resource_group, sep='\\\\n')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Diagnostics**\\n\",\n    \"\\n\",\n    \"Opt-in diagnostics for better experience, quality, and security of future releases.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Turning diagnostics collection on. \\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"set_diagnostics_collection(send_diagnostics=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1. Data Loading\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this section, we will\\n\",\n    \"1. Connect to the default AzureML datastore\\n\",\n    \"2. Download and load the dataset\\n\",\n    \"3. Upload the training set to the default blob storage of the workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"data_folder = DATA_FOLDER\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.1 Default AzureML datastore\\n\",\n    \"\\n\",\n    \"To make data accessible for remote training, AzureML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data to Azure Storage, and interact with it from your remote compute targets.\\n\",\n    \"\\n\",\n    \"Each workspace is associated with a default Azure Blob datastore named `'workspaceblobstore'`. We use this default datastore to collect the SQuAD training data.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"AzureBlob maidaipberteas6144514557 azureml-blobstore-cf97de17-8d21-437f-8b4c-298560f34ecd $AZUREML_DATAREFERENCE_workspaceblobstore\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"ds = ws.get_default_datastore()\\n\",\n    \"if AZUREML_VERBOSE:\\n\",\n    \"    print(ds.datastore_type, ds.account_name, ds.container_name, ds.as_mount())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.2 Download training dataset - SQuAD\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The SQuAD dataset can be downloaded with the following links and should be saved in a blob storage.\\n\",\n    \"- [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)\\n\",\n    \"- [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"('./squad\\\\\\\\dev-v1.1.json', <http.client.HTTPMessage at 0x1569b645f28>)\"\n      ]\n     },\n     \"execution_count\": 7,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"os.makedirs('./squad', exist_ok=True)\\n\",\n    \"urllib.request.urlretrieve('https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json', filename=os.path.join(data_folder, 'train-v1.1.json'))\\n\",\n    \"urllib.request.urlretrieve('https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json', filename=os.path.join(data_folder, 'dev-v1.1.json'))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The SQuAD dataset contains question-answer pairs on 500+ articles. For each observation in the training set, we have a **context, question, and text**. An example shown as below: [source](https://towardsdatascience.com/building-a-question-answering-system-part-1-9388aadff507)\\n\",\n    \"<img src=\\\"https://nlpbp.blob.core.windows.net/images/squad.png\\\">\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"The structure of an example in the training data.\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>context</th>\\n\",\n       \"      <th>qas</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>[{'answers': [{'answer_start': 515, 'text': 'S...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                                             context  \\\\\\n\",\n       \"0  Architecturally, the school has a Catholic cha...   \\n\",\n       \"\\n\",\n       \"                                                 qas  \\n\",\n       \"0  [{'answers': [{'answer_start': 515, 'text': 'S...  \"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"#load json object\\n\",\n    \"with open(os.path.join(data_folder, 'train-v1.1.json')) as train_file:\\n\",\n    \"    train = json.load(train_file)\\n\",\n    \"for paragraph in train['data'][0]['paragraphs']:\\n\",\n    \"    answer_question = paragraph['qas']\\n\",\n    \"    context = paragraph['context']\\n\",\n    \"    paragraph = paragraph\\n\",\n    \"    break\\n\",\n    \"print(\\\"The structure of an example in the training data.\\\")\\n\",\n    \"json_normalize(paragraph).head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"The structure of question answer pairs in the above example.\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>answers</th>\\n\",\n       \"      <th>id</th>\\n\",\n       \"      <th>question</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>[{'answer_start': 515, 'text': 'Saint Bernadet...</td>\\n\",\n       \"      <td>5733be284776f41900661182</td>\\n\",\n       \"      <td>To whom did the Virgin Mary allegedly appear i...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>[{'answer_start': 188, 'text': 'a copper statu...</td>\\n\",\n       \"      <td>5733be284776f4190066117f</td>\\n\",\n       \"      <td>What is in front of the Notre Dame Main Building?</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>[{'answer_start': 279, 'text': 'the Main Build...</td>\\n\",\n       \"      <td>5733be284776f41900661180</td>\\n\",\n       \"      <td>The Basilica of the Sacred heart at Notre Dame...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>[{'answer_start': 381, 'text': 'a Marian place...</td>\\n\",\n       \"      <td>5733be284776f41900661181</td>\\n\",\n       \"      <td>What is the Grotto at Notre Dame?</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>[{'answer_start': 92, 'text': 'a golden statue...</td>\\n\",\n       \"      <td>5733be284776f4190066117e</td>\\n\",\n       \"      <td>What sits on top of the Main Building at Notre...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                                             answers  \\\\\\n\",\n       \"0  [{'answer_start': 515, 'text': 'Saint Bernadet...   \\n\",\n       \"1  [{'answer_start': 188, 'text': 'a copper statu...   \\n\",\n       \"2  [{'answer_start': 279, 'text': 'the Main Build...   \\n\",\n       \"3  [{'answer_start': 381, 'text': 'a Marian place...   \\n\",\n       \"4  [{'answer_start': 92, 'text': 'a golden statue...   \\n\",\n       \"\\n\",\n       \"                         id                                           question  \\n\",\n       \"0  5733be284776f41900661182  To whom did the Virgin Mary allegedly appear i...  \\n\",\n       \"1  5733be284776f4190066117f  What is in front of the Notre Dame Main Building?  \\n\",\n       \"2  5733be284776f41900661180  The Basilica of the Sacred heart at Notre Dame...  \\n\",\n       \"3  5733be284776f41900661181                  What is the Grotto at Notre Dame?  \\n\",\n       \"4  5733be284776f4190066117e  What sits on top of the Main Building at Notre...  \"\n      ]\n     },\n     \"execution_count\": 9,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"print(\\\"The structure of question answer pairs in the above example.\\\")\\n\",\n    \"json_normalize(answer_question).head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.3 Upload to Azure blob storage\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The following code will upload the SQuAD dataset to the path ./squad on the default datastore.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Uploading an estimated of 2 files\\n\",\n      \"Target already exists. Skipping upload for squad\\\\dev-v1.1.json\\n\",\n      \"Target already exists. Skipping upload for squad\\\\train-v1.1.json\\n\",\n      \"Uploaded 0 files\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"$AZUREML_DATAREFERENCE_972d18f476b34d26a1ffd6a11b473114\"\n      ]\n     },\n     \"execution_count\": 10,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"ds.upload(src_dir='./squad', target_path='./squad', show_progress=AZUREML_VERBOSE)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2. Fine-tuning BERT with Distributed Training by Horovod\\n\",\n    \"We can reference the datastore to access the SQuAD dataset and start fine-tuning the model by exploring the power of distributed training on AzureML GPU clusters.\\n\",\n    \"\\n\",\n    \"Once you've created your workspace and set up your development environment, training a model in Azure Machine Learning involves the following steps:\\n\",\n    \"1. Create a GPU remote compute target\\n\",\n    \"2. Create a project directory\\n\",\n    \"3. Prepare your training script\\n\",\n    \"4. Create an Estimator object\\n\",\n    \"5. Submit the estimator to an experiment object under the workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.1 Create a GPU remote compute target\\n\",\n    \"\\n\",\n    \"We need to create a GPU [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to perform the fine-tuning. In this example, we create an AmlCompute cluster as our training compute resource. Please find the information of Azure VM size in below table.\\n\",\n    \"\\n\",\n    \"    \\n\",\n    \"|    VM Size    \\t| CPU \\t|   GPU   \\t| Storage (SSD) \\t| GPU memory \\t| InfiniBand  \\t|\\n\",\n    \"|:-------------:\\t|:---:\\t|:-------:\\t|:-------------:\\t|:----------:\\t|:----------:\\t|\\n\",\n    \"|  Standard_NC6 \\t|  6  \\t| 1 x K80 \\t|    340 GiB    \\t|    8 GiB   \\t|      No   \\t|\\n\",\n    \"| Standard_NC12 \\t|  12 \\t| 2 x K80 \\t|    680 GiB    \\t|   16 GiB   \\t|      No   \\t|\\n\",\n    \"| Standard_NC24 \\t|  24 \\t| 4 x K80 \\t|    1440 GiB   \\t|   32 GiB   \\t|      No   \\t|\\n\",\n    \"| Standard_NC24r \\t|  24 \\t| 4 x K80 \\t|    1440 GiB   \\t|   32 GiB   \\t|      Yes   \\t|\\n\",\n    \"| Standard_NC6s_v3 \\t|  6  \\t| 1 x V100 \\t|    736 GiB    \\t|   16 GiB   \\t|      No   \\t|\\n\",\n    \"| Standard_NC12s_v3 |  12 \\t| 2 x V100 \\t|    1474 GiB   \\t|   32 GiB   \\t|      No   \\t|\\n\",\n    \"| Standard_NC24s_v3 |  24 \\t| 4 x V100 \\t|    2948 GiB   \\t|   64 GiB   \\t|      No   \\t|\\n\",\n    \"| Standard_NC24rs_v3|  24 \\t| 4 x V100 \\t|    2948 GiB   \\t|   64 GiB   \\t|      Yes   \\t|\\n\",\n    \"\\n\",\n    \"This code creates a cluster for you if it does not already exist in your workspace.\\n\",\n    \"\\n\",\n    \"***We strongly recommend to use NCv3-series (NVIDIA Tesla V100) to fine-tune with SQuAD dataset. You will need to request quota of NCv3-series for your AzureML subscription.***\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Found existing compute target.\\n\",\n      \"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-22T22:38:04.496000+00:00', 'errors': None, 'creationTime': '2019-07-12T19:59:45.933132+00:00', 'modifiedTime': '2019-07-12T20:00:01.793458+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# choose a name for your cluster\\n\",\n    \"cluster_name = \\\"bertncrs24\\\"\\n\",\n    \"\\n\",\n    \"try:\\n\",\n    \"    gpu_compute_target = ComputeTarget(workspace=ws, name=cluster_name)\\n\",\n    \"    print('Found existing compute target.')\\n\",\n    \"except ComputeTargetException:\\n\",\n    \"    print('Creating a new compute target...')\\n\",\n    \"    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24rs_v3',\\n\",\n    \"                                                           max_nodes=4)\\n\",\n    \"\\n\",\n    \"    # create the cluster\\n\",\n    \"    gpu_compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\\n\",\n    \"\\n\",\n    \"    gpu_compute_target.wait_for_completion(show_output=True)\\n\",\n    \"\\n\",\n    \"# use get_status() to get a detailed status for the current AmlCompute. \\n\",\n    \"print(gpu_compute_target.get_status().serialize())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.2 Create a project directory\\n\",\n    \"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"project_folder = PROJECT_FOLDER\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Make a local clone of the original [PyTorch reimplementation](https://github.com/huggingface/pytorch-pretrained-BERT) repository\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!git clone -b v0.4.0 https://github.com/huggingface/transformers.git\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.3 Prepare your training script\\n\",\n    \"Let us prepare the training script to run the fine-tuning script `run_squad.py` from [the Hugging Face repository](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_squad.py). Please refer to the [repo](https://github.com/huggingface/pytorch-pretrained-BERT#fine-tuning-with-bert-running-the-examples) for more details about the script. \\n\",\n    \"\\n\",\n    \"The original `run_squad.py` script uses the PyTorch distributed launch utility to launch multiple processes across nodes and GPUs. Here we use a [modified version](https://github.com/microsoft/AzureML-BERT/blob/master/finetune/run_squad_azureml.py) of this file provided by the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT) from Microsoft to be able to launch multiple processes across nodes and GPUs in with an AzureML built-in MPI backend.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Let's retrieve and copy the training script [bert_run_squad_azureml.py](.\\\\bert_run_squad_azureml.py), evaluation script for SQuAD v1.1 [evaluate-v1.1.py](../../utils_nlp/eval/evaluate_squad.py) and the helper utility script for Horovod [azureml_bert_util.py](../../utils_nlp/azureml/azureml_bert_util.py) into our project directory.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"shutil.copy(EVALUATE_SQAD_PATH, project_folder)\\n\",\n    \"shutil.copy(BERT_UTIL_PATH, project_folder)\\n\",\n    \"shutil.copy(TRAIN_SCRIPT_PATH, project_folder)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.4 Create a PyTorch estimator for fine-tuning\\n\",\n    \"Let us create a new PyTorch estimator to run the fine-tuning script `run_squad_azureml.py`. To use AzureML's tracking and metrics capabilities, we need to add a small amount of AzureML code inside the training script.\\n\",\n    \"\\n\",\n    \"In `run_squad_azureml.py`, we will log some metrics to our AzureML run. To do so, we will access the AzureML run object within the script:\\n\",\n    \"```Python\\n\",\n    \"from azureml.core.run import Run\\n\",\n    \"run = Run.get_context()\\n\",\n    \"```\\n\",\n    \"Further within `run_squad_azureml.py`, we log learning rate, training loss and prediction scores the model achieves as:\\n\",\n    \"```Python\\n\",\n    \"run.log('lr', np.float(args.learning_rate))\\n\",\n    \"...\\n\",\n    \"\\n\",\n    \"for step, batch in enumerate(tqdm(train_dataloader, desc=\\\"Iteration\\\")): \\n\",\n    \"    ...\\n\",\n    \"    run.log('train_loss', np.float(loss))\\n\",\n    \"\\n\",\n    \"..\\n\",\n    \"```\\n\",\n    \"These run metrics will become particularly important when we begin hyperparameter tuning our model in the \\\"Tune model hyperparameters\\\" section below.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then, AzureML PyTorch estimator can be defined as below. We use `azuremlsamples/bert:torch-1.0.0-apex-cuda9` as the base docker image with [dockerfile](./dockerfile). In this example, we use STANDARD_NC24rs_v3 which has 4 GPUs. Thus, we can set `process_count_per_node=4`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"mpiConfig=MpiConfiguration()\\n\",\n    \"mpiConfig.process_count_per_node=4\\n\",\n    \"\\n\",\n    \"estimator = PyTorch(source_directory=project_folder,\\n\",\n    \"                    compute_target=gpu_compute_target,\\n\",\n    \"                    script_params = {\\n\",\n    \"                          '--bert_model':BERT_MODEL,\\n\",\n    \"                          '--do_train' : '',\\n\",\n    \"                          '--do_predict': '',\\n\",\n    \"                          '--train_file': ds.path('squad/train-v1.1.json').as_mount(),\\n\",\n    \"                          '--predict_file': ds.path('squad/dev-v1.1.json').as_mount(),\\n\",\n    \"                          '--max_seq_length': MAX_SEQ_LENGTH,\\n\",\n    \"                          '--train_batch_size': 8,\\n\",\n    \"                          '--learning_rate': 6.8e-5,\\n\",\n    \"                          '--num_train_epochs': NUM_TRAIN_EPOCHS,\\n\",\n    \"                          '--doc_stride': 128,\\n\",\n    \"                          '--seed': 32,\\n\",\n    \"                          '--init_gradient_accumulation_steps':INIT_GRADIENT_STEPS,\\n\",\n    \"                          '--target_gradient_accumulation_steps':TARGET_GRADIENT_STEPS,\\n\",\n    \"                          '--accumulation_warmup_proportion':0.25,\\n\",\n    \"                          '--output_dir': './outputs',\\n\",\n    \"                          '--loss_scale':256,\\n\",\n    \"                    },\\n\",\n    \"                    custom_docker_image='azuremlsamples/bert:torch-1.0.0-apex-cuda9',\\n\",\n    \"                    entry_script='bert_run_squad_azureml.py',\\n\",\n    \"                    node_count=NODE_COUNT,\\n\",\n    \"                    distributed_training=mpiConfig,\\n\",\n    \"                    framework_version='1.1',\\n\",\n    \"                    use_gpu=True)\\n\",\n    \"estimator._estimator_config.environment.python.user_managed_dependencies=True\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Note: You can try with `--bert_model:'bert-base-uncased'`to run a smaller bert model faster.**\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.5 Create an experiment\\n\",\n    \"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"experiment_name = EXPERIMENT_NAME\\n\",\n    \"experiment = Experiment(ws, name=experiment_name)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.6 Submit and Monitor your run\\n\",\n    \"AzureML can automatically create figures on the loss and time, which is eaiser to track the performance as in the following figure shown the train loss v.s. the number of iterations:\\n\",\n    \"![train_loss_bert](https://nlpbp.blob.core.windows.net/images/train_loss_bert.PNG)\\n\",\n    \"\\n\",\n    \"The Jupyter widget would be like this:\\n\",\n    \"![train_loss_bert](https://nlpbp.blob.core.windows.net/images/bert_widget.PNG)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 74,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"edeffbf4c94047f48699af91c45a27b9\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"run = experiment.submit(estimator)\\n\",\n    \"RunDetails(run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"_ = run.wait_for_completion(show_output=AZUREML_VERBOSE) # Block until complete\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Cancel the job**\\n\",\n    \"\\n\",\n    \"You can cancel the job manually to make sure you do not waste resources.\\n\",\n    \" ```python\\n\",\n    \"# Cancel the job with id.\\n\",\n    \"job_id = \\\"BERT-SQuAD_1562612876_bab5b3af\\\"\\n\",\n    \"run = get_run(experiment, job_id)\\n\",\n    \"\\n\",\n    \"# Cancel jobs.\\n\",\n    \"run.cancel()\\n\",\n    \"```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To achieve an F1 score over **90.5** F1 score and an Exact-Match over **83.5** with the `SQuAD v1.1` dataset, it requires **2** epochs when fine-tuning with the `BERT large` model. Below please find the elapsed time using deferent Azure GPU VMs and configures. \\n\",\n    \"\\n\",\n    \"The default configuration in this notebook uses 2 `STANDARD_NC24rs_v3` (8 x V100) with `fp16` enabled. The training phase should take **22 mins** to complete 2 epochs. \\n\",\n    \"\\n\",\n    \"|  GPU counts \\t|    1 GPU    \\t|         2 GPU \\t| 4 GPU      \\t| 8 GPU      \\t|\\n\",\n    \"|------------:\\t|:-----------:\\t|--------------:\\t|------------\\t|------------\\t|\\n\",\n    \"| NCv3-series \\t|     340 mins  |    180 mins \\t    |    80 mins \\t|   48 mins \\t|\\n\",\n    \"| NCv3 with fp16|     140 mins  |    79 mins \\t    |    38 mins \\t|   22 mins \\t|\\n\",\n    \"\\n\",\n    \"The performance with different VMs with `fp16` enabled (Duration = training time + preparing time):\\n\",\n    \"\\n\",\n    \"|  VM Size \\t|  GPU counts|    Node counts|    Duration    \\t|         F1 \\t| EM     \\t| Pretrained BERT      \\t|\\n\",\n    \"|------------:\\t|:-----------:\\t|--------------:\\t|------------\\t|------------\\t|------------\\t|------------\\t|\\n\",\n    \"| NC6sv3 |   4 |  4 |  31 mins  |    88.24 \\t    |    80.59 \\t|   Base \\t|\\n\",\n    \"| NC6sv3 |   4 |  4 |  80 mins  |    90.78 \\t    |    83.96 \\t|   Large \\t|\\n\",\n    \"| NC24rsv3 |  4 |   1 |  19 mins  |    86.18 \\t    |    77.90 \\t|   Base \\t|\\n\",\n    \"| NC24rsv3 |  4 |   1 |  46 mins  |    90.53 \\t    |    83.56 \\t|   Large \\t|\\n\",\n    \"| NC24rsv3 |  8 |   2 |  19 mins  |    87.47 \\t    |    79.52 \\t|   Base \\t|\\n\",\n    \"| NC24rsv3 |  8 |   2 |  32 mins  |    90.57 \\t    |    83.58 \\t|   Large \\t|\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3 Fine-Tuning BERT with Hyperparameter Tuning\\n\",\n    \"\\n\",\n    \"We would also like to optimize our hyperparameter, `learning rate`, using Azure Machine Learning's hyperparameter tuning capabilities.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 3.1 Start a hyperparameter sweep\\n\",\n    \"First, we will define the hyperparameter space to sweep over. In this example we will use random sampling to try different configuration sets of hyperparameter to minimize our primary metric, the f1 score (`f1`). For simplicity, we tune the BERT base model with  `--bert_model':'bert-base-uncased` and  `node_count=1`.\\n\",\n    \"\\n\",\n    \"We can also try with `BayesianParameterSampling` with suggested `max_total_runs=20`.\\n\",\n    \"```Python\\n\",\n    \"param_sampling = BayesianParameterSampling( {\\n\",\n    \"         'learning_rate': uniform(5e-5, 9e-5),\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"```\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"param_sampling = RandomParameterSampling( {\\n\",\n    \"         'learning_rate': uniform(5e-5, 9e-5),\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"hyperdrive_config = HyperDriveConfig(estimator=estimator,\\n\",\n    \"                                         hyperparameter_sampling=param_sampling, \\n\",\n    \"                                         primary_metric_name='f1',\\n\",\n    \"                                         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,\\n\",\n    \"                                         max_total_runs=MAX_TOTAL_RUNS,\\n\",\n    \"                                         max_concurrent_runs=MAX_CONCURRENT_RUNS)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally, lauch the hyperparameter tuning job.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# start the HyperDrive run\\n\",\n    \"hyperdrive_run = experiment.submit(hyperdrive_config)\\n\",\n    \"RunDetails(hyperdrive_run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 3.2 Monitor HyperDrive runs\\n\",\n    \"We can monitor the progress of the runs with the following Jupyter widget. \\n\",\n    \"![](https://nlpbp.blob.core.windows.net/images/bert_tune.PNG)\\n\",\n    \"![](https://nlpbp.blob.core.windows.net/images/bert_tune2.PNG)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"_ = hyperdrive_run.wait_for_completion(show_output=AZUREML_VERBOSE) # Block until complete\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"You can see the experiment progress from this notebook by using `azureml.widgets.RunDetails(hd_run).show()` or check from the Azure portal with the url link you can get by running `hd_run.get_portal_url()`.\\n\",\n    \"To load an existing Hyperdrive run, use `hd_run = hd.HyperDriveRun(exp, <user-run-id>, hyperdrive_run_config=hd_run_config)`. You also can cancel a run with `hd_run.cancel()`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Cancel the hyper drive run to save the resources**\\n\",\n    \" ```python\\n\",\n    \"# Cancel the hyper drive\\n\",\n    \"hyperdrive_run.cancel()\\n\",\n    \" ```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 3.3 Find and register the best model\\n\",\n    \"Once all the runs complete, we can find the run that produced the model with the highest F1 score. The F1 score with default learning rate is **86.18** in [Submit and Monitor your run](#2.6-Submit-and-Monitor-your-run) . The best F1 score is **87.01** after tuning with `learning rate=0.000090` with random sampling. With Bayesian sampling, the best F1 score is **86.87** after tuning with `learning rate=0.0000896`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Run(Experiment: BERT-SQuAD,\\n\",\n      \"Id: BERT-SQuAD_1562966635446_2,\\n\",\n      \"Type: azureml.scriptrun,\\n\",\n      \"Status: Completed)\\n\",\n      \"Best Run is:\\n\",\n      \"  F1 score: 87.01 \\n\",\n      \"  Learning rate: 0.000090\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"best_run = hyperdrive_run.get_best_run_by_primary_metric()\\n\",\n    \"best_run_metrics = best_run.get_metrics()\\n\",\n    \"print('Best Run is:\\\\n  F1 score: %.2f \\\\n  Learning rate: %f' % (best_run_metrics['f1'], best_run_metrics['lr']))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Persist properties of the run so we can access the logged metrics later\\n\",\n    \"sb.glue(\\\"f1\\\", best_run_metrics['f1'])\\n\",\n    \"sb.glue(\\\"learning_rate\\\", best_run_metrics['lr'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## References\\n\",\n    \"\\n\",\n    \"1. Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina, [*BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding*](https://arxiv.org/abs/1810.04805), ACL, 2018.\\n\",\n    \"2. Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, Percy Liang, [*SQuAD: 100,000+ Questions for Machine Comprehension of Text*](https://arxiv.org/abs/1606.05250), EMNLP, 2016. Dataset available at [https://rajpurkar.github.io/SQuAD-explorer/](https://rajpurkar.github.io/SQuAD-explorer/).\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"authors\": [\n   {\n    \"name\": \"minxia\"\n   }\n  ],\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.5\"\n  },\n  \"msauthor\": \"minxia\"\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/question_answering/question_answering_squad_transformers.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\n    \"\\n\",\n    \"Licensed under the MIT License.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Question Answering on the SQuAD Dataset using Transformers Models\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Before You Start\\n\",\n    \"\\n\",\n    \"The running time shown in this notebook is on a Standard_NC24rs_v3 Azure Data Science Virtual Machine with 4 NVIDIA Tesla V100 GPUs. \\n\",\n    \"> **Tip**: If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \\n\",\n    \"\\n\",\n    \"The table below provides some reference running time of BERT on different machine configurations.  \\n\",\n    \"\\n\",\n    \"|QUICK_RUN|Machine Configurations|Running time|\\n\",\n    \"|:---------|:----------------------|:------------|\\n\",\n    \"|True|4 **CPU**s, 14GB memory| ~ 10 minutes |\\n\",\n    \"|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 3 minutes |\\n\",\n    \"|False|4 NVIDIA Tesla K80 GPUs, 48GB GPU memory| ~ 18 hours |\\n\",\n    \"|False|4 NVIDIA Tesla V100 GPUs, 64GB GPU memory, without RDMA (NC24s)| ~ 7 hours|\\n\",\n    \"|False|4 NVIDIA Tesla V100 GPUs, 64GB GPU memory, with RDMA (NC24**r**s)| ~ 4 hours|\\n\",\n    \"\\n\",\n    \"If you run into CUDA out-of-memory error, try reducing the `PER_GPU_BATCH_SIZE` and increasing the `GRADIENT_ACCUMULATION_STEPS`. As long as `PER_GPU_BATCH_SIZE` * `GRADIENT_ACCUMULATION_STEPS` doesn't change, the effective **per gpu** batch size is the same as larger `PER_GPU_BATCH_SIZE` and smaller `GRADIENT_ACCUMULATION_STEPS`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\\n\",\n    \"QUICK_RUN = False\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Summary\\n\",\n    \"This notebook demonstrates how to fine tune [pre-trained transformers models](https://github.com/huggingface/transformers) for extractive question answering task. Utility functions and classes in the NLP Best Practices repo are used to facilitate data preprocessing, model training, model scoring, result postprocessing, and model evaluation. \\n\",\n    \"\\n\",\n    \"The following models are currently supported:\\n\",\n    \"* BERT: [Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)\\n\",\n    \"* XLNet: [Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/pdf/1906.08237.pdf)\\n\",\n    \"* DistilBert: [A small, fast, cheap and light Transformer model based on Bert architecture](https://medium.com/huggingface/distilbert-8cf3380435b5)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"import sys\\n\",\n    \"\\n\",\n    \"import scrapbook as sb\\n\",\n    \"import torch\\n\",\n    \"\\n\",\n    \"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"from utils_nlp.dataset.squad import load_pandas_df\\n\",\n    \"from utils_nlp.eval.question_answering import evaluate_qa\\n\",\n    \"from utils_nlp.models.transformers.datasets import QADataset\\n\",\n    \"from utils_nlp.models.transformers.question_answering import (\\n\",\n    \"    AnswerExtractor,\\n\",\n    \"    QAProcessor,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Configurations\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To get all the transformer models supporting question answering, call `AnswerExtractor.list_supported_models()`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"['bert-base-uncased',\\n\",\n       \" 'bert-large-uncased',\\n\",\n       \" 'bert-base-cased',\\n\",\n       \" 'bert-large-cased',\\n\",\n       \" 'bert-base-multilingual-uncased',\\n\",\n       \" 'bert-base-multilingual-cased',\\n\",\n       \" 'bert-base-chinese',\\n\",\n       \" 'bert-base-german-cased',\\n\",\n       \" 'bert-large-uncased-whole-word-masking',\\n\",\n       \" 'bert-large-cased-whole-word-masking',\\n\",\n       \" 'bert-large-uncased-whole-word-masking-finetuned-squad',\\n\",\n       \" 'bert-large-cased-whole-word-masking-finetuned-squad',\\n\",\n       \" 'bert-base-cased-finetuned-mrpc',\\n\",\n       \" 'xlnet-base-cased',\\n\",\n       \" 'xlnet-large-cased',\\n\",\n       \" 'distilbert-base-uncased',\\n\",\n       \" 'distilbert-base-uncased-distilled-squad']\"\n      ]\n     },\n     \"execution_count\": 2,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"AnswerExtractor.list_supported_models()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Max sequence length: 384\\n\",\n      \"Document stride: 128\\n\",\n      \"Per gpu batch size: 4\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"MODEL_NAME = \\\"bert-large-cased-whole-word-masking\\\"\\n\",\n    \"DO_LOWER_CASE = False\\n\",\n    \"\\n\",\n    \"# MODEL_NAME = \\\"xlnet-large-cased\\\"\\n\",\n    \"# DO_LOWER_CASE = False\\n\",\n    \"\\n\",\n    \"# MODEL_NAME = \\\"distilbert-base-uncased\\\"\\n\",\n    \"# DO_LOWER_CASE = True\\n\",\n    \"\\n\",\n    \"TRAIN_DATA_USED_PERCENT = 1\\n\",\n    \"DEV_DATA_USED_PERCENT = 1\\n\",\n    \"NUM_EPOCHS = 2\\n\",\n    \"\\n\",\n    \"MAX_SEQ_LENGTH = 384\\n\",\n    \"DOC_STRIDE = 128\\n\",\n    \"PER_GPU_BATCH_SIZE = 4\\n\",\n    \"GRADIENT_ACCUMULATION_STEPS = 1\\n\",\n    \"NUM_GPUS = torch.cuda.device_count()\\n\",\n    \"\\n\",\n    \"if QUICK_RUN:\\n\",\n    \"    TRAIN_DATA_USED_PERCENT = 0.001\\n\",\n    \"    DEV_DATA_USED_PERCENT = 0.01\\n\",\n    \"    NUM_EPOCHS = 1\\n\",\n    \"    \\n\",\n    \"    MAX_SEQ_LENGTH = 128\\n\",\n    \"    DOC_STRIDE = 64\\n\",\n    \"    PER_GPU_BATCH_SIZE = 1\\n\",\n    \"\\n\",\n    \"print(\\\"Max sequence length: {}\\\".format(MAX_SEQ_LENGTH))\\n\",\n    \"print(\\\"Document stride: {}\\\".format(DOC_STRIDE))\\n\",\n    \"print(\\\"Per gpu batch size: {}\\\".format(PER_GPU_BATCH_SIZE))\\n\",\n    \"\\n\",\n    \"RANDOM_SEED = 42\\n\",\n    \"SQUAD_VERSION = \\\"v1.1\\\" \\n\",\n    \"CACHE_DIR = \\\"./temp\\\"\\n\",\n    \"\\n\",\n    \"MAX_QUESTION_LENGTH = 64\\n\",\n    \"LEARNING_RATE = 3e-5\\n\",\n    \"\\n\",\n    \"DOC_TEXT_COL = \\\"doc_text\\\"\\n\",\n    \"QUESTION_TEXT_COL = \\\"question_text\\\"\\n\",\n    \"ANSWER_START_COL = \\\"answer_start\\\"\\n\",\n    \"ANSWER_TEXT_COL = \\\"answer_text\\\"\\n\",\n    \"QA_ID_COL = \\\"qa_id\\\"\\n\",\n    \"IS_IMPOSSIBLE_COL = \\\"is_impossible\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Load Data\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### The SQuAD Dataset\\n\",\n    \"Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable. [\\\\[1, 2\\\\]](#References)\\n\",\n    \"\\n\",\n    \"<img src=\\\"https://nlpbp.blob.core.windows.net/images/squad.png\\\">\\n\",\n    \"\\n\",\n    \"There has been two versions of SQuAD datasets. SQuAD 1.1 contains 100,000+ question-answer pairs on 500+ articles. SQuAD 2.0 adds 50,000 new, unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. These datasets are available at [https://rajpurkar.github.io/SQuAD-explorer/](https://rajpurkar.github.io/SQuAD-explorer/). Each dataset comes with a training dataset and a development dataset. \\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The utility function `load_pandas_df` downloads the dataset specified by `squad_version` and `file_split` to `local_cache_path` if it doesn't exist already.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 7.82k/7.82k [00:00<00:00, 20.6kKB/s]\\n\",\n      \"100%|██████████| 1.02k/1.02k [00:00<00:00, 19.9kKB/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"train_df = load_pandas_df(local_cache_path=CACHE_DIR, squad_version=SQUAD_VERSION, file_split=\\\"train\\\")\\n\",\n    \"dev_df = load_pandas_df(local_cache_path=CACHE_DIR, squad_version=SQUAD_VERSION, file_split=\\\"dev\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>doc_text</th>\\n\",\n       \"      <th>question_text</th>\\n\",\n       \"      <th>answer_start</th>\\n\",\n       \"      <th>answer_text</th>\\n\",\n       \"      <th>qa_id</th>\\n\",\n       \"      <th>is_impossible</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>To whom did the Virgin Mary allegedly appear i...</td>\\n\",\n       \"      <td>515</td>\\n\",\n       \"      <td>Saint Bernadette Soubirous</td>\\n\",\n       \"      <td>5733be284776f41900661182</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>What is in front of the Notre Dame Main Building?</td>\\n\",\n       \"      <td>188</td>\\n\",\n       \"      <td>a copper statue of Christ</td>\\n\",\n       \"      <td>5733be284776f4190066117f</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>2</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>The Basilica of the Sacred heart at Notre Dame...</td>\\n\",\n       \"      <td>279</td>\\n\",\n       \"      <td>the Main Building</td>\\n\",\n       \"      <td>5733be284776f41900661180</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>3</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>What is the Grotto at Notre Dame?</td>\\n\",\n       \"      <td>381</td>\\n\",\n       \"      <td>a Marian place of prayer and reflection</td>\\n\",\n       \"      <td>5733be284776f41900661181</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>4</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>What sits on top of the Main Building at Notre...</td>\\n\",\n       \"      <td>92</td>\\n\",\n       \"      <td>a golden statue of the Virgin Mary</td>\\n\",\n       \"      <td>5733be284776f4190066117e</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                                            doc_text  \\\\\\n\",\n       \"0  Architecturally, the school has a Catholic cha...   \\n\",\n       \"1  Architecturally, the school has a Catholic cha...   \\n\",\n       \"2  Architecturally, the school has a Catholic cha...   \\n\",\n       \"3  Architecturally, the school has a Catholic cha...   \\n\",\n       \"4  Architecturally, the school has a Catholic cha...   \\n\",\n       \"\\n\",\n       \"                                       question_text  answer_start  \\\\\\n\",\n       \"0  To whom did the Virgin Mary allegedly appear i...           515   \\n\",\n       \"1  What is in front of the Notre Dame Main Building?           188   \\n\",\n       \"2  The Basilica of the Sacred heart at Notre Dame...           279   \\n\",\n       \"3                  What is the Grotto at Notre Dame?           381   \\n\",\n       \"4  What sits on top of the Main Building at Notre...            92   \\n\",\n       \"\\n\",\n       \"                               answer_text                     qa_id  \\\\\\n\",\n       \"0               Saint Bernadette Soubirous  5733be284776f41900661182   \\n\",\n       \"1                a copper statue of Christ  5733be284776f4190066117f   \\n\",\n       \"2                        the Main Building  5733be284776f41900661180   \\n\",\n       \"3  a Marian place of prayer and reflection  5733be284776f41900661181   \\n\",\n       \"4       a golden statue of the Virgin Mary  5733be284776f4190066117e   \\n\",\n       \"\\n\",\n       \"   is_impossible  \\n\",\n       \"0          False  \\n\",\n       \"1          False  \\n\",\n       \"2          False  \\n\",\n       \"3          False  \\n\",\n       \"4          False  \"\n      ]\n     },\n     \"execution_count\": 6,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"train_df.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>doc_text</th>\\n\",\n       \"      <th>question_text</th>\\n\",\n       \"      <th>answer_start</th>\\n\",\n       \"      <th>answer_text</th>\\n\",\n       \"      <th>qa_id</th>\\n\",\n       \"      <th>is_impossible</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>Super Bowl 50 was an American football game to...</td>\\n\",\n       \"      <td>Which NFL team represented the AFC at Super Bo...</td>\\n\",\n       \"      <td>[177, 177, 177]</td>\\n\",\n       \"      <td>[Denver Broncos, Denver Broncos, Denver Broncos]</td>\\n\",\n       \"      <td>56be4db0acb8001400a502ec</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>Super Bowl 50 was an American football game to...</td>\\n\",\n       \"      <td>Which NFL team represented the NFC at Super Bo...</td>\\n\",\n       \"      <td>[249, 249, 249]</td>\\n\",\n       \"      <td>[Carolina Panthers, Carolina Panthers, Carolin...</td>\\n\",\n       \"      <td>56be4db0acb8001400a502ed</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>2</td>\\n\",\n       \"      <td>Super Bowl 50 was an American football game to...</td>\\n\",\n       \"      <td>Where did Super Bowl 50 take place?</td>\\n\",\n       \"      <td>[403, 355, 355]</td>\\n\",\n       \"      <td>[Santa Clara, California, Levi's Stadium, Levi...</td>\\n\",\n       \"      <td>56be4db0acb8001400a502ee</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>3</td>\\n\",\n       \"      <td>Super Bowl 50 was an American football game to...</td>\\n\",\n       \"      <td>Which NFL team won Super Bowl 50?</td>\\n\",\n       \"      <td>[177, 177, 177]</td>\\n\",\n       \"      <td>[Denver Broncos, Denver Broncos, Denver Broncos]</td>\\n\",\n       \"      <td>56be4db0acb8001400a502ef</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <td>4</td>\\n\",\n       \"      <td>Super Bowl 50 was an American football game to...</td>\\n\",\n       \"      <td>What color was used to emphasize the 50th anni...</td>\\n\",\n       \"      <td>[488, 488, 521]</td>\\n\",\n       \"      <td>[gold, gold, gold]</td>\\n\",\n       \"      <td>56be4db0acb8001400a502f0</td>\\n\",\n       \"      <td>False</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                                            doc_text  \\\\\\n\",\n       \"0  Super Bowl 50 was an American football game to...   \\n\",\n       \"1  Super Bowl 50 was an American football game to...   \\n\",\n       \"2  Super Bowl 50 was an American football game to...   \\n\",\n       \"3  Super Bowl 50 was an American football game to...   \\n\",\n       \"4  Super Bowl 50 was an American football game to...   \\n\",\n       \"\\n\",\n       \"                                       question_text     answer_start  \\\\\\n\",\n       \"0  Which NFL team represented the AFC at Super Bo...  [177, 177, 177]   \\n\",\n       \"1  Which NFL team represented the NFC at Super Bo...  [249, 249, 249]   \\n\",\n       \"2                Where did Super Bowl 50 take place?  [403, 355, 355]   \\n\",\n       \"3                  Which NFL team won Super Bowl 50?  [177, 177, 177]   \\n\",\n       \"4  What color was used to emphasize the 50th anni...  [488, 488, 521]   \\n\",\n       \"\\n\",\n       \"                                         answer_text  \\\\\\n\",\n       \"0   [Denver Broncos, Denver Broncos, Denver Broncos]   \\n\",\n       \"1  [Carolina Panthers, Carolina Panthers, Carolin...   \\n\",\n       \"2  [Santa Clara, California, Levi's Stadium, Levi...   \\n\",\n       \"3   [Denver Broncos, Denver Broncos, Denver Broncos]   \\n\",\n       \"4                                 [gold, gold, gold]   \\n\",\n       \"\\n\",\n       \"                      qa_id  is_impossible  \\n\",\n       \"0  56be4db0acb8001400a502ec          False  \\n\",\n       \"1  56be4db0acb8001400a502ed          False  \\n\",\n       \"2  56be4db0acb8001400a502ee          False  \\n\",\n       \"3  56be4db0acb8001400a502ef          False  \\n\",\n       \"4  56be4db0acb8001400a502f0          False  \"\n      ]\n     },\n     \"execution_count\": 7,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"dev_df.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"train_df = train_df.sample(frac=TRAIN_DATA_USED_PERCENT).reset_index(drop=True)\\n\",\n    \"dev_df = dev_df.sample(frac=DEV_DATA_USED_PERCENT).reset_index(drop=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`QADataset` is a standard question answering dataset for downstream processing.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"train_dataset = QADataset(\\n\",\n    \"    df=train_df,\\n\",\n    \"    doc_text_col=DOC_TEXT_COL,\\n\",\n    \"    question_text_col=QUESTION_TEXT_COL,\\n\",\n    \"    qa_id_col=QA_ID_COL,\\n\",\n    \"    is_impossible_col=IS_IMPOSSIBLE_COL,\\n\",\n    \"    answer_start_col=ANSWER_START_COL,\\n\",\n    \"    answer_text_col=ANSWER_TEXT_COL\\n\",\n    \")\\n\",\n    \"dev_dataset = QADataset(\\n\",\n    \"    df=dev_df,\\n\",\n    \"    doc_text_col=DOC_TEXT_COL,\\n\",\n    \"    question_text_col=QUESTION_TEXT_COL,\\n\",\n    \"    qa_id_col=QA_ID_COL,\\n\",\n    \"    is_impossible_col=IS_IMPOSSIBLE_COL,\\n\",\n    \"    answer_start_col=ANSWER_START_COL,\\n\",\n    \"    answer_text_col=ANSWER_TEXT_COL\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Tokenize and Preprocess Data\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The `QAProcessor.preprocess` tokenizes the input paragraph, question, and answer texts, and converts them into the format required by pre-trained transformer models, involving the following steps:\\n\",\n    \"* Tokenization.\\n\",\n    \"* Convert character-based answer span indices to token-based indices.\\n\",\n    \"* Truncate the question token list if it's longer than `max_question_length`.\\n\",\n    \"* Split the paragraph into multiple segments if it's longer than `max_seq_length` - `max_question_length` - 3. (The \\\"-3\\\" is for the special [CLS] token and two [SEP] tokens.)\\n\",\n    \"* Add the special tokens [CLS] and [SEP].\\n\",\n    \"* Pad the concatenated token sequence to `max_seq_length` if it's shorter.\\n\",\n    \"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\\n\",\n    \"\\n\",\n    \"`QAProcessor.preprocess` returns a Pytorch DataSet. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 213450/213450 [00:00<00:00, 2918674.41B/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"qa_processor = QAProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\\n\",\n    \"train_dataset = qa_processor.preprocess(\\n\",\n    \"    train_dataset,\\n\",\n    \"    is_training=True,\\n\",\n    \"    max_question_length=MAX_QUESTION_LENGTH,\\n\",\n    \"    max_seq_length=MAX_SEQ_LENGTH,\\n\",\n    \"    doc_stride=DOC_STRIDE,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"# we keep a copy of the oroginal dev_dataset as it is needed for evaluation\\n\",\n    \"dev_dataset_processed = qa_processor.preprocess(\\n\",\n    \"    dev_dataset,\\n\",\n    \"    is_training=False,\\n\",\n    \"    max_question_length=MAX_QUESTION_LENGTH,\\n\",\n    \"    max_seq_length=MAX_SEQ_LENGTH,\\n\",\n    \"    doc_stride=DOC_STRIDE,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"train_dataloader = dataloader_from_dataset(\\n\",\n    \"    train_dataset, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\\n\",\n    \")\\n\",\n    \"dev_dataloader = dataloader_from_dataset(\\n\",\n    \"    dev_dataset_processed, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Fine-tune AnswerExtractor\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"qa_extractor = AnswerExtractor(model_name=MODEL_NAME, cache_dir=CACHE_DIR)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with Timer() as t:\\n\",\n    \"    qa_extractor.fit(train_dataloader,\\n\",\n    \"                     num_epochs=NUM_EPOCHS,\\n\",\n    \"                     learning_rate=LEARNING_RATE,\\n\",\n    \"                     gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\\n\",\n    \"                     seed=RANDOM_SEED,\\n\",\n    \"                     cache_model=True)\\n\",\n    \"print(\\\"Training time : {:.3f} hrs\\\".format(t.interval / 3600)) \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Predict\\n\",\n    \"Note that the `AnswerExtractor.predict` only outputs the probabilities of each token being the start and end of the answer span. `postprocess_bert_answer` and  `postprocess_xlnet_answer` are two helper functions for postprocessing these probabilities and generating the final answers. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Evaluating: 100%|██████████| 661/661 [04:42<00:00,  2.64it/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"qa_results = qa_extractor.predict(dev_dataloader)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Postprocess and Generate the Final Answers\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"final_answers, answer_probs, nbest_answers = qa_processor.postprocess(\\n\",\n    \"    qa_results,\\n\",\n    \"    examples_file=\\\"./cached_qa_features/cached_examples_test.jsonl\\\",\\n\",\n    \"    features_file=\\\"./cached_qa_features/cached_features_test.jsonl\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Paragraph:\\n\",\n      \"In August 1999, ABC premiered a special series event, Who Wants to Be a Millionaire, a game show based on the British program of the same title. Hosted throughout its ABC tenure by Regis Philbin, the program became a major ratings success throughout its initial summer run, which led ABC to renew Millionaire as a regular series, returning on January 18, 2000. At its peak, the program aired as much as six nights a week. Buoyed by Millionaire, during the 1999–2000 season, ABC became the first network to move from third to first place in the ratings during a single television season. Millionaire ended its run on the network's primetime lineup after three years in 2002, with Buena Vista Television relaunching the show as a syndicated program (under that incarnation's original host Meredith Vieira) in September of that year.\\n\",\n      \"\\n\",\n      \"Question:\\n\",\n      \"Who originally hosted Who Wants to Be a Millionaire for ABC?\\n\",\n      \"\\n\",\n      \"Ground truth answers:\\n\",\n      \"['Regis Philbin', 'Regis Philbin', 'Regis Philbin']\\n\",\n      \"\\n\",\n      \"Predicted answer:\\n\",\n      \"Regis Philbin\\n\",\n      \"\\n\",\n      \"Top N best answers\\n\",\n      \"[OrderedDict([('text', 'Regis Philbin'), ('probability', 0.9916362000840248), ('start_logit', 7.9423394203186035), ('end_logit', 7.618710517883301)]), OrderedDict([('text', 'Hosted throughout its ABC tenure by Regis Philbin'), ('probability', 0.004141487486401103), ('start_logit', 2.464038133621216), ('end_logit', 7.618710517883301)]), OrderedDict([('text', 'Regis Philbin,'), ('probability', 0.0035518662225726477), ('start_logit', 7.9423394203186035), ('end_logit', 1.9868273735046387)]), OrderedDict([('text', 'Philbin'), ('probability', 0.0003250549061321865), ('start_logit', -0.08077805489301682), ('end_logit', 7.618710517883301)]), OrderedDict([('text', 'Regis'), ('probability', 0.00019582582917568986), ('start_logit', 7.9423394203186035), ('end_logit', -0.9111754298210144)]), OrderedDict([('text', 'Regis Phil'), ('probability', 4.149332683962915e-05), ('start_logit', 7.9423394203186035), ('end_logit', -2.4628684520721436)]), OrderedDict([('text', 'by Regis Philbin'), ('probability', 3.332539919374182e-05), ('start_logit', -2.358452320098877), ('end_logit', 7.618710517883301)]), OrderedDict([('text', 'bin'), ('probability', 2.8618519913389162e-05), ('start_logit', -2.5107181072235107), ('end_logit', 7.618710517883301)]), OrderedDict([('text', 'Hosted throughout its ABC tenure by Regis Philbin,'), ('probability', 1.483407878101762e-05), ('start_logit', 2.464038133621216), ('end_logit', 1.9868273735046387)]), OrderedDict([('text', 'Regis Philbin, the program became a major ratings success throughout its initial summer run'), ('probability', 1.1565367338388186e-05), ('start_logit', 7.9423394203186035), ('end_logit', -3.7403860092163086)]), OrderedDict([('text', 'throughout its ABC tenure by Regis Philbin'), ('probability', 8.641061698026031e-06), ('start_logit', -3.7082467079162598), ('end_logit', 7.618710517883301)]), OrderedDict([('text', 'Regis Philbin, the program became a major ratings success throughout its initial summer run,'), ('probability', 4.340281561528615e-06), ('start_logit', 7.9423394203186035), ('end_logit', -4.720461845397949)]), OrderedDict([('text', 'Millionaire, a game show based on the British program of the same title. Hosted throughout its ABC tenure by Regis Philbin'), ('probability', 2.581499164789754e-06), ('start_logit', -4.9164018630981445), ('end_logit', 7.618710517883301)]), OrderedDict([('text', 'game show based on the British program of the same title. Hosted throughout its ABC tenure by Regis Philbin'), ('probability', 1.5768661257791343e-06), ('start_logit', -5.409332752227783), ('end_logit', 7.618710517883301)]), OrderedDict([('text', 'Philbin,'), ('probability', 1.1642894253705209e-06), ('start_logit', -0.08077805489301682), ('end_logit', 1.9868273735046387)]), OrderedDict([('text', 'Hosted throughout its ABC tenure by Regis'), ('probability', 8.178505594859486e-07), ('start_logit', 2.464038133621216), ('end_logit', -0.9111754298210144)]), OrderedDict([('text', 'Meredith Vieira'), ('probability', 2.1176539810270548e-07), ('start_logit', 0.06787821650505066), ('end_logit', 0.13378390669822693)]), OrderedDict([('text', 'Hosted throughout its ABC tenure by Regis Phil'), ('probability', 1.7329348591843886e-07), ('start_logit', 2.464038133621216), ('end_logit', -2.4628684520721436)]), OrderedDict([('text', 'by Regis Philbin,'), ('probability', 1.1936571067088074e-07), ('start_logit', -2.358452320098877), ('end_logit', 1.9868273735046387)]), OrderedDict([('text', 'bin,'), ('probability', 1.0250649806025293e-07), ('start_logit', -2.5107181072235107), ('end_logit', 1.9868273735046387)])]\\n\",\n      \"-------------------------------------------------------------------------------------------------------------------\\n\",\n      \"Paragraph:\\n\",\n      \"In 2004, ABC's average viewership declined by ten ratings points, landing the network in fourth place, behind NBC, CBS and Fox (by the following year, the combined season-ending average audience share of ABC, NBC and CBS represented only 32% of U.S. households). However, during the 2004–05 season, the network experienced unexpected success with new series such as Desperate Housewives, Lost and Grey's Anatomy as well as reality series Dancing with the Stars, which helped ABC rise to second place, jumping ahead of CBS, but behind a surging Fox. On April 21, 2004, Disney announced a restructuring of its Disney Media Networks division with Anne Sweeney being named president of ABC parent Disney–ABC Television Group, and ESPN president George Bodenheimer becoming co-CEO of the division with Sweeney, as well as president of ABC Sports. On December 7, 2005, ABC Sports and ESPN signed an eight-year broadcast rights agreement with NASCAR, allowing ABC and ESPN to broadcast 17 Nextel Cup races each season (comprising just over half of the 36 races held annually) effective with the 2006 season.\\n\",\n      \"\\n\",\n      \"Question:\\n\",\n      \"Who was named president of Disney-ABC television group in 2004?\\n\",\n      \"\\n\",\n      \"Ground truth answers:\\n\",\n      \"['Anne Sweeney', 'Anne Sweeney', 'Anne Sweeney']\\n\",\n      \"\\n\",\n      \"Predicted answer:\\n\",\n      \"Anne Sweeney\\n\",\n      \"\\n\",\n      \"Top N best answers\\n\",\n      \"[OrderedDict([('text', 'Anne Sweeney'), ('probability', 0.9955952018782683), ('start_logit', 8.357232093811035), ('end_logit', 8.356441497802734)]), OrderedDict([('text', 'Sweeney'), ('probability', 0.0036178995163272075), ('start_logit', 2.7397849559783936), ('end_logit', 8.356441497802734)]), OrderedDict([('text', 'Anne Sweeney being named president of ABC parent Disney–ABC Television Group,'), ('probability', 0.00015457112079484626), ('start_logit', 8.357232093811035), ('end_logit', -0.4140002131462097)]), OrderedDict([('text', 'Anne Sweeney being named president of ABC parent Disney–ABC Television Group'), ('probability', 0.00013177008073689449), ('start_logit', 8.357232093811035), ('end_logit', -0.5735959410667419)]), OrderedDict([('text', 'Anne Sweeney being named president of ABC parent Disney–ABC Television Group, and ESPN president George Bodenheimer becoming co-CEO of the division with Sweeney'), ('probability', 0.00010324790082242362), ('start_logit', 8.357232093811035), ('end_logit', -0.8175216317176819)]), OrderedDict([('text', 'Anne'), ('probability', 8.954744862928307e-05), ('start_logit', 8.357232093811035), ('end_logit', -0.9598858952522278)]), OrderedDict([('text', 'Disney announced a restructuring of its Disney Media Networks division with Anne Sweeney'), ('probability', 8.742559416104907e-05), ('start_logit', -0.9830758571624756), ('end_logit', 8.356441497802734)]), OrderedDict([('text', 'On April 21, 2004, Disney announced a restructuring of its Disney Media Networks division with Anne Sweeney'), ('probability', 5.3945594647410474e-05), ('start_logit', -1.4658879041671753), ('end_logit', 8.356441497802734)]), OrderedDict([('text', 'Anne Sweeney being named president of ABC parent Disney–ABC Television Group, and ESPN president George Bodenheimer'), ('probability', 5.264419046702813e-05), ('start_logit', 8.357232093811035), ('end_logit', -1.4910986423492432)]), OrderedDict([('text', 'April 21, 2004, Disney announced a restructuring of its Disney Media Networks division with Anne Sweeney'), ('probability', 2.4206634534182337e-05), ('start_logit', -2.2672371864318848), ('end_logit', 8.356441497802734)]), OrderedDict([('text', 'Disney Media Networks division with Anne Sweeney'), ('probability', 2.0267835830057236e-05), ('start_logit', -2.444828748703003), ('end_logit', 8.356441497802734)]), OrderedDict([('text', '2004, Disney announced a restructuring of its Disney Media Networks division with Anne Sweeney'), ('probability', 1.651158022899612e-05), ('start_logit', -2.6498019695281982), ('end_logit', 8.356441497802734)]), OrderedDict([('text', 'Anne Sweeney being named president'), ('probability', 1.5100346698474555e-05), ('start_logit', 8.357232093811035), ('end_logit', -2.7399368286132812)]), OrderedDict([('text', 'Anne Sweeney being named'), ('probability', 1.048714962404428e-05), ('start_logit', 8.357232093811035), ('end_logit', -3.104503870010376)]), OrderedDict([('text', 'Anne Sweeney being named president of ABC'), ('probability', 7.763457777829983e-06), ('start_logit', 8.357232093811035), ('end_logit', -3.405226707458496)]), OrderedDict([('text', 'Anne Sweeney being named president of ABC parent Disney–ABC'), ('probability', 7.279500152601964e-06), ('start_logit', 8.357232093811035), ('end_logit', -3.469592332839966)]), OrderedDict([('text', 'with Anne Sweeney'), ('probability', 4.5077168678184705e-06), ('start_logit', -3.948073148727417), ('end_logit', 8.356441497802734)]), OrderedDict([('text', 'Anne Sweeney being named president of ABC parent Disney–ABC Television Group, and ESPN president George'), ('probability', 3.6106978117933016e-06), ('start_logit', 8.357232093811035), ('end_logit', -4.170753479003906)]), OrderedDict([('text', 'Anne Sweeney being named president of ABC parent Disney–ABC Television'), ('probability', 3.450058674894645e-06), ('start_logit', 8.357232093811035), ('end_logit', -4.216263294219971)]), OrderedDict([('text', 'Sweeney being named president of ABC parent Disney–ABC Television Group,'), ('probability', 5.616969448093071e-07), ('start_logit', 2.7397849559783936), ('end_logit', -0.4140002131462097)])]\\n\",\n      \"-------------------------------------------------------------------------------------------------------------------\\n\",\n      \"Paragraph:\\n\",\n      \"In addition, there are $2 million worth of other ancillary events, including a week-long event at the Santa Clara Convention Center, a beer, wine and food festival at Bellomy Field at Santa Clara University, and a pep rally. A professional fundraiser will aid in finding business sponsors and individual donors, but still may need the city council to help fund the event. Additional funding will be provided by the city council, which has announced plans to set aside seed funding for the event.\\n\",\n      \"\\n\",\n      \"Question:\\n\",\n      \"Where was a beer, wine and food festival held at prior to the Super Bowl?\\n\",\n      \"\\n\",\n      \"Ground truth answers:\\n\",\n      \"['Bellomy Field', 'Bellomy Field', 'Santa Clara Convention Center']\\n\",\n      \"\\n\",\n      \"Predicted answer:\\n\",\n      \"Bellomy Field at Santa Clara University\\n\",\n      \"\\n\",\n      \"Top N best answers\\n\",\n      \"[OrderedDict([('text', 'Bellomy Field at Santa Clara University'), ('probability', 0.8911798723290278), ('start_logit', 8.530954360961914), ('end_logit', 8.044594764709473)]), OrderedDict([('text', 'Santa Clara University'), ('probability', 0.05653501302954448), ('start_logit', 5.773268222808838), ('end_logit', 8.044594764709473)]), OrderedDict([('text', 'Bellomy Field'), ('probability', 0.04355919507587527), ('start_logit', 8.530954360961914), ('end_logit', 5.026169300079346)]), OrderedDict([('text', 'Bellomy Field at Santa Clara University,'), ('probability', 0.0041427350628246784), ('start_logit', 8.530954360961914), ('end_logit', 2.6734046936035156)]), OrderedDict([('text', 'at Bellomy Field at Santa Clara University'), ('probability', 0.0026144465466759878), ('start_logit', 2.699460506439209), ('end_logit', 8.044594764709473)]), OrderedDict([('text', 'Bellomy Field at Santa Clara'), ('probability', 0.0007185308301180144), ('start_logit', 8.530954360961914), ('end_logit', 0.9215018153190613)]), OrderedDict([('text', 'Santa Clara University,'), ('probability', 0.00026280842737466214), ('start_logit', 5.773268222808838), ('end_logit', 2.6734046936035156)]), OrderedDict([('text', 'Santa Clara Convention Center, a beer, wine and food festival at Bellomy Field at Santa Clara University'), ('probability', 0.0002477774410439148), ('start_logit', 0.3431837260723114), ('end_logit', 8.044594764709473)]), OrderedDict([('text', 'Field at Santa Clara University'), ('probability', 0.00016740039422735777), ('start_logit', -0.04895868897438049), ('end_logit', 8.044594764709473)]), OrderedDict([('text', 'at Bellomy Field'), ('probability', 0.0001277892271562228), ('start_logit', 2.699460506439209), ('end_logit', 5.026169300079346)]), OrderedDict([('text', 'at Santa Clara University'), ('probability', 0.00010075950695038713), ('start_logit', -0.556610643863678), ('end_logit', 8.044594764709473)]), OrderedDict([('text', 'Bellomy Field at'), ('probability', 5.6814224825865046e-05), ('start_logit', 8.530954360961914), ('end_logit', -1.615920066833496)]), OrderedDict([('text', 'Bellomy Field at Santa Clara University, and a pep rally.'), ('probability', 5.503602065575232e-05), ('start_logit', 8.530954360961914), ('end_logit', -1.647718906402588)]), OrderedDict([('text', 'University'), ('probability', 4.749428214965224e-05), ('start_logit', -1.308737874031067), ('end_logit', 8.044594764709473)]), OrderedDict([('text', 'Santa Clara'), ('probability', 4.55824363904097e-05), ('start_logit', 5.773268222808838), ('end_logit', 0.9215018153190613)]), OrderedDict([('text', 'Bellomy'), ('probability', 3.587220941206712e-05), ('start_logit', 8.530954360961914), ('end_logit', -2.0757439136505127)]), OrderedDict([('text', 'Bellomy Field at Santa'), ('probability', 3.2615110636404645e-05), ('start_logit', 8.530954360961914), ('end_logit', -2.170931100845337)]), OrderedDict([('text', 'beer, wine and food festival at Bellomy Field at Santa Clara University'), ('probability', 2.7231470259308925e-05), ('start_logit', -1.8649739027023315), ('end_logit', 8.044594764709473)]), OrderedDict([('text', 'Clara University'), ('probability', 2.310549787851485e-05), ('start_logit', -2.0292766094207764), ('end_logit', 8.044594764709473)]), OrderedDict([('text', 'Bell'), ('probability', 1.992087697312563e-05), ('start_logit', 8.530954360961914), ('end_logit', -2.663938522338867)])]\\n\",\n      \"-------------------------------------------------------------------------------------------------------------------\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"for i in [0, 10, 100]:\\n\",\n    \"    print('Paragraph:')\\n\",\n    \"    print(dev_df.iloc[i]['doc_text'])\\n\",\n    \"    print()\\n\",\n    \"    print('Question:')\\n\",\n    \"    print(dev_df.iloc[i]['question_text'])\\n\",\n    \"    print()\\n\",\n    \"    print('Ground truth answers:')\\n\",\n    \"    print(dev_df.iloc[i]['answer_text'])\\n\",\n    \"    print()\\n\",\n    \"    print('Predicted answer:')\\n\",\n    \"    print(final_answers[dev_df.iloc[i]['qa_id']])\\n\",\n    \"    print()\\n\",\n    \"    print('Top N best answers')\\n\",\n    \"    print(nbest_answers[dev_df.iloc[i]['qa_id']])\\n\",\n    \"    print('-------------------------------------------------------------------------------------------------------------------')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Evaluate\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Question answering task is usually evaluated on two metrics: exact match (EM) and F1 score.   \\n\",\n    \"The exact match is computed by first performing some simple normalization (e.g. remove punctuation and convert to lower case) on the ground truth and predicted answers and check if they match exactly after normalization.   \\n\",\n    \"F1 score is computed from token-level precision and recall by comparing the ground truth and predicted answers. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"{\\n\",\n      \"  \\\"exact\\\": 86.6414380321665,\\n\",\n      \"  \\\"f1\\\": 92.68221713064221,\\n\",\n      \"  \\\"total\\\": 10570,\\n\",\n      \"  \\\"HasAns_exact\\\": 86.6414380321665,\\n\",\n      \"  \\\"HasAns_f1\\\": 92.68221713064221,\\n\",\n      \"  \\\"HasAns_total\\\": 10570\\n\",\n      \"}\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"evaluation_result = evaluate_qa(actual_dataset=dev_dataset,\\n\",\n    \"                                preds=final_answers)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The table below compares running time and model performance of BERT, XLNet, and DistilBert on Standard_NC24rs_v3 DSVM.\\n\",\n    \"\\n\",\n    \"|Model name|Training time|Scoring time|Exact Match (EM)|F1 score|\\n\",\n    \"|:---------|:------------|:-----------|:--------------|--------|\\n\",\n    \"|bert-large-cased-whole-word-masking| 3.4 hrs| ~ 5 mins|86.64|92.68|\\n\",\n    \"|xlnet-large-cased|5.2 hrs| ~ 10 mins|84.67|91.69|\\n\",\n    \"|distilbert-base-uncased|0.66 hr| ~ 1 min|76.62|84.71|\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"sb.glue(\\\"exact\\\", evaluation_result[\\\"exact\\\"])\\n\",\n    \"sb.glue(\\\"f1\\\", evaluation_result[\\\"f1\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## References\\n\",\n    \"\\n\",\n    \"1. Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, Percy Liang, [*SQuAD: 100,000+ Questions for Machine Comprehension of Text*](https://arxiv.org/abs/1606.05250), EMNLP, 2016.\\n\",\n    \"2. Pranav Rajpurkar, Robin Jia, Percy Liang, [*Know What You Don't Know: Unanswerable Questions for SQuAD*](https://arxiv.org/abs/1806.03822), ACL, 2018\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"nlp_gpu\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/question_answering/question_answering_system_bidaf_quickstart.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Create a Question Answering (QA) System in Under 20 Minutes\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This notebook demonstrates how to create a Question Answering (QA) webservice in under 20 minutes. We use Azure Machine Learning ([AzureML](https://azure.microsoft.com/en-us/services/machine-learning-service/)) Service to deploy a pre-trained [AllenNLP model](https://allennlp.org/models\\n\",\n    \"), [BiDAF](https://www.semanticscholar.org/paper/Bidirectional-Attention-Flow-for-Machine-Seo-Kembhavi/007ab5528b3bd310a80d553cccad4b78dc496b02\\n\",\n    \"), using Azure Container Instances ([ACI](https://azure.microsoft.com/en-us/services/container-instances/)).\"\n   ]\n  },\n    {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/question_answering/bidaf_quickstart.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Table of Contents\\n\",\n    \"\\n\",\n    \"1. [Deploy Model](#1.-Deploy-Model)\\n\",\n    \"    - [1.1 Link to or Create a Workspace](#1.1-Link-to-or-Create-a-Workspace)\\n\",\n    \"    - [1.2 Register BiDAF model for Deployment](#1.2-Register-BiDAF-model-for-Deployment)  \\n\",\n    \"    - [1.3 Create Scoring Script](#1.3-Create-Scoring-Script)  \\n\",\n    \"    - [1.4 Create a YAML File for the Environment](#1.4-Create-a-YAML-File-for-the-Environment)  \\n\",\n    \"    - [1.5 Image Creation](#1.5-Image-Creation)\\n\",\n    \"    - [1.6 Deploy the Image as a Web Service to Azure Container Instance](#1.6-Deploy-the-Image-as-a-Web-Service-to-Azure-Container-Instance)\\n\",\n    \"    \\n\",\n    \"2. [Test Deployed Webservice](#2.-Test-Deployed-Webservice)\\n\",\n    \"    - [2.1 Real-time Scoring](#2.1-Real-time-Scoring)\\n\",\n    \"    - [2.2 Batch Scoring](#2.2-Batch-Scoring)  \\n\",\n    \"    \\n\",\n    \"3. [Conclusion](#Conclusion)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import sys\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"import json\\n\",\n    \"import urllib\\n\",\n    \"import scrapbook as sb\\n\",\n    \"\\n\",\n    \"#import utils\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"from utils_nlp.azureml import azureml_utils\\n\",\n    \"\\n\",\n    \"from azureml.core.webservice import AciWebservice, Webservice\\n\",\n    \"from azureml.core.image import ContainerImage\\n\",\n    \"from azureml.core.conda_dependencies import CondaDependencies\\n\",\n    \"from azureml.core.model import Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"CPU_CORES = 1\\n\",\n    \"MEMORY_GB = 8\\n\",\n    \"DEPLOYMENT_PYTHON_VERSION = '3.6.8'\\n\",\n    \"DEPLOYMENT_CONDA_PACKAGES = ['jsonnet','cmake','regex','pytorch','torchvision']\\n\",\n    \"DEPLOYMENT_PIP_PACKAGES = ['allennlp==0.8.4','azureml-sdk==1.0.48']\\n\",\n    \"CONTAINER_TAGS = {'area': \\\"nlp\\\", 'type': \\\"question-answering BiDAF\\\"}\\n\",\n    \"MODEL_TAGS = {\\\"bidaf\\\": \\\"demo\\\"}\\n\",\n    \"config_path = (\\n\",\n    \"    \\\"./.azureml\\\"\\n\",\n    \")  # Path to the directory containing config.json with azureml credentials\\n\",\n    \"\\n\",\n    \"webservice_name = \\\"aci-bidaf-service\\\" #name for webservice; must be unique within your workspace\\n\",\n    \"\\n\",\n    \"# Azure resources\\n\",\n    \"subscription_id = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"resource_group = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  \\n\",\n    \"workspace_name = \\\"YOUR_WORKSPACE_NAME\\\"  \\n\",\n    \"workspace_region = \\\"YOUR_WORKSPACE_REGION\\\" #Possible values eastus, eastus2 and so on.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1. Deploy Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.1 Link to or Create a Workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The following cell looks to set up the connection to your [Azure Machine Learning service Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). You can choose to connect to an existing workspace or create a new one. \\n\",\n    \"\\n\",\n    \"**To access an existing workspace:**\\n\",\n    \"1. If you have a `config.json` file, you do not need to provide the workspace information; you will only need to update the `config_path` variable that is defined above which contains the file.\\n\",\n    \"2. Otherwise, you will need to supply the following:\\n\",\n    \"    * The name of your workspace\\n\",\n    \"    * Your subscription id\\n\",\n    \"    * The resource group name\\n\",\n    \"\\n\",\n    \"**To create a new workspace:**\\n\",\n    \"\\n\",\n    \"Set the following information:\\n\",\n    \"* A name for your workspace\\n\",\n    \"* Your subscription id\\n\",\n    \"* The resource group name\\n\",\n    \"* [Azure region](https://azure.microsoft.com/en-us/global-infrastructure/regions/) to create the workspace in, such as `eastus2`. \\n\",\n    \"\\n\",\n    \"This will automatically create a new resource group for you in the region provided if a resource group with the name given does not already exist. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"ws = azureml_utils.get_or_create_workspace(\\n\",\n    \"    config_path=config_path,\\n\",\n    \"    subscription_id=subscription_id,\\n\",\n    \"    resource_group=resource_group,\\n\",\n    \"    workspace_name=workspace_name,\\n\",\n    \"    workspace_region=workspace_region,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print('Workspace name: ' + ws.name, \\n\",\n    \"      'Azure region: ' + ws.location, \\n\",\n    \"      'Subscription id: ' + ws.subscription_id, \\n\",\n    \"      'Resource group: ' + ws.resource_group, sep='\\\\n')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.2 Register BiDAF model for Deployment\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This step downloads the pre-trained [AllenNLP](https://allennlp.org/models) pretrained model and registers the model in our Workspace. The pre-trained AllenNLP model we use is called Bidirectional Attention Flow for Machine Comprehension ([BiDAF](https://www.semanticscholar.org/paper/Bidirectional-Attention-Flow-for-Machine-Seo-Kembhavi/007ab5528b3bd310a80d553cccad4b78dc496b02\\n\",\n    \")) It achieved state-of-the-art performance on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset in 2017 and is a well-respected, performant baseline for QA. AllenNLP's pre-trained BIDAF model is trained on the SQuAD training set and achieves an EM score of 68.3 on the SQuAD development set. See the [BIDAF deep dive notebook](https://github.com/microsoft/nlp-recipes/examples/question_answering/bidaf_deep_dive.ipynb\\n\",\n    \") for more information on this algorithm and AllenNLP implementation.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"x config.json\\n\",\n      \"x vocabulary/\\n\",\n      \"x vocabulary/non_padded_namespaces.txt\\n\",\n      \"x vocabulary/tokens.txt\\n\",\n      \"x weights.th\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"bidaf_model_url = 'https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-model-2017.09.15-charpad.tar.gz'\\n\",\n    \"urllib.request.urlretrieve(bidaf_model_url, filename=\\\"bidaf.tar.gz\\\")\\n\",\n    \"!tar xvzf bidaf.tar.gz\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Registering a model means registering one or more files that make up a model (in our case, we register all the files contained in the downloaded .tar.gz file). Here we demonstrate how to register a model using the AzureML SDK, but see the [model registration](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#registermodel\\n\",\n    \") documentation for other registration methods.\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"**Note**: If you have already registered the model, you need not re-register it. Rather, just retrieve the pre-existing model in your Workspace with `bidaf_model = Model(ws, name='bidaf')`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Registering model bidaf\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"bidaf_model = Model.register(workspace = ws,\\n\",\n    \"                       model_path =\\\"bidaf.tar.gz\\\",\\n\",\n    \"                       model_name = \\\"bidaf\\\",\\n\",\n    \"                       tags = MODEL_TAGS,\\n\",\n    \"                       description = \\\"BiDAF Pretrained Model\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.3 Create Scoring Script\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this section we show an example of an entry script, score.py, which is called from the deployed webservice. The script must contain:\\n\",\n    \"\\n\",\n    \"1. init() - This function loads the model in a global object.  \\n\",\n    \"2. run() - This function is used for model prediction. The inputs and outputs to run() typically use JSON for serialization and deserilization. \\n\",\n    \"\\n\",\n    \"Our scoring script allows for both real-time and batch prediction. Each observation is a dictionary with two keys: _question_ and _passage_. With batch prediction we pass in a list of observations and use AllenNLPs `predict_batch_json()` method. For real-time prediction we pass in a single observation and use AllenNLPs `predict()` method.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Overwriting score.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile score.py\\n\",\n    \"import json\\n\",\n    \"from allennlp.predictors import Predictor\\n\",\n    \"from azureml.core.model import Model\\n\",\n    \"\\n\",\n    \"def init():\\n\",\n    \"    global model\\n\",\n    \"    bidaf_dir_path = Model.get_model_path('bidaf')\\n\",\n    \"    model = Predictor.from_path(bidaf_dir_path)\\n\",\n    \"\\n\",\n    \"def run(rawdata):\\n\",\n    \"    try:\\n\",\n    \"        data = json.loads(rawdata)\\n\",\n    \"        \\n\",\n    \"        # if one question-passage pair was passed\\n\",\n    \"        if type(data) == dict:\\n\",\n    \"            passage = data['passage']\\n\",\n    \"            question = data['question']\\n\",\n    \"            result = model.predict(question, passage)[\\\"best_span_str\\\"]\\n\",\n    \"        \\n\",\n    \"        # if multiple question-passage pairs were passed\\n\",\n    \"        elif type(data) == list:\\n\",\n    \"            result = model.predict_batch_json(data)\\n\",\n    \"            result = [i[\\\"best_span_str\\\"] for i in result]\\n\",\n    \"\\n\",\n    \"    except Exception as e:\\n\",\n    \"        result = str(e)\\n\",\n    \"        return json.dumps({\\\"error\\\": result})\\n\",\n    \"    return json.dumps({\\\"result\\\":result})\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.4 Create a YAML File for the Environment \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. The following cells create a file, bidafenv.yml, which specifies the dependencies from the run.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'bidafenv.yml'\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"myenv = CondaDependencies.create(conda_packages= DEPLOYMENT_CONDA_PACKAGES,\\n\",\n    \"                                 pip_packages= DEPLOYMENT_PIP_PACKAGES, \\n\",\n    \"                                 python_version = DEPLOYMENT_PYTHON_VERSION)\\n\",\n    \"myenv.add_channel('conda-forge')\\n\",\n    \"myenv.add_channel('pytorch')\\n\",\n    \"\\n\",\n    \"conda_env_file_name = 'bidafenv.yml'\\n\",\n    \"myenv.save_to_file('.', conda_env_file_name)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.5 Image Creation\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this step we create a container image which is wrapper containing the entry script, yaml file with package dependencies and the model. The created image is then deployed as a webservice in the next step. This step can take up to 10 minutes and even longer if the model is large.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Creating image\\n\",\n      \"Running.......................................................................................................................................\\n\",\n      \"Succeeded\\n\",\n      \"Image creation operation finished for image bidaf-image:36, operation \\\"Succeeded\\\"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"image_config = ContainerImage.image_configuration(execution_script = \\\"score.py\\\",\\n\",\n    \"                                                  runtime = \\\"python\\\",\\n\",\n    \"                                                  conda_file = conda_env_file_name,\\n\",\n    \"                                                  description = \\\"Image with BiDAF model\\\",\\n\",\n    \"                                                  tags = CONTAINER_TAGS)\\n\",\n    \"\\n\",\n    \"image = ContainerImage.create(name = \\\"bidaf-image\\\",\\n\",\n    \"                              models = [bidaf_model],\\n\",\n    \"                              image_config = image_config,\\n\",\n    \"                              workspace = ws)\\n\",\n    \"\\n\",\n    \"image.wait_for_creation(show_output = True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"If the above step fails, then use the below command to see logs\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# print(image.image_build_log_uri)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.6 Deploy the Image as a Web Service to Azure Container Instance\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Azure Container Instances are mostly used for deploying your models as a web service if one or more of the following conditions are true:  \\n\",\n    \"1. You need to quickly deploy and validate your model.\\n\",\n    \"2. You are testing a model that is under development.  \\n\",\n    \"\\n\",\n    \"\\n\",\n    \"To set them up properly, we need to indicate the number of CPU cores and the amount of memory we want to allocate to our web service.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#Set the web service configuration\\n\",\n    \"aci_config = AciWebservice.deploy_configuration(cpu_cores = CPU_CORES, \\n\",\n    \"                                               memory_gb = MEMORY_GB)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The final step to deploying our web service is to call WebService.deploy_from_image(). This function uses the Docker image and the deployment configuration we created above to perform the following:  \\n\",\n    \"1. Deploy the docker image to an Azure Container Instance\\n\",\n    \"2. Call the init() function in our scoring file\\n\",\n    \"3. Provide an HTTP endpoint for scoring calls  \\n\",\n    \"\\n\",\n    \"The deploy_from_image method requires the following parameters:\\n\",\n    \"1. workspace: the workspace containing the service\\n\",\n    \"2. name: a unique name used to identify the service in the workspace\\n\",\n    \"3. image: a docker image object that contains the environment needed for scoring/inference\\n\",\n    \"4. deployment_config: a configuration object describing the compute type\\n\",\n    \"\\n\",\n    \"**Note**: The web service creation can take a few minutes\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Creating service\\n\",\n      \"Running.............................................\\n\",\n      \"SucceededACI service creation operation finished, operation \\\"Succeeded\\\"\\n\",\n      \"Healthy\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# deploy image as web service\\n\",\n    \"aci_service = Webservice.deploy_from_image(workspace = ws, \\n\",\n    \"                                           name = webservice_name,\\n\",\n    \"                                           image = image,\\n\",\n    \"                                           deployment_config = aci_config)\\n\",\n    \"\\n\",\n    \"aci_service.wait_for_deployment(show_output = True)\\n\",\n    \"print(aci_service.state)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Fetch logs to debug in case of failures.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# print(aci_service.get_logs())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"If you want to reuse an existing service versus creating a new one, call the webservice with the name of the service. You can look up all the deployed webservices under deployment in the Azure Portal. Below is an example:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# aci_service = Webservice(workspace=ws, name='<<serive-name>>')\\n\",\n    \"\\n\",\n    \"# to use the webservice\\n\",\n    \"# aci_service.run()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Conclusion**: Now we have a deployed webservice and deploying the model took less than 20 minutes!\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2. Test Deployed Webservice\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Depending on the needs of our QA system, we can either do real-time or batch scoring. We show an example of both types of scoring below using the following example [passage](https://www.semanticscholar.org/paper/Bidirectional-Attention-Flow-for-Machine-Seo-Kembhavi/007ab5528b3bd310a80d553cccad4b78dc496b02) and questions:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"passage = \\\"Machine Comprehension (MC), answering questions about a given context, \\\\\\n\",\n    \"requires modeling complex interactions between the context and the query. Recently,\\\\\\n\",\n    \"attention mechanisms have been successfully extended to MC. Typically these mechanisms\\\\\\n\",\n    \"use attention to summarize the query and context into a single vector, couple \\\\\\n\",\n    \"attentions temporally, and often form a uni-directional attention. In this paper \\\\\\n\",\n    \"we introduce the Bi-Directional Attention Flow (BIDAF) network, a multi-stage \\\\\\n\",\n    \"hierarchical process that represents the context at different levels of granularity \\\\\\n\",\n    \"and uses a bi-directional attention flow mechanism to achieve a query-aware context \\\\\\n\",\n    \"representation without early summarization. Our experimental evaluations show that \\\\\\n\",\n    \"our model achieves the state-of-the-art results in Stanford QA (SQuAD) and\\\\\\n\",\n    \"CNN/DailyMail Cloze Test datasets.\\\"\\n\",\n    \"\\n\",\n    \"question1 = \\\"What is BIDAF?\\\"\\n\",\n    \"question2 = \\\"What datasets does BIDAF achieve state-of-the-art results on?\\\"\\n\",\n    \"question3 = \\\"What do attention mechanisms do?\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.1 Real-time Scoring\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We prepare data for predicting answers for one passage-question pair by creating a dictionary with _question_ and _passage_ keys\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"data = {\\\"passage\\\": passage, \\\"question\\\":question1}\\n\",\n    \"data = json.dumps(data)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Time elapsed: 0.5916\\n\",\n      \"Answer: Bi-Directional Attention Flow\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"with Timer() as t:\\n\",\n    \"    score = aci_service.run(input_data=data)\\n\",\n    \"    t.stop()\\n\",\n    \"    print(\\\"Time elapsed: {}\\\".format(t))\\n\",\n    \"    \\n\",\n    \"result = json.loads(score)\\n\",\n    \"try:\\n\",\n    \"    output = result[\\\"result\\\"]\\n\",\n    \"    sb.glue(\\\"answer\\\", output)\\n\",\n    \"    print(\\\"Answer:\\\", output)\\n\",\n    \"except:\\n\",\n    \"    print(result[\\\"error\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We see that the model responded to the question \\\"What is BiDAF?\\\" with \\\"Bi-Directional Attention Flow\\\".\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.2 Batch Scoring\\n\",\n    \"\\n\",\n    \"We prepare the data for batch scoring by creating a list of dictionaries with _passage_ and _question_ keys.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"data_multiple = [{\\\"passage\\\": passage, \\\"question\\\":i} for i in [question1, question2, question3]]\\n\",\n    \"data_multiple = json.dumps(data_multiple)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Time elapsed: 0.5267\\n\",\n      \"['Bi-Directional Attention Flow', 'Stanford QA (SQuAD) andCNN/DailyMail Cloze Test', 'have been successfully extended to MC']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"with Timer() as t:\\n\",\n    \"    score = aci_service.run(input_data=data_multiple)\\n\",\n    \"    t.stop()\\n\",\n    \"    print(\\\"Time elapsed: {}\\\".format(t))\\n\",\n    \"    \\n\",\n    \"result = json.loads(score)\\n\",\n    \"try:\\n\",\n    \"    output = result[\\\"result\\\"]\\n\",\n    \"    print(output)\\n\",\n    \"except:\\n\",\n    \"    print(result[\\\"error\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We see that the model responded to the question \\\"What is BiDAF?\\\" with \\\"Bi-Directional Attention Flow\\\", the question \\\"What datasets does BIDAF achieve state-of-the-art results on?\\\" with \\\"Stanford QA (SQuAD) and CNN/DailyMail Cloze Test\\\", and the question \\\"What do attention mechanisms do?\\\" with \\\"summarize the query and context into a single vector, couple attentions temporally, and often form a uni-directional attention\\\". All these answers make sense given the passage and demonstrate that the AllenNLP pre-trained model is a good model for a deployed QA system. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Conclusion\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This notebook demonstrated how to produce a fast QA service in under 20 minutes using Azure Container Instances (ACI). We deployed a popular pre-trained model, BiDAF, provided by AllenNLP, which was state-of-the-art in 2017 and performs well on our example queries. \"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.4\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/sentence_similarity/README.md",
    "content": "# Sentence Similarity\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for building\nsentence similarity models. The [gensen](../../utils_nlp/models/gensen) and [pretrained\nembeddings](../../utils_nlp/models/pretrained_embeddings) utility scripts are used to speed up the\nmodel building process in the notebooks.  \nThe sentence similarity scores can be used in a wide\nvariety of applications, such as search/retrieval, nearest-neighbor or kernel-based classification\nmethods, recommendations, and ranking tasks.\n\n## What is sentence similarity\n\nSentence similarity or semantic textual similarity is a measure of how similar two pieces of text\nare, or to what degree they express the same meaning. Related tasks include paraphrase or duplicate\nidentification, search, and matching applications. The common methods used for text similarity range\nfrom simple word-vector dot products to pairwise classification, and more recently, deep neural\nnetworks.\n\nSentence similarity is normally calculated by the following two steps:\n\n1. obtaining the embeddings of the sentences\n\n2. taking the cosine similarity between them as shown in the following figure([source](https://tfhub.dev/google/universal-sentence-encoder/1)):\n\n    ![Sentence Similarity](https://nlpbp.blob.core.windows.net/images/example-similarity.png)\n\n## Summary\n\n|Notebook|Environment|Description|Dataset|\n|---|---|---|---|\n|[Creating a Baseline model](baseline_deep_dive.ipynb)| Local| A baseline model is a basic solution that serves as a point of reference for comparing other models to. The baseline model's performance gives us an indication of how much better our models can perform relative to a naive approach.|[STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset)|\n|[BERT Sentence Encoder](bert_encoder.ipynb)|Local|In this notebook, we show how to extract features from pretrained BERT as sentence embeddings.|Handcrafted sample data|\n|[BERT with SentEval](bert_senteval.ipynb)|AzureML|In this notebook, we show how to use SentEval to compare the performance of BERT sequence encodings with various pooling strategies on a sentence similarity task. We leverage AzureML  resources such as Datastore and AmlCompute to autoscale our compute cluster and run the experiments in parallel.|[STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset)|\n|Gensen | [Local](gensen_local.ipynb), [AzureML](gensen_aml_deep_dive.ipynb)|This notebook serves as an introduction to an end-to-end NLP solution for sentence similarity building one of the State of the Art models, GenSen. We provide two notebooks. One, which runs on the AzureML platform.  We show the advantages of AzureML when training large NLP models with GPU in this notebook. The other example walks through using a GPU enabled VM to train and score Gensen.|[SNLI](https://nlp.stanford.edu/projects/snli/)|\n|[Automated Machine Learning(AutoML) with Deployment on Azure Container Instance](automl_local_deployment_aci.ipynb)|Azure Container Instances|This notebook shows users how to use AutoML on local machine and deploy the model as a webservice to Azure Container Instances (ACI) to get a sentence similarity score.|[STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset)|\n|[Google Universal Sentence Encoder with Azure Machine Learning Pipeline, AutoML with Deployment on Azure Kubernetes Service](automl_with_pipelines_deployment_aks.ipynb)|AzureML| This notebook shows a user how to use AzureML pipelines and deploy the pipeline output model as a webservice to Azure Kubernetes Service which can be used as an end point to get sentence similarity scores.|[STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset)|\n\n## Using GPU vs Azure ML Compute\nWe did a comparative study to make it easier for you to choose between a GPU enabled Azure VM\nand Azure ML compute. The table below provides the cost vs performance trade-off for\neach of the choices.\n\n* The \"Azure VM\" column refers to the running time of the [gensen local](gensen_local.ipynb) notebook. All the other columns refer to the [gensen AzureML](gensen_aml_deep_dive.ipynb) notebook.\n* Both the Azure VM and each Azure ML Compute node are Standard_NC6 with 1 NVIDIA Tesla K80 GPU with 12 GB GPU memory.\n* The total time in the table stands for the training time + setup time.\n* Cost is the estimated cost of running the Azure ML Compute Job or the VM up-time.\n\n**Please note:** These were the estimated cost for running these notebooks as of July 1st, 2019. Please\nlook at the [Azure Pricing Calculator](https://azure.microsoft.com/en-us/pricing/calculator/) to see the most up to date pricing information.\n\n|---|Azure VM| AML 1 Node| AML 2 Nodes | AML 4 Nodes | AML 8 Nodes|\n|---|---|---|---|---|---|\n|Training Loss​|4.91​|4.81​|4.78​|4.77​|4.58​|\n|Total Time​|1h 05m|1h 54m|1h 44m​|1h 26m​|1h 07m​|\n|Cost|$1.12​|$2.71​|$4.68​|$7.9​|$12.1​|\n"
  },
  {
    "path": "examples/sentence_similarity/automl_local_deployment_aci.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>\\n\",\n    \"\\n\",\n    \"<i>Licensed under the MIT License.</i>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Local Automated Machine Learning Model with ACI Deployment for Predicting Sentence Similarity\"\n   ]\n  },\n    {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/sentence_similarity/automl_local_deployment_aci.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This notebook demonstrates how to use [Azure Machine Learning Service's](https://azure.microsoft.com/en-us/services/machine-learning-service/\\n\",\n    \") Automated Machine Learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml\\n\",\n    \")) locally to automate machine learning model selection and tuning and how to use Azure Container Instance ([ACI](https://azure.microsoft.com/en-us/services/container-instances/\\n\",\n    \")) for deployment. We utilize the STS Benchmark dataset to predict sentence similarity and utilize AutoML's text preprocessing features.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Table of Contents\\n\",\n    \"1. [Introduction](#1.-Introduction)  \\n\",\n    \"     [1.1 What is Azure AutoML?](#1.1-What-is-Azure-AutoML?)  \\n\",\n    \"     [1.2 Modeling Problem](#1.2-Modeling-Problem)  \\n\",\n    \"1. [Data Preparation](#2.-Data-Preparation)  \\n\",\n    \"1. [Create AutoML Run](#3.-Create-AutoML-Run)    \\n\",\n    \"    [3.1 Link to or create a Workspace](#3.1-Link-to-or-create-a-Workspace)  \\n\",\n    \"    [3.2 Create AutoMLConfig object](#3.2-Create-AutoMLConfig-object)\\n\",\n    \"    [3.3 Run Experiment](#3.3-Run-Experiment)      \\n\",\n    \"1. [Deploy Sentence Similarity Model](#4.-Deploy-Sentence-Similarity-Model)  \\n\",\n    \"    [4.1 Retrieve the Best Model](#4.1-Retrieve-the-Best-Model)  \\n\",\n    \"    [4.2 Register the Fitted Model for Deployment](#4.2-Register-the-Fitted-Model-for-Deployment)   \\n\",\n    \"    [4.3 Create an Entry Script](#4.3-Create-an-Entry-Script)   \\n\",\n    \"    [4.4 Create a YAML File for the Environment](#4.4-Create-a-YAML-File-for-the-Environment)  \\n\",\n    \"    [4.5 Create a Container Image](#4.5-Create-a-Container-Image)    \\n\",\n    \"    [4.6 Deploy the Image as a Web Service to Azure Container Instance](#4.6-Deploy-the-Image-as-a-Web-Service-to-Azure-Container-Instance)  \\n\",\n    \"    [4.7 Test Deployed Model](#4.7-Test-Deployed-Model)     \\n\",\n    \"1. [Clean](#5-Clean)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.1 What is Azure AutoML?\\n\",\n    \"\\n\",\n    \"Automated machine learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml)) is a capability of Microsoft's [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/\\n\",\n    \"). The goal of AutoML is to improve the productivity of data scientists and democratize AI by allowing for the rapid development and deployment of machine learning models. To acheive this goal, AutoML automates the process of selecting a ML model and tuning the model. All the user is required to provide is a dataset (suitable for a classification, regression, or time-series forecasting problem) and a metric to optimize in choosing the model and hyperparameters. The user is also given the ability to set time and cost constraints for the model selection and tuning.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![](https://nlpbp.blob.core.windows.net/images/automl.PNG)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The AutoML model selection and tuning process can be easily tracked through the Azure portal or directly in python notebooks through the use of widgets. AutoML quickly selects a high quilty machine learning model tailored for your prediction problem. In this notebook, we walk through the steps of preparing data, setting up an AutoML experiment, and evaluating the results of our best model. More information about running AutoML experiments in Python can be found [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train). \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.2 Modeling Problem\\n\",\n    \"\\n\",\n    \"The regression problem we will demonstrate is predicting sentence similarity scores on the STS Benchmark dataset. The [STS Benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset) contains a selection of English datasets that were used in Semantic Textual Similarity (STS) tasks 2012-2017. The dataset contains 8,628 sentence pairs with a human-labeled integer representing the sentences' similarity (ranging from 0, for no meaning overlap, to 5, meaning equivalence). The sentence pairs will be embedded using AutoML's built-in preprocessing, so we'll pass the sentences directly into the model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Turning diagnostics collection on. \\n\",\n      \"System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\\n\",\n      \"Azure ML SDK Version: 1.0.48\\n\",\n      \"Pandas version: 0.23.4\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Set the environment path to find NLP\\n\",\n    \"import sys\\n\",\n    \"\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"import time\\n\",\n    \"import os\\n\",\n    \"import pandas as pd\\n\",\n    \"import shutil\\n\",\n    \"import numpy as np\\n\",\n    \"import torch\\n\",\n    \"import sys\\n\",\n    \"from scipy.stats import pearsonr\\n\",\n    \"from scipy.spatial import distance\\n\",\n    \"from sklearn.externals import joblib\\n\",\n    \"import json\\n\",\n    \"import scrapbook as sb\\n\",\n    \"\\n\",\n    \"# Import utils\\n\",\n    \"from utils_nlp.azureml import azureml_utils\\n\",\n    \"from utils_nlp.dataset import stsbenchmark\\n\",\n    \"from utils_nlp.dataset.preprocess import (\\n\",\n    \"    to_lowercase,\\n\",\n    \"    to_spacy_tokens,\\n\",\n    \"    rm_spacy_stopwords,\\n\",\n    \")\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"\\n\",\n    \"# Tensorflow dependencies for Google Universal Sentence Encoder\\n\",\n    \"import tensorflow_hub as hub\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# AzureML packages\\n\",\n    \"import azureml as aml\\n\",\n    \"import logging\\n\",\n    \"from azureml.telemetry import set_diagnostics_collection\\n\",\n    \"\\n\",\n    \"set_diagnostics_collection(send_diagnostics=True)\\n\",\n    \"from azureml.train.automl import AutoMLConfig\\n\",\n    \"from azureml.core.experiment import Experiment\\n\",\n    \"from azureml.widgets import RunDetails\\n\",\n    \"from azureml.train.automl.run import AutoMLRun\\n\",\n    \"from azureml.core.webservice import AciWebservice, Webservice\\n\",\n    \"from azureml.core.image import ContainerImage\\n\",\n    \"from azureml.core.conda_dependencies import CondaDependencies\\n\",\n    \"\\n\",\n    \"print(\\\"System version: {}\\\".format(sys.version))\\n\",\n    \"print(\\\"Azure ML SDK Version:\\\", aml.core.VERSION)\\n\",\n    \"print(\\\"Pandas version: {}\\\".format(pd.__version__))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"BASE_DATA_PATH = \\\"../../data\\\"\\n\",\n    \"CPU_CORES = 1\\n\",\n    \"MEMORY_GB = 8\\n\",\n    \"\\n\",\n    \"# Define the settings for AutoML\\n\",\n    \"automl_task = \\\"regression\\\"\\n\",\n    \"automl_iteration_timeout = 15\\n\",\n    \"automl_iterations = 50\\n\",\n    \"automl_metric = \\\"spearman_correlation\\\"\\n\",\n    \"automl_preprocess = True\\n\",\n    \"automl_model_blacklist = ['XGBoostRegressor']\\n\",\n    \"\\n\",\n    \"config_path = (\\n\",\n    \"    \\\"./.azureml\\\"\\n\",\n    \")  # Path to the directory containing config.json with azureml credentials\\n\",\n    \"\\n\",\n    \"webservice_name = \\\"aci-automl-service\\\" #name for webservice; must be unique within your workspace\\n\",\n    \"\\n\",\n    \"# Azure resources\\n\",\n    \"subscription_id = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"resource_group = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  \\n\",\n    \"workspace_name = \\\"YOUR_WORKSPACE_NAME\\\"  \\n\",\n    \"workspace_region = \\\"YOUR_WORKSPACE_REGION\\\" #Possible values eastus, eastus2 and so on.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"automl_settings = {\\n\",\n    \"    \\\"task\\\": automl_task,  # type of task: classification, regression or forecasting\\n\",\n    \"    \\\"debug_log\\\": \\\"automated_ml_errors.log\\\",\\n\",\n    \"    \\\"path\\\": \\\"./automated-ml-regression\\\",\\n\",\n    \"    \\\"iteration_timeout_minutes\\\": automl_iteration_timeout,  # How long each iteration can take before moving on\\n\",\n    \"    \\\"iterations\\\": automl_iterations,  # Number of algorithm options to try\\n\",\n    \"    \\\"primary_metric\\\": automl_metric,  # Metric to optimize\\n\",\n    \"    \\\"preprocess\\\": automl_preprocess,  # Whether dataset preprocessing should be applied\\n\",\n    \"    \\\"blacklist_models\\\": automl_model_blacklist #exclude this model due to installation issues\\n\",\n    \"}\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 2. Data Preparation\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## STS Benchmark Dataset\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"As described above, the STS Benchmark dataset contains 8.6K sentence pairs along with a human-annotated score for how similiar the two sentences are. We will load the training, development (validation), and test sets provided by STS Benchmark and preprocess the data (lowercase the text, drop irrelevant columns, and rename the remaining columns) using the utils contained in this repo. Each dataset will ultimately have three columns: _sentence1_ and _sentence2_ which contain the text of the sentences in the sentence pair, and _score_ which contains the human-annotated similarity score of the sentence pair.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:02<00:00, 146KB/s]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Data downloaded to ../../data\\\\raw\\\\stsbenchmark\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:02<00:00, 172KB/s]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Data downloaded to ../../data\\\\raw\\\\stsbenchmark\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:02<00:00, 182KB/s]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Data downloaded to ../../data\\\\raw\\\\stsbenchmark\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Load in the raw datasets as pandas dataframes\\n\",\n    \"train_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\\\"train\\\")\\n\",\n    \"dev_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\\\"dev\\\")\\n\",\n    \"test_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\\\"test\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Clean each dataset by lowercasing text, removing irrelevant columns,\\n\",\n    \"# and renaming the remaining columns\\n\",\n    \"train_clean = stsbenchmark.clean_sts(train_raw)\\n\",\n    \"dev_clean = stsbenchmark.clean_sts(dev_raw)\\n\",\n    \"test_clean = stsbenchmark.clean_sts(test_raw)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Convert all text to lowercase\\n\",\n    \"train = to_lowercase(train_clean)\\n\",\n    \"dev = to_lowercase(dev_clean)\\n\",\n    \"test = to_lowercase(test_clean)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Training set has 5749 sentences\\n\",\n      \"Development set has 1500 sentences\\n\",\n      \"Testing set has 1379 sentences\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"Training set has {} sentences\\\".format(len(train)))\\n\",\n    \"print(\\\"Development set has {} sentences\\\".format(len(dev)))\\n\",\n    \"print(\\\"Testing set has {} sentences\\\".format(len(test)))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>score</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"      <th>sentence2</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>5.00</td>\\n\",\n       \"      <td>a plane is taking off.</td>\\n\",\n       \"      <td>an air plane is taking off.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>3.80</td>\\n\",\n       \"      <td>a man is playing a large flute.</td>\\n\",\n       \"      <td>a man is playing a flute.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>3.80</td>\\n\",\n       \"      <td>a man is spreading shreded cheese on a pizza.</td>\\n\",\n       \"      <td>a man is spreading shredded cheese on an uncoo...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>2.60</td>\\n\",\n       \"      <td>three men are playing chess.</td>\\n\",\n       \"      <td>two men are playing chess.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>4.25</td>\\n\",\n       \"      <td>a man is playing the cello.</td>\\n\",\n       \"      <td>a man seated is playing the cello.</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   score                                      sentence1  \\\\\\n\",\n       \"0   5.00                         a plane is taking off.   \\n\",\n       \"1   3.80                a man is playing a large flute.   \\n\",\n       \"2   3.80  a man is spreading shreded cheese on a pizza.   \\n\",\n       \"3   2.60                   three men are playing chess.   \\n\",\n       \"4   4.25                    a man is playing the cello.   \\n\",\n       \"\\n\",\n       \"                                           sentence2  \\n\",\n       \"0                        an air plane is taking off.  \\n\",\n       \"1                          a man is playing a flute.  \\n\",\n       \"2  a man is spreading shredded cheese on an uncoo...  \\n\",\n       \"3                         two men are playing chess.  \\n\",\n       \"4                 a man seated is playing the cello.  \"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"train.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 3. Create AutoML Run\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"AutoML can be used for classification, regression or timeseries experiments. Each experiment type has corresponding machine learning models and metrics that can be optimized (see [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train)) and the options will be delineated below. As a first step we connect to an existing workspace or create one if it doesn't exist.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.1 Link to or create a Workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The following cell looks to set up the connection to your [Azure Machine Learning service Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). You can choose to connect to an existing workspace or create a new one. \\n\",\n    \"\\n\",\n    \"**To access an existing workspace:**\\n\",\n    \"1. If you have a `config.json` file, you do not need to provide the workspace information; you will only need to update the `config_path` variable that is defined above which contains the file.\\n\",\n    \"2. Otherwise, you will need to supply the following:\\n\",\n    \"    * The name of your workspace\\n\",\n    \"    * Your subscription id\\n\",\n    \"    * The resource group name\\n\",\n    \"\\n\",\n    \"**To create a new workspace:**\\n\",\n    \"\\n\",\n    \"Set the following information:\\n\",\n    \"* A name for your workspace\\n\",\n    \"* Your subscription id\\n\",\n    \"* The resource group name\\n\",\n    \"* [Azure region](https://azure.microsoft.com/en-us/global-infrastructure/regions/) to create the workspace in, such as `eastus2`. \\n\",\n    \"\\n\",\n    \"This will automatically create a new resource group for you in the region provided if a resource group with the name given does not already exist. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Performing interactive authentication. Please follow the instructions on the terminal.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING - Note, we have launched a browser for you to login. For old experience with device code, use \\\"az login --use-device-code\\\"\\n\",\n      \"WARNING - You have logged in. Now let us find all the subscriptions to which you have access...\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Interactive authentication successfully completed.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"ws = azureml_utils.get_or_create_workspace(\\n\",\n    \"    config_path=config_path,\\n\",\n    \"    subscription_id=subscription_id,\\n\",\n    \"    resource_group=resource_group,\\n\",\n    \"    workspace_name=workspace_name,\\n\",\n    \"    workspace_region=workspace_region,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\n\",\n    \"    \\\"Workspace name: \\\" + ws.name,\\n\",\n    \"    \\\"Azure region: \\\" + ws.location,\\n\",\n    \"    \\\"Subscription id: \\\" + ws.subscription_id,\\n\",\n    \"    \\\"Resource group: \\\" + ws.resource_group,\\n\",\n    \"    sep=\\\"\\\\n\\\",\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.2 Create AutoMLConfig object\\n\",\n    \"Next, we specify the parameters for the AutoMLConfig class. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**task**  \\n\",\n    \"AutoML supports the following base learners for the regression task: Elastic Net, Light GBM, Gradient Boosting, Decision Tree, K-nearest Neighbors, LARS Lasso, Stochastic Gradient Descent, Random Forest, Extremely Randomized Trees, XGBoost, DNN Regressor, Linear Regression. In addition, AutoML also supports two kinds of ensemble methods: voting (weighted average of the output of multiple base learners) and stacking (training a second \\\"metalearner\\\" which uses the base algorithms' predictions to predict the target variable). Specific base learners can be included or excluded in the parameters for the AutoMLConfig class (whitelist_models and blacklist_models) and the voting/stacking ensemble options can be specified as well (enable_voting_ensemble and enable_stack_ensemble)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**preprocess**  \\n\",\n    \"AutoML also has advanced preprocessing methods, eliminating the need for users to perform this manually. Data is automatically scaled and normalized but an additional parameter in the AutoMLConfig class enables the use of more advanced techniques including imputation, generating additional features, transformations, word embeddings, etc. (full list found [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-create-portal-experiments#preprocess)). Note that algorithm-specific preprocessing will be applied even if preprocess=False. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**primary_metric**  \\n\",\n    \"The regression metrics available are the following: Spearman Correlation (spearman_correlation), Normalized RMSE (normalized_root_mean_squared_error), Normalized MAE (normalized_mean_absolute_error), and R2 score (r2_score) \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Constraints:**  \\n\",\n    \"There is a cost_mode parameter to set cost prediction modes (see options [here](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlconfig?view=azure-ml-py)). To set constraints on time there are multiple parameters including experiment_exit_score (target score to exit the experiment after achieving), experiment_timeout_minutes (maximum amount of time for all combined iterations), and iterations (total number of different algorithm and parameter combinations to try).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Note**: we are directly passing in sentence pairs as data because we are relying upon AutoML's built-in preprocessing (by setting preprocess = True in the AutoMLConfig parameters) to perform the embedding step.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"X_train = train.drop(\\\"score\\\", axis=1).values\\n\",\n    \"y_train = train[\\\"score\\\"].values.flatten()\\n\",\n    \"X_validation = dev.drop(\\\"score\\\", axis=1).values\\n\",\n    \"y_validation = dev[\\\"score\\\"].values.flatten()\\n\",\n    \"\\n\",\n    \"# local compute\\n\",\n    \"automated_ml_config = AutoMLConfig(\\n\",\n    \"    X=X_train,\\n\",\n    \"    y=y_train,\\n\",\n    \"    X_valid=X_validation,\\n\",\n    \"    y_valid=y_validation,\\n\",\n    \"    verbosity=logging.ERROR,\\n\",\n    \"    **automl_settings  # where the autoML main settings are defined\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.3 Run Experiment\\n\",\n    \"\\n\",\n    \"Run the experiment locally and inspect the results using a widget\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"experiment = Experiment(ws, \\\"NLP-SS-automl\\\")\\n\",\n    \"local_run = experiment.submit(automated_ml_config, show_output=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#local_run.cancel()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The results of the completed run can be visualized in two ways. First, by using a RunDetails widget as shown in the cell below. Second, by accessing the [Azure portal](https://portal.azure.com), selecting your workspace, clicking on _Experiments_ and then selecting the name and run number of the experiment you want to inspect. Both these methods will show the results and duration for each iteration (algorithm tried), a visualization of the results, and information about the run including the compute target, primary metric, etc.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Inspect the run details using the provided widget\\n\",\n    \"RunDetails(local_run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![](https://nlpbp.blob.core.windows.net/images/autoMLwidget.PNG)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 4. Deploy Sentence Similarity Model\\n\",\n    \"Deploying an Azure Machine Learning model as a web service creates a REST API. You can send data to this API and receive the prediction returned by the model.\\n\",\n    \"In general, you create a webservice by deploying a model as an image to a Compute Target.\\n\",\n    \"\\n\",\n    \"Some of the Compute Targets are: \\n\",\n    \"1. Azure Container Instance\\n\",\n    \"2. Azure Kubernetes Service\\n\",\n    \"3. Local web service\\n\",\n    \"\\n\",\n    \"The general workflow for deploying a model is as follows:\\n\",\n    \"1. Register a model\\n\",\n    \"2. Prepare to deploy\\n\",\n    \"3. Deploy the model to the compute target\\n\",\n    \"4. Test the deployed model (webservice)\\n\",\n    \"\\n\",\n    \"In this notebook, we walk you through the process of creating a webservice running on Azure Container Instance by deploying an AutoML model as an image. ACI is typically used for low scale, CPU-based workloads. (You can find more information on deploying and serving models [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where))\\n\",\n    \"\\n\",\n    \"## 4.1 Retrieve the Best Model\\n\",\n    \"Now we can identify the model that maximized performance on a given metric (spearman correlation in our case) using the `get_output` method which returns the best_run (AutoMLRun object with information about the experiment) and fitted_model ([Pipeline]((https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb)) object) across all iterations. Overloads on `get_output` allow you to retrieve the best run and fitted model for any logged metric or for a particular iteration. \\n\",\n    \"\\n\",\n    \"The different steps that make up the pipeline can be accessed through `fitted_model.named_steps` and information about data preprocessing is available through `fitted_model.named_steps['datatransformer'].get_featurization_summary()`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"best_run, fitted_model = local_run.get_output()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Register the Fitted Model for Deployment\\n\",\n    \"\\n\",\n    \"Registering a model means registering one or more files that make up a model. The Machine Learning models are registered in your current Aure Machine Learning Workspace. The model can either come from Azure Machine Learning or another location, such as your local machine. \\n\",\n    \"Below we show how a model is registered from the results of an experiment run. If neither metric nor iteration are specified in the register_model call, the iteration with the best primary metric is registered.\\n\",\n    \"\\n\",\n    \"See other ways to register a model [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Registering model AutoML29e4bb98ebest\\n\",\n      \"AutoML29e4bb98ebest\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"description = \\\"AutoML Model\\\"\\n\",\n    \"tags = {\\\"area\\\": \\\"nlp\\\", \\\"type\\\": \\\"sentence similarity automl\\\"}\\n\",\n    \"name = \\\"automl\\\"\\n\",\n    \"model = local_run.register_model(description=description, tags=tags)\\n\",\n    \"\\n\",\n    \"print(local_run.model_id)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.3 Create an Entry Script\\n\",\n    \"In this section we show an example of an entry script, which is called from the deployed webservice. `score.py` is our entry script. The script must contain:\\n\",\n    \"1. init() - This function loads the model in a global object.\\n\",\n    \"2. run() - This function is used for model prediction. The inputs and outputs to `run()` typically use JSON for serialization and deserilization. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Overwriting score.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile score.py\\n\",\n    \"import pickle\\n\",\n    \"import json\\n\",\n    \"import numpy\\n\",\n    \"import azureml.train.automl\\n\",\n    \"from sklearn.externals import joblib\\n\",\n    \"from azureml.core.model import Model\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def init():\\n\",\n    \"    global model\\n\",\n    \"    model_path = Model.get_model_path(\\n\",\n    \"        model_name=\\\"<<modelid>>\\\"\\n\",\n    \"    )  # this name is model.id of model that we want to deploy\\n\",\n    \"    # deserialize the model file back into a sklearn model\\n\",\n    \"    model = joblib.load(model_path)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def run(rawdata):\\n\",\n    \"    try:\\n\",\n    \"        data = json.loads(rawdata)[\\\"data\\\"]\\n\",\n    \"        data = numpy.array(data)\\n\",\n    \"        result = model.predict(data)\\n\",\n    \"    except Exception as e:\\n\",\n    \"        result = str(e)\\n\",\n    \"        return json.dumps({\\\"error\\\": result})\\n\",\n    \"    return json.dumps({\\\"result\\\": result.tolist()})\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Substitute the actual model id in the script file.\\n\",\n    \"script_file_name = \\\"score.py\\\"\\n\",\n    \"\\n\",\n    \"with open(script_file_name, \\\"r\\\") as cefr:\\n\",\n    \"    content = cefr.read()\\n\",\n    \"\\n\",\n    \"with open(script_file_name, \\\"w\\\") as cefw:\\n\",\n    \"    cefw.write(content.replace(\\\"<<modelid>>\\\", local_run.model_id))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.4 Create a YAML File for the Environment\\n\",\n    \"\\n\",\n    \"To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. The following cells create a file, autoenv.yml, which specifies the dependencies from the run.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ml_run = AutoMLRun(experiment=experiment, run_id=local_run.id)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"best_iteration = int(\\n\",\n    \"    best_run.id.split(\\\"_\\\")[-1]\\n\",\n    \")  # get the appended iteration number for the best model\\n\",\n    \"dependencies = ml_run.get_run_sdk_dependencies(iteration=best_iteration)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'azureml-widgets': '1.0.48',\\n\",\n       \" 'azureml-train': '1.0.48',\\n\",\n       \" 'azureml-train-restclients-hyperdrive': '1.0.48',\\n\",\n       \" 'azureml-train-core': '1.0.48',\\n\",\n       \" 'azureml-train-automl': '1.0.48',\\n\",\n       \" 'azureml-telemetry': '1.0.48',\\n\",\n       \" 'azureml-sdk': '1.0.48',\\n\",\n       \" 'azureml-pipeline': '1.0.48',\\n\",\n       \" 'azureml-pipeline-steps': '1.0.48',\\n\",\n       \" 'azureml-pipeline-core': '1.0.48',\\n\",\n       \" 'azureml-mlflow': '1.0.48',\\n\",\n       \" 'azureml-dataprep': '1.1.8',\\n\",\n       \" 'azureml-dataprep-native': '13.0.0',\\n\",\n       \" 'azureml-core': '1.0.48',\\n\",\n       \" 'azureml-automl-core': '1.0.48'}\"\n      ]\n     },\n     \"execution_count\": 21,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"dependencies\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Add dependencies in the yaml file from the above cell. You must specify the version of \\\"azureml-sdk[automl]\\\" while creating the yaml file.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'automlenv.yml'\"\n      ]\n     },\n     \"execution_count\": 22,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"myenv = CondaDependencies.create(\\n\",\n    \"    conda_packages=[\\\"numpy\\\", \\\"scikit-learn==0.21.2\\\", \\\"py-xgboost<=0.80\\\", \\\"pandas==0.24.2\\\"],\\n\",\n    \"    pip_packages=[\\\"azureml-sdk[automl]==1.0.48.*\\\"],\\n\",\n    \"    python_version=\\\"3.6.8\\\",\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"conda_env_file_name = \\\"automlenv.yml\\\"\\n\",\n    \"myenv.save_to_file(\\\".\\\", conda_env_file_name)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.5 Create a Container Image\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this step we create a container image which is wrapper containing the entry script, yaml file with package dependencies and the model. The created image is then deployed as a webservice in the next step. This step can take up to 10 minutes and even longer if the model is large.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Creating image\\n\",\n      \"Running...................................................\\n\",\n      \"Succeeded\\n\",\n      \"Image creation operation finished for image automl-image:20, operation \\\"Succeeded\\\"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"image_config = ContainerImage.image_configuration(\\n\",\n    \"    execution_script=script_file_name,\\n\",\n    \"    runtime=\\\"python\\\",\\n\",\n    \"    conda_file=conda_env_file_name,\\n\",\n    \"    description=\\\"Image with automl model\\\",\\n\",\n    \"    tags={\\\"area\\\": \\\"nlp\\\", \\\"type\\\": \\\"sentencesimilarity automl\\\"},\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"image = ContainerImage.create(\\n\",\n    \"    name=\\\"automl-image\\\",\\n\",\n    \"    # this is the model object\\n\",\n    \"    models=[model],\\n\",\n    \"    image_config=image_config,\\n\",\n    \"    workspace=ws,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"image.wait_for_creation(show_output=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"If the above step fails, then use the below command to see logs\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 24,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# print(image.image_build_log_uri)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.6 Deploy the Image as a Web Service to Azure Container Instance\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Azure Container Instances are mostly used for deploying your models as a web service if one or more of the following conditions are true:\\n\",\n    \"1. You need to quickly deploy and validate your model.\\n\",\n    \"2. You are testing a model that is under development.\\n\",\n    \"\\n\",\n    \"To set them up properly, we need to indicate the number of CPU cores and the amount of memory we want to allocate to our web service.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 25,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Set the web service configuration\\n\",\n    \"aci_config = AciWebservice.deploy_configuration(\\n\",\n    \"    cpu_cores=CPU_CORES, memory_gb=MEMORY_GB\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The final step to deploying our web service is to call `WebService.deploy_from_image()`. This function uses the Docker image and the deployment configuration we created above to perform the following:\\n\",\n    \"1. Deploy the docker image to an Azure Container Instance\\n\",\n    \"2. Call the init() function in our scoring file\\n\",\n    \"3. Provide an HTTP endpoint for scoring calls\\n\",\n    \"\\n\",\n    \"The deploy_from_image method requires the following parameters:\\n\",\n    \"\\n\",\n    \"1. workspace: the workspace containing the service\\n\",\n    \"2. name: a unique name used to identify the service in the workspace\\n\",\n    \"3. image: a docker image object that contains the environment needed for scoring/inference\\n\",\n    \"4. deployment_config: a configuration object describing the compute type\\n\",\n    \"\\n\",\n    \"**Note:** The web service creation can take a few minutes \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 26,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Creating service\\n\",\n      \"Running............................\\n\",\n      \"SucceededACI service creation operation finished, operation \\\"Succeeded\\\"\\n\",\n      \"Healthy\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# deploy image as web service\\n\",\n    \"aci_service = Webservice.deploy_from_image(\\n\",\n    \"    workspace=ws, name=webservice_name, image=image, deployment_config=aci_config\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"aci_service.wait_for_deployment(show_output=True)\\n\",\n    \"print(aci_service.state)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Fetch logs to debug in case of failures.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 27,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# print(aci_service.get_logs())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"If you want to reuse an existing service versus creating a new one, call the webservice with the name. You can look up all the deployed webservices under deployment in the Azure Portal. Below is an example:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 28,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# aci_service = Webservice(workspace=ws, name='<<serive-name>>')\\n\",\n    \"\\n\",\n    \"# to use the webservice\\n\",\n    \"# aci_service.run()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.7 Test Deployed Model\\n\",\n    \"\\n\",\n    \"Testing the deployed model means running the created webservice. <br>\\n\",\n    \"The deployed model can be tested by passing a list of sentence pairs. The output will be a score between 0 and 5, with 0 indicating no meaning overlap between the sentences and 5 meaning equivalence.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 29,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"sentences = [\\n\",\n    \"    [\\\"This is sentence1\\\", \\\"This is sentence1\\\"],\\n\",\n    \"    [\\\"A hungry cat.\\\", \\\"A sleeping cat\\\"],\\n\",\n    \"    [\\\"Its summer time \\\", \\\"Winter is coming\\\"],\\n\",\n    \"]\\n\",\n    \"data = {\\\"data\\\": sentences}\\n\",\n    \"data = json.dumps(data)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 30,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Time elapsed: 0.3863\\n\",\n      \"Number of samples predicted: 3\\n\",\n      \"[1.8922695576076576, 2.981616317499634, 1.895254173398724]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Set up a Timer to see how long the model takes to predict\\n\",\n    \"t = Timer()\\n\",\n    \"\\n\",\n    \"t.start()\\n\",\n    \"score = aci_service.run(input_data=data)\\n\",\n    \"t.stop()\\n\",\n    \"\\n\",\n    \"print(\\\"Time elapsed: {}\\\".format(t))\\n\",\n    \"\\n\",\n    \"result = json.loads(score)\\n\",\n    \"try:\\n\",\n    \"    output = result[\\\"result\\\"]\\n\",\n    \"    print(\\\"Number of samples predicted: {}\\\".format(len(output)))\\n\",\n    \"    print(output)\\n\",\n    \"except:\\n\",\n    \"    print(result[\\\"error\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally, we'll calculate the Pearson Correlation on the test set.\\n\",\n    \"\\n\",\n    \"**What is Pearson Correlation?**\\n\",\n    \"\\n\",\n    \"Our evaluation metric is Pearson correlation ($\\\\rho$) which is a measure of the linear correlation between two variables. The formula for calculating Pearson correlation is as follows:  \\n\",\n    \"\\n\",\n    \"$$\\\\rho_{X,Y} = \\\\frac{E[(X-\\\\mu_X)(Y-\\\\mu_Y)]}{\\\\sigma_X \\\\sigma_Y}$$\\n\",\n    \"\\n\",\n    \"This metric takes a value in [-1,1] where -1 represents a perfect negative correlation, 1 represents a perfect positive correlation, and 0 represents no correlation. We utilize the Pearson correlation metric as this is the main metric that [SentEval](http://nlpprogress.com/english/semantic_textual_similarity.html), a widely-used evaluation toolkit for evaluation sentence representations, uses for the STS Benchmark dataset.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 31,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"test_y = test[\\\"score\\\"].values.flatten()\\n\",\n    \"test_x = test.drop(\\\"score\\\", axis=1).values.tolist()\\n\",\n    \"\\n\",\n    \"data = {\\\"data\\\": test_x}\\n\",\n    \"data = json.dumps(data)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 32,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Time elapsed: 1.1874\\n\",\n      \"Number of samples predicted: 1379\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Set up a Timer to see how long the model takes to predict\\n\",\n    \"t = Timer()\\n\",\n    \"\\n\",\n    \"t.start()\\n\",\n    \"score = aci_service.run(input_data=data)\\n\",\n    \"t.stop()\\n\",\n    \"\\n\",\n    \"print(\\\"Time elapsed: {}\\\".format(t))\\n\",\n    \"\\n\",\n    \"result = json.loads(score)\\n\",\n    \"try:\\n\",\n    \"    output = result[\\\"result\\\"]\\n\",\n    \"    print(\\\"Number of samples predicted: {}\\\".format(len(output)))\\n\",\n    \"except:\\n\",\n    \"    print(result[\\\"error\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 33,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# get Pearson Correlation\\n\",\n    \"pearson = pearsonr(output, test_y)[0]\\n\",\n    \"print(pearson)\\n\",\n    \"\\n\",\n    \"sb.glue(\\\"pearson_correlation\\\", pearson)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The goal of this notebook is to demonstrate how to use AutoML locally and then deploy the model to Azure Container Instance quickly. The model utilizes the built-in capabilities of AutoML to embed our sentences. The model performance on its own, without tweaking, is not very strong with this particular dataset. For a more advanced model, see [AutoML with Pipelines Deployment AKS](automl_with_pipelines_deployment_aks.ipynb) for much stronger performance on the same task. This notebook utilizes AzureML Pipelines to explicitly embed our sentences using the Google Universal Sentence Encoder (USE) model. For our dataset, the Google USE embeddings result in superior model performance.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 5. Clean up\\n\",\n    \"Throughout the notebook, we used a workspace and Azure container instances. To get a sense of the cost we incurred, we can refer to this [calculator](https://azure.microsoft.com/en-us/pricing/calculator/). We can also navigate to the [Cost Management + Billing](https://ms.portal.azure.com/#blade/Microsoft_Azure_Billing/ModernBillingMenuBlade/Overview) pane on the portal, click on our subscription ID, and click on the Cost Analysis tab to check our credit usage.\\n\",\n    \"<br><br>\\n\",\n    \"In order not to incur extra costs, let's delete the resources we no longer need.\\n\",\n    \"<br><br>\\n\",\n    \"Once we have verified that our web service works well on ACI, we can delete it. This helps reduce [costs](https://azure.microsoft.com/en-us/pricing/details/container-instances/), since the container group we were paying for no longer exists, and allows us to keep our workspace clean.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 34,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# aci_service.delete()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"At this point, the main resource we are paying for is the Standard Azure Container Registry (ACR), which contains our Docker image. Details on pricing are available [here](https://azure.microsoft.com/en-us/pricing/details/container-registry/).\\n\",\n    \"\\n\",\n    \"We may decide to use our Docker image in a separate ACI or even in an AKS deployment. In that case, we should keep it available in our workspace. However, if we no longer have a use for it, we can delete it.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 35,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# docker_image.delete()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"If our goal is to continue using our workspace, we should keep it available. On the contrary, if we plan on no longer using it and its associated resources, we can delete it.\\n\",\n    \"<br><br>\\n\",\n    \"Note: Deleting the workspace will delete all the experiments, outputs, models, Docker images, deployments, etc. that we created in that workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 36,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# ws.delete(delete_dependent_resources=True)\\n\",\n    \"# This deletes our workspace, the container registry, the account storage, Application Insights and the key vault\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"As mentioned above, Azure Container Instances tend to be used to develop and test deployments. They are typically configured with CPUs, which usually suffice when the number of requests per second is not too high. When working with several instances, we can configure them further by specifically allocating CPU resources to each of them.\\n\",\n    \"\\n\",\n    \"For production requirements, i.e. when > 100 requests per second are expected, we recommend deploying models to Azure Kubernetes Service (AKS). It is a convenient infrastructure as it manages hosted Kubernetes environments, and makes it easy to deploy and manage containerized applications without container orchestration expertise. It also supports deployments with CPU clusters and deployments with GPU clusters.For more examples on deployment follow [MachineLearningNotebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/deployment) github repository.\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"## Next Steps\\n\",\n    \"\\n\",\n    \"Check out [AutoML with Pipelines Deployment AKS](automl_with_pipelines_deployment_aks.ipynb) to see how to construct a AzureML Pipeline with an embedding step (using Google Universal Sentence Encoder model) and an AutoMLStep, increasing our Pearson correlation score. Also, this notebooks demonstrates deployment using AKS versus ACI.\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.4\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>\\n\",\n    \"\\n\",\n    \"<i>Licensed under the MIT License.</i>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# AzureML Pipeline, AutoML, AKS Deployment for Sentence Similarity\"\n   ]\n  },\n    {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/sentence_similarity/automl_with_pipelines_deployment_aks.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This notebook builds off of the [AutoML Local Deployment ACI](automl_local_deployment_aci.ipynb) notebook and demonstrates how to use [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-service/\\n\",\n    \") pipelines and Automated Machine Learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml\\n\",\n    \")) to streamline the creation of a machine learning workflow for predicting sentence similarity. The pipeline contains two steps:   \\n\",\n    \"1. PythonScriptStep: embeds sentences using a popular sentence embedding model, Google Universal Sentence Encoder\\n\",\n    \"2. AutoMLStep: demonstrates how to use Automated Machine Learning (AutoML) to automate model selection for predicting sentence similarity (regression)\\n\",\n    \"\\n\",\n    \"After creating the pipeline, the notebook demonstrates the deployment of our sentence similarity model using Azure Kubernetes Service ([AKS](https://docs.microsoft.com/en-us/azure/aks/intro-kubernetes\\n\",\n    \")).\\n\",\n    \"\\n\",\n    \"This notebook showcases how to use the following AzureML features:  \\n\",\n    \"- AzureML Pipelines (PythonScriptStep and AutoMLStep)\\n\",\n    \"- Automated Machine Learning\\n\",\n    \"- AmlCompute\\n\",\n    \"- Datastore\\n\",\n    \"- Logging\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Table of Contents\\n\",\n    \"1. [Introduction](#1.-Introduction)  \\n\",\n    \"    * 1.1 [What are AzureML Pipelines?](#1.1-What-are-AzureML-Pipelines?)  \\n\",\n    \"    * 1.2 [What is Azure AutoML?](#1.2-What-is-Azure-AutoML?)  \\n\",\n    \"    * 1.3 [Modeling Problem](#1.3-Modeling-Problem)  \\n\",\n    \"2. [Data Preparation](#2.-Data-Preparation)  \\n\",\n    \"3. [AzureML Setup](#3.-AzureML-Setup)  \\n\",\n    \"    * 3.1 [Link to or create a `Workspace`](#3.1-Link-to-or-create-a-Workspace)  \\n\",\n    \"    * 3.2 [Set up an `Experiment` and Logging](#3.2-Set-up-an-Experiment-and-Logging)  \\n\",\n    \"    * 3.3 [Link `AmlCompute` compute target](#3.3-Link-AmlCompute-compute-target)  \\n\",\n    \"    * 3.4 [Upload data to `Datastore`](#3.4-Upload-data-to-Datastore)  \\n\",\n    \"4. [Create AzureML Pipeline](#4.-Create-AzureML-Pipeline)  \\n\",\n    \"    * 4.1 [Set up run configuration file](#4.1-Set-up-run-configuration-file)  \\n\",\n    \"    * 4.2 [PythonScriptStep](#4.2-PythonScriptStep)  \\n\",\n    \"        * 4.2.1 [Define python script to run](#4.2.1-Define-python-script-to-run)\\n\",\n    \"        * 4.2.2 [Create PipelineData object](#4.2.2-Create-PipelineData-object)\\n\",\n    \"        * 4.2.3 [Create PythonScriptStep](#4.2.3-Create-PythonScriptStep)\\n\",\n    \"    * 4.3 [AutoMLStep](#4.3-AutoMLStep)\\n\",\n    \"        * 4.3.1 [Define get_data script to load data](#4.3.1-Define-get_data-script-to-load-data)\\n\",\n    \"        * 4.3.2 [Create AutoMLConfig object](#4.3.2-Create-AutoMLConfig-object)\\n\",\n    \"        * 4.3.3 [Create AutoMLStep](#4.3.3-Create-AutoMLStep)    \\n\",\n    \"5. [Run Pipeline](#5.-Run-Pipeline)  \\n\",\n    \"6. [Deploy Sentence Similarity Model](#6.-Deploy-Sentence-Similarity-Model)\\n\",\n    \"    * 6.1 [Register/Retrieve AutoML and Google Universal Sentence Encoder Models for Deployment](#6.1-Register/Retrieve-AutoML-and-Google-Universal-Sentence-Encoder-Models-for-Deployment)  \\n\",\n    \"    * 6.2 [Create Scoring Script](#6.2-Create-Scoring-Script)\\n\",\n    \"    * 6.3 [Create a YAML File for the Environment](#6.3-Create-a-YAML-File-for-the-Environment)   \\n\",\n    \"    * 6.4 [Image Creation](#6.4-Image-Creation) \\n\",\n    \"    * 6.5 [Provision the AKS Cluster](#6.5-Provision-the-AKS-Cluster)   \\n\",\n    \"    * 6.6 [Deploy the image as a Web Service to Azure Kubernetes Service](#6.6-Deploy-the-image-as-a-Web-Service-to-Azure-Kubernetes-Service)  \\n\",\n    \"    * 6.7 [Test Deployed Model](#6.7-Test-Deployed-Webservice)  \\n\",\n    \"    \\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1. Introduction\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.1 What are AzureML Pipelines?\\n\",\n    \"\\n\",\n    \"[AzureML Pipelines](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines) define reusable machine learning workflows that can be used as a template for your machine learning scenarios. Pipelines allow you to optimize your workflow and spend time on machine learning rather than infrastructure. A Pipeline is defined by a series of steps; the following steps are available: AdlaStep, AutoMLStep, AzureBatchStep, DataTransferStep, DatabricksStep, EstimatorStep, HyperDriveStep, ModuleStep, MpiStep, and PythonScriptStep (see [here](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/?view=azure-ml-py) for details of each step). When the pipeline is run, cached results are used for all steps that have not changed, optimizing the run time. Data sources and intermediate data can be used across multiple steps in a pipeline, saving time and resources. Below we see an example of an AzureML pipeline.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![](https://nlpbp.blob.core.windows.net/images/pipelines.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.2 What is Azure AutoML?\\n\",\n    \"\\n\",\n    \"Automated machine learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml)) is a capability of Microsoft's [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/\\n\",\n    \"). The goal of AutoML is to improve the productivity of data scientists and democratize AI by allowing for the rapid development and deployment of machine learning models. To acheive this goal, AutoML automates the process of selecting a ML model and tuning the model. All the user is required to provide is a dataset (suitable for a classification, regression, or time-series forecasting problem) and a metric to optimize in choosing the model and hyperparameters. The user is also given the ability to set time and cost constraints for the model selection and tuning.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![](automl.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The AutoML model selection and tuning process can be easily tracked through the Azure portal or directly in python notebooks through the use of widgets. AutoML quickly selects a high quality machine learning model tailored for your prediction problem. In this notebook, we walk through the steps of preparing data, setting up an AutoML experiment, and evaluating the results of our best model. More information about running AutoML experiments in Python can be found [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train). \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.3 Modeling Problem\\n\",\n    \"\\n\",\n    \"The regression problem we will demonstrate is predicting sentence similarity scores on the STS Benchmark dataset. The [STS Benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset) contains a selection of English datasets that were used in Semantic Textual Similarity (STS) tasks 2012-2017. The dataset contains 8,628 sentence pairs with a human-labeled integer representing the sentences' similarity (ranging from 0, for no meaning overlap, to 5, meaning equivalence).\\n\",\n    \"\\n\",\n    \"For each sentence in the sentence pair, we will use Google's pretrained Universal Sentence Encoder (details provided below) to generate a $512$-dimensional embedding. Both embeddings in the sentence pair will be concatenated and the resulting $1024$-dimensional vector will be used as features in our regression problem. Our target variable is the sentence similarity score.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\sklearn\\\\externals\\\\joblib\\\\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\\n\",\n      \"  warnings.warn(msg, category=DeprecationWarning)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Turning diagnostics collection on. \\n\",\n      \"System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\\n\",\n      \"Azure ML SDK Version: 1.0.48\\n\",\n      \"Pandas version: 0.24.2\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Set the environment path to find NLP\\n\",\n    \"import sys\\n\",\n    \"\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"import time\\n\",\n    \"import logging\\n\",\n    \"import csv\\n\",\n    \"import os\\n\",\n    \"import pandas as pd\\n\",\n    \"import shutil\\n\",\n    \"import numpy as np\\n\",\n    \"import sys\\n\",\n    \"from scipy.stats import pearsonr\\n\",\n    \"from scipy.spatial import distance\\n\",\n    \"from sklearn.externals import joblib\\n\",\n    \"import json\\n\",\n    \"\\n\",\n    \"# Import utils\\n\",\n    \"from utils_nlp.azureml import azureml_utils\\n\",\n    \"from utils_nlp.dataset import stsbenchmark\\n\",\n    \"from utils_nlp.dataset.preprocess import (\\n\",\n    \"    to_lowercase,\\n\",\n    \"    to_spacy_tokens,\\n\",\n    \"    rm_spacy_stopwords,\\n\",\n    \")\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"\\n\",\n    \"# Google Universal Sentence Encoder loader\\n\",\n    \"import tensorflow_hub as hub\\n\",\n    \"\\n\",\n    \"# AzureML packages\\n\",\n    \"import azureml as aml\\n\",\n    \"import logging\\n\",\n    \"from azureml.telemetry import set_diagnostics_collection\\n\",\n    \"\\n\",\n    \"set_diagnostics_collection(send_diagnostics=True)\\n\",\n    \"from azureml.core import Datastore, Experiment, Workspace\\n\",\n    \"from azureml.core.compute import ComputeTarget, AmlCompute\\n\",\n    \"from azureml.core.runconfig import RunConfiguration\\n\",\n    \"from azureml.core.conda_dependencies import CondaDependencies\\n\",\n    \"from azureml.core.webservice import AksWebservice, Webservice\\n\",\n    \"from azureml.core.compute import AksCompute, ComputeTarget\\n\",\n    \"from azureml.core.image import ContainerImage\\n\",\n    \"from azureml.core.model import Model\\n\",\n    \"from azureml.train.automl import AutoMLStep, AutoMLStepRun, AutoMLConfig\\n\",\n    \"from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput\\n\",\n    \"from azureml.pipeline.steps import PythonScriptStep\\n\",\n    \"from azureml.data.data_reference import DataReference\\n\",\n    \"from azureml.widgets import RunDetails\\n\",\n    \"\\n\",\n    \"print(\\\"System version: {}\\\".format(sys.version))\\n\",\n    \"print(\\\"Azure ML SDK Version:\\\", aml.core.VERSION)\\n\",\n    \"print(\\\"Pandas version: {}\\\".format(pd.__version__))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"BASE_DATA_PATH = \\\"../../data\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"automl_settings = {\\n\",\n    \"    \\\"task\\\": 'regression',  # type of task: classification, regression or forecasting\\n\",\n    \"    \\\"iteration_timeout_minutes\\\": 15,  # How long each iteration can take before moving on\\n\",\n    \"    \\\"iterations\\\": 50,  # Number of algorithm options to try\\n\",\n    \"    \\\"primary_metric\\\": \\\"spearman_correlation\\\",  # Metric to optimize\\n\",\n    \"    \\\"preprocess\\\": True,  # Whether dataset preprocessing should be applied\\n\",\n    \"    \\\"verbosity\\\": logging.INFO,\\n\",\n    \"    \\\"blacklist_models\\\": ['XGBoostRegressor'] #this model is blacklisted due to installation issues\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"config_path = (\\n\",\n    \"    \\\"./.azureml\\\"\\n\",\n    \")  # Path to the directory containing config.json with azureml credentials\\n\",\n    \"\\n\",\n    \"# Azure resources\\n\",\n    \"subscription_id = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"resource_group = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  \\n\",\n    \"workspace_name = \\\"YOUR_WORKSPACE_NAME\\\"  \\n\",\n    \"workspace_region = \\\"YOUR_WORKSPACE_REGION\\\" #Possible values eastus, eastus2 and so on.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 2. Data Preparation\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**STS Benchmark Dataset**\\n\",\n    \"\\n\",\n    \"As described above, the STS Benchmark dataset contains 8.6K sentence pairs along with a human-annotated score for how similar the two sentences are. We will load the training, development (validation), and test sets provided by STS Benchmark and preprocess the data (lowercase the text, drop irrelevant columns, and rename the remaining columns) using the utils contained in this repo. Each dataset will ultimately have three columns: _sentence1_ and _sentence2_ which contain the text of the sentences in the sentence pair, and _score_ which contains the human-annotated similarity score of the sentence pair.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:02<00:00, 198KB/s]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Data downloaded to ../../data\\\\raw\\\\stsbenchmark\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:02<00:00, 174KB/s]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Data downloaded to ../../data\\\\raw\\\\stsbenchmark\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:02<00:00, 148KB/s]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Data downloaded to ../../data\\\\raw\\\\stsbenchmark\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Load in the raw datasets as pandas dataframes\\n\",\n    \"train_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\\\"train\\\")\\n\",\n    \"dev_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\\\"dev\\\")\\n\",\n    \"test_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\\\"test\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Clean each dataset by lowercasing text, removing irrelevant columns,\\n\",\n    \"# and renaming the remaining columns\\n\",\n    \"train_clean = stsbenchmark.clean_sts(train_raw)\\n\",\n    \"dev_clean = stsbenchmark.clean_sts(dev_raw)\\n\",\n    \"test_clean = stsbenchmark.clean_sts(test_raw)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Convert all text to lowercase\\n\",\n    \"train = to_lowercase(train_clean)\\n\",\n    \"dev = to_lowercase(dev_clean)\\n\",\n    \"test = to_lowercase(test_clean)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Training set has 5749 sentences\\n\",\n      \"Development set has 1500 sentences\\n\",\n      \"Testing set has 1379 sentences\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"Training set has {} sentences\\\".format(len(train)))\\n\",\n    \"print(\\\"Development set has {} sentences\\\".format(len(dev)))\\n\",\n    \"print(\\\"Testing set has {} sentences\\\".format(len(test)))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>score</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"      <th>sentence2</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>5.00</td>\\n\",\n       \"      <td>a plane is taking off.</td>\\n\",\n       \"      <td>an air plane is taking off.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>3.80</td>\\n\",\n       \"      <td>a man is playing a large flute.</td>\\n\",\n       \"      <td>a man is playing a flute.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>3.80</td>\\n\",\n       \"      <td>a man is spreading shreded cheese on a pizza.</td>\\n\",\n       \"      <td>a man is spreading shredded cheese on an uncoo...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>2.60</td>\\n\",\n       \"      <td>three men are playing chess.</td>\\n\",\n       \"      <td>two men are playing chess.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>4.25</td>\\n\",\n       \"      <td>a man is playing the cello.</td>\\n\",\n       \"      <td>a man seated is playing the cello.</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   score                                      sentence1  \\\\\\n\",\n       \"0   5.00                         a plane is taking off.   \\n\",\n       \"1   3.80                a man is playing a large flute.   \\n\",\n       \"2   3.80  a man is spreading shreded cheese on a pizza.   \\n\",\n       \"3   2.60                   three men are playing chess.   \\n\",\n       \"4   4.25                    a man is playing the cello.   \\n\",\n       \"\\n\",\n       \"                                           sentence2  \\n\",\n       \"0                        an air plane is taking off.  \\n\",\n       \"1                          a man is playing a flute.  \\n\",\n       \"2  a man is spreading shredded cheese on an uncoo...  \\n\",\n       \"3                         two men are playing chess.  \\n\",\n       \"4                 a man seated is playing the cello.  \"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"train.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Save the cleaned data\\n\",\n    \"if not os.path.isdir(\\\"data\\\"):\\n\",\n    \"    os.mkdir(\\\"data\\\")\\n\",\n    \"\\n\",\n    \"train.to_csv(\\\"data/train.csv\\\", index=False)\\n\",\n    \"test.to_csv(\\\"data/test.csv\\\", index=False)\\n\",\n    \"dev.to_csv(\\\"data/dev.csv\\\", index=False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 3. AzureML Setup\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now, we set up the necessary components for running this as an AzureML experiment\\n\",\n    \"1. Create or link to an existing `Workspace`\\n\",\n    \"2. Set up an `Experiment` with `logging`\\n\",\n    \"3. Create or attach existing `AmlCompute`\\n\",\n    \"4. Upload our data to a `Datastore`\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.1 Link to or create a Workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The following cell looks to set up the connection to your [Azure Machine Learning service Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). You can choose to connect to an existing workspace or create a new one. \\n\",\n    \"\\n\",\n    \"**To access an existing workspace:**\\n\",\n    \"1. If you have a `config.json` file, you do not need to provide the workspace information; you will only need to update the `config_path` variable that is defined above which contains the file.\\n\",\n    \"2. Otherwise, you will need to supply the following:\\n\",\n    \"    * The name of your workspace\\n\",\n    \"    * Your subscription id\\n\",\n    \"    * The resource group name\\n\",\n    \"\\n\",\n    \"**To create a new workspace:**\\n\",\n    \"\\n\",\n    \"Set the following information:\\n\",\n    \"* A name for your workspace\\n\",\n    \"* Your subscription id\\n\",\n    \"* The resource group name\\n\",\n    \"* [Azure region](https://azure.microsoft.com/en-us/global-infrastructure/regions/) to create the workspace in, such as `eastus2`. \\n\",\n    \"\\n\",\n    \"This will automatically create a new resource group for you in the region provided if a resource group with the name given does not already exist. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Performing interactive authentication. Please follow the instructions on the terminal.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING - Note, we have launched a browser for you to login. For old experience with device code, use \\\"az login --use-device-code\\\"\\n\",\n      \"WARNING - You have logged in. Now let us find all the subscriptions to which you have access...\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Interactive authentication successfully completed.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"ws = azureml_utils.get_or_create_workspace(\\n\",\n    \"    config_path=config_path,\\n\",\n    \"    subscription_id=subscription_id,\\n\",\n    \"    resource_group=resource_group,\\n\",\n    \"    workspace_name=workspace_name,\\n\",\n    \"    workspace_region=workspace_region,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\n\",\n    \"    \\\"Workspace name: \\\" + ws.name,\\n\",\n    \"    \\\"Azure region: \\\" + ws.location,\\n\",\n    \"    \\\"Subscription id: \\\" + ws.subscription_id,\\n\",\n    \"    \\\"Resource group: \\\" + ws.resource_group,\\n\",\n    \"    sep=\\\"\\\\n\\\",\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.2 Set up an Experiment and Logging\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Make a folder for the project\\n\",\n    \"project_folder = \\\"./automl-sentence-similarity\\\"\\n\",\n    \"os.makedirs(project_folder, exist_ok=True)\\n\",\n    \"\\n\",\n    \"# Set up an experiment\\n\",\n    \"experiment_name = \\\"NLP-SS-googleUSE\\\"\\n\",\n    \"experiment = Experiment(ws, experiment_name)\\n\",\n    \"\\n\",\n    \"# Add logging to our experiment\\n\",\n    \"run = experiment.start_logging()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.3 Link AmlCompute Compute Target\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To use AzureML Pipelines we need to link a compute target as they can not be run locally. The different options include AmlCompute, Azure Databricks, Remote VMs, etc. All [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) can be found in this table with details about whether the given options work with automated ML, pipelines, and GPU. For the following example, we will use an AmlCompute target because it supports Azure Pipelines and GPU. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Found existing compute target.\\n\",\n      \"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-16T01:06:19.136000+00:00', 'errors': None, 'creationTime': '2019-07-09T16:20:30.625908+00:00', 'modifiedTime': '2019-07-09T16:20:46.601973+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# choose your cluster\\n\",\n    \"cluster_name = \\\"gpu-test\\\"\\n\",\n    \"\\n\",\n    \"try:\\n\",\n    \"    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\\n\",\n    \"    print(\\\"Found existing compute target.\\\")\\n\",\n    \"except ComputeTargetException:\\n\",\n    \"    print(\\\"Creating a new compute target...\\\")\\n\",\n    \"    compute_config = AmlCompute.provisioning_configuration(\\n\",\n    \"        vm_size=\\\"STANDARD_NC6\\\", max_nodes=4\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    # create the cluster\\n\",\n    \"    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\\n\",\n    \"\\n\",\n    \"    compute_target.wait_for_completion(show_output=True)\\n\",\n    \"\\n\",\n    \"# use get_status() to get a detailed status for the current AmlCompute.\\n\",\n    \"print(compute_target.get_status().serialize())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.4 Upload data to Datastore\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This step uploads our local data to a `Datastore` so that the data is accessible from the remote compute target and creates a `DataReference` to point to the location of the data on the Datastore. A DataStore is backed either by a Azure File Storage (default option) or Azure Blob Storage ([how to decide between these options](https://docs.microsoft.com/en-us/azure/storage/common/storage-decide-blobs-files-disks)) and data is made accessible by mounting or copying data to the compute target. `ws.datastores` lists all options for datastores and `ds.account_name` gets the name of the datastore that can be used to find it in the Azure portal.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Uploading an estimated of 3 files\\n\",\n      \"Uploading ./data\\\\dev.csv\\n\",\n      \"Uploading ./data\\\\test.csv\\n\",\n      \"Uploading ./data\\\\train.csv\\n\",\n      \"Uploaded ./data\\\\dev.csv, 1 files out of an estimated total of 3\\n\",\n      \"Uploaded ./data\\\\test.csv, 2 files out of an estimated total of 3\\n\",\n      \"Uploaded ./data\\\\train.csv, 3 files out of an estimated total of 3\\n\",\n      \"Uploaded 3 files\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"$AZUREML_DATAREFERENCE_6a3eb209d2a04cc6b66a68fa25213980\"\n      ]\n     },\n     \"execution_count\": 14,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# Select a specific datastore or you can call ws.get_default_datastore()\\n\",\n    \"datastore_name = \\\"workspacefilestore\\\"\\n\",\n    \"ds = ws.datastores[datastore_name]\\n\",\n    \"\\n\",\n    \"# Upload files in data folder to the datastore\\n\",\n    \"ds.upload(\\n\",\n    \"    src_dir=\\\"./data\\\",\\n\",\n    \"    target_path=\\\"stsbenchmark_data\\\",\\n\",\n    \"    overwrite=True,\\n\",\n    \"    show_progress=True,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We also set up a `DataReference` object that points to the data we just uploaded into the stsbenchmark_data folder. DataReference objects point to data that is accessible from a datastore and will be used an an input into our pipeline.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"input_data = DataReference(\\n\",\n    \"    datastore=ds,\\n\",\n    \"    data_reference_name=\\\"stsbenchmark\\\",\\n\",\n    \"    path_on_datastore=\\\"stsbenchmark_data/\\\",\\n\",\n    \"    overwrite=False,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 4. Create AzureML Pipeline\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now we set up our pipeline which is made of two steps:  \\n\",\n    \"1. `PythonScriptStep`: takes each sentence pair from the data in the `Datastore` and concatenates the Google USE embeddings for each sentence into one vector. This step saves the embedding feature matrix back to our `Datastore` and uses a `PipelineData` object to represent this intermediate data.  \\n\",\n    \"2. `AutoMLStep`: takes the intermediate data produced by the previous step and passes it to an `AutoMLConfig` which performs the automatic model selection\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.1 Set up run configuration file\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"First we set up a `RunConfiguration` object which configures the execution environment for an experiment (sets up the conda dependencies, etc.)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {\n    \"format\": \"row\"\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"run config is ready\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# create a new RunConfig object\\n\",\n    \"conda_run_config = RunConfiguration(framework=\\\"python\\\")\\n\",\n    \"\\n\",\n    \"# Set compute target to AmlCompute\\n\",\n    \"conda_run_config.target = compute_target\\n\",\n    \"\\n\",\n    \"conda_run_config.environment.docker.enabled = True\\n\",\n    \"conda_run_config.environment.docker.base_image = aml.core.runconfig.DEFAULT_CPU_IMAGE\\n\",\n    \"\\n\",\n    \"# Specify our own conda dependencies for the execution environment\\n\",\n    \"conda_run_config.environment.python.user_managed_dependencies = False\\n\",\n    \"conda_run_config.environment.python.conda_dependencies = CondaDependencies.create(\\n\",\n    \"    pip_packages=[\\n\",\n    \"        \\\"azureml-sdk[automl]==1.0.48\\\",\\n\",\n    \"        \\\"azureml-dataprep==1.1.8\\\",\\n\",\n    \"        \\\"azureml-train-automl==1.0.48\\\",\\n\",\n    \"    ],\\n\",\n    \"    conda_packages=[\\n\",\n    \"        \\\"numpy\\\",\\n\",\n    \"        \\\"py-xgboost<=0.80\\\",\\n\",\n    \"        \\\"pandas\\\",\\n\",\n    \"        \\\"tensorflow\\\",\\n\",\n    \"        \\\"tensorflow-hub\\\",\\n\",\n    \"        \\\"scikit-learn\\\",\\n\",\n    \"    ],\\n\",\n    \"    pin_sdk_version=False,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"print(\\\"run config is ready\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 PythonScriptStep\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`PythonScriptStep` is a step which runs a user-defined Python script ([documentation](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.python_script_step.pythonscriptstep?view=azure-ml-py) here). In this `PythonScriptStep`, we will convert our sentences into a numerical representation in order to use them in our machine learning model. We will embed both sentences using the Google Universal Sentence Encoder (provided by tensorflow-hub) and concatenate their representations into a $1024$-dimensional vector to use as features for AutoML.\\n\",\n    \"\\n\",\n    \"**Google Universal Sentence Encoder:**\\n\",\n    \"We'll use a popular sentence encoder called Google Universal Sentence Encoder (see [original paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46808.pdf)). Google provides two pretrained models based on different design goals: a Transformer model (targets high accuracy even if this reduces model complexity) and a Deep Averaging Network model (DAN; targets efficient inference). Both models are trained on a variety of web sources (Wikipedia, news, question-answers pages, and discussion forums) and produced 512-dimensional embeddings. This notebook utilizes the Transformer-based encoding model which can be downloaded [here](https://tfhub.dev/google/universal-sentence-encoder-large/3) because of its better performance relative to the DAN model on the STS Benchmark dataset (see Table 2 in Google Research's [paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46808.pdf)). The Transformer model produces sentence embeddings using the \\\"encoding sub-graph of the transformer architecture\\\" (original architecture introduced [here](https://arxiv.org/abs/1706.03762)). \\\"This sub-graph uses attention to compute context aware representations of words in a sentence that take into account both the ordering and identity of all the other workds. The context aware word representations are converted to a fixed length sentence encoding vector by computing the element-wise sum of the representations at each word position.\\\" The input to the model is lowercase PTB-tokenized strings and the model is designed to be useful for multiple different tasks by using multi-task learning. More details about the model can be found in the [paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46808.pdf) by Google Research.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 4.2.1 Define python script to run\\n\",\n    \"\\n\",\n    \"Define the script (called embed.py) that the `PythonScriptStep` will execute:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Writing ./automl-sentence-similarity/embed.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile $project_folder/embed.py\\n\",\n    \"import argparse\\n\",\n    \"import os\\n\",\n    \"import azureml.core\\n\",\n    \"import pandas as pd\\n\",\n    \"import numpy as np\\n\",\n    \"import tensorflow as tf\\n\",\n    \"import tensorflow_hub as hub\\n\",\n    \"\\n\",\n    \"tf.logging.set_verbosity(tf.logging.ERROR)  # reduce logging output\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def google_encoder(dataset):\\n\",\n    \"    \\\"\\\"\\\" Function that embeds sentences using the Google Universal\\n\",\n    \"    Sentence Encoder pretrained model\\n\",\n    \"    \\n\",\n    \"    Parameters:\\n\",\n    \"    ----------\\n\",\n    \"    dataset: pandas dataframe with sentences and scores\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"    -------\\n\",\n    \"    emb1: 512-dimensional representation of sentence1\\n\",\n    \"    emb2: 512-dimensional representation of sentence2\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    sts_input1 = tf.placeholder(tf.string, shape=(None))\\n\",\n    \"    sts_input2 = tf.placeholder(tf.string, shape=(None))\\n\",\n    \"\\n\",\n    \"    # Apply embedding model and normalize the input\\n\",\n    \"    sts_encode1 = tf.nn.l2_normalize(embedding_model(sts_input1), axis=1)\\n\",\n    \"    sts_encode2 = tf.nn.l2_normalize(embedding_model(sts_input2), axis=1)\\n\",\n    \"\\n\",\n    \"    with tf.Session() as session:\\n\",\n    \"        session.run(tf.global_variables_initializer())\\n\",\n    \"        session.run(tf.tables_initializer())\\n\",\n    \"        emb1, emb2 = session.run(\\n\",\n    \"            [sts_encode1, sts_encode2],\\n\",\n    \"            feed_dict={\\n\",\n    \"                sts_input1: dataset[\\\"sentence1\\\"],\\n\",\n    \"                sts_input2: dataset[\\\"sentence2\\\"],\\n\",\n    \"            },\\n\",\n    \"        )\\n\",\n    \"    return emb1, emb2\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def feature_engineering(dataset):\\n\",\n    \"    \\\"\\\"\\\"Extracts embedding features from the dataset and returns\\n\",\n    \"    features and target in a dataframe\\n\",\n    \"    \\n\",\n    \"    Parameters:\\n\",\n    \"    ----------\\n\",\n    \"    dataset: pandas dataframe with sentences and scores\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"    -------\\n\",\n    \"    df: pandas dataframe with embedding features\\n\",\n    \"    scores: list of target variables\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    google_USE_emb1, google_USE_emb2 = google_encoder(dataset)\\n\",\n    \"    n_google = google_USE_emb1.shape[1]  # length of the embeddings\\n\",\n    \"    df = np.concatenate((google_USE_emb1, google_USE_emb2), axis=1)\\n\",\n    \"    names = [\\\"USEEmb1_\\\" + str(i) for i in range(n_google)] + [\\n\",\n    \"        \\\"USEEmb2_\\\" + str(i) for i in range(n_google)\\n\",\n    \"    ]\\n\",\n    \"    df = pd.DataFrame(df, columns=names)\\n\",\n    \"    return df, dataset[\\\"score\\\"]\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def write_output(df, path, name):\\n\",\n    \"    \\\"\\\"\\\"Write dataframes to correct path\\\"\\\"\\\"\\n\",\n    \"    os.makedirs(path, exist_ok=True)\\n\",\n    \"    print(\\\"%s created\\\" % path)\\n\",\n    \"    df.to_csv(path + \\\"/\\\" + name, index=False)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# Parse arguments\\n\",\n    \"parser = argparse.ArgumentParser()\\n\",\n    \"parser.add_argument(\\\"--sentence_data\\\", type=str)\\n\",\n    \"parser.add_argument(\\\"--embedded_data\\\", type=str)\\n\",\n    \"args = parser.parse_args()\\n\",\n    \"\\n\",\n    \"# Import the Universal Sentence Encoder's TF Hub module\\n\",\n    \"module_url = \\\"https://tfhub.dev/google/universal-sentence-encoder-large/3\\\"\\n\",\n    \"embedding_model = hub.Module(module_url)\\n\",\n    \"\\n\",\n    \"# Read data\\n\",\n    \"train = pd.read_csv(args.sentence_data + \\\"/train.csv\\\")\\n\",\n    \"dev = pd.read_csv(args.sentence_data + \\\"/dev.csv\\\")\\n\",\n    \"\\n\",\n    \"# Get Google USE features\\n\",\n    \"training_data, training_scores = feature_engineering(train)\\n\",\n    \"validation_data, validation_scores = feature_engineering(dev)\\n\",\n    \"\\n\",\n    \"# Write out training data to Datastore\\n\",\n    \"write_output(training_data, args.embedded_data, \\\"X_train.csv\\\")\\n\",\n    \"write_output(\\n\",\n    \"    pd.DataFrame(training_scores, columns=[\\\"score\\\"]), args.embedded_data, \\\"y_train.csv\\\"\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"# Write out validation data to Datastore\\n\",\n    \"write_output(validation_data, args.embedded_data, \\\"X_dev.csv\\\")\\n\",\n    \"write_output(\\n\",\n    \"    pd.DataFrame(validation_scores, columns=[\\\"score\\\"]), args.embedded_data, \\\"y_dev.csv\\\"\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 4.2.2 Create PipelineData object\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`PipelineData` objects represent a piece of intermediate data in a pipeline. Generally they are produced by one step (as an output) and then consumed by the next step (as an input), introducing an implicit order between steps in a pipeline. We create a PipelineData object that can represent the data produced by our first pipeline step that will be consumed by our second pipeline step.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"embedded_data = PipelineData(\\\"embedded_data\\\", datastore=ds)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 4.2.3 Create PythonScriptStep\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This step defines the `PythonScriptStep`. We give the step a name, tell the step which python script to run (embed.py) and what directory that script is located in (source_directory). \\n\",\n    \"\\n\",\n    \"We also link the compute target and run configuration that we made previously. Our input is the `DataReference` object (input_data) where our raw sentence data was uploaded and our ouput is the `PipelineData` object (embedded_data) where the embedded data produced by this step will be stored. These are also passed in as arguments so that we have access to the correct data paths.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"embed_step = PythonScriptStep(\\n\",\n    \"    name=\\\"Embed\\\",\\n\",\n    \"    script_name=\\\"embed.py\\\",\\n\",\n    \"    arguments=[\\\"--embedded_data\\\", embedded_data, \\\"--sentence_data\\\", input_data],\\n\",\n    \"    inputs=[input_data],\\n\",\n    \"    outputs=[embedded_data],\\n\",\n    \"    compute_target=compute_target,\\n\",\n    \"    runconfig=conda_run_config,\\n\",\n    \"    source_directory=project_folder,\\n\",\n    \"    allow_reuse=True,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.3 AutoMLStep\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`AutoMLStep` creates an AutoML step in a pipeline (see [documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlstep?view=azure-ml-py) and [basic example](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb)). When using AutoML on remote compute, rather than passing our data directly into the `AutoMLConfig` object as we did in the local example, we must define a get_data.py script with a get_data() function to pass as the data_script argument. This workflow can be used for both local and remote executions (see [details](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-auto-train-remote)). \\n\",\n    \"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 4.3.1 Define get_data script to load data\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Define the get_data.py file and get_data() function that the `AutoMLStep` will execute to collect data. When AutoML is used with a remote compute, the data can not be passed directly as parameters. Rather, a get_data function must be defined to access the data (see [this resource](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-auto-train-remote) for further details). Note that we can directly access the path of the intermediate data (called embedded_data) through `os.environ['AZUREML_DATAREFERENCE_embedded_data']`. This is necessary because the AutoMLStep does not accept additional parameters like the PythonScriptStep does with `arguments`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Writing ./automl-sentence-similarity/get_data.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile $project_folder/get_data.py\\n\",\n    \"\\n\",\n    \"import os\\n\",\n    \"import pandas as pd\\n\",\n    \"\\n\",\n    \"# get location of the embedded_data for future use\\n\",\n    \"EMBEDDED_DATA_REF = os.environ[\\\"AZUREML_DATAREFERENCE_embedded_data\\\"]\\n\",\n    \"\\n\",\n    \"def get_data():\\n\",\n    \"    \\\"\\\"\\\"Function needed to load data for use on remote AutoML experiments\\\"\\\"\\\"\\n\",\n    \"    X_train = pd.read_csv(EMBEDDED_DATA_REF + \\\"/X_train.csv\\\")\\n\",\n    \"    y_train = pd.read_csv(EMBEDDED_DATA_REF + \\\"/y_train.csv\\\")\\n\",\n    \"    X_dev = pd.read_csv(EMBEDDED_DATA_REF + \\\"/X_dev.csv\\\")\\n\",\n    \"    y_dev = pd.read_csv(EMBEDDED_DATA_REF + \\\"/y_dev.csv\\\")\\n\",\n    \"    return {\\\"X\\\": X_train.values, \\\"y\\\": y_train.values.flatten(), \\\"X_valid\\\": X_dev.values, \\\"y_valid\\\": y_dev.values.flatten()}\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 4.3.2 Create AutoMLConfig object\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now, we specify the parameters for the `AutoMLConfig` class:\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**task**  \\n\",\n    \"AutoML supports the following base learners for the regression task: Elastic Net, Light GBM, Gradient Boosting, Decision Tree, K-nearest Neighbors, LARS Lasso, Stochastic Gradient Descent, Random Forest, Extremely Randomized Trees, XGBoost, DNN Regressor, Linear Regression. In addition, AutoML also supports two kinds of ensemble methods: voting (weighted average of the output of multiple base learners) and stacking (training a second \\\"metalearner\\\" which uses the base algorithms' predictions to predict the target variable). Specific base learners can be included or excluded in the parameters for the AutoMLConfig class (whitelist_models and blacklist_models) and the voting/stacking ensemble options can be specified as well (enable_voting_ensemble and enable_stack_ensemble)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**preprocess**  \\n\",\n    \"AutoML also has advanced preprocessing methods, eliminating the need for users to perform this manually. Data is automatically scaled and normalized but an additional parameter in the AutoMLConfig class enables the use of more advanced techniques including imputation, generating additional features, transformations, word embeddings, etc. (full list found [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-create-portal-experiments#preprocess)). Note that algorithm-specific preprocessing will be applied even if preprocess=False. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**primary_metric**  \\n\",\n    \"The regression metrics available are the following: Spearman Correlation (spearman_correlation), Normalized RMSE (normalized_root_mean_squared_error), Normalized MAE (normalized_mean_absolute_error), and R2 score (r2_score) \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Constraints:**  \\n\",\n    \"There is a cost_mode parameter to set cost prediction modes (see options [here](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlconfig?view=azure-ml-py)). To set constraints on time there are multiple parameters including experiment_exit_score (target score to exit the experiment after achieving), experiment_timeout_minutes (maximum amount of time for all combined iterations), and iterations (total number of different algorithm and parameter combinations to try).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"automl_config = AutoMLConfig(\\n\",\n    \"    debug_log=\\\"automl_errors.log\\\",\\n\",\n    \"    path=project_folder,\\n\",\n    \"    compute_target=compute_target,\\n\",\n    \"    run_configuration=conda_run_config,\\n\",\n    \"    data_script=project_folder\\n\",\n    \"    + \\\"/get_data.py\\\",  # local path to script with get_data() function\\n\",\n    \"    **automl_settings #where the AutoML main settings are defined\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 4.3.3 Create AutoMLStep\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally, we create `PipelineData` objects for the model data (our outputs) and then create the `AutoMLStep`. The `AutoMLStep` requires a `AutoMLConfig` object and we pass our intermediate data (embedded_data) in as the inputs. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Create PipelineData objects for tracking AutoML metrics\\n\",\n    \"\\n\",\n    \"metrics_data = PipelineData(\\n\",\n    \"    name=\\\"metrics_data\\\",\\n\",\n    \"    datastore=ds,\\n\",\n    \"    pipeline_output_name=\\\"metrics_output\\\",\\n\",\n    \"    training_output=TrainingOutput(type=\\\"Metrics\\\"),\\n\",\n    \")\\n\",\n    \"model_data = PipelineData(\\n\",\n    \"    name=\\\"model_data\\\",\\n\",\n    \"    datastore=ds,\\n\",\n    \"    pipeline_output_name=\\\"best_model_output\\\",\\n\",\n    \"    training_output=TrainingOutput(type=\\\"Model\\\"),\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"automl_step = AutoMLStep(\\n\",\n    \"    name=\\\"AutoML\\\",\\n\",\n    \"    automl_config=automl_config,  # the AutoMLConfig object created previously\\n\",\n    \"    inputs=[\\n\",\n    \"        embedded_data\\n\",\n    \"    ],  # inputs is the PipelineData that was the output of the previous pipeline step\\n\",\n    \"    outputs=[\\n\",\n    \"        metrics_data,\\n\",\n    \"        model_data,\\n\",\n    \"    ],  # PipelineData objects to reference metric and model information\\n\",\n    \"    allow_reuse=True,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Run Pipeline\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now we set up our pipeline which requires specifying our `Workspace` and the ordering of the steps that we created (steps parameter). We submit the pipeline and inspect the run details using a RunDetails widget. For remote runs, the execution of iterations is asynchronous.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 24,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING - 'auto_prepare_environment' is deprecated and unused. It will be removed in a future release.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"pipeline = Pipeline(\\n\",\n    \"    description=\\\"pipeline_embed_automl\\\",  # give a name for the pipeline\\n\",\n    \"    workspace=ws,\\n\",\n    \"    steps=[embed_step, automl_step],\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 25,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Created step Embed [d14b211c][a0500165-a3a6-4963-9d8a-7b3dc5981478], (This step will run and generate new outputs)\\n\",\n      \"Created step AutoML [b676f3ac][f37c2b71-e7be-486e-85cf-65c95d59d04f], (This step will run and generate new outputs)\\n\",\n      \"Using data reference stsbenchmark for StepId [35ea3ed1][e3340790-c54f-4147-8dd0-bcb80a9b7b46], (Consumers of this data are eligible to reuse prior runs.)\\n\",\n      \"Submitted pipeline run: 61516c51-af97-458b-a743-93fd3cbd7abf\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"pipeline_run = experiment.submit(pipeline)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 26,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Inspect the run details using the provided widget\\n\",\n    \"RunDetails(pipeline_run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![](https://nlpbp.blob.core.windows.net/images/pipelineWidget.PNG)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Alternatively, block until the run has completed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 27,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pipeline_run.wait_for_completion(\\n\",\n    \"    show_output=True\\n\",\n    \")  # show console output while run is in progress\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Cancel the Run**\\n\",\n    \"\\n\",\n    \"Interrupting/Restarting the jupyter kernel will not properly cancel the run, which can lead to wasted compute resources. To avoid this, we recommend explicitly canceling a run with the following code:\\n\",\n    \"\\n\",\n    \"`pipeline_run.cancel()`\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 6. Deploy Sentence Similarity Model\\n\",\n    \"\\n\",\n    \"Deploying an Azure Machine Learning model as a web service creates a REST API. You can send data to this API and receive the prediction returned by the model.\\n\",\n    \"In general, you create a webservice by deploying a model as an image to a Compute Target.\\n\",\n    \"\\n\",\n    \"Some of the Compute Targets are: \\n\",\n    \"1. Azure Container Instance\\n\",\n    \"2. Azure Kubernetes Service\\n\",\n    \"3. Local web service\\n\",\n    \"\\n\",\n    \"The general workflow for deploying a model is as follows:\\n\",\n    \"1. Register a model\\n\",\n    \"2. Prepare to deploy\\n\",\n    \"3. Deploy the model to the compute target\\n\",\n    \"4. Test the deployed model (webservice)\\n\",\n    \"\\n\",\n    \"In this notebook we walk you through the process of creating a webservice running on Azure Kubernetes Service ([AKS](https://docs.microsoft.com/en-us/azure/aks/intro-kubernetes\\n\",\n    \")) by deploying the model as an image. AKS is good for high-scale production deployments. It provides fast response time and autoscaling of the deployed service. Cluster autoscaling is not supported through the Azure Machine Learning SDK. \\n\",\n    \"\\n\",\n    \"You can find more information on deploying and serving models [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where)\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.1 Register/Retrieve AutoML and Google Universal Sentence Encoder Models for Deployment\\n\",\n    \"\\n\",\n    \"Registering a model means registering one or more files that make up a model. The Machine Learning models are registered in your current Aure Machine Learning Workspace. The model can either come from Azure Machine Learning or another location, such as your local machine.\\n\",\n    \"\\n\",\n    \"See other ways to register a model [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where)\\n\",\n    \"\\n\",\n    \"Below we show how to register a new model and also how to retrieve and register an existing model.\\n\",\n    \"\\n\",\n    \"### Register a new automl model\\n\",\n    \"Register the best AutoML model based on the pipeline results or load the saved model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 28,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING - Received unrecognized parameter: time_column_name None\\n\",\n      \"WARNING - Received unrecognized parameter: grain_column_names None\\n\",\n      \"WARNING - Received unrecognized parameter: drop_column_names None\\n\",\n      \"WARNING - Received unrecognized parameter: group None\\n\",\n      \"WARNING - Received unrecognized parameter: target_lags None\\n\",\n      \"WARNING - Received unrecognized parameter: target_rolling_window_size None\\n\",\n      \"WARNING - Received unrecognized parameter: max_horizon None\\n\",\n      \"WARNING - Received unrecognized parameter: country_or_region None\\n\",\n      \"WARNING - Received unrecognized parameter: seasonality None\\n\",\n      \"WARNING - Received unrecognized parameter: use_stl None\\n\",\n      \"WARNING - Received unrecognized parameter: season_trend None\\n\",\n      \"WARNING - Received unrecognized parameter: season None\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Registering model 711e9373160c4a8best\\n\",\n      \"711e9373160c4a8best\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"automl_step_run = AutoMLStepRun(step_run=pipeline_run.find_step_run(\\\"AutoML\\\")[0])\\n\",\n    \"\\n\",\n    \"# to register the fitted_mode\\n\",\n    \"description = \\\"Pipeline AutoML Model\\\"\\n\",\n    \"tags = {\\\"area\\\": \\\"nlp\\\", \\\"type\\\": \\\"sentencesimilarity pipelines\\\"}\\n\",\n    \"model = automl_step_run.register_model(description=description, tags=tags)\\n\",\n    \"automl_model_name = automl_step_run.model_id\\n\",\n    \"print(\\n\",\n    \"    automl_step_run.model_id\\n\",\n    \")  # Use this id to deploy the model as a web service in Azure.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Retrieve existing model from Azure\\n\",\n    \"If you already have a best model then you can skip registering the model by just retrieving the latest version of model by providing its name\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 29,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Found model with name 711e9373160c4a8best\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"automl_model_name = \\\"711e9373160c4a8best\\\"  # best fit model registered in the workspace\\n\",\n    \"model = Model(ws, name=automl_model_name)\\n\",\n    \"print(\\\"Found model with name\\\", automl_model_name)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Register Google Universal Sentence Encoder Model\\n\",\n    \"Register the Google Universal Sentence Encoder model if not already registered in your workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 30,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Registering model googleUSEmodel\\n\",\n      \"Registered googleUSEembeddings model\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# set location for where to download google tensorflow model\\n\",\n    \"os.environ[\\\"TFHUB_CACHE_DIR\\\"] = \\\"./googleUSE\\\"\\n\",\n    \"# download model\\n\",\n    \"hub.Module(\\\"https://tfhub.dev/google/universal-sentence-encoder-large/3\\\")\\n\",\n    \"# register model\\n\",\n    \"embedding_model = Model.register(\\n\",\n    \"    model_path=\\\"googleUSE\\\",\\n\",\n    \"    model_name=\\\"googleUSEmodel\\\",\\n\",\n    \"    tags={\\\"Model\\\": \\\"GoogleUSE\\\"},\\n\",\n    \"    description=\\\"Google Universal Sentence Embedding pretrained model\\\",\\n\",\n    \"    workspace=ws,\\n\",\n    \")\\n\",\n    \"print(\\\"Registered googleUSEembeddings model\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Retrieve existing Google USE model from Azure\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 31,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Found model with name googleUSEembeddings\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"embedding_model = Model(ws, name=\\\"googleUSEmodel\\\")\\n\",\n    \"print(\\\"Found model with name googleUSEembeddings\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.2 Create Scoring Script\\n\",\n    \"\\n\",\n    \"In this section we show an example of an entry script, which is called from the deployed webservice. `score.py` is our entry script. The script must contain:\\n\",\n    \"1. init() - This function loads the model in a global object.\\n\",\n    \"2. run() - This function is used for model prediction. The inputs and outputs to `run()` typically use JSON for serialization and deserilization. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 32,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Writing score.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile score.py\\n\",\n    \"import pickle\\n\",\n    \"import json\\n\",\n    \"import numpy as np\\n\",\n    \"import azureml.train.automl\\n\",\n    \"from sklearn.externals import joblib\\n\",\n    \"from azureml.core.model import Model\\n\",\n    \"import pandas as pd\\n\",\n    \"import tensorflow as tf\\n\",\n    \"import tensorflow_hub as hub\\n\",\n    \"import os\\n\",\n    \"\\n\",\n    \"tf.logging.set_verbosity(tf.logging.ERROR)  # reduce logging output\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def google_encoder(dataset):\\n\",\n    \"    \\\"\\\"\\\" Function that embeds sentences using the Google Universal\\n\",\n    \"    Sentence Encoder pretrained model\\n\",\n    \"    \\n\",\n    \"    Parameters:\\n\",\n    \"    ----------\\n\",\n    \"    dataset: pandas dataframe with sentences and scores\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"    -------\\n\",\n    \"    emb1: 512-dimensional representation of sentence1\\n\",\n    \"    emb2: 512-dimensional representation of sentence2\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    global embedding_model, sess\\n\",\n    \"    sts_input1 = tf.placeholder(tf.string, shape=(None))\\n\",\n    \"    sts_input2 = tf.placeholder(tf.string, shape=(None))\\n\",\n    \"\\n\",\n    \"    # Apply embedding model and normalize the input\\n\",\n    \"    sts_encode1 = tf.nn.l2_normalize(embedding_model(sts_input1), axis=1)\\n\",\n    \"    sts_encode2 = tf.nn.l2_normalize(embedding_model(sts_input2), axis=1)\\n\",\n    \"\\n\",\n    \"    sess.run(tf.global_variables_initializer())\\n\",\n    \"    sess.run(tf.tables_initializer())\\n\",\n    \"    emb1, emb2 = sess.run(\\n\",\n    \"        [sts_encode1, sts_encode2],\\n\",\n    \"        feed_dict={sts_input1: dataset[\\\"sentence1\\\"], sts_input2: dataset[\\\"sentence2\\\"]},\\n\",\n    \"    )\\n\",\n    \"    return emb1, emb2\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def feature_engineering(dataset):\\n\",\n    \"    \\\"\\\"\\\"Extracts embedding features from the dataset and returns\\n\",\n    \"    features and target in a dataframe\\n\",\n    \"    \\n\",\n    \"    Parameters:\\n\",\n    \"    ----------\\n\",\n    \"    dataset: pandas dataframe with sentences and scores\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"    -------\\n\",\n    \"    df: pandas dataframe with embedding features\\n\",\n    \"    scores: list of target variables\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    google_USE_emb1, google_USE_emb2 = google_encoder(dataset)\\n\",\n    \"    return np.concatenate((google_USE_emb1, google_USE_emb2), axis=1)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def init():\\n\",\n    \"    global model, googleUSE_dir_path\\n\",\n    \"    model_path = Model.get_model_path(\\n\",\n    \"        model_name=\\\"<<modelid>>\\\"\\n\",\n    \"    )  # this name is model.id of model that we want to deploy\\n\",\n    \"    # deserialize the model file back into a sklearn model\\n\",\n    \"    model = joblib.load(model_path)\\n\",\n    \"\\n\",\n    \"    # load the path for google USE embedding model\\n\",\n    \"    googleUSE_dir_path = Model.get_model_path(model_name=\\\"googleUSEmodel\\\")\\n\",\n    \"    os.environ[\\\"TFHUB_CACHE_DIR\\\"] = googleUSE_dir_path\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def run(rawdata):\\n\",\n    \"    global embedding_model, sess, googleUSE_dir_path, model\\n\",\n    \"    try:\\n\",\n    \"        # load data and convert to dataframe\\n\",\n    \"        data = json.loads(rawdata)[\\\"data\\\"]\\n\",\n    \"        data_df = pd.DataFrame(data, columns=[\\\"sentence1\\\", \\\"sentence2\\\"])\\n\",\n    \"\\n\",\n    \"        # begin a tensorflow session and load tensorhub module\\n\",\n    \"        sess = tf.Session()\\n\",\n    \"        embedding_model = hub.Module(\\n\",\n    \"            googleUSE_dir_path + \\\"/96e8f1d3d4d90ce86b2db128249eb8143a91db73\\\"\\n\",\n    \"        )\\n\",\n    \"\\n\",\n    \"        # Embed sentences using Google USE model\\n\",\n    \"        embedded_data = feature_engineering(data_df)\\n\",\n    \"        # Predict using AutoML saved model\\n\",\n    \"        result = model.predict(embedded_data)\\n\",\n    \"\\n\",\n    \"    except Exception as e:\\n\",\n    \"        result = str(e)\\n\",\n    \"        sess.close()\\n\",\n    \"        return json.dumps({\\\"error\\\": result})\\n\",\n    \"\\n\",\n    \"    sess.close()\\n\",\n    \"    return json.dumps({\\\"result\\\": result.tolist()})\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 33,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Substitute the actual model id in the script file.\\n\",\n    \"script_file_name = \\\"score.py\\\"\\n\",\n    \"\\n\",\n    \"with open(script_file_name, \\\"r\\\") as cefr:\\n\",\n    \"    content = cefr.read()\\n\",\n    \"\\n\",\n    \"with open(script_file_name, \\\"w\\\") as cefw:\\n\",\n    \"    cefw.write(content.replace(\\\"<<modelid>>\\\", automl_model_name))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.3 Create a YAML File for the Environment\\n\",\n    \"\\n\",\n    \"To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. The following cells create a file, pipeline_env.yml, which specifies the dependencies from the run.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 34,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'pipeline_env.yml'\"\n      ]\n     },\n     \"execution_count\": 34,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"myenv = CondaDependencies.create(\\n\",\n    \"    conda_packages=[\\n\",\n    \"        \\\"numpy\\\",\\n\",\n    \"        \\\"scikit-learn\\\",\\n\",\n    \"        \\\"py-xgboost<=0.80\\\",\\n\",\n    \"        \\\"pandas\\\",\\n\",\n    \"        \\\"tensorflow\\\",\\n\",\n    \"        \\\"tensorflow-hub\\\",\\n\",\n    \"    ],\\n\",\n    \"    pip_packages=[\\\"azureml-sdk[automl]==1.0.48.*\\\"],\\n\",\n    \"    python_version=\\\"3.6.8\\\",\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"conda_env_file_name = \\\"pipeline_env.yml\\\"\\n\",\n    \"myenv.save_to_file(\\\".\\\", conda_env_file_name)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.4 Image Creation\\n\",\n    \"\\n\",\n    \"In this step we create a container image which is wrapper containing the entry script, yaml file with package dependencies and the model. The created image is then deployed as a webservice in the next step. This step can take up to 10 minutes and even longer if the model is large.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 35,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Creating image\\n\",\n      \"Running..................................................................................\\n\",\n      \"Succeeded\\n\",\n      \"Image creation operation finished for image pipeline-automl-image:1, operation \\\"Succeeded\\\"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# trying to add dependencies\\n\",\n    \"image_config = ContainerImage.image_configuration(\\n\",\n    \"    execution_script=script_file_name,\\n\",\n    \"    runtime=\\\"python\\\",\\n\",\n    \"    conda_file=conda_env_file_name,\\n\",\n    \"    description=\\\"Image with aml pipeline model\\\",\\n\",\n    \"    tags={\\\"area\\\": \\\"nlp\\\", \\\"type\\\": \\\"sentencesimilarity pipeline\\\"},\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"image = ContainerImage.create(\\n\",\n    \"    name=\\\"pipeline-automl-image\\\",\\n\",\n    \"    # this is the model object\\n\",\n    \"    models=[model, embedding_model],  # add both embedding and autoML models\\n\",\n    \"    image_config=image_config,\\n\",\n    \"    workspace=ws,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"image.wait_for_creation(show_output=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"If the above step fails, then use below command to see logs.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# image.get_logs()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.5 Provision the AKS Cluster\\n\",\n    \"\\n\",\n    \"**Time estimate:** Approximately 20 minutes.\\n\",\n    \"\\n\",\n    \"Creating or attaching an AKS cluster is a one time process for your workspace. You can reuse this cluster for multiple deployments. If you delete the cluster or the resource group that contains it, you must create a new cluster the next time you need to deploy. You can have multiple AKS clusters attached to your workspace.\\n\",\n    \"\\n\",\n    \"**Note:** Check the Azure Portal to make sure that the AKS Cluster has been provisioned properly before moving forward with this notebook\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 36,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# create aks cluser\\n\",\n    \"\\n\",\n    \"# Use the default configuration (can also provide parameters to customize)\\n\",\n    \"prov_config = AksCompute.provisioning_configuration()\\n\",\n    \"\\n\",\n    \"# Create the cluster\\n\",\n    \"aks_target = ComputeTarget.create(\\n\",\n    \"    workspace=ws, name=\\\"nlp-aks-cluster\\\", provisioning_configuration=prov_config\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"\\n\",\n    \"## 6.6 Deploy the Image as a Web Service on Azure Kubernetes Service\\n\",\n    \"\\n\",\n    \"In the case of deployment on AKS, in addition to the Docker image, we need to define computational resources. This is typically a cluster of CPUs or a cluster of GPUs. If we already have a Kubernetes-managed cluster in our workspace, we can use it, otherwise, we can create a new one.\\n\",\n    \"\\n\",\n    \"In this notebook we will use the cluster in the above cell.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 37,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Set the web service configuration\\n\",\n    \"aks_config = AksWebservice.deploy_configuration()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We are now ready to deploy our web service. We will deploy from the Docker image. It contains our AutoML model as well as the Google Universal Sentence Encoder model and the conda environment needed for the scoring script to work properly. The parameters to pass to the Webservice.deploy_from_image() command are similar to those used for deployment on Azure Container Instance ([ACI](https://azure.microsoft.com/en-us/services/container-instances/\\n\",\n    \")). The only major difference is the compute target (aks_target), i.e. the CPU cluster we just spun up.\\n\",\n    \"\\n\",\n    \"**Note:** This deployment takes a few minutes to complete.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 38,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Creating service\\n\",\n      \"Running.........................\\n\",\n      \"SucceededAKS service creation operation finished, operation \\\"Succeeded\\\"\\n\",\n      \"Healthy\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# deploy image as web service\\n\",\n    \"aks_service_name = \\\"aks-pipelines-service\\\"\\n\",\n    \"\\n\",\n    \"aks_service = Webservice.deploy_from_image(\\n\",\n    \"    workspace=ws,\\n\",\n    \"    name=aks_service_name,\\n\",\n    \"    image=image,\\n\",\n    \"    deployment_config=aks_config,\\n\",\n    \"    deployment_target=aks_target,\\n\",\n    \")\\n\",\n    \"aks_service.wait_for_deployment(show_output=True)\\n\",\n    \"print(aks_service.state)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"If the above step fails then use below command to see logs\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# aks_service.get_logs()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.7 Test Deployed Webservice\\n\",\n    \"\\n\",\n    \"Testing the deployed model means running the created webservice. <br>\\n\",\n    \"The deployed model can be tested by passing a list of sentence pairs. The output will be a score between 0 and 5, with 0 indicating no meaning overlap between the sentences and 5 meaning equivalence.\\n\",\n    \"\\n\",\n    \"The run method expects input in json format. The Run() method retrieves API keys behind the scenes to make sure that the call is authenticated. The service has a timeout (default of ~30 seconds) which does not allow passing the large test dataset. To overcome this, you can batch data and send it to the service.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 39,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"sentences = [\\n\",\n    \"    [\\\"This is sentence1\\\", \\\"This is sentence1\\\"],\\n\",\n    \"    [\\\"A hungry cat.\\\", \\\"A sleeping cat\\\"],\\n\",\n    \"    [\\\"Its summer time \\\", \\\"Winter is coming\\\"],\\n\",\n    \"]\\n\",\n    \"data = {\\\"data\\\": sentences}\\n\",\n    \"data = json.dumps(data)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 40,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Time elapsed: 12.8143\\n\",\n      \"Number of samples predicted: 3\\n\",\n      \"[3.7827566108065453, 2.7329700382428097, 2.4850673912463717]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Set up a Timer to see how long the model takes to predict\\n\",\n    \"t = Timer()\\n\",\n    \"\\n\",\n    \"t.start()\\n\",\n    \"score = aks_service.run(input_data=data)\\n\",\n    \"t.stop()\\n\",\n    \"\\n\",\n    \"print(\\\"Time elapsed: {}\\\".format(t))\\n\",\n    \"\\n\",\n    \"result = json.loads(score)\\n\",\n    \"try:\\n\",\n    \"    output = result[\\\"result\\\"]\\n\",\n    \"    print(\\\"Number of samples predicted: {}\\\".format(len(output)))\\n\",\n    \"    print(output)\\n\",\n    \"except:\\n\",\n    \"    print(result[\\\"error\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally, we'll calculate the Pearson Correlation on the test set.\\n\",\n    \"\\n\",\n    \"**What is Pearson Correlation?**\\n\",\n    \"\\n\",\n    \"Our evaluation metric is Pearson correlation ($\\\\rho$) which is a measure of the linear correlation between two variables. The formula for calculating Pearson correlation is as follows:  \\n\",\n    \"\\n\",\n    \"$$\\\\rho_{X,Y} = \\\\frac{E[(X-\\\\mu_X)(Y-\\\\mu_Y)]}{\\\\sigma_X \\\\sigma_Y}$$\\n\",\n    \"\\n\",\n    \"This metric takes a value in [-1,1] where -1 represents a perfect negative correlation, 1 represents a perfect positive correlation, and 0 represents no correlation. We utilize the Pearson correlation metric as this is the main metric that [SentEval](http://nlpprogress.com/english/semantic_textual_similarity.html), a widely-used evaluation toolkit for evaluation sentence representations, uses for the STS Benchmark dataset.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 41,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# load test set sentences\\n\",\n    \"data = pd.read_csv(\\\"data/test.csv\\\")\\n\",\n    \"train_y = data[\\\"score\\\"].values.flatten()\\n\",\n    \"train_x = data.drop(\\\"score\\\", axis=1).values.tolist()\\n\",\n    \"data = {\\\"data\\\": train_x[:500]}\\n\",\n    \"data = json.dumps(data)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 42,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Time elapsed: 17.3619\\n\",\n      \"Number of sample predicted : 500\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Set up a Timer to see how long the model takes to predict\\n\",\n    \"t = Timer()\\n\",\n    \"\\n\",\n    \"t.start()\\n\",\n    \"score = aks_service.run(input_data=data)\\n\",\n    \"t.stop()\\n\",\n    \"\\n\",\n    \"print(\\\"Time elapsed: {}\\\".format(t))\\n\",\n    \"\\n\",\n    \"result = json.loads(score)\\n\",\n    \"\\n\",\n    \"try:\\n\",\n    \"    output = result[\\\"result\\\"]\\n\",\n    \"    print(\\\"Number of sample predicted : {}\\\".format(len(output)))\\n\",\n    \"except:\\n\",\n    \"    print(result[\\\"error\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 43,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"0.8706075673971211\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# get Pearson Correlation\\n\",\n    \"print(pearsonr(output, train_y[:500])[0])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Conclusion\\n\",\n    \"\\n\",\n    \"This notebook demonstrated how to use AzureML Pipelines and AutoML to streamline the creation of a machine learning workflow for predicting sentence similarity. After creating the pipeline, the notebook demonstrated the deployment of our sentence similarity model using AKS. The model results reported in this notebook (using Google USE embeddings) are much stronger than the results from using AutoML with its built-in embedding capabilities (as in [AutoML Local Deployment ACI](automl_local_deployment_aci.ipynb)). \"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.4\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/sentence_similarity/baseline_deep_dive.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>\\n\",\n    \"\\n\",\n    \"<i>Licensed under the MIT License.</i>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Estimating Baseline Performance\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### What is a baseline model? \\n\",\n    \"\\n\",\n    \"Producing a baseline model is crucial for evaluating your model's performance on any machine learning problem. A baseline model is a basic solution that serves as a point of reference for comparing other models to. The baseline model's performance gives us an indication of how much better our models can perform relative to a naive approach. \\n\",\n    \"\\n\",\n    \"Let's say we are building a sentence similarity model where our training set contains pairs of sentences and we want to predict how similiar these sentences are on a scale from 1-5. We could spend months producing a complex machine learning solution to this problem and ultimately get a mean squared error (MSE) of 0.3. But is this result good or bad? There is no way of knowing without comparing it with some baseline performance. For our baseline model, we could predict the mean sentence similarity of sentence pairs in our training set (called the _zero rule_) and get a MSE of 0.35. So our model is worse than the baseline which indicates that we may want to consider using different features, models, evaluation metrics, etc. It is crucial that the choice of baseline model be tailored to a data science problem based on buisness goals and the specific modeling task.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### What are good baselines for sentence similarity?\\n\",\n    \"\\n\",\n    \"For sentence similarity problems, we have two sub-tasks: 1) First, we need to produce a vector representation of each sentence in the sentence pair, known as an **embedding**. 2) Second, we need to compute the similarity between these two sentence embeddings.\\n\",\n    \"\\n\",\n    \"For producing representations of sentences, there are some common baseline approaches: \\n\",\n    \"1. Create word embeddings for each word in a sentence\\n\",\n    \"    1. word2vec word embeddings\\n\",\n    \"    2. GLoVe word embeddings\\n\",\n    \"    3. fastText word embeddings\\n\",\n    \"    \\n\",\n    \"2. Create sentence embeddings\\n\",\n    \"    1. doc2vec document embeddings\\n\",\n    \"    2. TF-IDF embeddings \\n\",\n    \"\\n\",\n    \"Then we have to compare our embeddings to calculate sentence similarity:\\n\",\n    \"1. Word Embedding comparison\\n\",\n    \"    1. Cosine Similarity (first requires averaging the word embeddings of all words in each sentence)\\n\",\n    \"    2. Word Mover's Distance\\n\",\n    \"\\n\",\n    \"2. Sentence Embedding comparison\\n\",\n    \"    1. Cosine Similarity  \\n\",\n    \"    \\n\",\n    \"The different embedding models and similarity metrics are introduced below.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Table of Contents\\n\",\n    \"* [Data Loading and Preprocessing](#Data-Loading-and-Preprocessing)\\n\",\n    \"    * [Load STS Benchmark Dataset](#Load-STS-Benchmark-Dataset)\\n\",\n    \"    - [Preprocess / Tokenize](#Data-Preprocessing-/-Tokenization)\\n\",\n    \"    - [Document Frequency Calculation](#Document-Frequency-Calculation)\\n\",\n    \"* [Baseline Models](#Baseline-Models)\\n\",\n    \"    - [Baseline #1: word2vec and cosine similarity](#Baseline-#1:-Word2vec-Embeddings-with-Cosine-Similarity)\\n\",\n    \"    - [Baseline #2: word2vec and Word Mover's Distance](#Baseline-#2:-Word2vec-Embeddings-with-Word-Mover's-Distance)\\n\",\n    \"    - [Baseline #3: GloVe and cosine similarity](#Baseline-#3:-GloVe-Embeddings-with-Cosine-Similarity)\\n\",\n    \"    * [Baseline #4: GloVe and Word Mover's Distance](#Baseline-#4:-GloVe-Embeddings-with-Word-Mover's-Distance)\\n\",\n    \"    - [Baseline #5: fastText and cosine similarity](#Baseline-#5:-fastText-Embeddings-with-Cosine-Similarity)\\n\",\n    \"    - [Baseline #6: fastText and Word Mover's Distance](#Baseline-#6:-fastText-Embeddings-with-Word-Mover's-Distance)\\n\",\n    \"\\n\",\n    \"    * [Baseline #7: TF-IDF and cosine similarity](#Baseline-#7:-TF-IDF-Embeddings-with-Cosine-Similarity)\\n\",\n    \"    * [Baseline #8: Doc2vec and cosine similarity](#Baseline-#8:-Doc2vec-Embeddings-with-Cosine-Similarity))\\n\",\n    \"\\n\",\n    \"* [Comparison of Baseline Models](#Comparison-of-Baseline-Models)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Reference running time \\n\",\n    \"The table below provide some reference running time of each section on CPU and GPU machines.  \\n\",\n    \"\\n\",\n    \"|Notebook Section|4 **CPU**s, 14GB memory VM| 1 NVIDIA Tesla K80 GPU, 12GB GPU memory VM|\\n\",\n    \"|:---------------|:------------------------:|:------------------------------------------:|\\n\",\n    \"|Whole notebook| ~ 35 mintues| ~ 28 minutes|\\n\",\n    \"|Data Loading and Preprocessing| ~ 8 minutes| ~ 6 minutes|\\n\",\n    \"|Baseline #1| ~ 4 minutes| ~ 3 minutes|\\n\",\n    \"|Baseline #2| ~ 5 seconds| ~ 3 seconds|\\n\",\n    \"|Baseline #3| ~ 18 minutes| ~ 14 minutes|\\n\",\n    \"|Baseline #4| ~ 5 seconds| ~ 5 seconds|\\n\",\n    \"|Baseline #5| Memory error, please skip if error occurs| ~ 3 minutes|\\n\",\n    \"|Baseline #6| Memory error, please skip if error occurs| ~ 5 seconds|\\n\",\n    \"|Baseline #7| ~ 6 seconds| ~ 5 seconds|\\n\",\n    \"|Baseline #8| ~ 3 minutes| ~ 2 minutes|\\n\",\n    \"|Comparison of Baseline Models| ~ 1 second| ~ 2 seconds|\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\\n\",\n      \"Gensim version: 3.7.3\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"#Import Packages\\n\",\n    \"import sys\\n\",\n    \"# Set the environment path\\n\",\n    \"sys.path.append(\\\"../../\\\")  \\n\",\n    \"import os\\n\",\n    \"from collections import Counter\\n\",\n    \"import math\\n\",\n    \"import numpy as np\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"import scrapbook as sb\\n\",\n    \"import scipy\\n\",\n    \"from scipy.spatial import distance\\n\",\n    \"import gensim\\n\",\n    \"from gensim.models.doc2vec import LabeledSentence\\n\",\n    \"from gensim.models.doc2vec import TaggedDocument\\n\",\n    \"from gensim.models import Doc2Vec\\n\",\n    \"from sklearn.feature_extraction.text import TfidfVectorizer\\n\",\n    \"\\n\",\n    \"import matplotlib.pyplot as plt\\n\",\n    \"%matplotlib inline\\n\",\n    \"\\n\",\n    \"#Import utility functions\\n\",\n    \"from utils_nlp.dataset.preprocess import to_lowercase, to_spacy_tokens\\n\",\n    \"from utils_nlp.dataset import stsbenchmark\\n\",\n    \"from utils_nlp.dataset.preprocess import (\\n\",\n    \"    to_lowercase,\\n\",\n    \"    to_spacy_tokens,\\n\",\n    \"    rm_spacy_stopwords,\\n\",\n    \")\\n\",\n    \"from utils_nlp.models.pretrained_embeddings import word2vec\\n\",\n    \"from utils_nlp.models.pretrained_embeddings import glove\\n\",\n    \"from utils_nlp.models.pretrained_embeddings import fasttext\\n\",\n    \"\\n\",\n    \"print(\\\"System version: {}\\\".format(sys.version))\\n\",\n    \"print(\\\"Gensim version: {}\\\".format(gensim.__version__))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Set the path where you datasets are located\\n\",\n    \"tmp_dir = TemporaryDirectory()\\n\",\n    \"BASE_DATA_PATH = tmp_dir.name \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Data Loading and Preprocessing\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Load STS Benchmark Dataset\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Here we utilize the [STS Benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset) which contains a selection of English datasets that were used in Semantic Textual Similarity (STS) tasks 2012-2017. The datasets include text from image captions, news headlines, and user forums. The dataset contains 8,628 sentence pairs with a human-labeled integer representing the sentences' similarity (ranging from 0, for no meaning overlap, to 5, meaning equivalence).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:01<00:00, 247KB/s]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Data downloaded to C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Temp\\\\tmpp2a0cw_t\\\\raw\\\\stsbenchmark\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:01<00:00, 243KB/s]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Data downloaded to C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Temp\\\\tmpp2a0cw_t\\\\raw\\\\stsbenchmark\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Produce a pandas dataframe for the training and test sets\\n\",\n    \"train_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\\\"train\\\")\\n\",\n    \"test_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\\\"test\\\")\\n\",\n    \"\\n\",\n    \"# Clean the sts dataset\\n\",\n    \"sts_train = stsbenchmark.clean_sts(train_raw)\\n\",\n    \"sts_test = stsbenchmark.clean_sts(test_raw)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Training set has 5749 sentences\\n\",\n      \"Testing set has 1379 sentences\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"Training set has {} sentences\\\".format(len(sts_train)))\\n\",\n    \"print(\\\"Testing set has {} sentences\\\".format(len(sts_test)))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>score</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"      <th>sentence2</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>2.500</td>\\n\",\n       \"      <td>A girl is styling her hair.</td>\\n\",\n       \"      <td>A girl is brushing her hair.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>3.600</td>\\n\",\n       \"      <td>A group of men play soccer on the beach.</td>\\n\",\n       \"      <td>A group of boys are playing soccer on the beach.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>5.000</td>\\n\",\n       \"      <td>One woman is measuring another woman's ankle.</td>\\n\",\n       \"      <td>A woman measures another woman's ankle.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>4.200</td>\\n\",\n       \"      <td>A man is cutting up a cucumber.</td>\\n\",\n       \"      <td>A man is slicing a cucumber.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>1.500</td>\\n\",\n       \"      <td>A man is playing a harp.</td>\\n\",\n       \"      <td>A man is playing a keyboard.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <td>1.800</td>\\n\",\n       \"      <td>A woman is cutting onions.</td>\\n\",\n       \"      <td>A woman is cutting tofu.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <td>3.500</td>\\n\",\n       \"      <td>A man is riding an electric bicycle.</td>\\n\",\n       \"      <td>A man is riding a bicycle.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>7</th>\\n\",\n       \"      <td>2.200</td>\\n\",\n       \"      <td>A man is playing the drums.</td>\\n\",\n       \"      <td>A man is playing the guitar.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>8</th>\\n\",\n       \"      <td>2.200</td>\\n\",\n       \"      <td>A man is playing guitar.</td>\\n\",\n       \"      <td>A lady is playing the guitar.</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>9</th>\\n\",\n       \"      <td>1.714</td>\\n\",\n       \"      <td>A man is playing a guitar.</td>\\n\",\n       \"      <td>A man is playing a trumpet.</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   score                                      sentence1  \\\\\\n\",\n       \"0  2.500                    A girl is styling her hair.   \\n\",\n       \"1  3.600       A group of men play soccer on the beach.   \\n\",\n       \"2  5.000  One woman is measuring another woman's ankle.   \\n\",\n       \"3  4.200                A man is cutting up a cucumber.   \\n\",\n       \"4  1.500                       A man is playing a harp.   \\n\",\n       \"5  1.800                     A woman is cutting onions.   \\n\",\n       \"6  3.500           A man is riding an electric bicycle.   \\n\",\n       \"7  2.200                    A man is playing the drums.   \\n\",\n       \"8  2.200                       A man is playing guitar.   \\n\",\n       \"9  1.714                     A man is playing a guitar.   \\n\",\n       \"\\n\",\n       \"                                          sentence2  \\n\",\n       \"0                      A girl is brushing her hair.  \\n\",\n       \"1  A group of boys are playing soccer on the beach.  \\n\",\n       \"2           A woman measures another woman's ankle.  \\n\",\n       \"3                      A man is slicing a cucumber.  \\n\",\n       \"4                      A man is playing a keyboard.  \\n\",\n       \"5                          A woman is cutting tofu.  \\n\",\n       \"6                        A man is riding a bicycle.  \\n\",\n       \"7                      A man is playing the guitar.  \\n\",\n       \"8                     A lady is playing the guitar.  \\n\",\n       \"9                       A man is playing a trumpet.  \"\n      ]\n     },\n     \"execution_count\": 10,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"sts_test.head(10)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Data Preprocessing / Tokenization\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Our baseline models will expect that each sentence is represented by a list of **tokens**. Tokens are linguistic units like words, punctuation marks, numbers, etc. We'll use our util functions which utilize the spaCy package, a popular package for performing tokenization.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"It's also common to remove high-frequency words which do not help distinguish one sentence from another, so called **stop words**. For example, \\\"the\\\", \\\"and\\\", \\\"a\\\", etc. are typical stop words although each tokenization package may differ in the words they consider to be stop words. We'll tokenize our corpus with and without stop words so that we can compare our methods with and without stop words.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Training Set Preprocessing\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Convert all text to lowercase\\n\",\n    \"df_low = to_lowercase(sts_train)  \\n\",\n    \"# Tokenize text\\n\",\n    \"sts_tokenize = to_spacy_tokens(df_low) \\n\",\n    \"# Tokenize with removal of stopwords\\n\",\n    \"sts_train_stop = rm_spacy_stopwords(sts_tokenize) \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now each row in our dataframe contains:  \\n\",\n    \"- The similarity score of the sentence pair\\n\",\n    \"- The 2 original sentences from our datasets  \\n\",\n    \"- A column for each sentence's tokenization with stop words  \\n\",\n    \"- A column for each sentence's tokenization without stop words\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>score</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"      <th>sentence2</th>\\n\",\n       \"      <th>sentence1_tokens</th>\\n\",\n       \"      <th>sentence2_tokens</th>\\n\",\n       \"      <th>sentence1_tokens_rm_stopwords</th>\\n\",\n       \"      <th>sentence2_tokens_rm_stopwords</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>5.00</td>\\n\",\n       \"      <td>a plane is taking off.</td>\\n\",\n       \"      <td>an air plane is taking off.</td>\\n\",\n       \"      <td>[a, plane, is, taking, off, .]</td>\\n\",\n       \"      <td>[an, air, plane, is, taking, off, .]</td>\\n\",\n       \"      <td>[plane, taking, .]</td>\\n\",\n       \"      <td>[air, plane, taking, .]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>3.80</td>\\n\",\n       \"      <td>a man is playing a large flute.</td>\\n\",\n       \"      <td>a man is playing a flute.</td>\\n\",\n       \"      <td>[a, man, is, playing, a, large, flute, .]</td>\\n\",\n       \"      <td>[a, man, is, playing, a, flute, .]</td>\\n\",\n       \"      <td>[man, playing, large, flute, .]</td>\\n\",\n       \"      <td>[man, playing, flute, .]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>3.80</td>\\n\",\n       \"      <td>a man is spreading shreded cheese on a pizza.</td>\\n\",\n       \"      <td>a man is spreading shredded cheese on an uncoo...</td>\\n\",\n       \"      <td>[a, man, is, spreading, shreded, cheese, on, a...</td>\\n\",\n       \"      <td>[a, man, is, spreading, shredded, cheese, on, ...</td>\\n\",\n       \"      <td>[man, spreading, shreded, cheese, pizza, .]</td>\\n\",\n       \"      <td>[man, spreading, shredded, cheese, uncooked, p...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>2.60</td>\\n\",\n       \"      <td>three men are playing chess.</td>\\n\",\n       \"      <td>two men are playing chess.</td>\\n\",\n       \"      <td>[three, men, are, playing, chess, .]</td>\\n\",\n       \"      <td>[two, men, are, playing, chess, .]</td>\\n\",\n       \"      <td>[men, playing, chess, .]</td>\\n\",\n       \"      <td>[men, playing, chess, .]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>4.25</td>\\n\",\n       \"      <td>a man is playing the cello.</td>\\n\",\n       \"      <td>a man seated is playing the cello.</td>\\n\",\n       \"      <td>[a, man, is, playing, the, cello, .]</td>\\n\",\n       \"      <td>[a, man, seated, is, playing, the, cello, .]</td>\\n\",\n       \"      <td>[man, playing, cello, .]</td>\\n\",\n       \"      <td>[man, seated, playing, cello, .]</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   score                                      sentence1  \\\\\\n\",\n       \"0   5.00                         a plane is taking off.   \\n\",\n       \"1   3.80                a man is playing a large flute.   \\n\",\n       \"2   3.80  a man is spreading shreded cheese on a pizza.   \\n\",\n       \"3   2.60                   three men are playing chess.   \\n\",\n       \"4   4.25                    a man is playing the cello.   \\n\",\n       \"\\n\",\n       \"                                           sentence2  \\\\\\n\",\n       \"0                        an air plane is taking off.   \\n\",\n       \"1                          a man is playing a flute.   \\n\",\n       \"2  a man is spreading shredded cheese on an uncoo...   \\n\",\n       \"3                         two men are playing chess.   \\n\",\n       \"4                 a man seated is playing the cello.   \\n\",\n       \"\\n\",\n       \"                                    sentence1_tokens  \\\\\\n\",\n       \"0                     [a, plane, is, taking, off, .]   \\n\",\n       \"1          [a, man, is, playing, a, large, flute, .]   \\n\",\n       \"2  [a, man, is, spreading, shreded, cheese, on, a...   \\n\",\n       \"3               [three, men, are, playing, chess, .]   \\n\",\n       \"4               [a, man, is, playing, the, cello, .]   \\n\",\n       \"\\n\",\n       \"                                    sentence2_tokens  \\\\\\n\",\n       \"0               [an, air, plane, is, taking, off, .]   \\n\",\n       \"1                 [a, man, is, playing, a, flute, .]   \\n\",\n       \"2  [a, man, is, spreading, shredded, cheese, on, ...   \\n\",\n       \"3                 [two, men, are, playing, chess, .]   \\n\",\n       \"4       [a, man, seated, is, playing, the, cello, .]   \\n\",\n       \"\\n\",\n       \"                 sentence1_tokens_rm_stopwords  \\\\\\n\",\n       \"0                           [plane, taking, .]   \\n\",\n       \"1              [man, playing, large, flute, .]   \\n\",\n       \"2  [man, spreading, shreded, cheese, pizza, .]   \\n\",\n       \"3                     [men, playing, chess, .]   \\n\",\n       \"4                     [man, playing, cello, .]   \\n\",\n       \"\\n\",\n       \"                       sentence2_tokens_rm_stopwords  \\n\",\n       \"0                            [air, plane, taking, .]  \\n\",\n       \"1                           [man, playing, flute, .]  \\n\",\n       \"2  [man, spreading, shredded, cheese, uncooked, p...  \\n\",\n       \"3                           [men, playing, chess, .]  \\n\",\n       \"4                   [man, seated, playing, cello, .]  \"\n      ]\n     },\n     \"execution_count\": 12,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"sts_train_stop.head(5)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Test Set Preprocessing\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Convert all text to lowercase\\n\",\n    \"df_low = to_lowercase(sts_test)\\n\",\n    \"# Tokenize text\\n\",\n    \"sts_tokenize = to_spacy_tokens(df_low)\\n\",\n    \"# Tokenize with removal of stopwords\\n\",\n    \"sts_test_stop = rm_spacy_stopwords(sts_tokenize)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Document Frequency Calculation\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Many baseline models we explore will require calculation of how frequently a word appears in our corpus. To calculate this, we iterate through the sentences in our training set and count the number of sentences that contain each word. There are other ways to produce this calculation, including pulling larger datasets from the web (like Wikipedia data) and calculating the frequencies on that data. Note that \\\"document\\\" refers to some larger chunk of multiple tokens/words. In our case, our documents will actually be individual sentences. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def get_document_frequency(df):\\n\",\n    \"    \\\"\\\"\\\"Iterate through all sentences in dataframe and create a dictionary \\n\",\n    \"    mapping tokens to the number of sentences in our corpus they appear in\\n\",\n    \"    \\n\",\n    \"    Args:\\n\",\n    \"        df (pandas dataframe): dataframe of sentence pairs with their similarity scores\\n\",\n    \"        \\n\",\n    \"    Returns:\\n\",\n    \"        document_frequency_dict (dictionary): mapping from tokens to number of sentences they appear in\\n\",\n    \"        n (int): number of sentences in the corpus\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    document_frequency_dict = {}\\n\",\n    \"    all_sentences =  df[[\\\"sentence1_tokens\\\", \\\"sentence2_tokens\\\"]]\\n\",\n    \"    sentences = all_sentences.values.flatten().tolist()\\n\",\n    \"    n = len(sentences)\\n\",\n    \"\\n\",\n    \"    for s in sentences:\\n\",\n    \"        for token in set(s):\\n\",\n    \"            document_frequency_dict[token] = document_frequency_dict.get(token, 0) + 1\\n\",\n    \"\\n\",\n    \"    return document_frequency_dict, n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Note that we need to calculate these values on our training set so that we don't \\\"peek at\\\" our test set until test time\\n\",\n    \"document_frequencies, num_documents = get_document_frequency(sts_train_stop)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"11498\"\n      ]\n     },\n     \"execution_count\": 16,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"num_documents\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Baseline Models\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"As we consider each of the baseline models, we'll save all model predictions in a dictionary and will evaluate the results at the end of this notebook.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"baselines = {}\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Baseline #1: Word2vec Embeddings with Cosine Similarity\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This baseline first constructs word embeddings using word2vec. Once we have a word embedding (vector) for each word in the sentence, we calculate an embedding for the full sentence by taking the (weighted) average of all the word embeddings. The weights will be calculated using TF-IDF. Lastly, in order to compare the two sentence embeddings we use the cosine similarity metric. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### What is Word2Vec?\\n\",\n    \"Word2vec is a predictive model for learning word embeddings from text (see [original research paper](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)). Word embeddings are learned such that words that share common contexts in the corpus will be close together in the vector space. There are two different model architectures that can be used to produce word2vec embeddings: continuous bag-of-words (CBOW) or continuous skip-gram. The former uses a window of surrounding words (the \\\"context\\\") to predict the current word and the latter uses the current word to predict the surrounding context words. See this [tutorial](https://www.guru99.com/word-embedding-word2vec.html#3) on word2vec for more detailed information about the model.\\n\",\n    \"\\n\",\n    \"For our purposes, we use pretrained word2vec word embeddings. These embeddings were trained on a Google News corpus and provide 300-dimensional embeddings (vectors) for 3 million English words. See this [link](https://code.google.com/archive/p/word2vec/) for the original location of the embeddings and see the code below to load these word embeddings.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|████████████████████████████████████████████████████████████████████████████| 1.61M/1.61M [01:08<00:00, 23.4kKB/s]\\n\",\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\smart_open\\\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\\n\",\n      \"  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"word2vec_model = word2vec.load_pretrained_vectors(dir_path=BASE_DATA_PATH)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### What is TF-IDF?\\n\",\n    \"\\n\",\n    \"TF-IDF or term frequency-inverse document frequency is a weighting scheme intended to measure how important a word is to the document (or sentence in our case) within the broader corpus (our dataset). The weight \\\"increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus\\\" ([tutorial link](http://www.tfidf.com/)). When we're averaging together many different word vectors to get a sentence embedding, it makes sense to give stronger weights to words that are more distinct relative to the corpus and that have a high frequency in the sentence. The TF-IDF weights capture this intution, with the weight increasing as term frequency increases and/or as the inverse document frequency increases.\\n\",\n    \"\\n\",\n    \"For a term $t$ in sentence $s$ in corpus $c$, then the TF-IDF weight is \\n\",\n    \"$$w_{t,s} = TF_{t,s} * \\\\log{\\\\frac{N}{df_t}}$$\\n\",\n    \"where:  \\n\",\n    \"$TF_{t,s}$ = the number of times term $t$ appears in sentence $s$  \\n\",\n    \"$df_t$ = the number of sentences containing term $t$  \\n\",\n    \"$N$ = the size of the corpus.  \\n\",\n    \"\\n\",\n    \"In these baselines, we calculate the TF-IDF weighted average of all the word embeddings. The code below implements this weighted average given a list of tokens and an embedding model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def average_sentence_embedding(tokens, embedding_model):\\n\",\n    \"    \\\"\\\"\\\"Calculate TF-IDF weighted average embedding for a sentence\\n\",\n    \"    \\n\",\n    \"    Args:\\n\",\n    \"        tokens (list): list of tokens in a sentence\\n\",\n    \"        embedding_model: model to use for word embedding (word2vec, glove, fastText, etc.)\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"        list: vector representing the sentence\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    # Throw away tokens that are not in the embedding model\\n\",\n    \"    tokens = [i for i in tokens if i in embedding_model]\\n\",\n    \"\\n\",\n    \"    if len(tokens) == 0:\\n\",\n    \"        return []\\n\",\n    \"\\n\",\n    \"    # We will weight by TF-IDF. The TF part is calculated by:\\n\",\n    \"    # (# of times term appears / total terms in sentence)\\n\",\n    \"    count = Counter(tokens)\\n\",\n    \"    token_list = list(count)\\n\",\n    \"    term_frequency = [count[i] / len(tokens) for i in token_list]\\n\",\n    \"\\n\",\n    \"    # Now for the IDF part: LOG(# documents / # documents with term in it)\\n\",\n    \"    inv_doc_frequency = [\\n\",\n    \"        math.log(num_documents / (document_frequencies.get(i, 0) + 1)) for i in count\\n\",\n    \"    ]\\n\",\n    \"\\n\",\n    \"    # Put the TF-IDF together and produce the weighted average of vector embeddings\\n\",\n    \"    word_embeddings = [embedding_model[token] for token in token_list]\\n\",\n    \"    weights = [term_frequency[i] * inv_doc_frequency[i] for i in range(len(token_list))]\\n\",\n    \"    return list(np.average(word_embeddings, weights=weights, axis=0))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### What is Cosine Similarity?\\n\",\n    \"\\n\",\n    \"Cosine similarity is a common similarity metric between vectors. Intuitively it measures the cosine of the angle between any two vectors. With vectors $a$ and $b$, the cosine similarity is: cosine similarity($a$,$b$) = $\\\\frac{\\\\vec{a} \\\\cdot \\\\vec{b} }{||\\\\vec{a}|| ||\\\\vec{b}||}$\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def calculate_cosine_similarity(embedding1, embedding2):\\n\",\n    \"    \\\"\\\"\\\"Calculate cosine similarity between two embedding vectors\\n\",\n    \"    \\n\",\n    \"    Args:\\n\",\n    \"        embedding1 (list): embedding for the first sentence\\n\",\n    \"        embedding2 (list): embedding for the second sentence\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"        list: cosine similarity value between the two embeddings\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    # distance.cosine calculates cosine DISTANCE, so we need to\\n\",\n    \"    # return 1 - distance to get cosine similarity\\n\",\n    \"    cosine_similarity = 1 - distance.cosine(embedding1, embedding2)\\n\",\n    \"    return cosine_similarity\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Get Sentence Similarity Predictions for Test Set\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now, we calculate predictions for each sentence pair found in the test set.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def average_word_embedding_cosine_similarity(df, embedding_model, rm_stopwords=False):\\n\",\n    \"    \\\"\\\"\\\"Calculate the cosine similarity between TF-IDF weighted averaged embeddings\\n\",\n    \"    \\n\",\n    \"    Args:\\n\",\n    \"        df (pandas dataframe): dataframe as provided by the nlp_utils\\n\",\n    \"        embedding_model: word embedding model\\n\",\n    \"        rm_stopwords (bool): whether to remove stop words (True) or not (False)\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"        list: predicted values for sentence similarity of test set examples\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    if rm_stopwords:\\n\",\n    \"        df['sentence1_embedding'] = df.apply(lambda x: average_sentence_embedding(x.sentence1_tokens_rm_stopwords, embedding_model), axis=1)\\n\",\n    \"        df['sentence2_embedding'] = df.apply(lambda x: average_sentence_embedding(x.sentence2_tokens_rm_stopwords, embedding_model), axis=1)\\n\",\n    \"    else:\\n\",\n    \"        df['sentence1_embedding'] = df.apply(lambda x: average_sentence_embedding(x.sentence1_tokens, embedding_model), axis=1)\\n\",\n    \"        df['sentence2_embedding'] = df.apply(lambda x: average_sentence_embedding(x.sentence2_tokens, embedding_model), axis=1)\\n\",\n    \"\\n\",\n    \"    df['predictions'] = df.apply(lambda x: calculate_cosine_similarity(x.sentence1_embedding, x.sentence2_embedding) if \\n\",\n    \"                                 (sum(x.sentence1_embedding) != 0 and sum(x.sentence2_embedding) != 0) else 0, axis=1)\\n\",\n    \"    \\n\",\n    \"    return df['predictions'].tolist()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Get predictions using average word2vec embeddings both with and without stop words\\n\",\n    \"baselines[\\\"Word2vec Cosine\\\"] = average_word_embedding_cosine_similarity(\\n\",\n    \"    sts_test_stop, word2vec_model, rm_stopwords=True\\n\",\n    \")\\n\",\n    \"baselines[\\\"Word2vec Cosine with Stop Words\\\"] = average_word_embedding_cosine_similarity(\\n\",\n    \"    sts_test_stop, word2vec_model, rm_stopwords=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Baseline #2: Word2vec Embeddings with Word Mover's Distance \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This baseline first constructs word embeddings using word2vec (for an introduction to word2vec, see [Background on Word2Vec](#What-is-Word2Vec?)). Then all the word embeddings are used to calculate sentence similarity using the word mover's distance.  \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### What is Word Mover's Distance (WMD)?\\n\",\n    \"Word Mover's Distance (WMD) is a metric that \\\"adapts the earth mover’s distance to the space of documents: the distance between two texts is given by the total amount of “mass” needed to move the words from one side into the other, multiplied by the distance the words need to move.\\\" We'll utilize word2vec's implementation of word mover's distance. See this [blog](http://vene.ro/blog/word-movers-distance-in-python.html) for additional information about this similarity measure.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Get Sentence Similarity Predictions for Test Set\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now, we calculate predictions for each of sentence pairs found in the test set.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def word_embedding_WMD(df, embedding_model, rm_stopwords=False):\\n\",\n    \"    \\\"\\\"\\\"Calculate Word Mover's Distance between two sentences using embeddings\\n\",\n    \"    \\n\",\n    \"    Args:\\n\",\n    \"        df (pandas dataframe): dataframe as provided by the nlp_utils\\n\",\n    \"        embedding_model (gensim model): word embedding model\\n\",\n    \"        rm_stopwords (bool): whether to remove stop words (True) or not (False)\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"        list: predicted values for sentence similarity of test set examples\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    if rm_stopwords:\\n\",\n    \"        df['sentence1_cleaned'] = df.apply(lambda x: [i for i in x.sentence1_tokens_rm_stopwords if i in embedding_model], axis=1)\\n\",\n    \"        df['sentence2_cleaned'] = df.apply(lambda x: [i for i in x.sentence2_tokens_rm_stopwords if i in embedding_model], axis=1)\\n\",\n    \"    else:\\n\",\n    \"        df['sentence1_cleaned'] = df.apply(lambda x: [i for i in x.sentence1_tokens if i in embedding_model], axis=1)\\n\",\n    \"        df['sentence2_cleaned'] = df.apply(lambda x: [i for i in x.sentence2_tokens if i in embedding_model], axis=1)\\n\",\n    \"\\n\",\n    \"    # wmdistance takes the raw tokens and performs the word2vec embedding itself\\n\",\n    \"    df['predictions'] = df.apply(lambda x: -embedding_model.wmdistance(x.sentence1_cleaned, x.sentence2_cleaned) if \\n\",\n    \"                                 (len(x.sentence1_cleaned) != 0 and len(x.sentence2_cleaned) != 0) else 0, axis=1)\\n\",\n    \"    \\n\",\n    \"    return df['predictions'].tolist()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 24,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Get predictions using word2vec embeddings and WMD both with and without stop words\\n\",\n    \"baselines[\\\"Word2vec WMD\\\"] = word_embedding_WMD(sts_test_stop, word2vec_model, rm_stopwords=True)\\n\",\n    \"baselines[\\\"Word2vec WMD with Stop Words\\\"] = word_embedding_WMD(\\n\",\n    \"    sts_test_stop, word2vec_model, rm_stopwords=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Baseline #3: GloVe Embeddings with Cosine Similarity\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This baseline first constructs word embeddings using GloVE. Once we have a word embedding (vector) for each word in the sentence, we calculate an embedding for the full sentence by taking the (weighted) average of all the word embeddings. The weights will be calculated using TF-IDF. Lastly, in order to compare the two sentence embeddings we use the cosine similarity metric (for an introduction to the cosine similarity metric, see [Background on Cosine Similarity](#What-is-Cosine-Similarity?)). \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### What is GloVe?\\n\",\n    \"GloVe is an unsupervised algorithm for obtaining word embeddings created by the Stanford NLP group (see [original research paper](https://nlp.stanford.edu/pubs/glove.pdf)) Training occurs on word-word co-occurrence statistics with the objective of learning word embeddings such that the dot product of two words' embeddings is equal to the words' probability of co-occurrence. See this [tutorial](https://nlp.stanford.edu/projects/glove/) on GloVe for more detailed background on the model. For our purposes, we use pretrained GloVe word embeddings (glove.840B.300d.zip which can be found through the above link). These embeddings were trained on Common Crawl data and provide 300-dimensional embeddings (vectors) for 2.2 million English words. Below is the code to load in the GloVe embeddings.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 25,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|████████████████████████████████████████████████████████████████████████████| 2.13M/2.13M [01:58<00:00, 17.9kKB/s]\\n\",\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\smart_open\\\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\\n\",\n      \"  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"glove_model = glove.load_pretrained_vectors(dir_path=BASE_DATA_PATH)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 26,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Get predictions using GloVe embeddings and cosine similarity both with and without stop words\\n\",\n    \"baselines[\\\"GLoVe Cosine\\\"] = average_word_embedding_cosine_similarity(\\n\",\n    \"    sts_test_stop, glove_model, rm_stopwords=True\\n\",\n    \")\\n\",\n    \"baselines[\\\"GLoVe Cosine with Stop Words\\\"] = average_word_embedding_cosine_similarity(\\n\",\n    \"    sts_test_stop, glove_model, rm_stopwords=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Baseline #4: GloVe Embeddings with Word Mover's Distance\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This baseline first constructs word embeddings using GloVe (for an introduction on GloVe, see [Background on GloVe](#What-is-GloVe?)). Then all the word embeddings are used to calculate sentence similarity using the word mover's distance (for an introduction to WMD, see [Background on Word Mover's Distance](#What-is-Word-Mover's-Distance-(WMD)?)).  \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 27,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Get predictions using GloVe embeddings and WMD both with and without stop words\\n\",\n    \"baselines[\\\"GLoVe WMD\\\"] = word_embedding_WMD(sts_test_stop, glove_model, rm_stopwords=True)\\n\",\n    \"baselines[\\\"GLoVe WMD with Stop Words\\\"] = word_embedding_WMD(\\n\",\n    \"    sts_test_stop, glove_model, rm_stopwords=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Baseline #5: fastText Embeddings with Cosine Similarity\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This baseline first constructs word embeddings using fastText. Once we have a word embedding (vector) for each word in the sentence, we calculate an embedding for the full sentence by taking the (weighted) average of all the word embeddings. The weights will be calculated using TF-IDF. Lastly, in order to compare the two sentence embeddings we use the cosine similarity metric (for an introduction to the cosine similarity metric, see [Background on Cosine Similarity](#What-is-Cosine-Similarity?)). \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### What is fastText?\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"fastText is an unsupervised algorithm created by Facebook Research for efficiently learning word embeddings (see [original research paper](https://arxiv.org/pdf/1607.04606.pdf)). fastText is significantly different than word2vec or GloVe in that these two algorithms we saw earlier treat each word as the smallest possible unit to find an embedding for. Conversely, fastText assumes that words are formed by an n-gram of characters (i.e. 2-grams of the word \\\"language\\\" would be {la, an, ng, gu, ua, ag, ge}). The embedding for a word is then composed of the sum of these character n-grams. This has advantages when finding word embeddings for rare words and words not present in the dictionary, as these words can still be broken down into character n-grams. Typically, for smaller datasets, fastText performs better than word2vec or GloVe. See this [tutorial](https://fasttext.cc/docs/en/unsupervised-tutorial.html) on fastText for more detail. We will use the pretrained word embeddings for the English language (wiki.en.bin; these embeddings as well as embeddings for 156 other languages can be found [here](https://fasttext.cc/docs/en/english-vectors.html)). These are 300-dimensional embeddings (vectors) trained on Wikipedia data.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 28,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|████████████████████████████████████████████████████████████████████████████| 2.56M/2.56M [01:46<00:00, 24.0kKB/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"fastText_model = fasttext.load_pretrained_vectors(dest_path=BASE_DATA_PATH)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 29,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\ipykernel_launcher.py:12: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\\n\",\n      \"  if sys.path[0] == '':\\n\",\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\ipykernel_launcher.py:29: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Get predictions using fastText embeddings and cosine similarity both with and without stop words\\n\",\n    \"baselines[\\\"fastText Cosine\\\"] = average_word_embedding_cosine_similarity(\\n\",\n    \"    sts_test_stop, fastText_model, rm_stopwords=True\\n\",\n    \")\\n\",\n    \"baselines[\\\"fastText Cosine with Stop Words\\\"] = average_word_embedding_cosine_similarity(\\n\",\n    \"    sts_test_stop, fastText_model, rm_stopwords=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Baseline #6: fastText Embeddings with Word Mover's Distance\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 30,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\ipykernel_launcher.py:13: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\\n\",\n      \"  del sys.path[0]\\n\",\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\ipykernel_launcher.py:14: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\\n\",\n      \"  \\n\",\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\ipykernel_launcher.py:21: DeprecationWarning: Call to deprecated `wmdistance` (Method will be removed in 4.0.0, use self.wv.wmdistance() instead).\\n\",\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\ipykernel_launcher.py:16: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\\n\",\n      \"  app.launch_new_instance()\\n\",\n      \"C:\\\\Users\\\\cocochra\\\\AppData\\\\Local\\\\Continuum\\\\anaconda3\\\\envs\\\\nlp_gpu\\\\lib\\\\site-packages\\\\ipykernel_launcher.py:17: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Get predictions using fastText embeddings and WMD both with and without stop words\\n\",\n    \"baselines[\\\"fastText WMD\\\"] = word_embedding_WMD(sts_test_stop, fastText_model, rm_stopwords=True)\\n\",\n    \"baselines[\\\"fastText WMD with Stop Words\\\"] = word_embedding_WMD(\\n\",\n    \"    sts_test_stop, fastText_model, rm_stopwords=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Baseline #7: TF-IDF Embeddings with Cosine Similarity\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This baseline first constructs a document embedding based on bag-of-words with TF-IDF weighting (for an introduction to TF-IDF, see [Background on TF-IDF](#What-is-TF-IDF?). Then we apply cosine similarity between the two embeddings in the sentence pair (for an introduction to the cosine similarity metric, see [Background on Cosine Similarity](#What-is-Cosine-Similarity?)).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Bag-of-Words\\n\",\n    \"\\n\",\n    \"The most basic approach for document embeddings is called Bag-of-Words. This method first determines the vocabulary across the entire corpus and then, for each document, creates a vector containing the number of times each vocabulary word appeared in the given document. These vectors are obviously very sparse and typical bag-of-words implementations ignore terms whose document frequency is less than some threshold in order to reduce sparsity. We also often ignore stop words as they add little semantic information. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 33,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def tfidf_cosine_similarity(df, rm_stopwords=False):\\n\",\n    \"    \\\"\\\"\\\"Calculate cosine similarity between TF-IDF document embeddings\\n\",\n    \"    \\n\",\n    \"    Args:\\n\",\n    \"        df (pandas dataframe): dataframe as provided by the nlp_utils\\n\",\n    \"        rm_stopwords (bool): whether to remove stop words (True) or not (False)\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"        list: predicted values for sentence similarity of test set examples\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    stop_word_param = \\\"english\\\" if rm_stopwords else None\\n\",\n    \"\\n\",\n    \"    tf = TfidfVectorizer(\\n\",\n    \"        input=\\\"content\\\",\\n\",\n    \"        analyzer=\\\"word\\\",\\n\",\n    \"        min_df=0,\\n\",\n    \"        stop_words=stop_word_param,\\n\",\n    \"        sublinear_tf=True,\\n\",\n    \"    )\\n\",\n    \"    all_sentences = df[[\\\"sentence1\\\", \\\"sentence2\\\"]]\\n\",\n    \"    corpus = np.concatenate([df[\\\"sentence1\\\"].values, df[\\\"sentence2\\\"].values])\\n\",\n    \"    tfidf_matrix = np.array(tf.fit_transform(corpus).todense())\\n\",\n    \"    num_samples = len(df.index)\\n\",\n    \"    \\n\",\n    \"    # calculate the cosine similarity between pairs of tfidf embeddings\\n\",\n    \"    # first pair at index 0 and n in tfidf_matrix, second pair at 1 and n+1, etc.\\n\",\n    \"    df[\\\"predictions\\\"] = df.apply(\\n\",\n    \"        lambda x: calculate_cosine_similarity(\\n\",\n    \"            tfidf_matrix[int(x.name), :], tfidf_matrix[num_samples + int(x.name), :]\\n\",\n    \"        )\\n\",\n    \"        if (\\n\",\n    \"            sum(tfidf_matrix[int(x.name), :]) != 0\\n\",\n    \"            and sum(tfidf_matrix[num_samples + int(x.name), :]) != 0\\n\",\n    \"        )\\n\",\n    \"        else 0,\\n\",\n    \"        axis=1,\\n\",\n    \"    )\\n\",\n    \"    return df[\\\"predictions\\\"].tolist()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 34,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"baselines[\\\"TF-IDF Cosine\\\"] = tfidf_cosine_similarity(sts_test_stop, rm_stopwords=True)\\n\",\n    \"baselines[\\\"TF-IDF Cosine with Stop Words\\\"] = tfidf_cosine_similarity(\\n\",\n    \"    sts_test_stop, rm_stopwords=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Baseline #8: Doc2vec Embeddings with Cosine Similarity\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This baseline constructs document embeddings using doc2vec and then applies cosine similarity to measure each sentence pair's similarity (for an introduction to the cosine similarity metric, see [Background on Cosine Similarity](#What-is-Cosine-Similarity?)).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### What is Doc2Vec?\\n\",\n    \"\\n\",\n    \"Doc2vec is an extension of word2vec which produces embeddings of a document. Note that \\\"document\\\" refers to some larger chunk of multiple tokens/words. In our case, our documents will actually be individual setntences. The algorithm not only exploits the idea of context words (like in word2vec), but also incorporates the context of the document. There are again two model architectures that parallel those of word2vec: Paragraph Vectors Distributed Memory (PV-DM) and Paragraph Vectors Distributed Bag-of-Words (PV-DBOW). PV-DM randomly samples consecutive words in a paragraph and predicts a center word by utilizing the context words and the paragraph id. PV-DBOW takes a paragraph id and uses it to predict words in the context. \\n\",\n    \"\\n\",\n    \"See [tutorial #1](https://kanoki.org/2019/03/07/sentence-similarity-in-python-using-doc2vec/) or [tutorial #2](https://gab41.lab41.org/doc2vec-to-assess-semantic-similarity-in-source-code-667acb3e62d7) for more information and an example of using Doc2vec for sentence similarity.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 35,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Doc2vec requires unique ids for each sentence, so we'll iterate \\n\",\n    \"# through our dataframe, adding a new ID column\\n\",\n    \"\\n\",\n    \"all_sentences =  sts_test_stop[[\\\"sentence1\\\", \\\"sentence2\\\"]]\\n\",\n    \"corpus = all_sentences.values.flatten().tolist()\\n\",\n    \"# Produce dictionary of sentence to id\\n\",\n    \"sentence_id = {sent: i for i, sent in enumerate(set(corpus))}\\n\",\n    \"\\n\",\n    \"def assign_id(row):\\n\",\n    \"    return sentence_id[row]\\n\",\n    \"\\n\",\n    \"sts_test_stop[\\\"qid1\\\"] = sts_test_stop[\\\"sentence1\\\"].apply(assign_id)\\n\",\n    \"sts_test_stop[\\\"qid2\\\"] = sts_test_stop[\\\"sentence2\\\"].apply(assign_id)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 36,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def doc2vec_cosine(df, rm_stopwords=False):\\n\",\n    \"    \\\"\\\"\\\"Calculate cosine similarity between each sentence pair using Doc2Vec embeddings\\n\",\n    \"    \\n\",\n    \"    Args:\\n\",\n    \"        df (pandas dataframe): dataframe as provided by the nlp_utils\\n\",\n    \"        rm_stopwords (bool): whether to remove stop words (True) or not (False)\\n\",\n    \"    \\n\",\n    \"    Returns:\\n\",\n    \"        list: predicted values for sentence similarity of test set examples\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    if rm_stopwords:\\n\",\n    \"        df[[\\\"sentence1_prepped\\\", \\\"sentence2_prepped\\\"]] = df[\\n\",\n    \"            [\\\"sentence1_tokens_rm_stopwords\\\", \\\"sentence2_tokens_rm_stopwords\\\"]\\n\",\n    \"        ]\\n\",\n    \"    else:\\n\",\n    \"        df[[\\\"sentence1_prepped\\\", \\\"sentence2_prepped\\\"]] = df[\\n\",\n    \"            [\\\"sentence1_tokens\\\", \\\"sentence2_tokens\\\"]\\n\",\n    \"        ]\\n\",\n    \"\\n\",\n    \"    # Doc2vec requires data as Tagged Documents with the tokenized sentence and the sentence id\\n\",\n    \"    df[\\\"labeled_questions1\\\"] = df.apply(\\n\",\n    \"        lambda x: TaggedDocument(x.sentence1_prepped, str(x.qid1)), axis=1\\n\",\n    \"    )\\n\",\n    \"    df[\\\"labeled_questions2\\\"] = df.apply(\\n\",\n    \"        lambda x: TaggedDocument(x.sentence2_prepped, str(x.qid2)), axis=1\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    # Get all Tagged Documents\\n\",\n    \"    df_labeled_sentences = df[[\\\"labeled_questions1\\\", \\\"labeled_questions2\\\"]]\\n\",\n    \"    labeled_sentences = df_labeled_sentences.values.flatten().tolist()\\n\",\n    \"\\n\",\n    \"    # instantiate Doc2Vec model\\n\",\n    \"    model = Doc2Vec(\\n\",\n    \"        labeled_sentences, dm=1, min_count=1, window=5, vector_size=500, epochs=30\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    # Train our model for 20 epochs\\n\",\n    \"    for epoch in range(20):\\n\",\n    \"        model.train(\\n\",\n    \"            labeled_sentences, epochs=model.epochs, total_examples=model.corpus_count\\n\",\n    \"        )\\n\",\n    \"\\n\",\n    \"    df[\\\"predictions\\\"] = df.apply(\\n\",\n    \"        lambda x: model.wv.n_similarity(x.sentence1_prepped, x.sentence2_prepped)\\n\",\n    \"        if (len(x.sentence1_prepped) != 0 and len(x.sentence2_prepped) != 0)\\n\",\n    \"        else 0,\\n\",\n    \"        axis=1,\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    return df[\\\"predictions\\\"].tolist()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 37,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"baselines[\\\"Doc2vec Cosine\\\"] = doc2vec_cosine(sts_test_stop, rm_stopwords=True)\\n\",\n    \"baselines[\\\"Doc2vec Cosine with Stop Words\\\"] = doc2vec_cosine(sts_test_stop, rm_stopwords=False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Comparison of Baseline Models\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Our evaluation metric is Pearson correlation ($\\\\rho$) which is a measure of the linear correlation between two variables. The formula for calculating Pearson correlation is as follows:  \\n\",\n    \"\\n\",\n    \"$$\\\\rho_{X,Y} = \\\\frac{E[(X-\\\\mu_X)(Y-\\\\mu_Y)]}{\\\\sigma_X \\\\sigma_Y}$$\\n\",\n    \"\\n\",\n    \"This metric takes a value in [-1,1] where -1 represents a perfect negative correlation, 1 represents a perfect positive correlation, and 0 represents no correlation. We utilize the Pearson correlation metric as this is the metric that [SentEval](http://nlpprogress.com/english/semantic_textual_similarity.html), a widely-used evaluation toolkit for evaluation sentence representations, uses for the STS Benchmark dataset.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 38,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def pearson_correlation(df, prediction):\\n\",\n    \"    \\\"\\\"\\\"Calculate the Pearson correlation between two vectors\\n\",\n    \"    \\n\",\n    \"    Args:\\n\",\n    \"        df (pandas dataframe): dataframe of sentences and their similarity scores\\n\",\n    \"        prediction (list): predicted similarity scores for each value in test set\\n\",\n    \"        \\n\",\n    \"    Returns:\\n\",\n    \"        float: pearson correlation value between the actual and predicted score lists\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    pearson_correlation = scipy.stats.pearsonr(prediction, list(df[\\\"score\\\"]))[0]\\n\",\n    \"    return pearson_correlation\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 39,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'Word2vec Cosine': 0.6476606845766778,\\n\",\n       \" 'Word2vec Cosine with Stop Words': 0.6683808069062863,\\n\",\n       \" 'Word2vec WMD': 0.6574175839579567,\\n\",\n       \" 'Word2vec WMD with Stop Words': 0.5689438215886101,\\n\",\n       \" 'GLoVe Cosine': 0.6688056947022161,\\n\",\n       \" 'GLoVe Cosine with Stop Words': 0.6049380247374541,\\n\",\n       \" 'GLoVe WMD': 0.6267300417407605,\\n\",\n       \" 'GLoVe WMD with Stop Words': 0.48470008225931194,\\n\",\n       \" 'fastText Cosine': 0.6707510007525627,\\n\",\n       \" 'fastText Cosine with Stop Words': 0.6771300330824099,\\n\",\n       \" 'fastText WMD': 0.6394958913339955,\\n\",\n       \" 'fastText WMD with Stop Words': 0.5177829727556036,\\n\",\n       \" 'TF-IDF Cosine': 0.6749213786510483,\\n\",\n       \" 'TF-IDF Cosine with Stop Words': 0.7118087132257667,\\n\",\n       \" 'Doc2vec Cosine': 0.528387685928394,\\n\",\n       \" 'Doc2vec Cosine with Stop Words': 0.45572884639905675}\"\n      ]\n     },\n     \"execution_count\": 39,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# Get metrics on predictions from all models\\n\",\n    \"results = dict((model, pearson_correlation(sts_test_stop, baselines[model])) for model in baselines)\\n\",\n    \"results\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We investigate our 8 models with and without stop words (16 different results total). The results show that TF-IDF bag-of-words document embeddings combined with the cosine similarity performs the best.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 40,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"image/png\": \"iVBORw0KGgoAAAANSUhEUgAAAg8AAAEWCAYAAADhFHRsAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzdd7xcVbn/8c+XgLRQpBpUiHSBQIAQIAYEwYYNBQVEpKiIXOGCgvITr2C74AXFAsLlqvSOYgGUDgkpBBJCEkqQEqkivQWQJM/vj/UMZ2cy55yZ03JO8n2/XueVmV3WXnvNwF6z9trPo4jAzMzMrFlLLOwKmJmZ2cDizoOZmZm1xJ0HMzMza4k7D2ZmZtYSdx7MzMysJe48mJmZWUvceTCzHiXpR5KekfTPhV2X/kDS+yT9XdIrknZf2PUBkHSApFsr71+RtO7CrFMrJIWk9ZvYbidJj/VFnRY37jyYLeYkzZL0Wl5AnpJ0lqTBXSzr3cA3gU0i4h09W9MB6wfAqRExOCL+WL+yrv2fl3RVtmOfybo91NPlSro5L/Rb1C3/Yy7fqaePaX3DnQczA/hERAwGtgK2Ab7bagGSlgTWAZ6NiH91cf9F0TrA3Z1sU2v/IcBTwK96vVZ9537gi7U3klYFtgOeXmg1sm5z58HM3hIRjwN/BTYDkLSSpN9KelLS43lLYlCuO0DSOEmnSHoOuBm4Dlgrf0Wfndt9UtLdkl7IX6LvrR0vf3V/W9I04FVJS+ayoyVNk/RqHn9NSX+V9LKk6yW9vVLGZZL+KelFSWMkbVpZd7ak0/LX/MuSbpO0XmX9ppKuk/Rcjrp8J5cvIekYSQ9KelbSpZJWaa/dJH1F0gNZzp8lrZXLHwTWBf6SbbJ0J+3/OnA5sEml7I9JulPSS5IelXR8Zd0yks7POr4g6XZJa3b22TWo/1u3AZpos40rbTZT0uc6OifgAmCvyrH3Aa4A/l0pc2lJP5f0RP79vNpW+X14MtcdVFf3pSWdLOmR/AzPkLRsO+f57WyLl7Puu3RSd2uHOw9m9pYcLt8NuDMXnQPMAdYHtgQ+BHy5ssu2wEPAGsAHgY8CT+Qw+AGSNgQuAo4AVgeuplxI31YpYx/gY8DKETEnl+2R5W0IfILSofkOsBrl/1uHV/b/K7BB1mEK5WJVtQ/wfeDtwAPAj/NcVwCuB/4GrJXneEPucziwO/D+XPc8cFo7bfYB4ATgc5SRg38AFwNExHrAI+TIQkS80aiMSlnLAXsBEyuLX6X8cl852+lraps7sT+wEvBuYFXgEOC1XNfZZ9eR9tpseUoH8UJKe+8D/LraYWvgCeCePD55LufWbXMsZTRiOLAFMJIc/ZL0EeAoyvdhA2DXun1/QvmeDM9zfSfwvfpKSNoI+DqwTUSsAHwYmNVBva0jEeE///lvMf6j/A/0FeAFyoXv18CywJrAG8CylW33AW7K1wcAj9SVtRPwWOX9fwGXVt4vATwO7FQ59kEN6rNv5f3vgdMr7w8D/tjOuawMBLBSvj8b+E1l/W7AfZVzubOdcu4Fdqm8HwK8CSzZYNvfAv9TeT84tx1aOZ9dm2z/OZSL7bAOtv85cEq+PggYD2xet00zn92tlXUBrN9Em+0FjK071v8Cx7VT15spHZYvUDqRGwH357rHKt+DB4HdKvt9GJiVr38HnFhZt2GtvoAonav1Kuu3Bx6u/z7m9v+idD6WWtj/3Q30v0X1HqOZtWb3iLi+ukDSMGAp4ElJtcVLAI9WNqu+bmQtSocEgIiYJ+lRyq/Djsp4qvL6tQbvB2cdB1F+FX+WMrIxL7dZDXgxX1ef+phd25fya/3Bduq9DnCFpHmVZXMpF+XH67ZdizLiAUBEvCLpWco5zmqn/Hq7R8T1eT6fAm6RtElE/FPStsCJlFtJbwOWBi7L/c7L87hY0srA+ZRf8evQ+WfXkfbabB1gW0kvVNYvmfXoyB+AnwLPtrPtfN+TfL1WZd3kunU1qwPLAZMr5ylggdszEfGApCOA44FNJV0DfCMinuik7taAb1uYWXsepfx6XS0iVs6/FSOiOkTdWVreJygXHABU/g//bua/AHcnte/nKRfbXSnD90Nrh2pi30eB9TpY99HKea8cEctEmRNSr/4cl6fcQmi0bYciYm5E/IHSURmdiy8E/gy8OyJWAs4gzy8i3oyI70fEJsAo4OOU2wLNfHZd8ShwS127DI6Ir3VyXrMpt5e+RuPOw3xtCKydywCepHxnqutqnqF0Jjet1GelKJNPG9XjwogYnccKyi0P6wJ3HsysoYh4ErgW+KmkFXMS4XqS3t9CMZcCH5O0i6SlKI9xvkEZau8JK2R5z1J+gf53C/teCbxD0hE56W6F/JUP5QL9Y0nrAEhaXdKn2innQuBAScNzkt9/A7dFxKxWT0bFpyhzDe7NxSsAz0XE65JGUjpMte13ljQsRyxeotwumdtDn10jVwIbStpP0lL5t40qk2A78B3g/e20y0XAd7OdV6PMWTg/110KHCBpk5wTclxtp4iYB/wfcIqkNQAkvVPSh+sPIGkjSR/Iz+h1SqdjbrMnbvNz58HMOvJFylD5PZRJg5dT7v83JSJmUu53/4ryK/ETlMmD/+5wx+adSxnGfjzrOLHjzeer28uUSXifoAzT/x3YOVf/gvJr/1pJL2e527ZTzg2UuR2/p/xKXg/Yu8Xz+IukVygdgB8D+0dE7fHOQ4EfZD2+R7mY1ryD8pm8ROls3ELbRbdbn10j2WYfopzfE5R2+wnlVkpn+z4REbe2s/pHwB3ANGA65TbQj3K/v1LmedxImbx5Y92+387lEyW9RJkEu1GDYyxNuf3zTNZ7DUqHxrpAEd0ZMTQzM7PFjUcezMzMrCXuPJiZmVlL3HkwMzOzlrjzYGZmZi1xkChbLKy22moxdOjQhV0NM7MBZfLkyc9ExOr1y915sMXC0KFDueOOOxZ2NczMBhRJ/2i03LctzMzMrCXuPJiZmVlL3HkwMzOzlrjzYGZmZi1x58HMzMxa4s6DmZmZtcSdBzMzM2uJOw9mZmbWEgeJssXC9MdfZOgxVy3sapjZYmzWiR9b2FXoMR55MDMzs5a489DHJJ0i6YjK+2sk/aby/qeSvtGN8o+XdFS+PknSfZKmSbpC0srdq32nxz4qjzdD0l2SvtiFMg7pyn5mZtZ33Hnoe+OBUQCSlgBWAzatrB8FjGumIEmDOtnkOmCziNgcuB/4fy3XtkmSDgE+CIyMiM2AHQG1Wk5EnBER5/Z0/czMrOe489D3xpGdB0qnYQbwsqS3S1oaeC9wp4qT8lf8dEl7AUjaSdJNki4EpueyYyXNlHQ9sFHtQBFxbUTMybcTgXfl9rdJeqvDIulmSVtLWl7S7yTdLulOSZ/K9YMknZz1mCbpsAbn9R3g0Ih4KY/9YkSck/vvkuVNz/KXzuUnSronyzw5l1VHTm6W9BNJkyTdL2mHSn1OynpOk/TV7n0kZmbWCk+Y7GMR8YSkOZLWpnQiJgDvBLYHXgSmRcS/Je0BDAe2oIxO3C5pTBYzkjKi8LCkrYG9gS0pn+cUYHKDQx8EXJKvLwY+BxwnaQiwVkRMlvTfwI0RcVDe4piUHZIvAu8BtoyIOZJWqRYsaQVghYh4sP6gkpYBzgZ2iYj7JZ0LfC3//TSwcUREB7dUloyIkZJ2A44DdgW+BLwYEdtkR2ScpGsj4uG6Yx8MHAwwaMUFMsqamVkXeeRh4aiNPtQ6DxMq78fnNqOBiyJibkQ8BdwCbJPrJlUulDsAV0TE7PzV/+f6g0k6FpgDXJCLLgU+m68/B1yWrz8EHCNpKnAzsAywNuWCfUZtFCMinqs/BBDtnOtGwMMRcX++P4dyS+Ml4HXgN5I+A8xuZ/8/5L+TgaGVen4x63kbsCqwQf2OEXFmRIyIiBGDllupneLNzKxVHnlYOGrzHoZRbls8CnyTckH9XW7T0XyBV+vet3fhRtL+wMcpv/wDICIel/SspM2BvYDasL+APSJiZl0ZHXUOiIiXJL0qad2IeKi+Cu3sM0fSSGAXysjJ14EPNNj0jfx3Lm3fVwGHRcQ17dXJzMx6j0ceFo5xlAv6czmy8BywMuXWxYTcZgywV97fX53ya31Sg7LGAJ+WtGzePvhEbYWkjwDfBj4ZEfW/7C8GvgWsFBHTc9k1wGHZWUDSlrn8WuAQSUvm8lVY0AnAaZJWzG1WzNsG9wFDJa2f2+0H3CJpcB77auAIyi2aZl1DufWxVB5rQ0nLt7C/mZl1g0ceFo7plHkMF9YtGxwRz+T7Kyidibsov/q/FRH/lLRxtaCImCLpEmAq8A9gbGX1qcDSwHXZH5gYEYfkusuBXwA/rGz/Q+DnwLTsQMyidHJ+A2yYy98E/i/LrjodGEyZm/Em8Cbw04h4XdKBwGXZ+bgdOANYBfhTzokQcGSnrdbmN5RbGFOynk8Du7ewv5mZdYNyJNtskTZixIi44447FnY1zMwGFEmTI2JE/XLftjAzM7OWuPNgZmZmLfGcB1ssODGWmQ0EAyV5lkcezMzMrCUddh7kJE6dldFnSZwkXS1p5fw7tLJ8J0lXNrH/dhmWeqqkeyUdX9l/VCe7N1vHKyTtXnk/U9J3K+9/nwGhulr+2ZL27G49zcysezobeXASpw70ZRKniNgtIl6gxIM4tLPtGzgHODgihgObUaJMAuxEW66N7qp+X1YFXqE8blqzPW0RNDtUiylhZmb9T2edBydx6oMkTpK+JenwfH2KpBsrdTk/X8+StBpwIrBejiCclEUMlnR5jqRckLEP6q0BPJnnOzci7pE0FDgEODLL20HSOpJuyLreoJKDo/ar/wxJY/P8Pt7gGNXvyyjgSmD1/H68B3gtY1UsI+msbOM7Je2cxzhA0mWS/gJcm/udmu1+VZ5Drc0W+DzMzKxvdPjrzkmc+iyJ0xhKeOpfAiOApVWiJ45m/qBPAMdkew7POu+U7bkp8ATlAv4+4Na6/U4BZkq6GfgbcE5EzJJ0BvBKRNQ6RH8Bzo2IcyQdlHWq3YoYCrwfWA+4SdL6EfF65RiTgc0kvY3yfbkFWJfSydyStlGq/wCIiGEqQa+ulbRhrtse2DwinlO5xbERJYz3msA9wO/yM+3085ATY5mZ9YpmJkw6iVPvJ3GaDGydHZs3KG08gtJe9Z2HRiZFxGMRMY8SaXJo/QYR8YMs81rg85QORCPb0xb58jzKZ1tzaUTMi4i/Aw8B9dEu3wDuBrYCtsvzbe/7cl7ucx8lMmat83Bd5TPbkbbv1RPAjbm8qc/DibHMzHpHM52H+iROEykXmOp8h55O4rRvNYkTUE3idHHlmHtExPD8Wzsi7qXjzgHZaXlV0rqNqtDOPnMoIyi/p/wKb+/C21ESp1o93xMR19aV/yYlFPSBlPYeC+xM+YV/b3vn0uC49ceuP48HI+J0SjKqLVTmJXQm2nnd6D2U+u9IGd15nvJ9qXUeeuT70sLnYWZmvaDZkQcncer9JE5jgKPy37GUuQhTa52oipeBFVo4Pnncj1XmQmxA6WS80KC88ZRbSwD7Mv/tj89KWkLSepTbEfNl30zjKFk678r30yijEGtTRiWgnOO+Wa8Nc12jssYAe+f3agilQ0U3Pw8zM+umZma0O4lT3yRxGgscC0yIiFclvU6DWxYR8aykcZJmAH8Fmo18tB9wiqTZlNtC+0bE3JzjcLnKhNPDgMMp8wqOzroeWCljJuWW1JrAIXXzHWrGUzoWJ2R950j6F/Bo3lYB+DVwhqTpWZcDIuINLTjP8wpKmu7plCdwbsnlK9D1z8PMzLrJibGsKZLOBq6MiMsXdl26womxzMxaJyfGMjMzs57gQDzWlIg4YGHXoTuc28LM+puBkseiEY88mJmZWUvceehHtAjmEpG0Rca4qL3fR9LsytMnwyRNy9c3S3qk8lQIkv4o6ZV8PVTSaypRKe9Viea5f2/U28zM2ufOQ/+yKOYSmQ6sk4/mQjmH+ygRJ2vvq+f0AiVCJtmhGVJX3oMRsWVEvJfySOmR+YSMmZn1EXce+pdFLpdIPp55O7BtLtoaOI35c2BUk2VdTFucic/QFrVzARHxEPANyuOlZmbWR9x56EcyBHN9LpHbKDE0RpC5RCgX1VoukV2BkzKIEpTIi8dGxCaaP5fIZ2gLGV7vIErMCGjLJYIquUQoMShujIhtKMGaTspgVwfTlktkc9rCileNB0bl9vMo4cSrnYfqyMMNwI45crI3bTlO2jOFujDZNZIOlnSHpDvmzn6xk2LMzKxZ7jz0P4taLpHqOY0Ebs+kZOurRCMdnCMINXMpUS33ApaNiFntN1U5hfZWOLeFmVnv8KOa/U99LpFHKRk3XwJ+l9v0dC6RXaq5RCRVc4nUUojXconMrCujw1wiaSKlczOatpDmj1FGFsY32P5iSnTJ4zspF8qoSjP5P8zMrId45KH/WeRyiUTEy5RO0AGVc5hAyUvRqPMwlhLe+qIG694iaShwMvCrjrYzM7Oe5c5D/1PLJTKxbtmLdblEplFyidxI5hKpLygiplDmDEylZKCszyWyAiWXyFRJZ1TWXU4ZFbi0suyHwFKUnCEzaMsz8hvgkVx+FyXddyPjgKUj4tF8P4GSA2OBzkMUJ1fOt2q92qOaWb9fRcRZ7RzTzMx6gXNb2GLBuS3MzFrn3BZmZmbWI9x5MDMzs5b4aQtbLDgxltniaSAnn+rPPPJgZmZmLely58FJnPouiZOktSRdnq+HS9qtsu6tduqkjIMqIaRnVMJLHyBpra7Uq658SXpG0tvz/RBJIWl0ZZunJa3ajWPMkrRad+tqZmbd052RBydx6qMkThHxRETsmW+HA7t1tH09Se+ihJcenW24HeVRTyixF7rdecggU7VQ2lDa6k7aviMbAc9ExLNN1tm31MzM+qnudB6cxKmHkjhJujojOpL1/V6+/qGkL+coxgxJbwN+QAkQNbXWlsAmee4PSWqUJGoN4GXglazLKxHxsKQ9KTkzLsjylpW0S9Zherbh0lmXWZJ+kiMokySt3+A41e/EKOBnzN+ZGJ9lrSPphvwMblDJ5YGksyX9TNJNwE8krSrp2qzP/5KRNfPzvUrSXdkue2FmZn2my50HJ3ECei6J0xhgB0krUvJMvC+Xj6YS2Cnb83vAJRExPCJqx9sY+DClPY9T3mapuAt4CnhY0lmSPpHlXQ7cAewbEcMpYabPBvaKiGGUCbVfq5TzUkSMpASY+nmD83hrNCrr8kfg3fm+2nanAudWPoNfVsrYENg1Ir4JHAfcGhFbUvJyrJ3bfAR4IiK2iIjNgL81qIsTY5mZ9ZLuTph0EqeeSeI0lhJiejRwFTBY0nLA0PpcEu24KiLeyIiM/wLWrK6MiLmUC+6elNs+p0g6vkE5GwEPR8T9+f6crFfNRZV/t2dBk4Ats+O1VES8AjyUoxTV78T2wIX5+rw875rLsr7ksc/Pc7gKeD6XTwd2zZGQHSKiYc/AibHMzHpHdzsP9UmcJlIuDNVfmT2dxGnfahInoJrE6eLKMffIX+fDI2LtiLg3l/dGEqdfMX8o5/a0l8TpdspozQ6UUYg7ga8Ak5soE+CNyuu5NHgEN0M+T4qIEyjnskeDcjr6rGD+tlugHTNHxgOU0aEpuXgiZY7GGkB7HaFqWZ1+J7JzszWlE3FC7TaPmZn1jZ4YeXASp24mccrbEY9SRk8mZplHMX8uipqXKTkpmqbytMZWlUXDgX80KO8+YGhlPsN+lJGimr0q/06gsXGUtqq23X8CE2udPko71uaK7EsZvWlkTK5H0keB2pMcawGzI+J8Sptu1c7+ZmbWC7rbeXASp55L4jQWeCo7R2Mpk0IbdR5uokyQrE6Y7MxSwMkqj7tOpVz8/zPXnQ2ckcsFHAhcJmk6Zc5Hta2XlnRb7ntkO8caR2mrWudhSp5Lte0OBw5Ueex1v0pd6n2fMqdkCuVW1CO5fBgwKet8LPCjjk/fzMx6khNjWVMkzQJGtNNJ6vecGMvMrHVyYiwzMzPrCQ7EY02JiKELuw7d4dwWZjYQDJRcHB55MDMzs5a48zCASVpT0oUZWXKypAmSPp3rdpJ0ZZPlHC/phLplw3OiZyv1OSonZc7I6I9fbGX/LOOQruxnZmZ9x52HASofQ/0jMCYi1o2IWoTOd3WhuItoewyzZm/aAjk1U59DgA8CIzPq4450HjdiARFxRkSc2+p+ZmbWd9x5GLg+APw7It56lDIi/hERC8SRqGqUuyKjWL4gadvKpp8jg25J+lCOakyRdJmkwQ2K/g5waEYHJSJejIhz2jtmLj9R0j2Z4+LkXFbNpnpzJZ/G/ZJ2yOWDVPKl3J77frVrTWhmZl3hzsPAtSltURybImkZ2s9dcREZuEnSdsCzEfF3lRTY36Xkm9iKkgvjG3XlrgCskKG8mzpmBuj6NLBp5rhoL1bDkplP4whKrguAL1FiiWxDiQb6FUnvaXBs57YwM+sF7jwsIiSdlvMMbu9gs45yV1wM7KmSXn1v2iJmbgdsAozLoEz7A+vUH572w363d8yXgNeB30j6DFAfObSmlql0MjA0X38I+GLW5zZgVWCD+h2d28LMrHf4Uc2B624q+Ski4j9ylKCjSEjtzkGIiEczENT7s9ztK/tcFxH7dLDvS5JelbRuXeKwdo8ZEXMkjQR2oXRWvk65FVOvlrejmrNDwGERcU17dTIzs97jkYeB60ZgGUnVlNnLdbJPZ7krLgJOAR6MiMdy2UTgfbV9JC0nacMGZZ8AnKaSVhxJK0o6uL1j5ryJlSLiasotieGdn/JbrqHc+lgqj7WhSiZPMzPrAx55GKAiIiTtTkmv/S3gaUpGym9XNttF0mOV95+lLXfFkpRsntXcFZcBvwAOqxznaUkHABfVJjpS5kDcz/xOBwYDt0t6E3gT+GlEvC6p0TFXAf6UcyJE+7kyGvkN5RbGlHzq5Glg9xb2NzOzbnBuC1ssOLeFmVnrnNvCzMzMeoQ7D2ZmZtYSz3mwxYITY5lZVwyURFV9zSMPZmZm1pI+6zw4iVP3SLpa0sr5d2hleVNtJ2k7SbdJmirpXknHV/Yf1UN1vCKfAKm9nynpu5X3v8+AUF0t/2xJe3a3nmZm1j190nlwEqfui4jdIuIFYGXg0M62b+Ac4OCIGA5sBlyay3cCeqTzAIyvlSVpVeAV2oJNka/HN1NQPtZpZmb9UF+NPDiJUwdJnCR9S9Lh+foUSTdW6nJ+vp6VESRPBNbLEYSTsojBki7PkZQLsrNWbw3gyTzfuRFxj6ShwCHAkVneDpLWkXRD1vUGSWvn8c+WdIaksXl+H29wjHG0dURGAVcCq6t4D/BaRPxT0jKSzso2vlPSznmMA/Iz+wtwbe53arb7VXkOtTZb4PMwM7O+0VedBydx6jiJ0xhgh3w9gtIZWAoYDYyt2/YYSgTI4RFxdC7bMo+5CbAu8L4GdTsFmJm3Fr4qaZmImEUJ2HRKljcWOBU4N8/zAuCXlTKGUsJXfww4I9urajKwmaS3UToPE4CZwHvz/bjc7j8Aso33Ac6plLU9sH9EfIDS5hsBw4Cv0Daq0dTnISfGMjPrFQtlwqScxKk+idNkYOvs2LxBueiOoHQo6jsPjUyKiMciYh4wtXLst0TED7LMa4HPA39rp6ztabsFdB6lA1NzaUTMi4i/Aw8BG9cd4w1Kzo2tKJ/FbXkuo/KvdstidJZNRNwH/AOohby+LiKey9c7AhflSMkTlJDc0OTn4cRYZma9o686D7ULClCSOFESIq3ewT4dJnECZtGWxOnSyj7X5a/o4RGxSUR8qW7fl4BXJa3b7DEjYg4wEvg9JQxyexfejpI41er0noi4tq78N/N8DqRcYMcCOwPrAc1MBH2j8rp67PrzeDAiTqe0/RY5L6Ez0c7rRu+h1H9HyujO85TcGLXOQ23koaP5Ja92dowWPg8zM+sFfdV5cBKnzpM4jQGOyn/HUuYiTI0F44e/DKzQwvHJ436sMhdiA0on44UG5Y0nbwkB+wK3VtZ9VtISktaj3B6Z2eBQ44CvAnfl+2mUUYi1KZ1IKOe4b9Zrw1zXqKwxwN45b2QIpUNFNz8PMzPrpj6Z0e4kTk0lcRoLHAtMiIhXJb1Og1sWEfGspHGSZgB/BZqNfLQfpf1nA3OAfSNibk5OvFzSpyhteTjwO0lHZ10PrJQxk9KBWxM4JCJeb3Cc8ZSOxQlZ3zmS/gU8mrdVAH5NmTMxPetyQES80WCe5xWUybbTKZ9hrfO4Al3/PMzMrJucGMuaIuls4MqIuHxh16UrnBjLzKx1cmIsMzMz6wkOxGNNiYgDFnYdusO5LcystyyO+S888mBmZmYtcedhEaV+kktE0hYZ46L2fh9JsytPnwyTNC1f3yzpkcpTIUj6o6RX8vVQSa9lVMp7VaJ57t9MPczMrOe487AIyotvf8klMh1YJwNgQYn3cB8lKmbt/bjK9i+QETIlrQwMqSvvwYjYMiLem/U4Mp+QMTOzPuLOw6Kp3+QSycczbwdq+28NnMb8OTCqybIupi3OxGdoi9q5gIh4iBJ+/PCOzsvMzHqWOw+Lpn6TSySNB0ZlcKx5wM3M33mojjzcAOwoaVAe85JOqj6FujDZlXNybgszs17gzsNiYCHnEoG2bJsjgdszKdn6klYHBucIQs1cSlTLvYBlM3lXh6fX3grntjAz6x1+VHPRdDcl5wdQconkKEFHUZI6zCUiaRZtuUS2r+xzXUTs00l9JlIyio6mJMoCeIzSERnfYPuLKdElj++kXChzJ5qavGlmZj3DIw+Lpn6VSyQiXgYeBQ6grfMwgZKXolHnYSwlvPVFDda9RdJQ4GSgw7kcZmbWs9x5WARlMq3dgfdLeljSJMptiAVyidT+KL/ga3k9plPmJtTnEtmUnCiZx3ma0iG4KB+3nEg78w8oty6WzoyoUDoP69Kg8xDFyRHxTINy1qs9qknJpvqriDiro/YwM7Oe5dwWtlhwbgszs9Y5t4WZmZn1CHcezMzMrCV+2sIWC06MZda7FsfkUIszjzyYmZlZS/pF58FJnDqt11qSLq+cz25153xUE2UclGGnp0maIelTufwASWt1pV515UvSM5Lenu+HSHcQ6GwAACAASURBVApJoyvbPC1p1W4cY1bGqzAzs4VooXcenMSpcxHxRETsmW+HA7t1tH09Se8CjgVGR8TmlMiQ03L1AUC3Ow/5eOhttAWQGgXcmf8iaSPgmYh4tsk6+5aamVk/tdA7DziJE5KulrR5vr5T0vfy9Q8lfTlHMWZIehvwA2AvSVMl1TpKm+RIyEOSGiWJWgN4GXgl6/JKRDwsaU9gBHBBlrdso3bNusyS9JMcQZlUCSZVVQtDXWurnzF/Z2J8lrWOpBtyFOQGSWvn8rMl/UzSTcBPJK0q6dqsz/+SUTAlLS/pKpWQ2zMq7WBmZn2gP3QenMQJxgA7SFoRmEOOZlDCOY+tbRQR/wa+B1wSEcMjona8jYEPU3JHHFe7zVJxF/AU8LCksyR9Isu7nNIO+0bEcCBov10BXoqIkcCpwM8bnMd42tpqJGVE6d35vtp2pwLn5ijIBcAvK2VsSPmMvgkcB9waEVsCfwbWzm0+AjwREVtExGbA3xrUxYmxzMx6SX/oPMxHi2cSp7FZ/9HAVcBgScsBQ3M0pTNXRcQbGZHxX8Ca1ZURMZdywd0TuB84RdLxDcrpqF2hrS0vom1EoWoSsGV2vJaKiFeAh3KUojpqsz1tt5LOy/OuuSzrSx77/DyHq4Dnc/l0YNccCdkhIhr2DJwYy8ysd/SH+8pO4lRulYwAHgKuA1YDvgJMbqJMgDcqr+fS4HPNOQmTgEmSrgPOalDndtu1Vkw7r2vHmC3pAeAg2kaTJlLmaKwBtNcRqpb1agfrase5X9LWWe4Jkq6NiB90UnczM+sh/WHkYbFP4pS3Ix6lzM+YmGUeReWWRcXLwAoNlnd07LUkbVVZNBz4R4PyOmvXvSr/TqCxcZS2qrbdfwIToy0W+nja5orsSxm9aWRMrkfSR4HakxxrAbMj4nxKm27Vzv5mZtYLFnrnwUmc3jIWeCoiZufrd9G483ATZYJkdcJkZ5YCTpZ0X96y2YtyQYcyx+GMXC46btelJd2W+x7ZzrHGUdqq1nmYkudSbbvDgQPzc9ivUpd636fMKZkCfAh4JJcPo4ygTKU8RfKjjk/fzMx6khNjWVPyVtCIdjpJ/Z4TY5mZtU5OjGVmZmY9oT9MmLQBICKGLuw6dIdzW5jZQDEQ8oR45MHMzMxa4s5DPyXpcJWcGBe0uN/Kkg7N18NyYuVUSc/lhNSpkq7vQn3WlbR3B+s3lvRXSX/Pel8saY0WjzFIUqNJomZm1o+489B/HQrsFhH7trjfyrkvETE9I1EOp0RoPDrf79qF+qxL2+OV85G0LHAl5WmSDTKXx/8BLSXBioi5EbFDF+pmZmZ9yJ2HfkjSGZSL9Z8lHSlppKTx+djneJUkU0jaNPNMTM08ERsAJ1IeEZ0q6aROjnNM7j9Nbfk0ts993yZpsKR7JL03y90519Xnz9iPktjs6tqCiLghIu5VyZdxjkqujCmSdszjDJN0e6Xu60paUtILuX5XlbwXf5A0U9K5lXpvI+kWlQysf5W0JmZm1mc8YbIfiohDJH0E2DkinlHJebFjRMyRtCvw35TomYcAv4iIC1SSZg0CjgE2y9GGdqmk9V6bkgRMwNWSRkXEeEl/oyTgejtwVnYCjgG+HhG7NyhuM9qPhnk4JfHZMEmb5nE2oIyOnBwRl6gk32oU3XIrSkjxfwETVXKV3An8Avhkts2+wA+Bgxuc48G15YNWXL2j5jAzsxa48zAwrASckxfdoAR9ghKI6ViVlNt/yARgzZb5IeCjlIsxwGBKUqrxlIRUk4GXmD8xVleMBk4CiIi7JT0BrJ/H+a6kdbLuD2jBNNwTI+JJgAwINRR4nRIA7Po810GU8OELiIgzgTMBlh6ygQOamJn1EN+2GBh+CNyUGSQ/ASwDEBEXAp8EXgOukfSBFsoU8KPanIiIWD8izs51q1FChK8ILN1EWXdT0pe3d5wFRMR5wKcpeTmuq93OqNMoZ4eAaZV6D4uIjzZRRzMz6yHuPAwMKwGP5+sDagslrQs8FBG/pEyI3Jzmc19cA3xJJQMmkt6lkpAMyq/1Yyhhvk/IZR2Vex4lvPhHKnXbTdImzJ+f4r3AEOABSetGxAMR8QtKJtHNm6gzwD3AOyWNzDLflrdDzMysj7jzMDD8DyV75DjKMH3NXsCMHNLfGDg3Ip6lpB2f0dGEyZzceDllLsF0Su6NwZIOAl6NiEuBH1OSib2fcntjkEq69MPryppNGRE5Mh/VvAf4AvA0JRHYsnmMC4AvZiKwz0u6O+u+Lpl6uzMR8QYltfjPJN2V9dq2mX3NzKxnOLeFLRac28LMrHXObWFmZmY9wp0HMzMza4kf1bTFghNjmdlA1F+TZHnkwczMzFrSY50HJ3LqXZI+LenofP0ZSRtX1t0qqbOIkoMknZZPYUzPsNTrSFoio0f2RB23lnRH5f1+kl6RNCjfbylpSjfKXz+fzjAzs4WoJ29bHAp8NCIebnG/WiKnX0fEdGA4gKSzgSsj4vIu1qeWyOni+hVqS+R0eC0fg6RdKImc/tXsASJiLtAniZwi4orK288A84D7Wiji85Tz2zwi5klamxJBcglKTIcTe6CadwHrS1ouH98cBdwPbAFMyffjmi1M0pIRMacH6mVmZj2oR0Ye5ERO3UrklOU8lK9XkzRP0qh8P0HSUElflvRzSTsAuwGnZF2GZjF7Z9vMrO1bZwjwZETMy/N9JCJeyHZaIcs6N4/5rRyhmCHpsFy2fsZlOC/b5tLshL0lL/RTgJG5aEvgdEqngfx3fJb3wTzmdEn/p5KbA0mPSfovlZgWn862myZpAiWXR63NFvg8GpyzmZn1gh7pPETEIcATlEROp1B+Ee8YEVsC36MkcoK2RE7DgRGUnATHAA9mqOGj2zuG5k/kNBwYpZLIaQJQS+T0UzKRU5Z7U5b7y7rimkrkROlknJcXtloip+HANnm+9bYC/oOSzOm9krZTSfr0C2CPiNiaEgzph3XtNwd4KDtZo7NuO+TFeY2ImFXZdixwNXBknlttnSJiJHA0pc3rXQx8Jjt0J1ducxwDvJxlfVElcuO+lA7A9sChkmrRHzcBTsu2eR34aoPjjKd8NitQwkuPYf7OwzhJywG/yzYZRgmFXU1s9WpEvC8iLgPOBr4WEdszf4CsTj8PSQdLukPSHXNnv9igqmZm1hW9NWFyJeAySTOAUyiJjKAkcvqOpG8D60TEay2UWU3kNIWSXGnDXHcc8HFgGKUD0R2jKeGWiYi7KRelaiKnbwHvjojXG+w7MSKezNsZtURO76UtkdNUysX63Q32HQvsmH8nUG6HbAvc1mS9/5D/Ts7jziciHgE2Ao7NRTdJ2qlBOTsAv4+I2RHxMvBHSpsAPBwRE/P1+ZXlVeMonYTtgEkRMRPYSNI7gKWyHu8F/h4RD+Y+51LOu+YSKKMwwLIRUbvVcV5lm04/j4g4MyJGRMSIQcut1KCqZmbWFb3VeXAip9YTOY2lXLhHUOZjrEa5oI5p4nyqx64dt9E5vB4RV0fEUcBPgE812KyjtJz14UgbhSedQOn0vC9fA/wT+Cxt8x06S/35aifHaPbzMDOzXtCbIw9O5FQ0m8hpAvB+yi2TfwPTga9QOhX1mm2zt6g8CTEkXy9BGaX5R21CotrSYY+hzDVYVtJgSgejVof3SNomX+8D3Fp/nJxH8RTllk+t8zAROIKc70Bpkw0q8xS+ANzSoKxngNclbZ+L9q2cT1c/DzMz66be6jw4kVPbsZpK5JS3cJ6g7QI7ljKack+DYi+i3P6pTpjszDuAq/JW0nTK6M/pue63wDRJ50bEpCz/dspF//R8CgbKiM1XJE0Dlqd02hoZBwyKiCfz/QRKm43Pc50NfAn4Q7bzG8D/tVPWgcD/5oTJVyrLu/R5mJlZ9zkxljVF0vrA5TlBccBxYiwzs9bJibHMzMysJzi3hTUlIh4gA3gNRM5tYWZ9rb/mpegJHnkwMzOzlrjzMEBpAOUSkfQXSR+vvH9QlXwakv4k6ZMqUTpD0v6VddvksiPy/flZz7sk3a8SDXStVutrZmZd587DwHUosFtE7NvplvOr5RIhIqbX4k9QHp09Ot/v2oX61HKJNDKejDKpEpr7BUr0yprtaHvKZHpdOXtTcmZUHRkRW1Ce2JkO3ChpqS7U2czMusCdhwFIAy+XSC3qJPnvH4G1srwNgBcypgPAQ8CKKjk+BHyQEuNjARExLyJOBp6jRCA1M7M+4AmTA1BEHJIBrnaOiGckrUjJJTJH0q6UXCJ70JZL5AKV/ByDKMG0NuvskUvNn0tEwNUquUTGS6rlEnk7mUskb0N8PSJ2b1Dc7cDwDEQ1itIZ2ETShpQRiPpMm7+nxMa4lxKe+81OmmQKZRRivhmRkg4mc2YMWnH1ToowM7NmufOwaFgJOCd/xQdQG8KfABwr6V3AHyLi7+XHfFOquUQABlNyiYyn5BKZTEnp/bXOCoqI1yTNpDytsS0lfPkmlI7E9rTdsqi5hBIF9H5KwKrOwpi3F1L8TDKQ1dJDNnBAEzOzHuLbFouG/p5LBEoHYSdgmYh4iRK9clT+zTfyEBGP5/HfD9zcRNnDKaMUZmbWB9x5WDT091wiUDoIX6NtJONOSiKwd1BSuNf7L+DbETGvvQJVHAmsClzXxDmZmVkPcOdh0dCvc4mkcZRJnhOy/DeBZylpuxe4pRARt0bEn9up3imZJ6R2K+QDWZ6ZmfUB57awxYJzW5iZtc65LczMzKxHuPNgZmZmLfGjmrZYcGIss/5rUU4gtajyyIOZmZm1pE86D07i1P0kTpJ+LGnnfP0NScvk6yUlvdDE/kMkXZ11uUfSnztriy7U8ZuSTq68/21Go6y9P1LSz7pR/pcl/by79TQzs+7pq5EHJ3HqZhKniDg2Im7Kt98gA0G14EfAVRGxRURsAnw3l3fUFq16q+3SMGBVSbXv2QIBodqTMRw8MmZm1g/1+v+c5SROQMdJnCSNknRpvt5D0quSlpK0vKS/5/LzJe2eQZHWAMZWR10knZijChMkrdGgCkOAxyr1mZYv52sLScvmCMl0SVMk7Zjlf1nSFZKukTRT0ncbHGMyJWfF0pJWoYSvnkEJRQ2VUNSSvqUSa2KGpMNy2fr5/gxKvoohedz7Jd1M6bjVznfv3PYuSTdhZmZ9ptcnTDqJ0wIaJXG6Hdg6X+8A3ANsRcknMbG6c0ScIumbwA4R8ULWcyXglog4Jm8LHETpFFSdClwoaQpwfbbFk5Q2fqstJH0b+HdEDJO0KaUtN8gyRgKbAf8Gbpd0ZURMrdTt35Jm5Lm8Pev+KDBK0stZ7pOSRgL7ZnmDgEmSbgFmUzoaB+b35l2USJNbUSJYjqm0x3HAThHxlKSVGzW0nBjLzKxXLIxh4ZWAy/IicwqwaS6fAHwnL17rRMRrLZRZTeI0BVifksQJykXm45Qh9J92VlAet5rEaVLWrZaHoVESp88B+1CSOHVmgSROGR3xkbxIjwB+DuxI6UiMbaLM1yLir/l6MjC0wTGuBtYDfku5QN8padUGZY2mJKUiIu4GnqC0J8A1EfF8RLxKGZEZ3WD/2sjNKEq7NWq7HYDfR8TsiHi5rqwHI+L2fL0dcENEPBsR/6ZEuawe51xJX6ad73FEnBkRIyJixKDlVmq0iZmZdcHC6Dw4iVPjJE5jgY9Rfn3fQLnAjqb82u7Mvyuv59LOiFJehC+IiC8AU2l88e8o7WZ9ONJG4Ulr8x62p7TbDMpoRbXtOjrGq00cA+ArlI7hUOAuSW/voEwzM+tBC2vkwUmcFjSGMhFyfET8M4+1XkQ0Ol6z7VI9/i6Sls3XKwLvAR5pUNYYyi0Fcn7IEOCBXPchlSdglgM+RePJj7WRh5WzszKPMun0Y7SNPIwBPp3zKwZnWY1GWCYCu0haJW9l7VlZt25ETKS0/fPAO5tvDTMz646FESTqf4BzJH0DuLGyfC/gC5LeBP4J/CAinpM0Lm9x/DUijm5UYERcLWljShInKBfEz0v6JJnEKecGTFBJ4jSBTOIE/DY7LFULJHGS9Cylc9MwiVMH53uKpO8Dy2Z57SVxmkC5UNdGGmZQnjZp5EzgekmPAh/p4NhV2wCnZvsuAZweEXfW5pfU2gL4FfC/Ksmw3gS+mHMZAG4FLqTc/jivOt+hJue1vAhMqyyeSJnfMD23mSTpIspcD7Iu0yWtX1fWY5J+lPs/AVSTU5wi6T2UUYxrI2JGk+1gZmbd5MRY1pScW7BZRByxsOvSFU6MZWbWOjkxlpmZmfUEjzzYYmHpIRvEkP0dnNLMBo7+kPPDIw9mZmbWI9x5GKAkraq2XB//lPR45X1UXk+VNLTB/mdL2jNf35xRI6dJuk/SqdXAS5LmNlHehiq5Mx5QyWNyqUp471bPqz6OhpmZ9TNOyT1ARcSzlJgRSDoeeCXDXyPplc6icjawb0TckU9fnAD8iRK7AkoQqnbLU0nSdRXwjYj4Sy7bGVgdeKqVSkTEqM63MjOzhckjDzafjOT4LWBtSVs0udvngQm1jkOWc1NEzJC0jKSzVHJl3Km2zKCNcpkg6ZX8d6ccEbk8R0MuUD4vKmlrSbdImqySa2NIT7aBmZl1zCMPi6ZlJdViMDwcEZ9uZeeImJtxHzamZAntrLzNKGGxG/mPLHNYxuK4ViVPSKNcJvW2pIQvf4ISe+N9km6jxKL4VEQ8LWkv4MeUfB7zcW4LM7Pe4c7DoqnD2wxNqoaQ7k55oykXeyLiPkn/oOQdmQAcm8mv/hARf2+w76SIeAwgOy9DKdEqNwOuy4GIQcCTjQ4cEWdSAmqx9JAN/FiRmVkP8W2LxUTeOpgq6eomth1ESSTWKAdHI3fTlhV0geIaLWwyl8kblde1nB0C7q7kMRkWER9qsK+ZmfUSdx4WExFxYF5sd+toO0lLUSZMPhoR0zratuJCStrttx5KlvQRScOYP1fGhpTU6TPVOJdJM2YCq0vavlZfldThZmbWR9x5sJoLJE2j5NRYnpKsqimZxvzjwGGS/i7pHkrSs38Bv6bkzphOSV9+QES8QcllMiNvR2wMnNvksf5NSZD1k5yXMZWSiMvMzPqII0zaYsG5LczMWucIk2ZmZtYj3HkwMzOzlvhRTVssTH/8RYYec9XCroaZWZ/qreRaHnkwMzOzlvRZ58GJnLpH0iclHZOvd5e0SWXdzZIWmNBSt/8Skn4paUaGir5d0nty3Xd6qI5bVCJRImkfSbPz8U8kDcsnOrpa/lBJM3qirmZm1nV9dtvCiZy6JyL+TImHALA7cCVwTwtF7AWsBWweEfMysuOrue47wH/3QDWnA+tIWiEiXqY8QnkfJcz0pHw/rtnCJA2KiLk9UC8zM+tBA/62xaKQyEnSIEkPqVhZ0jxJO+a6sZLWl3RAjrCMokRmPCnrsl4W89ms3/2SdmhwzkOAJyNiXp7vYxHxvKQTydwVki7IY34jRyhmSDoilw3N8zonz/9yScvVfRbzgNuBbXPR1sBptMVhGAWMz/J2yfadLul3kpbO5bMkfU/SrXlOW0u6S9IEMk9GR5+HmZn1vv7SeahdvKZKuqLVnfPXaS2RUzPlNZXICdgHOCdHKmqJnIYDI4DHGuy7JXAEsAmwLiWR01KU3A57RsTWwO8oiZzq639/7jc667ZDXlDfFREPVLYdTxmBODojRj6Yq5aMiJF5/OMa1O1S4BPZJj+VtGWWdww5UhMR+0raGjiQ0gHYDvhKbVtgI+DMiNgceAk4tMFxxlOiTS4PzANuZv7Ow7hsz7OBvbKdlwS+Vinj9YgYHREXA2cBh0fE9nXH6fTzkHSwpDsk3TF39osNqmpmZl3RXzoPtYvX8FYzQFYskMipi+WNBs6DksgJqCZy+o6kbwPrZFTFepPyF/08SuTDoZQLbi2R01Tgu8C7Guw7Ftgx/07IemxD+SXfjD/kv5PzuPPJBFMbAf+PclG/QdIuDcoZDVwREa9GxCtZbm0k49GIqN12OD+3rTeO0kkYCdyenZv1Ja0ODI6Ih7IeD0fE/bnPOXneNZcASFoJWDkibsnl51W26fTziIgzI2JERIwYtNxKDapqZmZd0V86DwvQ4pfIaSzlIj0SuBpYGdiJkhuiGbVj147b6BzeiIi/RsTRlDkOuzfYrOH514ro5D3AREqnZzTlAg9lVGBv8pZFJ8eAtrkYaucYzX4eZmbWC/pt52ExTOR0G+UX+7yIeJ0ycvFVSqei3svACk0enzzuVpLWytdLZP3/kavfzHaEcv67S1oubz18ulKHtWvnQbmlc2v9cXKi5KOU3Ba1zsMEyu2UWufhPmCopPXz/X7ALdSJiBeAFyXVRjj2rZxPVz8PMzPrpn7beWjCIpXIKY/xKOWXO5QL9gqUJxjqXQwcnRMO12uwvpE1gL+oPOo4DZgDnJrrzgSmSbogIqZQ5iNMonRofhMRd+Z29wL7Z7uvApzezrHGAUtHxKP5fgJlDsj4PNfXKfMqLst2ngec0U5ZBwKn5YTJ6q2JLn0eZmbWfU6MZU1RiZVxZURstpCr0iVOjGVm1jo5MZaZmZn1BOe2sKZExCzKUyMDknNbmA0cvZWPwXqORx7MzMysJe489DNqy8txd0ZW/EY+HdGVsi5QyQEyI6M4LtX5Xl2TT5GcmBNQZ2T0x492oZwfSNq1N+poZmY9w52H/qcW4GpT4IPAbjSOGNmMCyhPIgwDlgW+3DNVbOiHlBDYm+Wkyk/Q4uOkABHxvYi4vqcrZ2ZmPcedh34sIv4FHAx8XUV7eTcGSTo5l0+TdFjuf3UkyqOX71LJrjlL82chfUDSmpJWl/R7lYybt0t6X64fXDnuNEl7VOupkuPiK8Bh+cgpEfFURFya6/fJfWdI+kmlzmerLcvnkbm8mj11lqTvS5qS22ycy5fPkZTbsx2afkzXzMy6zxMm+7mIeChvW6wBfCGXDcsL6bUZyOpA4D3AlhExR9Iq1TLydsV+wH9mRs0/UYI/nSVpW2BWRDwl6ULglIi4VdLawDXAe4H/Al7MPBRIentdNdcHHomIl+rrn4GpfkKJ6Pl81nl3SkyLd9Ye/ax2Zuo8ExFbSToUOIoyenIscGNEHJT7TZJ0fUS8Wt1R0sGUzheDVly93TY2M7PWeORhYKiFc24v78auwBkRMSfXPVe3/6+BMRFRixR5CSXIEpSw0Zfk612BUzPw0p+BFSWtkMtPqxUWEc+3UPdtgJsj4ums3wWUPBYPAetK+pWkj1ASbTXSKGfHh4Bjsp43A8tQIoHOx7ktzMx6h0ce+rkMwzyXEv2yvZwQ7eaAkHQcsDol1HXNBNqSVe0O/CiXLwFsX59kSlK75acHKKGrV8jw1PV1W0CmA98C+DAlk+nngIMabNooZ4eAPSJiZgd1MjOzXuKRh34sL+5nAKfmvIWGeTeAa4FDJC2Z61bJf79MuTjvk5k+AciyrgB+BtwbEc/mqmuBr1eOP7yd5fPdtoiI2cBvgV9KeltuM0TSFyghrt8vaTWVBGb7ALdIWg1YIiJ+T7ktslULTXMNJbS48lhbdrK9mZn1IHce+p9la49qAtdTLtzfz3Xt5d34DfAIJT/FXcDnc/szgDWBCVnm9yrHuYQyh+KSyrLDgRE5KfIe4JBc/iPg7Tm58S5g5wb1/i7wNHCPSv6MPwJPR8STlDTgNwF3AVMi4k/AO4Gb89bD2blNs34ILJXnOyPfm5lZH3FuC1ssOLeFmVnr5NwWZmZm1hPceTAzM7OW+GkLWyw4MZZZ85yYyjrjkQczMzNrSbc6D3ISpz5L4iRpfP47VNLnK8sPkHRqE/t/PEM53yXpHklfzeW7S9qkh+p4Z+3xTklLSno1H9esrZ8sqZVHMuvLv1nSAhN3zMysb3V35MFJnPooiVNEjMqXQ2l7FLMp2RE7E/hERGwBbEmJzAglSFSPdB6A8UCtnltQYlCMyjosD6xLeVyzmTr7lpqZWT/VY7ctnMSp60mcJP1a0ifz9RWSfpevvyTpR/n6ldz8RGCHHPE5MpetJelvOZLyPw0+nhUo81uezfN9IyJmShoFfBI4KctbT9JwSROz7a5QBoTKX/0/lzQ+22Fkg+OMo63zMIoSZ6IWaGokJcbDXEmrSPpjHmOipM3zGMdLOlPStcC5kpaVdHFudwmlU9nu52FmZn2jR+c8RMRDWeYalJDDZDKlfYBzJC1D6WDUkjhtThlxeIvakjj9LaMi1pI4oUoSJ+AXlCRO2wB7UAIlQSWJU5Z/Y101m0ni9AHKRW8blSROw8kkTnk+Z7XTBM9ExFbA6ZQkTtCWxGkbSnClk/JXeNUYYId8/U7aRgJGA2Prtj0GGJsjPqfksuGUXBXDgL0kvbu6Q+a6+DPwD0kXSdpX0hIRMT6XH53lPQicC3w72246848kLZ8jIIcCv2tw/tWRh1F5Xm+o5McYRelcQAl6dWce4zt5zJqtgU9FxOeBrwGzc7sf57ra+Xb6eUg6WNIdku6YO/vFRpuYmVkX9MaESSdxaj2J01jKaMImwD3AU5KGANtTLsiduSEiXoyI13P/deo3iIgvA7tQRnWOosHFX9JKwMoRcUsuOody/jUXZVljKO09XybMiJgFvE3SOyi3oGYCtwPbUjoPtXOpfjduBFbNYwP8uZJbY0fg/NxuGjAtlzf1eTgxlplZ7+jR+8pyEqcuJXGKiMfz9sBHKL/WV8ljvNKgjo28UXldPXb9caYD0yWdBzwMHNBE2fMV0cl7KJ/XnsCTERGSJgLvo9y2mJjbNGrnWlmvtrO8bUHzn4eZmfWCHht5kJM4tafZJE4TgCMo7TaWMjpQf8sC4GVanNyZ80B2qiwaThkJmq+8iHgReF5S7RbKfsAtlf32yvJGU24NNboXMA44Ms+ndl5fBP4ZES/ksup3YyfK7Z5GowfV7TYDanMjuvN5mJlZN3W38+AkTp1rNonTWGDJiHgAmEIZfWjUeZgGzFF5KKDjQAAACWlJREFU5LLZiYICvqXyKOxUymd0QK67GDg6J3OuB+xPmZcxjdLJ+EGlnOdVHhk9A/hSO8caR3mqYgJAtukg5r/9cjz52VEmgO7fTlmnA4Nzu29RbrlA9z4PMzPrJifGsqZIuhk4KiIGZHYpJ8YyM2udnBjLzMzMeoID8VhTImKnhV0HMzPrHzzyYGZmZi1x58HMzMxa4s6DmZmZtcSdBzMzM2uJOw9mZmbWEncezMzMrCXuPJiZmVlLHGHSFguSXqbkVhkoVgOeWdiVaJHr3Ddc59430OoLvVfndSJi9fqFDhJli4uZjUKs9leS7hhI9QXXua+4zr1voNUX+r7Ovm1hZmZmLXHnwczMzFrizoMtLs5c2BVo0UCrL7jOfcV17n0Drb7Qx3X2hEkzMzNriUcezMzMrCXuPJiZmVlL3HmwRYakj0iaKekBScc0WL+0pEty/W2ShvZ9LReoU2d13lHSFElzJO25MOpYr4k6f0PSPZKmSbpB0joLo551deqszodImi5pqqRbJW2yMOpZV6cO61zZbk9JIWmhPlrYRBsfIOnpbOOpkr68MOpZV6dO21jS5/L7fLekC/u6jg3q01k7n1Jp4//f3v3HyFHWcRx/f9pC6A8EpGJEGg/wqFpS2vRqaUQFa6IGcsVYYxtIqBESFWwMxh9o1aLhDyQGo0Co1FoqxCpVoYpaf7SlarzSQulPora1lUYTbBWwtNByfPxjnqvrdvd2xrqze9vvK7lkd/aZvc/M7e0+88yz8/2jpGeaEsR2/MTPkP8BhgM7gfOAk4FNwJuq2nwUuDvdng18bwhk7gImAkuBWUNkP18GjEq3PzJE9vMrKm73Aj9v98yp3anAWqAP6GnnvMBc4I5W7tf/IXM3sBE4I90/q90zV7X/GLC4GVli5CF0ijcDO2zvsn0YWAbMrGozE7g33V4OzJCkEjNWa5jZ9m7bm4GXWxGwhjyZV9s+mO72AeeUnLFanszPVdwdDbR6Jnme1zPAl4GvAC+UGa6GvHnbSZ7M1wF32v4ngO2nS85Yreh+ngN8txlBovMQOsVrgacq7u9Ny2q2sf0S8CxwZinpasuTud0Uzfwh4GdNTdRYrsySrpe0k+zDeF5J2eppmFnSZGCc7Z+UGayOvK+L96XTWcsljSsnWl15Ml8AXCDpd5L6JL27tHS15f7/S6cLzwVWNSNIdB5Cp6g1glB99JinTZnaLU8euTNLuhroAW5raqLGcmW2faft84FPA/Obnmpwg2aWNAy4HfhEaYkGl2cf/xjosj0R+BX/GQVslTyZR5CduriU7Ch+kaTTm5xrMEXeM2YDy233NyNIdB5Cp9gLVB7JnAP8tV4bSSOA04B/lJKutjyZ202uzJLeCXwO6LX9YknZ6im6n5cBVzY1UWONMp8KXAiskbQbuBhY0cJJkw33se39Fa+Fe4ApJWWrJ+97xkO2j9j+M1lxve6S8tVS5LU8myadsoDoPITOsR7olnSupJPJ/nFWVLVZAVyTbs8CVjnNKmqRPJnbTcPMaTh9IVnHodXniCFf5soPhMuBP5WYr5ZBM9t+1vZY2122u8jmlvTa3tCauLn28Wsq7vYCT5aYr5Y8/38Pkk0ARtJYstMYu0pN+d9yvWdIGg+cAfy+WUGi8xA6QprDcAOwkuxN6fu2t0n6kqTe1OxbwJmSdgA3AnW//laGPJklTZW0F3g/sFDSttYlzr2fbwPGAA+kr4u1tEOUM/MN6at4T5C9Nq6p83SlyJm5beTMOy/t401kc0rmtiZtJmfmlcB+SduB1cAnbe9vTeJCr4s5wLJmHhzF5alDCCGEUEiMPIQQQgihkOg8hBBCCKGQ6DyEEEIIoZDoPIQQQgihkOg8hBBCCKGQ6DyEEIY0Sf3pK6FbJT0gaVSrM9UjaYykhZJ2pq8trpU0rUm/a4kaVGJNlS7Prri/qB0qiob2F52HEMJQd8j2JNsXAoeBD+ddUdLw5sWqaRHZVU27bU8gu9bB2DwrKjOsatnx5p8LHO082L7W9vbjfM5wAojOQwihk/wGeD1ktTUkPZpGJRYOfNBKOpAuqrMOmC7pC5LWp5GLbw5UWpU0T9L2VMhpWVr2SkkPpmV9kiam5QskLZa0RtIuSccU1pJ0PjANmG/7ZYBUHfHh9PiNKcNWSR9Py7okPSnpLuBxYFyN/FMkPSLpMUkrq67kOPC7j9nGNCrRA9yf9tHIlL8nrTNH0pa0zq0Vz3VA0i2SNqV98Or/y18uDCnReQghdARl9UreA2yR9EbgA8BbbE8C+oGrUtPRwFbb02z/FrjD9tQ0cjESuCK1+wwwORVyGhjNuBnYmJZ9FlhaEeENwLvIyiZ/UdJJVREnAE/UKlQkaQrwQbLOxcXAdcou8w0wHlhqe7LtPZX5gXXAN4BZtqcAi4FbauyeY7bR9nJgA3BVGrk5VJHnbOBW4B3AJGCqpIF6H6OBPtsXAWvJylaHE0x0HkIIQ93IdFnpDcBfyC5DPoOs8NL69NgM4LzUvh/4QcX6l0laJ2kL2YflhLR8M9lR+dXAS2nZJcB3AGyvIrvc+WnpsYdtv2h7H/A0UOSI/BLgR7aft30A+CHw1vTYHtt9FW0r848nK5D1y7Sd88mKJVWrt431TAXW2P57uiTy/cDb0mOHgYEy4I8BXTm3MXSQEa0OEEIIx+lQGl04Kp16uNf2TTXavzBw9C/pFOAuoMf2U5IWAKekdpeTfWD2Ap+XNIHBSyJXVg/t59j3123ARZKGDZy2qIw8yPY9Xy9/Wm+b7en1Vm6wjXVXG+SxIxU1E2ptZzgBxMhDCKET/RqYJeksODpX4XU12g18iO6TNIas2ippYuI426uBTwGnkxX7Wks6/SHpUmCf7efyBLK9k2x05OaKeRXdkmam571S0ihJo4H3ks3faOQPwKskTU/Pd1Lq5DTcxuRfZOW9q60D3i5pbJorMgd4JM92hhND9BhDCB3H9nZJ84FfpI7AEeB6YE9Vu2ck3QNsAXaTlTwGGA7cl05JCLg9tV0AfFvSZuAgxatvXgt8Fdgh6SCwn6xS4+OSlgCPpnaLbG+U1NVgOw+niY9fT1lHAF8jG+VotI0AS4C7JR0Cples8zdJN5FVkhTwU9sPFdzW0MGiqmYIIYQQConTFiGEEEIoJDoPIYQQQigkOg8hhBBCKCQ6DyGEEEIoJDoPIYQQQigkOg8hhBBCKCQ6DyGEEEIo5N91/SlE9JWKHwAAAABJRU5ErkJggg==\\n\",\n      \"text/plain\": [\n       \"<Figure size 432x288 with 1 Axes>\"\n      ]\n     },\n     \"metadata\": {\n      \"needs_background\": \"light\"\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"fig, ax = plt.subplots()\\n\",\n    \"\\n\",\n    \"# Example data\\n\",\n    \"x = list(baselines.keys())\\n\",\n    \"x_pos = np.arange(len(x))\\n\",\n    \"y = list(results.values())\\n\",\n    \"\\n\",\n    \"ax.barh(x_pos, y, align='center')\\n\",\n    \"ax.set_yticks(x_pos)\\n\",\n    \"ax.set_yticklabels(x)\\n\",\n    \"ax.invert_yaxis() \\n\",\n    \"ax.set_xlabel('Perason Correlation')\\n\",\n    \"ax.set_title('Performance of Baseline Models')\\n\",\n    \"\\n\",\n    \"plt.show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 41,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Clean up data\\n\",\n    \"tmp_dir.cleanup()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 42,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/papermill.record+json\": {\n       \"results\": {\n        \"Doc2vec Cosine\": 0.528387685928394,\n        \"Doc2vec Cosine with Stop Words\": 0.45572884639905675,\n        \"GLoVe Cosine\": 0.6688056947022161,\n        \"GLoVe Cosine with Stop Words\": 0.6049380247374541,\n        \"GLoVe WMD\": 0.6267300417407605,\n        \"GLoVe WMD with Stop Words\": 0.48470008225931194,\n        \"TF-IDF Cosine\": 0.6749213786510483,\n        \"TF-IDF Cosine with Stop Words\": 0.7118087132257667,\n        \"Word2vec Cosine\": 0.6476606845766778,\n        \"Word2vec Cosine with Stop Words\": 0.6683808069062863,\n        \"Word2vec WMD\": 0.6574175839579567,\n        \"Word2vec WMD with Stop Words\": 0.5689438215886101,\n        \"fastText Cosine\": 0.6707510007525627,\n        \"fastText Cosine with Stop Words\": 0.6771300330824099,\n        \"fastText WMD\": 0.6394958913339955,\n        \"fastText WMD with Stop Words\": 0.5177829727556036\n       }\n      }\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"# Record results with scrapbook for tests\\n\",\n    \"sb.glue(\\\"results\\\", results)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.5\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/sentence_similarity/bert_encoder.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Sentence Similarity with Pretrained BERT\\n\",\n    \"In this notebook, we use a pretrained [BERT model](https://arxiv.org/abs/1810.04805) as a sentence encoder to measure sentence similarity. We use a [feature extractor](../../utils_nlp/bert/extract_features.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert). \\n\",\n    \"\\n\",\n    \"**Note: To learn how to do pre-training on your own, please reference the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT) created by Microsoft.**\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 00 Global Settings\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import sys\\n\",\n    \"import os\\n\",\n    \"import torch\\n\",\n    \"import itertools\\n\",\n    \"import numpy as np\\n\",\n    \"import pandas as pd\\n\",\n    \"import scrapbook as sb\\n\",\n    \"from collections import OrderedDict\\n\",\n    \"\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"from utils_nlp.models.bert.common import Language, Tokenizer\\n\",\n    \"from utils_nlp.models.bert.sequence_encoding import BERTSentenceEncoder, PoolingStrategy\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# device config\\n\",\n    \"NUM_GPUS = 0\\n\",\n    \"\\n\",\n    \"# model config\\n\",\n    \"LANGUAGE = Language.ENGLISH\\n\",\n    \"TO_LOWER = True\\n\",\n    \"MAX_SEQ_LENGTH = 128\\n\",\n    \"LAYER_INDEX = -2\\n\",\n    \"POOLING_STRATEGY = PoolingStrategy.MEAN\\n\",\n    \"\\n\",\n    \"# path config\\n\",\n    \"CACHE_DIR = \\\"./temp\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if not os.path.exists(CACHE_DIR):\\n\",\n    \"    os.makedirs(CACHE_DIR, exist_ok=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 01 Define the Sentence Encoder with Pretrained BERT\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The `BERTSentenceEncoder` defaults to Pretrained BERT.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 407873900/407873900 [00:15<00:00, 26602678.27B/s]\\n\",\n      \"100%|██████████| 231508/231508 [00:00<00:00, 905295.88B/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"se = BERTSentenceEncoder(\\n\",\n    \"    language=LANGUAGE,\\n\",\n    \"    num_gpus=NUM_GPUS,\\n\",\n    \"    cache_dir=CACHE_DIR,\\n\",\n    \"    to_lower=TO_LOWER,\\n\",\n    \"    max_len=MAX_SEQ_LENGTH,\\n\",\n    \"    layer_index=LAYER_INDEX,\\n\",\n    \"    pooling_strategy=POOLING_STRATEGY,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 02 Compute the Sentence Encodings\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The `encode` method of the sentence encoder accepts a list of text to encode, as well as the layers we want to extract the embeddings from and the pooling strategy we want to use. The embedding size is 768. We can also return just the values column as a list of numpy arrays by setting the `as_numpy` parameter to True.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 2/2 [00:00<00:00, 2917.78it/s]\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>text_index</th>\\n\",\n       \"      <th>layer_index</th>\\n\",\n       \"      <th>values</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>-2</td>\\n\",\n       \"      <td>[0.038080588, 0.0926698, 0.0366186, -0.1218368...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>-2</td>\\n\",\n       \"      <td>[0.084241375, 0.099506006, -0.38437817, 0.2164...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   text_index  layer_index                                             values\\n\",\n       \"0           0           -2  [0.038080588, 0.0926698, 0.0366186, -0.1218368...\\n\",\n       \"1           1           -2  [0.084241375, 0.099506006, -0.38437817, 0.2164...\"\n      ]\n     },\n     \"execution_count\": 5,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"result = se.encode(\\n\",\n    \"    [\\\"Coffee is good\\\", \\\"The moose is across the street\\\"],\\n\",\n    \"    as_numpy=False\\n\",\n    \")\\n\",\n    \"result\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 768,\n       \"encoder\": \"json\",\n       \"name\": \"result\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"result\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"# for testing\\n\",\n    \"size_emb = len(result[\\\"values\\\"].iloc[0])\\n\",\n    \"sb.glue(\\\"size_emb\\\", size_emb)\\n\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/sentence_similarity/bert_senteval.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Parallel Experimentation with BERT on AzureML\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/sentence_similarity/bert_senteval.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"[SentEval](https://github.com/facebookresearch/SentEval) is a widely used benchmarking tool developed by Facebook Research for evaluating general-purpose sentence embeddings. It provides a simple interface for evaluating embeddings on up to 17 supported downstream tasks (such as sentiment classification, natural language inference, semantic similarity, etc.) \\n\",\n    \"\\n\",\n    \"Due to the fact that different BERT layers capture different information, and that the choice of pooling layer and pooling strategy for the encoding is highly dependent on the final finetuning task, we use SentEval to evaluate different combinations of these encoding parameters on the STSBenchmark dataset. In this notebook, we aim to show an example of\\n\",\n    \"* running SentEval experiments with BERT encodings\\n\",\n    \"* running parallel jobs on AzureML compute targets for faster experimentation (extracting sequence encodings from BERT with 110M parameters is computationally expensive, even without finetuning. Each experiment could take an hour or more, depending on the specs of the machine, so running multiple experiments sequentially can quickly add up) \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 00 Global Settings\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \\n\",\n      \"[GCC 7.3.0]\\n\",\n      \"AzureML version: 1.0.57\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import os\\n\",\n    \"import sys\\n\",\n    \"import pickle\\n\",\n    \"import shutil\\n\",\n    \"import itertools\\n\",\n    \"import glob\\n\",\n    \"import numpy as np\\n\",\n    \"import pandas as pd\\n\",\n    \"import seaborn as sns\\n\",\n    \"import matplotlib.pyplot as plt\\n\",\n    \"import scrapbook as sb\\n\",\n    \"\\n\",\n    \"import azureml\\n\",\n    \"from azureml.core import Experiment\\n\",\n    \"from azureml.data.data_reference import DataReference\\n\",\n    \"from azureml.train.dnn import PyTorch\\n\",\n    \"from azureml.widgets import RunDetails\\n\",\n    \"\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"from utils_nlp.azureml.azureml_utils import get_or_create_workspace, get_or_create_amlcompute\\n\",\n    \"from utils_nlp.models.bert.common import Language, Tokenizer\\n\",\n    \"from utils_nlp.models.bert.sequence_encoding import BERTSentenceEncoder, PoolingStrategy\\n\",\n    \"from utils_nlp.eval.senteval import SentEvalConfig\\n\",\n    \"\\n\",\n    \"%matplotlib inline\\n\",\n    \"print(\\\"System version: {}\\\".format(sys.version))\\n\",\n    \"print(\\\"AzureML version: {}\\\".format(azureml.core.VERSION))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# azureml config\\n\",\n    \"subscription_id = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"resource_group = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  \\n\",\n    \"workspace_name = \\\"YOUR_WORKSPACE_NAME\\\"  \\n\",\n    \"workspace_region = \\\"YOUR_WORKSPACE_REGION\\\"\\n\",\n    \"\\n\",\n    \"# path config\\n\",\n    \"CACHE_DIR = \\\"./temp\\\"\\n\",\n    \"LOCAL_UTILS = \\\"../../utils_nlp\\\"\\n\",\n    \"LOCAL_SENTEVAL = \\\"../../utils_nlp/eval/SentEval\\\"\\n\",\n    \"\\n\",\n    \"EXPERIMENT_NAME = \\\"NLP-SS-bert\\\"\\n\",\n    \"CLUSTER_NAME = \\\"eval-gpu\\\"\\n\",\n    \"MAX_NODES = None # we scale the number of nodes in the cluster automatically\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"os.makedirs(CACHE_DIR, exist_ok=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We evaluate 768-dimensional encodings from BERT with each combination of 12 BERT layers and 2 pooling strategies (mean and max) for a total of 24 experiments. To run a smaller number of experiments or customize the pooling layers/strategies of interest, edit `EXP_PARAMS`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"MODEL_PARAMS = {\\n\",\n    \"    \\\"num_gpus\\\": 1,\\n\",\n    \"    \\\"language\\\": Language.ENGLISH,\\n\",\n    \"    \\\"to_lower\\\": True,\\n\",\n    \"    \\\"max_len\\\": 128,\\n\",\n    \"    \\\"cache_dir\\\": CACHE_DIR\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"SENTEVAL_PARAMS = {\\n\",\n    \"    \\\"usepytorch\\\": True, \\n\",\n    \"    \\\"batch_size\\\": 128,\\n\",\n    \"    \\\"transfer_tasks\\\": [\\\"STSBenchmark\\\"]\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"EXP_PARAMS = {\\n\",\n    \"    \\\"layer_index\\\": range(12),\\n\",\n    \"    \\\"pooling_strategy\\\": [PoolingStrategy.MEAN, PoolingStrategy.MAX],\\n\",\n    \"}\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 01 Set up AzureML resources\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We set up the following AzureML resources for this example:\\n\",\n    \"* A [Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace), a centralized hub for all the artifacts you create when you use Azure Machine Learning service\\n\",\n    \"* An [Experiment](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.experiment.experiment?view=azure-ml-py), which acts a container for trials or model runs\\n\",\n    \"* A [Datastore](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data), a compute location-independent abstraction of data in Azure storage accounts\\n\",\n    \"\\n\",\n    \"The following cell looks to set up the connection to your [Azure Machine Learning service Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). You can choose to connect to an existing workspace or create a new one. \\n\",\n    \"\\n\",\n    \"**To access an existing workspace:**\\n\",\n    \"1. If you have a `config.json` file, you do not need to provide the workspace information; you will only need to update the `config_path` variable that is defined above which contains the file.\\n\",\n    \"2. Otherwise, you will need to supply the following:\\n\",\n    \"    * The name of your workspace\\n\",\n    \"    * Your subscription id\\n\",\n    \"    * The resource group name\\n\",\n    \"\\n\",\n    \"**To create a new workspace:**\\n\",\n    \"\\n\",\n    \"Set the following information:\\n\",\n    \"* A name for your workspace\\n\",\n    \"* Your subscription id\\n\",\n    \"* The resource group name\\n\",\n    \"* [Azure region](https://azure.microsoft.com/en-us/global-infrastructure/regions/) to create the workspace in, such as `eastus2`. \\n\",\n    \"\\n\",\n    \"This will automatically create a new resource group for you in the region provided if a resource group with the name given does not already exist. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ws = get_or_create_workspace(\\n\",\n    \"    subscription_id=subscription_id,\\n\",\n    \"    resource_group=resource_group,\\n\",\n    \"    workspace_name=workspace_name,\\n\",\n    \"    workspace_region=workspace_region,\\n\",\n    \")\\n\",\n    \"exp = Experiment(workspace=ws, name=EXPERIMENT_NAME)\\n\",\n    \"ds = ws.get_default_datastore()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 02 Set up SentEval\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Run the bash script to download the data for auxiliary transfer tasks.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"data_path = os.path.join(LOCAL_SENTEVAL, \\\"data/downstream\\\")\\n\",\n    \"data_script = \\\"get_transfer_data.bash\\\"\\n\",\n    \"tokenizer_name = \\\"tokenizer.sed\\\"\\n\",\n    \"!cd $data_path && pwd  && chmod 777 $data_script && chmod 777 $tokenizer_name &&bash $data_script\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We upload the SentEval dependency to datastore. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"$AZUREML_DATAREFERENCE_29393ed34c4d4ab398be34e5f5952c2c\"\n      ]\n     },\n     \"execution_count\": 7,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"ds.upload(\\n\",\n    \"    src_dir=LOCAL_SENTEVAL,\\n\",\n    \"    target_path=os.path.join(EXPERIMENT_NAME, \\\"senteval\\\"),\\n\",\n    \"    overwrite=False,\\n\",\n    \"    show_progress=False,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 03 Define experiment configurations\\n\",\n    \"We define a set of static configurations, which entails model parameters that will stay consistent across all experiments, in `SentEvalConfig`. We also define the parameter space that will vary across the experiments. We serialize the configuration objects and upload them to our datastore to make them accessible to all experiments.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"sc = SentEvalConfig(\\n\",\n    \"    model_params=MODEL_PARAMS,\\n\",\n    \"    senteval_params=SENTEVAL_PARAMS,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"parameter_groups = list(itertools.product(*list(EXP_PARAMS.values())))\\n\",\n    \"if MAX_NODES is not None:\\n\",\n    \"    parameter_groups = parameter_groups[:MAX_NODES]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"$AZUREML_DATAREFERENCE_07723c6c94b7431b9f811c8be538d8b7\"\n      ]\n     },\n     \"execution_count\": 10,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"os.makedirs(os.path.join(CACHE_DIR, \\\"config\\\"), exist_ok=True)\\n\",\n    \"\\n\",\n    \"static_config = (\\n\",\n    \"    SentEvalConfig(model_params=MODEL_PARAMS, senteval_params=SENTEVAL_PARAMS),\\n\",\n    \"    os.path.join(CACHE_DIR, \\\"config\\\", \\\"static_config.pkl\\\"),\\n\",\n    \")\\n\",\n    \"exp_configs = [\\n\",\n    \"    (\\n\",\n    \"        dict(zip(EXP_PARAMS.keys(), p)),\\n\",\n    \"        os.path.join(CACHE_DIR, \\\"config\\\", \\\"exp_config_{0:03d}.pkl\\\".format(i)),\\n\",\n    \"    )\\n\",\n    \"    for i, p in enumerate(parameter_groups)\\n\",\n    \"]\\n\",\n    \"\\n\",\n    \"configs = [static_config] + exp_configs\\n\",\n    \"for config in configs:\\n\",\n    \"    pickle.dump(config[0], open(config[1], \\\"wb\\\"))\\n\",\n    \"\\n\",\n    \"ds.upload_files(\\n\",\n    \"    [c[1] for c in configs],\\n\",\n    \"    target_path=\\\"{}/config\\\".format(EXPERIMENT_NAME),\\n\",\n    \"    overwrite=True,\\n\",\n    \"    show_progress=False,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 04 Scale the compute target\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Scale the number of nodes in the compute target to the number of experiments we want to run.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Scaling compute target eval-gpu to 24 node(s)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"compute_target = get_or_create_amlcompute(\\n\",\n    \"    workspace=ws,\\n\",\n    \"    compute_name=CLUSTER_NAME,\\n\",\n    \"    vm_size=\\\"STANDARD_NC6\\\",\\n\",\n    \"    min_nodes=0,\\n\",\n    \"    max_nodes=len(parameter_groups),\\n\",\n    \"    idle_seconds_before_scaledown=300,\\n\",\n    \"    verbose=False,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"print(\\n\",\n    \"    \\\"Scaling compute target {0} to {1} node(s)\\\".format(\\n\",\n    \"        CLUSTER_NAME, len(parameter_groups)\\n\",\n    \"    )\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 05 Define the execution script\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Here we define the script to be executed for each experiment on the remote compute target. We deserialize the configuration objects from the datastore to specify the model parameters for the experiment, and run the SentEval evaluation engine with that model for the STSBenchmark transfer task.\\n\",\n    \"\\n\",\n    \"As specified in the SentEval repo, we implement the **batcher** function, which transforms a batch of text sentence into sentence embeddings.\\n\",\n    \"\\n\",\n    \"After running SentEval, we serialize the output.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"src_dir = os.path.join(CACHE_DIR, EXPERIMENT_NAME)\\n\",\n    \"os.makedirs(src_dir, exist_ok=True)\\n\",\n    \"if not os.path.exists(os.path.join(src_dir, \\\"utils_nlp\\\")):\\n\",\n    \"    shutil.copytree(\\n\",\n    \"        LOCAL_UTILS,\\n\",\n    \"        os.path.join(src_dir, \\\"utils_nlp\\\"),\\n\",\n    \"        ignore=shutil.ignore_patterns(\\\"__pycache__\\\", \\\"SentEval\\\"),\\n\",\n    \"    )\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Writing ./temp/NLP-SS-bert/run.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile $src_dir/run.py\\n\",\n    \"import pickle\\n\",\n    \"import argparse\\n\",\n    \"import os\\n\",\n    \"from utils_nlp.eval.senteval import SentEvalConfig\\n\",\n    \"from utils_nlp.models.bert.sequence_encoding import BERTSentenceEncoder\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def prepare_output(output_dir, config_file):\\n\",\n    \"    os.makedirs(output_dir, exist_ok=True)\\n\",\n    \"    out = os.path.join(\\n\",\n    \"        output_dir,\\n\",\n    \"        \\\"results_{}.pkl\\\".format(config_file.split(\\\"/\\\")[-1].split(\\\".\\\")[0][-3:]),\\n\",\n    \"    )\\n\",\n    \"    return out\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def batcher(params, batch):\\n\",\n    \"    sentences = [\\\" \\\".join(s).lower() for s in batch]\\n\",\n    \"    embeddings = params[\\\"model\\\"].encode(\\n\",\n    \"        sentences, batch_size=params[\\\"batch_size\\\"], as_numpy=True\\n\",\n    \"    )\\n\",\n    \"    return embeddings\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"if __name__ == \\\"__main__\\\":\\n\",\n    \"    parser = argparse.ArgumentParser()\\n\",\n    \"    parser.add_argument(\\\"--data_dir\\\", type=str, dest=\\\"data_dir\\\")\\n\",\n    \"    parser.add_argument(\\n\",\n    \"        \\\"--static_config\\\",\\n\",\n    \"        type=str,\\n\",\n    \"        dest=\\\"static_config\\\",\\n\",\n    \"        help=\\\"Filename of serialized static config object\\\",\\n\",\n    \"    )\\n\",\n    \"    parser.add_argument(\\n\",\n    \"        \\\"--exp_config\\\",\\n\",\n    \"        type=str,\\n\",\n    \"        dest=\\\"exp_config\\\",\\n\",\n    \"        help=\\\"Filename of serialized experiment config object\\\",\\n\",\n    \"    )\\n\",\n    \"    parser.add_argument(\\n\",\n    \"        \\\"--output_dir\\\",\\n\",\n    \"        type=str,\\n\",\n    \"        dest=\\\"output_dir\\\",\\n\",\n    \"        help=\\\"Directory to write serialized results to\\\",\\n\",\n    \"    )\\n\",\n    \"    args = parser.parse_args()\\n\",\n    \"\\n\",\n    \"    # Import senteval\\n\",\n    \"    sys.path.insert(0, args.data_dir)\\n\",\n    \"    import senteval\\n\",\n    \"\\n\",\n    \"    # Deserialize configs\\n\",\n    \"    static_config = pickle.load(open(args.static_config, \\\"rb\\\"))\\n\",\n    \"    exp_config = pickle.load(open(args.exp_config, \\\"rb\\\"))\\n\",\n    \"\\n\",\n    \"    # Update senteval params for this experiment\\n\",\n    \"    params = static_config.senteval_params\\n\",\n    \"    params[\\\"model\\\"] = BERTSentenceEncoder(**static_config.model_params)\\n\",\n    \"    for k, v in exp_config.items():\\n\",\n    \"        setattr(params[\\\"model\\\"], k, v)\\n\",\n    \"    params[\\\"task_path\\\"] = \\\"{}/data\\\".format(args.data_dir)\\n\",\n    \"\\n\",\n    \"    # Run the senteval engine\\n\",\n    \"    se = senteval.engine.SE(params, batcher)\\n\",\n    \"    results = se.eval(params[\\\"transfer_tasks\\\"])\\n\",\n    \"\\n\",\n    \"    # Pickle the output\\n\",\n    \"    output_file = prepare_output(args.output_dir, args.exp_config)\\n\",\n    \"    print(\\\"Pickling to {}\\\".format(output_file))\\n\",\n    \"    pickle.dump(results, open(output_file, \\\"wb\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 06 Run the experiments in parallel\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We iterate through the experiment parameter combinations and submit each job to AmlCompute as a `PyTorch` estimator. Since we explicitly set `node_count=1` and `process_count_per_node=1` in the estimator, the jobs will run in parallel.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Scaling compute target eval-gpu to 24 node(s)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"runs = []\\n\",\n    \"for i in range(len(parameter_groups)):\\n\",\n    \"    est = PyTorch(\\n\",\n    \"        source_directory=src_dir,\\n\",\n    \"        script_params={\\n\",\n    \"            \\\"--data_dir\\\": ds.path(\\\"{}/senteval\\\".format(EXPERIMENT_NAME)).as_mount(),\\n\",\n    \"            \\\"--static_config\\\": ds.path(\\n\",\n    \"                \\\"{0}/{1}/{2}\\\".format(\\n\",\n    \"                    EXPERIMENT_NAME, \\\"config\\\", static_config[1].split(\\\"/\\\")[-1]\\n\",\n    \"                )\\n\",\n    \"            ).as_mount(),\\n\",\n    \"            \\\"--exp_config\\\": ds.path(\\n\",\n    \"                \\\"{0}/{1}/{2}\\\".format(\\n\",\n    \"                    EXPERIMENT_NAME, \\\"config\\\", exp_configs[i][1].split(\\\"/\\\")[-1]\\n\",\n    \"                )\\n\",\n    \"            ),\\n\",\n    \"            \\\"--output_dir\\\": \\\"./outputs\\\",\\n\",\n    \"        },\\n\",\n    \"        compute_target=compute_target,\\n\",\n    \"        entry_script=\\\"run.py\\\",\\n\",\n    \"        inputs=[\\n\",\n    \"            DataReference(\\n\",\n    \"                datastore=ds, path_on_datastore=\\\"outputs\\\"\\n\",\n    \"            ).as_upload(\\n\",\n    \"                path_on_compute=os.path.join(\\\"./outputs/results_{0:03d}.pkl\\\".format(i))\\n\",\n    \"            )\\n\",\n    \"        ],\\n\",\n    \"        node_count=1,\\n\",\n    \"        process_count_per_node=1,\\n\",\n    \"        use_gpu=True,\\n\",\n    \"        framework_version=\\\"1.1\\\",\\n\",\n    \"        conda_packages=[\\\"numpy\\\", \\\"pandas\\\"],\\n\",\n    \"        pip_packages=[\\n\",\n    \"            \\\"scikit-learn==0.20.3\\\",\\n\",\n    \"            \\\"azureml-sdk==1.0.53\\\",\\n\",\n    \"            \\\"pytorch-pretrained-bert>=0.6\\\",\\n\",\n    \"            \\\"cached-property==1.5.1\\\",\\n\",\n    \"        ],\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    run = exp.submit(est)\\n\",\n    \"    runs.append(run)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Each run object is collected in `runs`, so we can monitor any run via a Jupyter widget for debugging.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"#RunDetails(runs[0]).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Alternatively, block until the runs are complete.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"map(lambda r: r.wait_for_completion(), runs)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally, we pull down the serialized outputs of each experiment from the datastore and inspect the metrics for analysis.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"24\"\n      ]\n     },\n     \"execution_count\": 18,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"ds.download(\\n\",\n    \"    target_path=CACHE_DIR,\\n\",\n    \"    prefix=\\\"outputs\\\",\\n\",\n    \"    show_progress=False,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Here we aggregate the outputs from each SentEval experiment to plot the distribution of Pearson correlations reported across the different encodings. We can see that for the STS Benchmark downstream task, the first layer achieves the highest Pearson correlation on the test dataset. As suggested in [bert-as-a-service](https://github.com/hanxiao/bert-as-service), this can be interpreted as a representation that is closer to the original word embedding.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"results = [\\n\",\n    \"    pickle.load(open(f, \\\"rb\\\"))\\n\",\n    \"    for f in sorted(glob.glob(os.path.join(CACHE_DIR, \\\"outputs\\\", \\\"*.pkl\\\")))\\n\",\n    \"]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# For testing\\n\",\n    \"sb.glue(\\\"pearson\\\", results[0][\\\"STSBenchmark\\\"][\\\"pearson\\\"])\\n\",\n    \"sb.glue(\\\"mse\\\", results[0][\\\"STSBenchmark\\\"][\\\"mse\\\"])\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"Text(0.5, 1, 'Pearson correlations of BERT sequence encodings on STS Benchmark')\"\n      ]\n     },\n     \"execution_count\": 18,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    },\n    {\n     \"data\": {\n      \"image/png\": \"iVBORw0KGgoAAAANSUhEUgAAAiIAAACcCAYAAABC68AGAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOydd3gVxfrHP++eEwglQCoplIRqAQERUUApKmJBvViwK157vVbAgoiC5VqwKzbEAnYERZoUlSJFEaVICem9kUZIzjnz+2M34bSEAwmG3N98nmef5+zOOzPfM/vu7rszs7uilEKj0Wg0Go2mMTAaW4BGo9FoNJr/v+hARKPRaDQaTaOhAxGNRqPRaDSNhg5ENBqNRqPRNBo6ENFoNBqNRtNo6EBEo9FoNBpNo6EDkf8niMhMEXmqHvlLRaRLQ2qqDyJym4hkW7rCG1uP5uhDROJFRImI3Vr/QUSua2xdGl+891Uj1D9MRNIao27NUR6IiEiSiOyzLjbZIvKBiLRubF3/64jIChG50X2bUqq1UiqxsTS5IyJBwIvASEtXvld69Umt1M133rDyVdu4+1b18pqVdr2IOK1txSLyh4icb6Wd5mZf5lVPqYh0+ifbQhM4SqlzlFIfNraOw0VE/i0i20WkxPLp70UkxAqwqv2vSkQq3dbfsvI+LCJ7rG1pIvJZHfW4HxuFVj0d/7l/qvn/xlEdiFiMVkq1Bk4EBgCPNmThjRWBHy7+9Da1/9AAtAeCgS0HsWtn+U5v4FTgDq/00VYgU73c6Za2xsrbDngDmCMi7ZRSP1fbA8e712MtKfX+dxqNFyIyFJgGXKGUCgGOBT6HmgCr2ic/AZ5z88dbrV6ga4AzLZuTgB8PUmX1eTcGyAZePTL/rPH5f3j+POpoCoEIAEqpdOAHoBeAiLQVkfdEJFNE0kXkKRGxWWldRWSZiOSLSJ6IfCIi7arLsiL+8SKyGSgTEbu1nm7dbfwtImdYts1FZLqIZFjLdBFpbqUNs+4u7heRHEvLuNr+g4iEWb06Gdadxly3tJtEZJeIFIjIPBGJdUtTInKHiOwEdtax7RgRWWKV8beIXFaLjlAR+U5Eci0d34lIByttKnAa8JpXL4ESkW5ubT/Lyp8sIo+KiGGlXS8iv4jI81bZe0TkHLe6rxeRRKud94jIVbVo9NvuItID+NsyKxKRZbW1dzVKqRxgCXDcwWz95HUBHwGtgO6Hmh+gDt8yRGSCiOy2fPVzEQlzy3eN1b75IvKI5bdnWmkeQ23i1bUsIrEi8pW1j/aIyN1uaZOtumZZmraIyElu6R1F5Gsrb361D1hpN4jINmvfLhKRznX871NEZLWIFInZqzTMLW2FiDwpIqssDYtFJMItfYhb3lQRud7aXpfv2Sy/yxORROA8Lz01PX0B+GmCiPxkaVsqIq+LyMdWWrCIfGy1TZGIrBeR9rW0wbFWvUVWO1/gljbTKvd7q55fRaRrLc05ADM4/h1AKVWglPpQKVVSW/t75V2klNpt5c1SSs0IIB9KqQrgS9yOHes4fF5EUsTsmXlLRFpYaXWeE0WkhYi8YO27vdY+aOFW5VVWuXki8ohbvski8oXV7iUi8qeI9BCRiVY9qSIy0s1+nOWnJWKeb25xS6vWOF5EsoAPvP+3iNwtIlvFOi9qjjBKqaN2AZIwo3iAjph3wE9a63OBtzEvEFHAOuAWK60bcBbQHIgEfgKme5W7ySqzBdATSAVirfR4oKv1ewqw1qojEljtpmEY4LBsgoBzgXIgtJb/8z3wGRBq2Q+1to8A8jB7fZpj3n385JZPYV5Iw4AW/rZZ7ZAKjAPsVll5wPGW/UzgKet3OHAx0BIIAb4A5rrVtwK40Uu7ArpZv2cB31p544EdwL+ttOuBKuAmwAbcBmQAYmksBnpatjHV+vy0VV3tHm/psdeS1yMdiAX+AG7w51t+8l8P/GL9tmH2pFQCUXXVU0tZdfnWf6z/2MHa728Ds62044BS4HQr7UVMX6s+Hmr2p5svplm/DWAjMAloBnQBEoGzrfTJQAWmv9qAp4G1bv/3D+Ala38FA0OstIuAXZh343bM3snVtfzvOCDfqsPAPB7zgUg3H9sN9MD03xXAM1ZaJ6AEuALzOAkH+gbge7cC2zGP6zBguZcfrMDya+rwUyt9DfC81X5DMP32YyvtFmA+5vFjA/oDbfy0QZDVXg9b5Yyw/ldPt31YAJxstecnwJxa2vM0YB/wBDAYaF6LnYdfWNuutup5ELM3xHYI592WwIfALLf06cA8q41DrLZ4OpBzIvC6tR/irLYbhOnf8da+esfyhz7AfuBYL58922qrWcAe4BGrnpuAPW4azwO6Yp53hloaTvTS+KxVdws8j5/HgN+wfFUvR35pdAF1ijMPiFKgCEjG7CJvgdk1vx/romzZXgEsr6Wci4Dfvcp1vyh1A3KAM4Egr7y7gXPd1s8Gkqzfw6yTg90tPQc4xY+GGMCFnyAFeA+zO7V6vTXmSTLeWlfACK88HtuAscDPXjZvA49bv2fidYJys+sLFLqtr6CWQMQ6eewHjnNLuwVYYf2+HtjlltbSyhuNeWErwgyCWvjTEmC7xxNYIFJkLQozkGnjZuPuW9XLTW7/wWFtq7L28WV11FNXIFKXb20DzvDykSrME+0k3C5KVttVElggMhBI8aprIvCB9XsysNQt7Thgn/X7VCDX33/C7JH8t9u6gXmC7+zHdjzwkde2RcB1bj72qFva7cBCN63f+CnzYL63DLjVLW0kdQcitflpJ2v/t3RL/5gDgcgNlj+dcBAfPg3IAgy3bbOByW778F23tHOB7XWUdw7mRb8I03dfxCuo8PYLt+1XAUuBMsyAcEId9SRx4NhwYAZova00scro6mZ/KlYQQB3nRMtf9gF96jiWOrhtWwdc7uazS9zSRlsabdZ6iJW/XS3/aS5wj5vGSiDY6/hJt9r0F6BtXftWLw27NIWhmYuUUu2UUp2VUrcrpfYBnTGj4Eyry7MI86IbBSAiUSIyR8zu8GLMk0iEV7mp1T+UUrsw704nAzlW3uqhkVjMIKiaZGtbNflKKYfbejlmIOFNR6BAKVXoJ82jDqVUKebJIs6f3lq2dQYGVreH1SZXYZ5YPRCRliLyttU9WozZY9ROrKGtgxCBeXfn3SbuWrPc/ku59bO1UqoMM2C6FXPffS8ix9RSz8HaPRAilFLtMC8yq4CFXunVvlW9vOOWttbKG4p593faIdYNHNS3OgPfuO2vbYATM9COxdNHqy8ggdAZiPXyhYetcqvJcvtdDgSLOVbeEUj28mn3cl92K7MA88IUV4vtpV4ahmAGW7VpqD5uOmIGot4czPc82szLzh9+/dQqp8BtG17lfoQZVM0Rc9jwOXGbCO1GLJCqzOE9f3o9NFD7uaNa4w9KqdGYPREXYgZTN9Zm75X3E6XUmZhznm4FpojI2XVkucjy/+bAncBKEYnG7J1sCWx0268Lre3V1HZOjMDsYfO3b6upqz2y3X7vA/KUUk63dartReQcEVkr5jB1EWaQ534NyFXmsJM77YCbMXt39tahUdPANIVAxB+pmHdGEW4XkTZKqerJg09jRscnKKXaYHZNilcZymNFqU+VUkMwT6AKs9sOzLuBzm6mnaxth6M5TNzmqrjhUYeItMLsjk6vTa+fbanASq8La2ul1G1+8t2POWQw0Gqf06urrqOuavIw79q92yTdv7mXYKUWKaXOwrwgbcfsivVHQ7U7VvA6EzhV3OYhBJi3FPNu/RoR6XeY9dfmW6nAOV77LFiZ86EyMS/IgBk8YvpENWWYF4Rq3APOVMw7VPdyQ5RS5wYgNxXoJP4n8KViDn+6l9tCKbW6FtuPvGxbKaWeCVCDv7kSB/M9jzaz0g6HTMxj1b19a8pVSlUppZ5QSh2HObRwPnCtn3IygI5izWHxo/ewUEq5lFI/YvYA9TrEvFVKqS+AzYHkVUo5lVJfYwbIQzD3wT7MIdXq/dpWmRNbD0Ye5vBKbfNgGgQx5/B9hTm01t4KqBbgeQ3wd44rxNyXH4jI4COpUeNJkwxElFKZwGLgBRFpI+akv65iziwHs5uuFHMyYxzm2GitiEhPERlhOXAF5oFWHWnPBh4VkUjrIjYJs4flcDT/ALwh5mTRIBGpDgA+BcaJSF9LwzTgV6VU0iFU8R3QQ8wJjkHWMkBEjvVjG2L9xyIxJ0c+7pWejTmvwN//cGLO1p8q5qODnYH7CKBNRKS9iFxgBVr7MfeRsxbzBml3q97mmE8NZBF4r0INynw8+F1Lw6HWXZdvvYXZjp0t20gRudBK+xI4X8xJm80wx9zdj9dNwLliToCOxux1qWYdUGxNxmsh5iTOXiIyIADJ6zAvxM+ISCsxJ2ZWn5TfAiaKyPGW3rYicmkt5XwMjBaRs636g61JgoFM/vsEOFNELhNzInm4iPQNwPc+B+4WkQ4iEgpMCKAuH5RSycAGYLKINBORUzGHArD+93AR6W31IBZjBkf+/PhXzIDxIet4HGaVM+dQNYnIhSJyuXXuEBE5GXPuw9oA8l4vIudZbWaIOSn3eEvfwfKK5ZOhwDard+cd4CURqe6BjjtI7wpQM/H7feBFMSdT20TkVOvYaEiaYfbk5AIO6/+OrDtLjcYVmD3J34jIwAbWpamFJhmIWFyL6XBbMSPZLznQ7fsE5mTNvZgTRL8+SFnNgWcwI/YszCGeh620pzBPSpuBPzEnMR3ui8GuwTxpbcccN/0PgHV38xhmFJ+Jecdw+aEUrMzZ8yOtfBnW/6iejOXNdMy5NnmYJzLvIYuXgUvEfJrgFT/578I8wSZijqd+inmCORgGZm9MBma3/lDM3gZ/NES7F4lIKWZgdSpwgVLK/U5ovni+A+SbOsqajnnhP+EQNdTlWy9jDvssFpESzH0xEEAptQVzkuynmD5RCLi/cOkjzEmlSZhBec17IawL9mjMuT97rLrfBdoeTKxb3m5AilXnWCvtG0yfmiPmkN5fmPMW/JWTijl88DDmBSEV84bgoOccZT4CfS6mrxRgBl19rOS6fO8dzCGTPzD95WDHfV1chekz+Zh+9xlm8Axm79OXmEHINmAlfoJkpVQlcAFmG+VhznG7Vim1/TD0FGJOyNxp1fsx8F+l1CcB5C3G3A8pmPM+ngNuU0r9Ukee+daxUwxMxZzbU/24/HjMSbhrLT9YitnDGggPYB7P6zH37bM08HXIOhfejRmYFgJXYh5ngeZfgjnpf56I9G9IbRr/iOd5WaPRHK2ISBLmZMulja3l/xtivgBsu1LKu/dQo9HUk6bcI6LRaDRHBGtYs6s1lDEKs3dn7sHyaTSaQ0e/UU6j0Wh8icYc2gnHHJ66TVkvE9NoNA2LHprRaDQajUbTaOihGY1Go9FoNI2GDkQ0Go1Go9E0Gkd8jkhVXuJRN/bjKjis92IdUVxrvmtsCT6onJzGluAXVRTId77+WaTjob709chTuXJzY0vwoWjb0TctbU6h3+/VNTqRtb1hpxFZbis/uNE/zG37j8776cFZX3q/RPOI4u9aGxTR5R/VcLgcfWcFjUaj0Wg0h4azqrEVHDY6ENFoNBqNpomjnP4+D9U00IGIRqPRaDRNnSYciBydg2sajUaj0WgCp2q/7xIAIjJKRP4WkV0i4vf7TNZ3n7aKyBYR+dRt+7Mi8pe1jHXbniAiv4rIThH5zPpeVq3oQESj0Wg0miaOcjp8loNhfbjxdczvIR0HXCEix3nZdAcmAoOtL9z/x9p+HuY33fpifiPrQRFpY2V7FnhJKdUd83s//65Lhw5ENBqNRqNp6jirfJeDczKwSymVaH2kcQ7m5wzcuQl4XSlVCKCUqn6c8jhgpVLKoZQqw/zY5CgREWAE5ochAT4ELqpLhA5ENBqNRqNp6jgqfZeDE4f5Zexq0qxt7vQAeojIKhFZa317CczA4xwRaSkiEcBwoCPmZxGKlFKOOsr0QE9W1Wg0Go2miaP89ICIyM3AzW6bZiilZrib+CvKa90OdAeGAR2An0Wkl1JqsYgMAFYDucAawBFgmT4VaDQajUajacr46QGxgo4ZvsY1pGH2YlTTAfB+42casFYpVQXsEZG/MQOT9UqpqcBUAGsS604gD2gnInarV8RfmR7ooRmNRqPRaJo6hzdHZD3Q3XrKpRlwOTDPy2Yu5rAL1hBMDyBRRGwiEm5tPwE4AViszC/pLgcusfJfB3xblwjdI6LRaDQaTVPnMN4jopRyiMidwCLABryvlNoiIlOADUqpeVbaSBHZCjiBB5VS+SISjDlMA1AMXO02L2Q8MEdEngJ+B96rS4cORDQajUajaeKoAN8b4pNPqQXAAq9tk9x+K+A+a3G3qcB8csZfmYmYT+QEhA5ENBqNRqNp6jThN6vqQESj0Wg0mqaODkQ0Go1Go9E0GlUBvTfkqEQHIhqNRqPRNHV0j4hGo9FoNJpGQwciGo1Go9FoGg09NKPRaDQajabRcDobW8FhowMRjUaj0WiaOg49NKPRaDQajaaxcAT0SvejkqMuEPll7Qaemf4WTpeLi0eP4sZrLvNIf/blt1n322YAKvbvp6CwiDWLvjwyWjb+ybPvzMblUow56zT+fem5PjaLfl7Pm7O/RRB6JHTk2QfNDx2+NPMLflpv6rzl8tGMOi3gl8zVyard2Ty3ZDMupfhXn87cMKinr6atabz983YQ6BHVlmcuGmBqWvYXP+/KQinFKQlRPHTWCViv560XRnwvmp1xJYjg2PwzjnULfGxsPQcQNOhCQOHKSaXye/M7TEFDL8XW5QQQwZm0lapln9ZbD4Cte1+anTcODAPHhh+p+mmur02vU2l2xmWgFK6sZPZ//jJGwvE0O+/6A/8tIpb9n03HuW19vTWtSsrlvyu24XLBRb06cMPJXXxsFv+dyVtrd5n+FBnC0+f2AWD6T3/z855cFIqBncJ5aNixDbLvAOx9BtDi2jvBsFG5/Hv2z5vtYxN0yjCCL74OAGfybspfewr7cX1pcc0dNTZGbCfKX51C1YZV9dbUYtBJhD10OxgGpd/8wN4PPvOxaTnydNrdci2gqNyRSN7EpwGwRUcS8fj92NpHglLk3PUIjozsemtKGHoCZz5+DYbN4I85K1j75nwfm2POG8iQe8eglCJnWwrz736jJq1Z6xbc9OOz7Fi0gSWTZtVbD0DcsBM45QlT09+zV7D5dV9NCecPpN99Y0ApCralsOJOU9O45FkUbje/AF+ans/SG15sEE0nDO3HNY/fgGEzWDFnKfPf/MbHZuB5gxhz71iUUqRsS+KNu6cTHhfJf95+CMMwsAXZWDxzAcs+WdwgmgDaDe9LlyfHgc0g+5MfSX/N95wQfsGpdHrgMlBQtiWJHbe/DMCg9M8o25YCQGV6Htuue7bBdDUYukekYXA6nTz1wuu8M30a0VERjL3xHoYPGUjXhM41NuPvuaXm9ydffMu2nbuPkBYX0976hBlP3k/78FCuuO9Jhg3sS9dOsTU2yRnZvPfl98x6biJtWrciv6gYgJ/W/8G23Sl88cpkKqsc3DDxWYb0703rli3qp8mleHrRH7x1xWDat2nBVR8sZ2j3GLpGtjmgqaCU99fsYOa1p9OmRTMKyszX/m5Ky2dTWj5f3HgGAOM+WsmGlDwGdI6slyZEaHbW1ez//AVUSQHB10zCuXsTKv/AxxalXRRBA8+l4tNpsL8cWoYAYMR2xYjrRsVM823Cza+ciNGxJ67Uv+upyaDZ6H9T8cGTqOICgm97Gse2DajctAMm4dEEDf0X+95+FCrKoJXZhq49W6h47UHTqEVrWt73Ks5df9RPD+a+e2bZVt4cM4D2IcFc9ekahnaNomt46xqb5MIy3l+fyMyxp9AmOIiCcmvfZRSyKaOQz68ZDMC4z9eyMa2AkzqG11sXYtBi3D2UTXsQV34uIVPfomrjalzpyTUmRnQczS+8ktLJd6HKSpE27QBwbN1EycSbzGJahRAy/WOqNm+ovybDIGziXWTfOh5Hdh6xn7xG+co1VCWm1JjYO8XR9oYryLr+P7hKSjFC29WkRT41nqJ3P6Vi7W9Ii2BQdX6BPCDEEEY+eR1zrnqGkqwCrp83hZ1LN5K/84Cfh8a359Q7RvPRmCfYX1xOy/A2HmWcfv8lpPy6vd5a3DUNeuo6Fl75DGWZBVzw/RRSFm+kyE1Tm4T29LlzNN/96wkq95YT7KbJWVHJ3LMfaTA9piaD6568iWeueoKCrHymzHuOjUvXk7HzwLHXPj6G0XeM4YkxD1NeXEab8LYAFOUU8sSYiTgqHTRvGcwzi6fz25L1FOUU1l+YYdDl6RvZctkUKjML6LPwGQoWb2DfjgO6ghOi6XDXGDaPfhTn3jKCIg60lauikj/OfLD+Oo4gqgnPETmqvr7757YddOoQS8e4GIKCgjjnjKEs+3ltrfYLlq7k3DOHHREtf+1MpFNMFB2iIwkKsjPq9JNZ/uvvHjZfLfqJseeOoE3rVgCEtzMdd3dqJif16oHdZqNlcHN6JnRk1ca/6q8po4COoa3oENqKIJvB2cd1YMXOTA+brzclMbZ/F9q0aAZAWKvmAAhQ6XBR5XRR6XTicCrCrbT6YMR0QRXmoPbmgsuJY/uv2Lr19bCx9xlK1e/LzCAEoLykJk1sQWCzgy0IDDuqrLj+mjp0w1WQhSrMAacD5+ZV2I89yVPTSWfi+HWhGYQA+KnX3usUnDt+b5DZ6H9lFdGxXUs6tGtp7rue0azY7XmX/s2faVzWpxNtgoMACGvptu+cLqpcLiqdLhxOVZNWX2zdjsGVlYErJxOcDirXLCPopMEeNs1GnE/l4rmoslIAVHGRTzlBA4fi2LQOKg/vexfuNO/VE0dqBo70LHA4KFu0gpbDBnnYhIw5h5LP5uEqMTW5Ck1NQV06gc1GxdrfTK37KlAV9dcU07crhUnZ7E3NxVXlZOv8tXQ/q7+HTZ8rhrNx1lL2F5t+Xp5/wKfa94qnZUQbkn76s95aqons25XipGxKUkxNid+updNIT009rxzO1g+XUrnX1FSRX//jqy669u1GdlImuanZOKscrJ3/C/3P8uwNHn7FmSydtZDyYvPYK87fC4CzyoGj0ryrD2pmR4yG6fEDCOnXjYo9WexPyUFVOcidu4qwswd42LS/+kyyPliIc6+pqyrvyLZVg1NV5bsEgIiMEpG/RWSXiEyoxeYyEdkqIltE5FNr23AR2eS2VIjIRVbaTBHZ45bW11+51RxVPSI5uXlERx24Q28fFcGfW/zfHWdkZZOemcXA/n2OiJbs/CLaR4Qd0BIeyp879njYJKdnAXDtQ0/jdLm47YoLGNK/Nz3jO/DWnPlcc+FIKvZXsm7zdrp0jKW+5JRUEN3mQK9K+5AW/JnhebeQXGCemK+btRKXS3HraccyuGt7+nQIZ0DnCM585QdAMbZ/F7pEeN6xHQ7Suh2qpKBmXZUUYsR4DjlIaHsMwH7lRBCDqlXf4kr6C1fGbpyp22lx20sg4PhtGaogk/oibcJQe/MPaCouwOjY3cPGiIjBBQTf/KSpadkXOHdu8rCx9x5M1Srfru7DIad0P+1D3PZd62D+ytrrYZNcZJ4Ar5+zFpdS3HJqNwbHR9InNpSTOoZx1ozloGBs3050cetJqQ9GaASu/JyadVd+LvZux3rY2KI7ANB68qtgGFR8NRPHH55DVUGDhrP/+y8aRJMtKgJHVm7NuiM7j+a9j/Gsr7OpKXrmdMQwKHprFvtWbyCocwdcJaVEvvA49rhoKn79jcKX3wOXq16aQqJDKck84OclmQXE9uvqYROWEA3A1V9NQgyDX6Z/zZ6Vm0GEMx69ivn3vkn84OPrpcOdljGhlLlpKs8qINJLU1tL0/nfTEJsBr+9+DXpK8whY1vzIC74fgrK6WLz6/NJXrSx3ppCo8MpyDxw7BVk5tO1n+exF51gngsnfTUNwzD4evpnbF5p3uSFxYTzwAeP0D4+htnTPmyY3hCgWUwYlRl5NeuVmfmEnOipq0UXU1fveU8hNoOU5z+naLl5TjCaN6PPomdRDidpr35DwcL6D9U2OIcxNCMiNuB14CwgDVgvIvOUUlvdbLoDE4HBSqlCEYkCUEotB/paNmHALsB9LO1BpVRA8yYCDkREZBAQ755HKdUwA5015fmr17/tD0tXMnLYEGw2W0NKqFOMtxan00VKRjbvTXuQ7LxCrp/wLF+/NoVBJ/bir51JXPvQ04S2DaHPMV2x2erf+eSvg9m7eZwuFykFpbx71WnklOxj3Ec/8eVNZ1BUXkliXgmL7xoFwK2zf2FjSh79O0XUU5W/HeSpVAwbhLZn/5znkJBQml8xgYoPHkNahGCExbDvrfsBaH7Z/RhJPXCl7TgCkrxaz7BhRMRQ8e5kpG04wTdNYd8r90GFeecoIe0wojvh3Fn/YZlAdTpdipSiMt659GRySiu44fNf+fKaIRRWVLKnoIxFNw4D4Nav17MxrYD+HcJ8yzxkDQfff9hsGNFxlD75H4ywSFo//golD41DlZuBk7QLw9axC47NDXRy9qfJe//ZbNg7xZF14/3YoyKJ/uBFMi65CWw2gvv1JuPyW3Fk5RD57KO0vmAkpXMX1leUH02eq4bdRlh8NJ+OnUpITBhXffEY742cwPH/Gszu5Zs8ApmGwVeTdzOJ3UabhGi+v3QqrWLCOP/rx/j6jAlUFpfz2cB7KM8uIqRTJOd89jAF21MpSc7xKbN+ivB1J7uN6PhYpo59jLCYcB77YioTRt5DeXE5BZn5PDzqPtpFhXLvOxNYt2ANxXl7/ZV6iML8tZXXecpuo0VCDH+NeZxmseH0nvskvw+7F2dxORv630pldiHNO0XR66vJlG9LoSK5/vOOGpSqw5ojcjKwy/paLiIyB7gQ2OpmcxPwulKqEEAp5c9JLgF+UEqVH46IgK6OIvIR8DwwBBhgLSfVYX+ziGwQkQ3vzvKd+FYb7aMiyMo5cCeUnZNHZIT/cfAflq7knLOGBVz2odI+IpTsvAMnjuz8QiLD2vnYDB/YlyC7nQ7RkcTHtSfFmhR389jz+eKVycx48n6Ugs6x7euvKSSYrOJ9BzSV7CMyJNjLpgXDesQQZEvCwWMAACAASURBVDOIa9eK+LAQUgrKWLYjgxPiwmjZzE7LZnYGd4lmc3r9T4yqtBAJOXBBlJBQVKln172rpADnzt/B5UTtzUMVZGGEtsfW/UScmYlQtR+q9uNM/BMj1ncC5yFr2luAtD3gN9ImDFXs+V9VcT7OretNTYU5qLwMjPCYmnRbr0E4tq4DV8OMu0a1bk52idu+K60g0mtoLKp1MMO6Rpn7rm1L4kNbkVJUzvJdOfSObntg38VH8mem7/DI4eAqyMUIj6pZN8IjcRXm+9g4NqwCpxNXbhbOzFQMq5cEIOiU4VSt/6XB3mPgzM7FHn2gZ9TePgJnbr6XTR77VqwBhxNHRhZVSWnYO8XhzM6j8u9d5rCO00X58tU0O7a7dxWHTElWASExB/w8JCaMkmzPu/WSzAJ2LtmIy+Fkb2ouBYmZhMZHE3diN0687ixu++Ulhj9yJb3GnMbQ8WPrrak8s4BWbppaRodRnuWpqSyzgJRFG1EOJ6WpuezdnUkbq5ekPNv0oZKUXDLXbCO8V2fqS0FWPmExB469sJhwCrM9j72CzHw2LlmH0+EkNzWHzMR0ouM9e4yLcgpJ35FKz5P9fmX+kKnMyKdZ7IGbrmYx4VR6tVVlRj4Fi9ajHE72p+Swb3cGLbqY54RKa1/vT8lh7+ottOqd0CC6GhSn03c5OHFAqtt6mrXNnR5ADxFZJSJrRWSUn3IuB7wv9lNFZLOIvCQidY4lB3qbfhJmt8ztSqm7rOXu2oyVUjOUUicppU668dorAqwCeh3Tg5S0DNIysqiqquKHH1cyfMgpPnZ7ktMoLimlb69j/ZTSMBzfPYHkjGzSsnKpqnKw8Kd1DDvZc5hr+Cn9WPenOXRUuLeE5IxsOkRH4nS6KCo2h0h27EllR1Iqp/arf5fs8bGhpBSWkl5URpXTxaKtaQztHuNhM7xHLOuTzWCusHw/yQWldGjXkpg2LdmYkofDZc4T2ZiSR5eIkHprcmXuQULbI20jwLBhP2Ygzl2eQxzOnb9j62R1rbdojYRG4yrKRRXnY+vYE8QAw4atY09Ufv2HZlzpuzDCY5DQKLDZsZ0wGMd2z0mUzq3rMbpY+6RlCBIeg6vgwB2O/YTBOP74pd5aqjk+ui0pheWk7y03993fWQzrEuVhM7xbFOtTzZN24b5KkgvLiWvbguiQYDamFdbsu9/SCkgIa5ihGefu7RjRcRiR0WCz0+zUEVRtXO1hU7XhF+zH9wNAQtpgi+lgzimxaDZoBFWrf2wQPQD7t/yNvVMc9thosNtpdfYwyleu8bApX76K4AHmsKzRrg1BneNwpGWyf8vfGCGtMULNCZDBJ/elKjHZp45DJfOPRMISomnbMRIjyMZxo09h15LfPGx2LN5Ip1PNC2eL0NaEJURTlJLD/Hve5M1B/+HNIfeyfOqn/PX1z6x81vcpoEMl949E2iRE09rS1OXCU0jx0pS8aCMxg0xNzUNb06ZLNCXJOTRr2xKjmb1me/sBPSjakV5vTYl/7CI6IYbIjlHYguycMnoIvy3x7CnbuHgdx53aC4DWoSFEJ8SSk5JFWHQ4Qc3NuW0t27Si+0nHkLm7/poASjbtokWXGJp3ikKC7EReNJiCxZ668heuo+1gU5c9LIQWXWKoSM7G1rYVYrWVPSyENgOOodxtkuvRgnI4fRb3TgFrudkrWwBdotiB7sAw4ArgXRGpuSsXkRigN7DILc9E4BjMToswYHxd2gMdmvkLiAbqf5WoA7vdxsP33sYt9z2K0+nkX+ePpFuXzrz2ziyOP6YHw08zg5IFS1dwzplDG+zxRb9abDYevvUqbnv8JZwuFxedOYRuneN4/eO5HNc9nuED+zL4xF6s+X0LF93+KIZhcN+4S2nXpjX7K6u4fsIzALRq2YKn778JewMMIdkNgwkj+3DbnFW4XHBhn850i2zDGyu3clxMKMN6xDCoSxRr9mQz5u2lGIZw74hetGvZnDOPiWNdUi6XvvMjgjCoa5RPEHNYKBeVSz+m+SX3mY/K/vkLKj+DoMEX4cpKwrl7E66kv1AJxxM87ilQLqpWfg4VZTh3bMDofCzB46aAAmfSnzh3N8BQiMtF5fz3CL7+ERADx2/LUTlpBJ0xFlf6bpzbN+DcuQlbtz60uOcl037hR7DPDB6lXSTSLgJX0taDVBQ4dsNg/IjjuP3rDbiU4sLjO9A1IoQ3Vu/kuPZtGdY1ikGdI1iTnMeYD3/GJsJ/Tu9JuxbNOLN7NOtT87nsI/Ox2EHxEQztGnWQGgPE5WLfzFdoNfE5MAwqV/yAKy2J4EvG4djzN46Nq3H8sR577wGE/PcD0/6Tt1Cl5kQ+I6I9Rngkjm0NOITldFHwzGu0f/Np8/HdbxdRtTuZdrddx/6tO9i3cg37Vm8g+NT+xH71LrhcFL70Dq695iTogpdmEP32cyBC5badlHzl+zj5oaKcLhZP+pCxsx5CbAabP19J3s50TrvvYjI372HX0t/Ys3IzCaf35salz+Jyulg+bTYVRaX1rrsuTWse+5BRnzyEGAY7PltJ0Y50TnzgYvL+2EPKkt9IX7GZDqf3ZsyyZ1EuF+ufms3+olKi+ndn8LM3oFwuxDDY/Pp8j6dtDheX08WHk97loVmTMGwGKz//kfSdqVx83+Xs2byb35auZ/PK3+l9eh+eXfoyLqeL2dM+pLSolF5DunLlo9ehlDmSsmDGt6T9nXLwSgPB6SLx4Xc5fvajYDPImb2MfX+n0emhsZRu2k3B4g0ULd9Eu6F96PfTSyini6QpH+EoLCXkpJ50/e/N4FJgCGmvfuPxtM1Rg5/JqUqpGcCMOnKlAR3d1jsA3o6QBqxVSlUBe0Tkb8zApDqSuwz4xkqvrrc6VtgvIh8AD9QlXbzHyfwaiVRPSlkH1ExBV0pdcLC8VXmJ9X92roFxFdT/gGtoXGu+a2wJPqic+o0XHylUUcnBjf5hpAEmIzc0lSs3N7YEH4q2HVXz4wGYU1j/YdMjQeRR+DTmctthTQE4oty2/6h6+LOGwVlfHrk7ZT+UPXaZz7W21ZOf16lBROzADuAMIB0zuLhSKbXFzWYUcIVS6joRiQB+B/oqpfKt9LXARGvyanWeGKVUppi9BS8BFUopv0/kQOA9IpMDtNNoNBqNRvMPoxyHHrkqpRwicifmsIoNeF8ptUVEpgAblFLzrLSRIrIVcGI+DVMdhMRj9qis9Cr6ExGJxBz62QTcWpeOgAIRpZR3JRqNRqPRaI4WDu+pGZRSC4AFXtsmuf1WwH3W4p03Cd/JrSilRhyKhkCfmjlFRNaLSKmIVIqIU0Sa2NteNBqNRqP530Q5XD5LUyHQoZnXMB/P+QLzCZprMSeraDQajUajaWwOY2jmaCHgmWNKqV0iYlNKOYEPRGT1QTNpNBqNRqM54qiq//1ApFxEmgGbROQ5zMd4Wx05WRqNRqPRaAKmCQ3FeBPoc0/XWLZ3AmWYs2QvPlKiNBqNRqPRBM7//BwRpVSyiLQAYpRSTxxhTRqNRqPRaA4BVdl0Ag9vAn1qZjTms8ALrfW+IjLvSArTaDQajUYTGMqhfJamQqBDM5Mxv9JXBKCU2oT5JV6NRqPRaDSNjKpUPktTIdDJqg6l1N4j+W0XjUaj0Wg0h4c6vPeZHRUE/NE7EbkSsIlId+BuQD++q9FoNBrNUUBTDkQCHZq5Czge84N3nwJ7gXuOlCiNRqPRaDSB46z0XZoKgQYix1mLHQgGLuTAJ4A1Go1Go9E0IsopPksgiMgoEflbRHaJiN8v5IrIZSKyVUS2iMinbts7ichiEdlmpcdb2xNE5FcR2Skin1nvIauVQIdmPgEeAP4Cmu4zQhqNRqPR/A/ichz6HE4RsQGvA2cBacB6EZmnlNrqZtMdmAgMVkoVikiUWxGzgKlKqSUi0poD8cGzwEtKqTki8hbwb+DN2nQE2iOSq5Sar5Tao5RKrl4C/bMajUaj0WiOHM4qw2cJgJOBXUqpRKVUJTAHc8TDnZuA15VShQBKqRwAETkOsCullljbS5VS5WI+1TIC+NLK/yFwUV0iAu0ReVxE3gV+xJwnglXx1wHm12g0Go1Gc4RwBTgU40UckOq2ngYM9LLpASAiqwAbMFkptdDaXiQiXwMJwFJgAhAKFClVM302zaqnVgINRMYBxwBBHOh6UYAORDQajUajaWT8BSIicjNws9umGUqpGe4mforyfgGJHegODAM6AD+LSC9r+2lAPyAF+Ay4HvD3stM6X2oSaCDSRynVO0BbjUaj0Wg0/yD+hmKsoGOGr3UNaZjfjqumA5Dhx2atUqoK2CMif2MGJmnA70qpRAARmQucArwPtBMRu9Ur4q9MDwKdI7LWGg/SaDQajUZzlOF0GT5LAKwHultPuTQDLse3R2MuMBxARCIwh2QSrbyhIhJp2Y0AtiqlFLAcuMTafh3wbV0iAg1EhgCbrEd8NovInyKyOcC8Go1Go9FojiAup/gsB8PqsbgTWARsAz5XSm0RkSkicoFltgjIF5GtmAHGg0qpfKWUE/Np2h9F5E/MYZ53rDzjgftEZBcQDrxXl45Ah2ZGBWin0Wg0Go3mH8bhsB1WPqXUAmCB17ZJbr8VcJ+1eOddApzgZ3si5hM5ARFQIKIf1dVoNBqN5ujF6Wq634ILtEfksCm7699HuopDZuOyqIMb/cPMCN5/cKN/mKqj9N11FznaNLYEH2YZ2xtbgg9RRrvGluBDoauisSX40MoobmwJfllVuquxJfjQ0h7c2BJ8mFOc3dgS/PJPf/rFpQMRjUaj0Wg0jUWV8/CGZo4GdCCi0Wg0Gk0Tx6l0j4hGo9FoNJpGwqECfQj26EMHIhqNRqPRNHEcukdEo9FoNBpNY+H0+7b2poEORDQajUajaeJU6UBEo9FoNBpNY+EUHYhoNBqNRqNpJPTQjEaj0Wg0mkajqgn3iDTd5300Go1Go9EA4BDfJRBEZJT1QdtdIjKhFpvLRGSriGwRkU+90tqISLqIvOa2bYVV5iZrqfN15rpHRKPRaDSaJs7hDM2IiA14HTgLSAPWi8g8pdRWN5vuwERgsFKq0E9Q8SSw0k/xVymlNgSiQ/eIaDQajUbTxKkS3yUATgZ2KaUSlVKVwBzgQi+bm4DXlVKFAEqpnOoEEekPtAcW10e7DkQ0Go1Go2niHObQTByQ6raeZm1zpwfQQ0RWichaERkFICIG8ALwYC1lf2ANyzwmUvcEFj00o9FoNBpNE8fp51IvIjcDN7ttmqGUmuFu4qco5bVuB7oDw4AOwM8i0gu4GliglEr1E2dcpZRKF5EQ4CvgGmBWbdp1IKLRaDQaTROnys82K+iY4SepmjSgo9t6ByDDj81apVQVsEdE/sYMTE4FThOR24HWQDMRKVVKTVBKpVv1l1iTW0+mjkBED81oNBqNRtPEOcyhmfVAdxFJEJFmwOXAPC+bucBwABGJwByqSVRKXaWU6qSUigceAGYppSaIiN2yQ0SCgPOBv+oSoXtENBqNRqNp4jgPI49SyiEidwKLABvwvlJqi4hMATYopeZZaSNFZKtVzYNKqfw6im0OLLKCEBuwFHinLh06ENFoNBqNpokT4FMyPiilFgALvLZNcvutgPuspbYyZgIzrd9lQP9D0aADEY1Go9FomjgOnzmmTQcdiGg0Go1G08Q53B6RowEdiGg0Go1G08Rx6h4RjUaj0Wg0jYUemqkn9j4DaHHtnWDYqFz+PfvnzfaxCTplGMEXXweAM3k35a89hf24vrS45o4aGyO2E+WvTqFqw6oG0RU2vA89nroesRlkfLKM5Fe/9bGJuuAUujxwKUopSrcms+W2VwEYkTGb0m0pAFSk57H52v82iKY+Q/tx/eM3YtgMls1Zwrdvfu1jc8p5g7n03stRSpG8LYlX736RiLhI7n97AoZhYAuysXDm9yz9ZFGDaOo39ERuePxGDJuNpXMW882bX/nYDDpvMGPvvQKlIGnbHqbf/QKRcZE89PZES5OdBTO/Y/EnCxtEU+ywExgw5RrEMNg1ewV/vT7fx6bz6IH0uW8MKEXh1hR+vvMNAK5OmUXRdvNlg2Xp+Swf92KDaBow7CRun3wrhs3GD7N/YM4bn/vYDD3/dK6992qUgsRtiUy76xmi4qKYPGMShs3Abrczd+a3fPfx9w2iCeCEof249vF/Y9gMls9Zynw/PjXwvEFcfO/lYPnU63e/RERcJPe+PR4xDOxBNhbNXMCPDeRT/Yf159bJt2LYDBbOXsgXb3zhY3Pa+adx9b1Xo5QicVsiz931HFFxUTw649Gatpo3cx4LPl7gp4ZDp+/QExln+fmPcxYz14+fn3reYC679wqw/Pzlu18gIi6SBy0/twfZ+aEB/Xz4GUOY8sxEbDYbn876ktemv+tjM/qiUTww4Q6UUmz5azt33PQQx/c+hmdemERISGucLicvP/82875pGE2njxjEY9MewGbY+Ozjb3j7lZk+NudeeBZ3P3QLSim2b9nBvbc8wrG9ejDlvw/TOqQVLqeLN156j+/n1uvN4R6cPXIYL744BZth8P4Hs3nuv6/72FxyyWgmPXYfSik2b97KNdfeCcD38z9m4MATWbVqPRf+67oG09SQVOlApB6IQYtx91A27UFc+bmETH2Lqo2rcaUn15gY0XE0v/BKSiffhSorRdq0A8CxdRMlE28yi2kVQsj0j6naHNA3dg6OIfR85gZ+v2wq+zPyGbDoafIWbaBsR3qNSYuEaOLvvogNoyfh2FtGUESbmjRnRSXrzhjfMFosxDC44clbmHrV4+Rn5fP0vP+yYek60nem1dhEx8dw0R0XM2nMBMqKy2gT3haAwpxCHhszHkelg+Ytg3l+8StsXLKOwpzCemkyDIObnryFJ66aRH5WPs/Ne4H1S9eRtvPAW4Nj4mMYc8elPDxmPGXFZbR10zRxzEM4Kh0Etwxm+uJXWb9kHYU5BfXSJIYwcOp1LLniGcozCzh3wRRSF29k784D7+kJSWhP7ztHs/CiJ6jcW05wuOe++27kI/XS4I1hGNz11B2Mv3IiuZl5vP7dq6xespaUnSk1NnHxsVxxx1juGXMfpXtLaWe1U0FOAff8616qKqsIbhnMu0vfZs2SNeRn16+dwPSpcU/ezNNXTSY/K5+n5j3Hb3586sI7LuaJMRN9fOrxMRNqfOq5xS+zcck6ihrAp+546g4evvJh8jLzePm7l/l1ya8ebRUbH8vYO8Zy/5j7Kd1bWuNTBTkF3P+v+2va6q2lb7F2yVoK6tlWhmFw45O3MOWqSRRk5fPMvBfY4OXn0ZafP2r5eXU7FeUU8oibn7/YQH5uGAbTnn+UsRfdSGZGNj8s/4zFPyxnx9+7a2wSunTmrvtu4oKzr2Lv3mLCI8IA2Fe+j7tvnciexGTaR0eyaMWXrFi2iuK9JfXWNPnZ8Vx3ye1kZWTzzZKP+XHhSnbt2FNjE9+lI7feM47Lzh1H8d4SwiNCTU37KnjwjsdISkwlKjqCb3/8hJ+WraakuLRemqp1vfLyVEadewVpaZmsXbOA+d8tZtu2nTU23bolMP6hOzl96EUUFe0lMjK8Ju2FF9+iZcsW3HTj1fXWcqQ4nMd3jxYCeqGZiAT72RbREAJs3Y7BlZWBKycTnA4q1ywj6KTBHjbNRpxP5eK5qDLTIVVxkU85QQOH4ti0Dir3N4Qs2pzYjX17sqlIzkFVOcmeu5qIUQM8bOKuPoO0Dxbj2FsGQFVecYPUXRvd+nYnOymTnNRsnFUOVs//hQFnDfSwOeOKkSyetYCyYlNTcf5eAJxVDhyVDgCCmgVhGA0zs6lb3+5kJmWSnZqNo8rBL/N/5mQvTWdecTYLZ31fo2mvpcnhpsneLAgxGub9euH9ulKSlE1pSi6uKidJ366l49meT5N1v3I422cupXJvOQAV+Ud23/Xs25OMpAwyU7JwVDlYMW8Fg0ee6mFz7pXn8O2H8ynda/p5kVs7VVWa701s1iwIo4HaCXx9as38X+h/1skeNsOvOIvFs344qE9JA/lUj749yEjKIMtqq5XzVnLKyFM8bEZdOYr5bm21109bNaSmbn27k2W1k6PKwar5P/sce95+XnyE/bxf/94kJaaQkpxGVVUV3371A2efO8LD5qrrLmHmO5+yd6/p3/l5ZvCTuDuZPYnmzV52Vi55efmEh4fVW1OfE3uRvCeN1OR0qqocfPfNIs48Z5iHzdhrxvDx+5/XBD35eWbgmrQ7haREM7DLycojP7ewJkipLycP6Mfu3Uns2ZNCVVUVn3/+LReMPtvD5sZ/X8mbb86kqMjcb7m5B16VsWz5L5SU1D8gOpI4UT5LUyHQHpH1InKTUmotgIhcDDyN+Ya1emGERuDKr/mYH678XOzdjvWwsUV3AKD15FfBMKj4aiaOP9Z72AQNGs7+7327bw+X4OgwKjIOOOL+jHzanNjNw6Zl1xgA+s+fgtgMEv/7BQXL/zD/V/MgBiyahnK6SHp1Lnk/1L+nJiw6jPzMvJr1/Mx8uvXr7mETkxALwJSvnsYwDL6YPoc/Vv4OQHhMBOM/eJTo+Bg+njaz3r0hAOHR4V6a8ujer6eHTayladpXz2IYBp9Nn83vK3+r0fTIB5OIiY/hw2kf1PsuEaBldChlGQfKKc8sIKJfVw+bNl2iARg1dxJiM/jjha/JWLEZAFvzIM5dMAXldPHXa/NJXbSx3poiosPJycitWc/NzOOYfsd42HToYvr59K9fxGYzmPXSx6xfYfpNZEwkUz+cQmx8LDOmvtsgvSEAoV4+VZCZT7d+nod1tU89/tU0DMPgq+mfsdnyqbCYcB764FHax8fw6bQP690bAhARHUGuW1vlZebR08un4rqY3+V6/uvnsdlsfPzSx2xcYe6niJgIpnw4hZj4GN6b+l69e0MAwqLDyQvQz5+y/Pzz6bPZ5ObnD38wiej4GD5qID+PjmlPenpWzXpmRhb9+p/gYdO1WzwA3y78GJvNxgvPvM7yH3/xsOl7Ym+aBQWRtCeF+tI+JpLMjAOasjJy6NO/l4dNQtdOAHz+/fsYNhuvPPc2Py1b7WFzQr/jCWoWRPKeNBqC2LhoUtMO9IimpWdy8oB+Hjbdu3cB4KcVc7HZbEx58gUWLV7RIPX/E1ThamwJh02ggciVwPsisgKIBcKBEXXmCBS/H+XziuRsNozoOEqf/A9GWCStH3+FkofGocrNOw9pF4atYxccm9f7KashdXmZ2A1adInmt389QfPYMPp/+wS/Dn0AR3E5q068g8rsQoI7R3Hil49RtjWVfcnZ9ZPk7/tEXk1l2A2i42N4YuyjhMWE88QX03hg5D2UF5eRn5nHQ6P+Q2hUKA+8M5FfF6xmb97eemny+80k5SnKZrcRGx/DY2MfJjwmgqlfPM09I++q0XTfqLsJjQpjwjsPs2bBavbm+fZ4HZIif/vOp51stEmIZtElU2kVE8bZ3zzGvBETqCou56uT72FfdhGtO0Uy8vOHKdyeSmlyjm+Z9dbk1U42G3EJcdx/2YNExkTw0lcvcOOZt1BWXEZuZi43j7yN8PZhPPHuZH76/meK6tlO4N+nlJcuw24jOj6Gp8Y+RlhMOJO+mMr4kfdQXlxOQWY+E0bdS7uoUO63fKq4vj4V0CnBbKvxl40nIiaC5796nlvPvJWy4jLyMvO4feTthLUPY9K7k/jl+1/q3VaBtJPNbiMmPobHLT9/8ounudfNz++3/PyhI+jn3vfANpuNLl07c/H51xMT1565Cz5i+KALa3ojotpH8Orbz3DPbRN9/k9DafI9H9iJ79KRKy+8mejYKOZ89x7nDLm0Zggmsn0EL7z5JA/e8XiDaKpNl3fZdpudbt0SGHHmJXToEMOKZd/Qp9+Imt6ko52m1APiTUB9hEqpP4GpwK2Y75y/UylVa6gqIjeLyAYR2TBzl/f3czxxFeRihEcdEBQeiasw38fGsWEVOJ24crNwZqZiWL0kAEGnDKdq/S/gbLhRsorMfIJjD4wRNo8NZ3+W591eRUYBeQs3oBxOKlJyKd+dQYsuZi9JZbZpW5GcQ+HqrYT0jq+3pvysfMJjDoyIhceEU+h1t1eQmc+GJetwOpzkpuaQkZhBTHyMh01hTiFpO1I55uTjGkBTnpemCJ870PzMPNYt+RWnw0lOajbpienE+mgqIHVHCsc1gKayzAJaxR7oZm4ZE0Z5dqGPTerijSiHk9LUXIp3Z9Imwewl2ZdtXiBKU3LJWrONsF6d660pNzOPqNjImvXImAjys/N9bFYvXoPT4SQrNZvU3Wl0SPD8Ind+dgFJO5LpfbLnXebhUuDlU2G1+NRGN5/KTMwgOj7Ww6Yop5C0HSkN4lN5mXlEurVVhJ+2ysvMY43VVtmp2aTtTiPOq60KsgtI3pFMrwZoq/ysPCK8/Ny7nfIz81jv5ucZiel+jj3Tz49tgHbKzMgiLi66Zj0mNprszBwvm2wWLliGw+EgNTmd3buSSOhi+nPrkFZ8/PlbPPvUK/y2YXO99YDZAxITe0BTdGwU2Vm5XjbZLP1hJQ6Hg7SUDPbsSibe6iVp3boV785+mRenvcGmjX82iCaA9LRMOnY44LMd4mLIzPS8MUxLz2TevMU4HA6SklLZsWM33bslNJiGI01THpoJdI7Ie8B/gBOAccB8EbmjNnul1Ayl1ElKqZOu7xZbmxkAzt3bMaLjMCKjwWan2akjqNro2U1XteEX7Meb3WgS0gZbTAdzTolFs0EjqFr9YyB/JWBKft9Nyy7RBHeKRIJstL9oEHmLPIdXcn9YT+jg4wEICguhZZcY9iVnY2/bCmlmr9ne7uSelO2ofxfj7j92Ep0QQ2THKGxBdgaNHsKGJes8bNYv/pXjTzVPvCGhIcQkxJKdkk1YdDhBzZsB0KpNK3qcdAwZu+sOEgNh1x87iUmIJapje+xBdoaMPo31S371sFm3+Fd6nXpCjabYxax8dQAAD6BJREFUhFiyUrIJjw6nmZumY046lvTd6T51HCr5mxIJSYimdcdIjCAb8ReeQuri3zxsUhduJHqQeTFo/n/t3X10VPWdx/H3Z0JQkSKIokKogAaVg4KIHhREfKx2VXQtLHp8WNsF3dVWVqsVDoLV6tFW21rKsUZBoAI+IAqiPIQI6KooBAjyYALlKUFaXAWEditk5rt/3Bs6JCHcwpA7ke/rnHsyM/nNzSc3NzO/+T3dFk1p1uFEdm7cQuNjmpAI/3ZHtGhKq3M7sr3s4DOVlpTSpl0bTgyPU59r+/Bh4YK9ynw4+0O6nt8FgGYtmpHXIY/NGzZz3InH0fjI4Dg1PaYpnbt3omJtZpqsq59T51/Ti+LCvVsWF83+mE7nnwn845zaUus5dQabM/D3Kyspo3W71pwQHquLrr2IBdWO1UezP6JL2rFq06FNrceqU4aOVfXzvGeE8zz9f6/6ef55Bo7T0sXLaX/KybQ9uQ25ubn0veEqZs2Yu1eZmW8X0fPCYMzPscc2p8MpJ7NxfTm5ubmMeWkkr708lelTMzPTCWDZkhW069CWvO+2Jje3EVdf/z2KZs7fq0zhO/Po0as7AC2ObU77U75L+fpN5OY24tnxT/PGK28zY9qcjGUCWLhoKaee2p527dqSm5tL//59eWv63jNypk2bSZ8+FwDQsmUL8vM7sDYD3VX1ZbdZja2hiNo1sxz4j3DN+XWSegCZmdOYSvF/Y3/H0UN+CYkEu+bNIFWxniN/cDuV60qpLP6QypKFNDrzXL7zqxeD8hP+gO0MmssSx51AouXxVK4qyUicKpZMUTpkDGe/PBRyEmyeNI+/llbQ4YF+fF2ylv+dVcxXc0to2ecserz3NJZKseaRCVRu3ckx3Tty+lMDsZShhFg/cupes20OVCqZYszw5xk6fgSJnBzmvTqHitXl9Lv3RtYuW0PxnIWUzF/CWb278vSckaSSKSY8Ppad23ZwZq8u3DLs9qCZVGJ6wVTKSzfs/4dGyPTC8OcYPv5hEjkJil6dQ/nqcgbcexN/WraGhXM+Ycn8xXTp3ZVn5vyeVDLFuDDTKb26ctuwH+7JNLXgTTZmIJMlU3wybByXTXwgmL77yny2l22iy09v4MuSdVQULubzectofdGZXDv3SSyZovjRSXyzdSfHd8+nxxM/xCyFlGD579/aa7bNwRynkQ+N4omXHg+mpL4ymw1lG7jtvlspW1bGR4ULWDhvEef07sboogJSqRQFjz3P19t20O3CfO58aGDVYeK15yaz7rP1B52pKtfY4c/z4PgRJHISzHu1iE2ry/lBeE4tnrOQZeE59cs5vyOVTDHx8XHs3LaDzr26cPOwf8fMkMTbBW9SXnrwL9ypZIpnH3qWX7z0C3Jycpj9ymw2lm3klvtuoWxZGR8XfkzxvGK69e7Gc0XPkUwlGf3YaHZs28GpF57KwIcG7sk05bkprM/Asao6z4eF5/m74f/ev4Xn+aI5n7A0PM9/E57nfwzP87PC87wq07QMnefJZJKh9z/GpNefJycnwcsvvUHZZ2u4f+jdlCxZwewZc5lb9D9cdMkFzF/wFslkkkeHP8XWrdu5of819LjgHFoc25z+N10PwOD/GsqKTz876Ew/f/BJxr42ikQiweSJ01hdupbBD97Jp0tXUjTzPd5790N6XdyDmR9MJpVM8sTDv2Xb1u307fd9zj3/bJq3OIYbBlwDwAM/HsGq5WUZOVb3DB7GO29PJCeRYOy4V1i5soyHR/yURcUlTJ9eyKzZ87j8sotYVjKXZDLJz4Y8yldfBS2p896dwmmnnUrTpk1Yv3YRg+64j9mF8/fzU+tXsgGPEVGm+uD2ZduNF2ddtaz43Vb7L1TPCo7MzGyfTMrWwU/XVTbbf6F6Nj7xxf4L1bNWiSZxR6hha+rvcUeo4ehEbtwRavXB9jVxR6ihSaMaEyhjt+Hrgxt7d6hU7tpUr4uu9zu5b4332tc2TN1vBklXAs8QXCn3BTN7opYy/YGHCYYhlZjZTZJOBqaEz8sFRprZH8Ly5xBcBO8oggvq3WN1VDYitYhIyieYJdMJ2HMmmlmHKM93zjnn3KFTaf/8B0dJOcAo4HKggmCG7DQzW5lWJh8YAvQ0s62Sqj7JbwYuMLNvJDUFlofP/Rx4FhgELCCoiFwJzNhXjqgT2l8Md1xJMFh1PPDHyL+tc8455w6ZSqzGFsF5wBozW2tmu4CXgb7VygwERpnZVgAz2xJ+3WVmVU35RxDWJySdBDQzs4/CVpDxwHV1hYhaETnKzIoIunI2mNnDZGr6rnPOOecOSqUla2zpM1jDbVC1p7UBytPuV4SPpesIdJT0gaQFYVcOAJLaSloW7uPJsDWkTbifuva5l6iDVf8uKQGslnQ3sAnIvoEWzjnn3GGotum6ZlYAFNTxtAir9tAIyAf6AHnA+5I6m9k2MysHzpLUGnhT0uSI+9xL1BaRwUAT4CfAOcDNwK0Rn+ucc865QyhpqRpbBBVA27T7eUD1aYIVwFQz221m64BSgorJHmFLyArgwrB8Xtq3a9vnXqJWRIxgTMg0oDtBU83zEZ/rnHPOuUNotyVrbBEsBPIltZfUGBhA8D6f7k2CsaFV15jrCKyVlCfpqPDxFkBPoNTMNgM7JPVQsKTtrUDNS9enido1MwG4H/gUsnROp3POOXeYOpCVVM2sMhxuMYtgGu4YM1sh6RFgkZlNC793haSVBBf5vd/MvpR0OfC0JCPojnkqXIUd4D/5x/TdGdQxYwaiV0S+CAM555xzLstE7IqpwczeIZhim/7Y8LTbBtwbbullCglWW69tn4uAyNdWiFoRGSHpBaAI2LPylplNifqDnHPOOXdoVEbrislKUSsitwOnE6yeVlXtMoJV1ZxzzjkXowNtEckGUSsiXczszEOaxDnnnHMHpCFfaybqrJkFkg7+utXOOeecy7jdqWSNraGI2iLSC7hN0jqCMSIiGMNS60AV55xzztWfw6Fr5sr9F3HOOedcHL71FREz23CogzjnnHPuwFRaZdwRDljUFhHnnHPOZalvfYuIc84557JXMuUVEeecc87F5HBY0Mw555xzWaoht4hEXUfEOeecc1mqMpWssUUh6UpJpZLWSHpwH2X6S1opaYWkiWmPz5S0TdL0auXHSlonaWm4da0rg7eIOOeccw3cgQxWlZQDjAIuByqAhZKmmdnKtDL5wBCgp5ltldQqbRe/ApoAd9Sy+/vNbHKUHN4i4pxzzjVwyVSqxhbBecAaM1trZruAl4G+1coMBEaZ2VYAM9tS9Q0zKwJ2HGx2r4g455xzDVwylayxRdAGKE+7XxE+lq4j0FHSB5IWSIq6wOljkpZJ+o2kI+oqeMi7ZppPmqtM7UvSIDMrONj9XJqJMKFvc6ZMysZMkLlct2YiTCgbj5VniiYbM0F25vJMmbV716Ya77WSBgGD0h4qqPb71fb+bNXuNwLygT5AHvC+pM5mtq2OOEOAPwONgQLgZ8Aj+yrc0FpEBu2/SL3zTNFkYybIzlyeKRrPFF025vJMh5iZFZhZ97SteiWrAmibdj8P+LyWMlPNbLeZrQNKCSomdf3czRb4BniRoAtonxpaRcQ555xzmbEQyJfUXlJjYAAwrVqZN4GLASQdR9BVs7aunUo6Kfwq4DpgeV3lfdaMc845dxgys0pJdwOzgBxgjJmtkPQIsMjMpoXfu0LSSiBJMBvmSwBJ7wOnA00lVQA/MrNZwARJxxN0/SwF7qwrR0OriGRj351niiYbM0F25vJM0Xim6LIxl2fKAmb2DvBOtceGp9024N5wq/7cC/exz0v+mQwKfoZzzjnnXP3zMSLOOeeci02DqIhEWYK2vkkaI2mLpDoH4dQnSW0lzZW0KlyK954syHSkpE8klYSZfh53piqSciQtqb48cVwkrZf0abgk8qK481SR1FzSZEmfhefW+THnOS1t6eilkr6WNDjOTGGu/w7P8eWSJkk6Mgsy3RPmWRHnMart9VLSsZIKJa0Ov7bIgkz9wmOVktS9PvMczrK+IpK2BO1VQCfgRkmd4k0FwFgg6sIu9aUSuM/MzgB6AHdlwbH6BrjEzLoAXYErJfWIOVOVe4BVcYeo5mIz62pm2fQi+Aww08xOB7oQ8zEzs9LwGHUFzgH+BrwRZyZJbYCfAN3NrDPBwL8BMWfqTLAq5nkEf7erw+W64zCWmq+XDwJFZpYPFIX34860HPhX4L16znJYy/qKCNGWoK13ZvYe8FXcOdKFc7cXh7d3ELxhVF8lr74zmZntDO/mhlvsA5Mk5QH/ArwQd5ZsJqkZ0BsYDWBmu/azkFF9uxT4k5ltiDsIweD/oyQ1Irj+RvX1GOrbGcACM/ubmVUC84Hr4wiyj9fLvsC48PY4gmmesWYys1VmVlqfOVzDqIhEWYLWVSOpHXA28HG8SfZ0gSwFtgCFZhZ7JuC3wANANl0724DZkorDFRGzQQfgC+DFsBvrBUlHxx0qzQBgUtwhzGwT8BSwEdgMbDez2fGmYjnQW1JLSU2A77P34lVxO8HMNkPwIQpotZ/y7luqIVREoixB69JIagq8Dgw2s6/jzmNmybAZPQ84L2wyjo2kq4EtZlYcZ45a9DSzbgTdkHdJ6h13IIJP+d2AZ83sbOCv1H8Teq3CBZiuBV7LgiwtCD7htwdaA0dLujnOTGa2CngSKARmAiUE3bfOZZWGUBGJsgStC0nKJaiETDCzKXHnSRc26c8j/rE1PYFrJa0n6Oq7RNJL8UYCM/s8/LqFYMxDncsi15MKoCKtFWsyQcUkG1wFLDazv8QdBLgMWGdmX5jZbmAKcEHMmTCz0WbWzcx6E3RDrI47U5q/pK3AeRJBi6k7DDWEikiUJWgde5bTHQ2sMrNfx50HQNLxkpqHt48ieMH+LM5MZjbEzPLMrB3B+fSumcX66VXS0ZK+U3UbuIL9LItcH8zsz0C5pNPChy4FVsYYKd2NZEG3TGgj0ENSk/D/8FKyYCC0pFbh1+8SDMLMluMFwev4beHt24CpMWZxMcr6lVX3tQRtzLGQNIngaoTHhUvbjjCz0fGmoidwC/BpOCYDYGi4cl5cTgLGhbOfEsCrZpYV02WzzAnAG8F7GI2AiWY2M95Ie/yYYMnmxgTXmLg95jyEYx4uB+6IOwuAmX0saTKwmKD7YwnZsUrn65JaAruBu8xsaxwhanu9BJ4AXpX0I4KKXL8syPQVMBI4Hnhb0lIz+1595joc+cqqzjnnnItNQ+iacc4559y3lFdEnHPOORcbr4g455xzLjZeEXHOOedcbLwi4pxzzrnYeEXEOeecc7HxiohzzjnnYuMVEeecc87F5v8BSdOXlkeqA20AAAAASUVORK5CYII=\\n\",\n      \"text/plain\": [\n       \"<Figure size 720x144 with 2 Axes>\"\n      ]\n     },\n     \"metadata\": {\n      \"needs_background\": \"light\"\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"if len(results) == 24:\\n\",\n    \"    df = pd.DataFrame(\\n\",\n    \"        np.reshape(\\n\",\n    \"            [r[\\\"STSBenchmark\\\"][\\\"pearson\\\"] for r in results],\\n\",\n    \"            (len(EXP_PARAMS[\\\"layer_index\\\"]), len(EXP_PARAMS[\\\"pooling_strategy\\\"])),\\n\",\n    \"        ).T,\\n\",\n    \"        index=[s.value for s in EXP_PARAMS[\\\"pooling_strategy\\\"]],\\n\",\n    \"        columns=EXP_PARAMS[\\\"layer_index\\\"],\\n\",\n    \"    )\\n\",\n    \"    fig, ax = plt.subplots(figsize=(10, 2))\\n\",\n    \"\\n\",\n    \"    sns.heatmap(df, annot=True, fmt=\\\".2g\\\", ax=ax).set_title(\\n\",\n    \"        \\\"Pearson correlations of BERT sequence encodings on STS Benchmark\\\"\\n\",\n    \"    )\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/sentence_similarity/gensen_aml_deep_dive.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\n    \"\\n\",\n    \"Licensed under the MIT License.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Training GenSen on AzureML with SNLI Dataset\\n\",\n    \"**GenSen: Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning** [\\\\[1\\\\]](#References)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Introduction\\n\",\n    \"GenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training.  The model combines the benefits of diverse sentence representation learning objectives into a single multi-task framework. As described in the paper **Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning**, it is \\\"the first large-scale reusable sentence representation model obtained by combining a set of training objectives with the level of diversity explored here, i.e. multi-lingual NMT, natural language inference, constituency parsing and skip-thought vectors\\\" [\\\\[1\\\\]](#References). These representations are useful for transfer and low-resource learning. GenSen is trained on several data sources with multiple training objectives on over 100 milion sentences.\\n\",\n    \"\\n\",\n    \"GenSen yields the state-of-the-art results on multiple datasets, such as MRPC, SICK-R, SICK-E and STS, for sentence similarity. The reported results are as follows compared with other models [\\\\[3\\\\]](#References):\\n\",\n    \"\\n\",\n    \"| Model | MRPC | SICK-R | SICK-E | STS |\\n\",\n    \"| --- | --- | --- | --- | --- |\\n\",\n    \"| GenSen (Subramanian et al., 2018) | 78.6/84.4 | 0.888 | 87.8 | 78.9/78.6 |\\n\",\n    \"| [InferSent](https://arxiv.org/abs/1705.02364) (Conneau et al., 2017) | 76.2/83.1 | 0.884 | 86.3 | 75.8/75.5 |\\n\",\n    \"| [TF-KLD](https://www.aclweb.org/anthology/D13-1090) (Ji and Eisenstein, 2013) | 80.4/85.9 | - | - | - |\\n\",\n    \"\\n\",\n    \"This notebook serves as an introduction to an end-to-end NLP solution for sentence similarity by demonstrating how to train and tune GenSen on the AzureML platform. We show the advantages of AzureML when training large NLP models with GPU.\\n\",\n    \"\\n\",\n    \"For more information on **AzureML**, see these resources:\\n\",\n    \"* [Quickstart notebook](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-create-workspace-with-python)\\n\",\n    \"* [Hyperdrive](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Background: Sequence-to-Sequence Learning\\n\",\n    \"![Sequence to sequence learning examples: machine translation (left) and constituent parsing (right)](https://nlpbp.blob.core.windows.net/images/seq2seq.png)**Sequence to sequence learning examples: machine translation (left) and constituent parsing (right)**\\n\",\n    \"\\n\",\n    \"The GenSen model is known to be most similar to that of Luong et al. (2015) [\\\\[4\\\\]](#References), who train a many-to-many **sequence-to-sequence** model on a diverse set of weakly related tasks that includes machine translation, constituency parsing, image captioning, sequence autoencoding, and intra-sentence skip-thoughts. \\n\",\n    \"\\n\",\n    \"Sequence-to-sequence learning, or seq2seq, aims to directly model the conditional probability $p(x|y)$ of mapping an input sequence, $x_1,...,x_n$, into an output sequence, $y_1,...,y_m$. This is done using an encoder-decoder framework. As illustrated in the above figure, the encoder computes a representation $s$ for each input sequence,  which the *decoder* uses to generate the ouput sequence. This decomposes the conditional probability as\\\" [\\\\[4\\\\]](#References):\\n\",\n    \"$$\\n\",\n    \"\\\\log p(y|x)=\\\\sum_{j=1}^{m} \\\\log p(y_i|y_{<j}, x, s)\\n\",\n    \"$$\\n\",\n    \"\\n\",\n    \"It is worth noting that the GenSen model deviates from Luong's seq2seq method in two key ways. First, GenSen uses an attention mechanism, meaning that the learned vector representations are not of fixed length. Second, GenSen optimizes for improvements on the same tasks on which the model is trained, rather than optimizing for transferability to different tasks or domains. [\\\\[1\\\\]](#References)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Azure ML Compute vs. Local\\n\",\n    \"We did a comparative study to make it easier for you to choose between a GPU enabled Azure VM \\n\",\n    \"and Azure ML compute. The table below provides the cost vs performance trade-off for \\n\",\n    \"each of the choices. We can tell from the table below that with distributed training on AzureML, it will make the model converge faster and get better training loss with similar training time.\\n\",\n    \"\\n\",\n    \"* The total time in the table stands for the training time + setup time.\\n\",\n    \"* Cost is the estimated cost of running the Azure ML Compute Job or the VM up-time.\\n\",\n    \"\\n\",\n    \"**Please note:** These were the estimated cost for running these notebooks as of July 1. Please \\n\",\n    \"look at the [Azure Pricing Calculator](https://azure.microsoft.com/en-us/pricing/calculator/) to see the most up to date pricing information. \\n\",\n    \"\\n\",\n    \"|---|Azure VM| AML 1 Node| AML 2 Nodes | AML 4 Nodes | AML 8 Nodes|\\n\",\n    \"|---|---|---|---|---|---|\\n\",\n    \"|Training Loss​|4.91​|4.81​|4.78​|4.77​|4.58​|\\n\",\n    \"|Total Time​|1h 05m|1h 54m|1h 44m​|1h 26m​|1h 07m​|\\n\",\n    \"|Cost|\\\\$1.12​|\\\\$2.71​|\\\\$4.68​|\\\\$7.9​|\\\\$12.1​|\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Table of Contents\\n\",\n    \"0. [Global Settings](#0-Global-Settings)\\n\",\n    \"1. [Data Loading and Preprocessing](#1-Data-Loading-and-Preprocessing)    \\n\",\n    \"    * 1.1. [Load SNLI](#1.1-Load-SNLI)  \\n\",\n    \"    * 1.2. [Tokenize](#1.2-Tokenize)  \\n\",\n    \"    * 1.3. [Preprocess](#1.3-Preprocess)  \\n\",\n    \"    * 1.4. [Upload to Azure Blob Storage](#1.4-Upload-to-Azure-Blob-Storage)  \\n\",\n    \"2. [Train GenSen with Distributed Pytorch and Horovod on AzureML](#2-Train-GenSen-with-Distributed-Pytorch-and-Horovod-on-AzureML)  \\n\",\n    \"    * 2.1 [Create or Attach a Remote Compute Target](#2.1-Create-or-Attach-a-Remote-Compute-Target)  \\n\",\n    \"    * 2.2. [Prepare the Training Script](#2.2-Prepare-the-Training-Script)  \\n\",\n    \"    * 2.3. [Define the Estimator and Experiment](#2.3-Define-the-Estimator-and-Experiment)  \\n\",\n    \"        * 2.3.1 [Create a PyTorch Estimator](#2.3.1-Create-a-PyTorch-Estimator)\\n\",\n    \"        * 2.3.2 [Create the Experiment](#2.3.2-Create-the-Experiment)\\n\",\n    \"    * 2.4. [Submit the Training Job to the Compute Target](#2.4-Submit-the-Training-Job-to-the-Compute-Target)\\n\",\n    \"        * 2.4.1 [Monitor the Run](#2.4.1-Monitor-the-Run)\\n\",\n    \"        * 2.4.2 [Interpret the Training Results](#2.4.2-Interpret-the-Training-Results)\\n\",\n    \"3. [Tune Model Hyperparameters](#3-Tune-Model-Hyperparameters)\\n\",\n    \"    * 3.1 [Start a Hyperparameter Sweep](#3.1-Start-a-Hyperparameter-Sweep)\\n\",\n    \"    * 3.2 [Monitor HyperDrive Runs](#3.2-Monitor-HyperDrive-Runs)\\n\",\n    \"    * 3.3 [Find the Best Model](#3.3-Find-the-Best-Model)\\n\",\n    \"- [References](#References)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 0 Global Settings\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\\n\",\n      \"Azure ML SDK Version: 1.0.48\\n\",\n      \"Pandas version: 0.24.2\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import sys\\n\",\n    \"import time\\n\",\n    \"import os\\n\",\n    \"import pandas as pd\\n\",\n    \"import shutil\\n\",\n    \"import papermill as pm\\n\",\n    \"import scrapbook as sb\\n\",\n    \"\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"from utils_nlp.dataset import snli, preprocess, Split\\n\",\n    \"from utils_nlp.azureml import azureml_utils\\n\",\n    \"from utils_nlp.models.gensen.preprocess_utils import gensen_preprocess\\n\",\n    \"\\n\",\n    \"import azureml as aml\\n\",\n    \"import azureml.train.hyperdrive as hd\\n\",\n    \"from azureml.telemetry import set_diagnostics_collection\\n\",\n    \"import azureml.data\\n\",\n    \"from azureml.data.azure_storage_datastore import AzureFileDatastore\\n\",\n    \"from azureml.core.compute import ComputeTarget, AmlCompute\\n\",\n    \"from azureml.core.compute_target import ComputeTargetException\\n\",\n    \"from azureml.core import Experiment, get_run\\n\",\n    \"from azureml.core.runconfig import MpiConfiguration\\n\",\n    \"from azureml.train.dnn import PyTorch\\n\",\n    \"from azureml.train.estimator import Estimator\\n\",\n    \"from azureml.train.hyperdrive import (\\n\",\n    \"    RandomParameterSampling,\\n\",\n    \"    BanditPolicy,\\n\",\n    \"    HyperDriveConfig,\\n\",\n    \"    uniform,\\n\",\n    \"    PrimaryMetricGoal,\\n\",\n    \")\\n\",\n    \"from azureml.widgets import RunDetails\\n\",\n    \"\\n\",\n    \"print(\\\"System version: {}\\\".format(sys.version))\\n\",\n    \"print(\\\"Azure ML SDK Version:\\\", aml.core.VERSION)\\n\",\n    \"print(\\\"Pandas version: {}\\\".format(pd.__version__))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# Model configuration\\n\",\n    \"NROWS = None\\n\",\n    \"CACHE_DIR = \\\"./temp\\\"\\n\",\n    \"MAX_EPOCH = 2 # by default is None\\n\",\n    \"ENTRY_SCRIPT = \\\"utils_nlp/gensen/gensen_train.py\\\"\\n\",\n    \"TRAIN_SCRIPT = \\\"gensen_train.py\\\"\\n\",\n    \"CONFIG_PATH = \\\"gensen_config.json\\\"\\n\",\n    \"EXPERIMENT_NAME = \\\"NLP-SS-GenSen-deepdive\\\"\\n\",\n    \"UTIL_NLP_PATH = \\\"../../utils_nlp\\\"\\n\",\n    \"MAX_TOTAL_RUNS = 8\\n\",\n    \"MAX_CONCURRENT_RUNS = 4\\n\",\n    \"\\n\",\n    \"# Azure resources\\n\",\n    \"subscription_id = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"resource_group = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  \\n\",\n    \"workspace_name = \\\"YOUR_WORKSPACE_NAME\\\"  \\n\",\n    \"workspace_region = \\\"YOUR_WORKSPACE_REGION\\\" #Possible values eastus, eastus2 and so on.\\n\",\n    \"AZUREML_CONFIG_PATH = \\\"./.azureml\\\"\\n\",\n    \"AZUREML_VERBOSE = False\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this notebook we use the Azure Machine Learning Python SDK to facilitate remote training and computation. To get started, we must first initialize an AzureML [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace), a centralized resource for managing experiment runs, compute resources, datastores, and other machine learning artifacts on the cloud. Refer to the official [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook for more information about setting up the workspace.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"if os.path.exists(AZUREML_CONFIG_PATH):\\n\",\n    \"    ws = azureml_utils.get_or_create_workspace(config_path=AZUREML_CONFIG_PATH)\\n\",\n    \"else:\\n\",\n    \"    ws = azureml_utils.get_or_create_workspace(\\n\",\n    \"        subscription_id=subscription_id,\\n\",\n    \"        resource_group=resource_group,\\n\",\n    \"        workspace_name=workspace_name,\\n\",\n    \"        workspace_region=workspace_region,\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"if AZUREML_VERBOSE:\\n\",\n    \"    print(\\\"Workspace name: {}\\\".format(ws.name))\\n\",\n    \"    print(\\\"Azure region: {}\\\".format(ws.location))\\n\",\n    \"    print(\\\"Subscription id: {}\\\".format(ws.subscription_id))\\n\",\n    \"    print(\\\"Resource group: {}\\\".format(ws.resource_group))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Opt-in diagnostics for better experience, quality, and security of future releases.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Turning diagnostics collection on. \\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"set_diagnostics_collection(send_diagnostics=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 1 Data Loading and Preprocessing\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We use the [SNLI](https://nlp.stanford.edu/projects/snli/) dataset in this example.\\n\",\n    \"\\n\",\n    \"Note: The dataset used in the original paper can be downloaded by running the bashfile [here](https://github.com/Maluuba/gensen/blob/master/get_data.sh). Training on the original datasets will reproduce the results in the [paper](https://arxiv.org/abs/1804.00079), but **will take about 20 hours of training time**. For the purposes of this example we use SNLI, a subset of the original dataset, as the only training dataset.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1.1 Load SNLI\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|████████████████████████████████████████████████████████████████████████████| 92.3k/92.3k [00:07<00:00, 11.6kKB/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"data_dir = os.path.join(CACHE_DIR, \\\"data\\\")\\n\",\n    \"train = snli.load_pandas_df(data_dir, file_split=Split.TRAIN, nrows=NROWS)\\n\",\n    \"dev = snli.load_pandas_df(data_dir, file_split=Split.DEV, nrows=NROWS)\\n\",\n    \"test = snli.load_pandas_df(data_dir, file_split=Split.TEST, nrows=NROWS)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>gold_label</th>\\n\",\n       \"      <th>sentence1_binary_parse</th>\\n\",\n       \"      <th>sentence2_binary_parse</th>\\n\",\n       \"      <th>sentence1_parse</th>\\n\",\n       \"      <th>sentence2_parse</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"      <th>sentence2</th>\\n\",\n       \"      <th>captionID</th>\\n\",\n       \"      <th>pairID</th>\\n\",\n       \"      <th>label1</th>\\n\",\n       \"      <th>label2</th>\\n\",\n       \"      <th>label3</th>\\n\",\n       \"      <th>label4</th>\\n\",\n       \"      <th>label5</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\\n\",\n       \"      <td>( ( A person ) ( ( is ( ( training ( his horse...</td>\\n\",\n       \"      <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\\n\",\n       \"      <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\\n\",\n       \"      <td>A person on a horse jumps over a broken down a...</td>\\n\",\n       \"      <td>A person is training his horse for a competition.</td>\\n\",\n       \"      <td>3416050480.jpg#4</td>\\n\",\n       \"      <td>3416050480.jpg#4r1n</td>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>contradiction</td>\\n\",\n       \"      <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\\n\",\n       \"      <td>( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...</td>\\n\",\n       \"      <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\\n\",\n       \"      <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\\n\",\n       \"      <td>A person on a horse jumps over a broken down a...</td>\\n\",\n       \"      <td>A person is at a diner, ordering an omelette.</td>\\n\",\n       \"      <td>3416050480.jpg#4</td>\\n\",\n       \"      <td>3416050480.jpg#4r1c</td>\\n\",\n       \"      <td>contradiction</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\\n\",\n       \"      <td>( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...</td>\\n\",\n       \"      <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\\n\",\n       \"      <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\\n\",\n       \"      <td>A person on a horse jumps over a broken down a...</td>\\n\",\n       \"      <td>A person is outdoors, on a horse.</td>\\n\",\n       \"      <td>3416050480.jpg#4</td>\\n\",\n       \"      <td>3416050480.jpg#4r1e</td>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>( Children ( ( ( smiling and ) waving ) ( at c...</td>\\n\",\n       \"      <td>( They ( are ( smiling ( at ( their parents ) ...</td>\\n\",\n       \"      <td>(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...</td>\\n\",\n       \"      <td>(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...</td>\\n\",\n       \"      <td>Children smiling and waving at camera</td>\\n\",\n       \"      <td>They are smiling at their parents</td>\\n\",\n       \"      <td>2267923837.jpg#2</td>\\n\",\n       \"      <td>2267923837.jpg#2r1n</td>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>( Children ( ( ( smiling and ) waving ) ( at c...</td>\\n\",\n       \"      <td>( There ( ( are children ) present ) )</td>\\n\",\n       \"      <td>(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...</td>\\n\",\n       \"      <td>(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...</td>\\n\",\n       \"      <td>Children smiling and waving at camera</td>\\n\",\n       \"      <td>There are children present</td>\\n\",\n       \"      <td>2267923837.jpg#2</td>\\n\",\n       \"      <td>2267923837.jpg#2r1e</td>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"      gold_label                             sentence1_binary_parse  \\\\\\n\",\n       \"0        neutral  ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...   \\n\",\n       \"1  contradiction  ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...   \\n\",\n       \"2     entailment  ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...   \\n\",\n       \"3        neutral  ( Children ( ( ( smiling and ) waving ) ( at c...   \\n\",\n       \"4     entailment  ( Children ( ( ( smiling and ) waving ) ( at c...   \\n\",\n       \"\\n\",\n       \"                              sentence2_binary_parse  \\\\\\n\",\n       \"0  ( ( A person ) ( ( is ( ( training ( his horse...   \\n\",\n       \"1  ( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...   \\n\",\n       \"2  ( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...   \\n\",\n       \"3  ( They ( are ( smiling ( at ( their parents ) ...   \\n\",\n       \"4             ( There ( ( are children ) present ) )   \\n\",\n       \"\\n\",\n       \"                                     sentence1_parse  \\\\\\n\",\n       \"0  (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...   \\n\",\n       \"1  (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...   \\n\",\n       \"2  (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...   \\n\",\n       \"3  (ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...   \\n\",\n       \"4  (ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...   \\n\",\n       \"\\n\",\n       \"                                     sentence2_parse  \\\\\\n\",\n       \"0  (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...   \\n\",\n       \"1  (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...   \\n\",\n       \"2  (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...   \\n\",\n       \"3  (ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...   \\n\",\n       \"4  (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...   \\n\",\n       \"\\n\",\n       \"                                           sentence1  \\\\\\n\",\n       \"0  A person on a horse jumps over a broken down a...   \\n\",\n       \"1  A person on a horse jumps over a broken down a...   \\n\",\n       \"2  A person on a horse jumps over a broken down a...   \\n\",\n       \"3              Children smiling and waving at camera   \\n\",\n       \"4              Children smiling and waving at camera   \\n\",\n       \"\\n\",\n       \"                                           sentence2         captionID  \\\\\\n\",\n       \"0  A person is training his horse for a competition.  3416050480.jpg#4   \\n\",\n       \"1      A person is at a diner, ordering an omelette.  3416050480.jpg#4   \\n\",\n       \"2                  A person is outdoors, on a horse.  3416050480.jpg#4   \\n\",\n       \"3                  They are smiling at their parents  2267923837.jpg#2   \\n\",\n       \"4                         There are children present  2267923837.jpg#2   \\n\",\n       \"\\n\",\n       \"                pairID         label1 label2 label3 label4 label5  \\n\",\n       \"0  3416050480.jpg#4r1n        neutral    NaN    NaN    NaN    NaN  \\n\",\n       \"1  3416050480.jpg#4r1c  contradiction    NaN    NaN    NaN    NaN  \\n\",\n       \"2  3416050480.jpg#4r1e     entailment    NaN    NaN    NaN    NaN  \\n\",\n       \"3  2267923837.jpg#2r1n        neutral    NaN    NaN    NaN    NaN  \\n\",\n       \"4  2267923837.jpg#2r1e     entailment    NaN    NaN    NaN    NaN  \"\n      ]\n     },\n     \"execution_count\": 6,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"train.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1.2 Tokenize\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Here we clean the dataframes, do lowercase standardization, and tokenize the text using the [NLTK](https://www.nltk.org/) library.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def clean_and_tokenize(df):\\n\",\n    \"    df = snli.clean_cols(df)\\n\",\n    \"    df = snli.clean_rows(df)\\n\",\n    \"    df = preprocess.to_lowercase(df)\\n\",\n    \"    df = preprocess.to_nltk_tokens(df)\\n\",\n    \"    return df\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"For `clean_and_tokenize` function, it may take a little bit longer. To run the following cell, it takes around 5 to 10 mins.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"train = clean_and_tokenize(train)\\n\",\n    \"dev = clean_and_tokenize(dev)\\n\",\n    \"test = clean_and_tokenize(test)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>score</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"      <th>sentence2</th>\\n\",\n       \"      <th>sentence1_tokens</th>\\n\",\n       \"      <th>sentence2_tokens</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>a person on a horse jumps over a broken down a...</td>\\n\",\n       \"      <td>a person is training his horse for a competition.</td>\\n\",\n       \"      <td>[a, person, on, a, horse, jumps, over, a, brok...</td>\\n\",\n       \"      <td>[a, person, is, training, his, horse, for, a, ...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>contradiction</td>\\n\",\n       \"      <td>a person on a horse jumps over a broken down a...</td>\\n\",\n       \"      <td>a person is at a diner, ordering an omelette.</td>\\n\",\n       \"      <td>[a, person, on, a, horse, jumps, over, a, brok...</td>\\n\",\n       \"      <td>[a, person, is, at, a, diner, ,, ordering, an,...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>a person on a horse jumps over a broken down a...</td>\\n\",\n       \"      <td>a person is outdoors, on a horse.</td>\\n\",\n       \"      <td>[a, person, on, a, horse, jumps, over, a, brok...</td>\\n\",\n       \"      <td>[a, person, is, outdoors, ,, on, a, horse, .]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>children smiling and waving at camera</td>\\n\",\n       \"      <td>they are smiling at their parents</td>\\n\",\n       \"      <td>[children, smiling, and, waving, at, camera]</td>\\n\",\n       \"      <td>[they, are, smiling, at, their, parents]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>children smiling and waving at camera</td>\\n\",\n       \"      <td>there are children present</td>\\n\",\n       \"      <td>[children, smiling, and, waving, at, camera]</td>\\n\",\n       \"      <td>[there, are, children, present]</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"           score                                          sentence1  \\\\\\n\",\n       \"0        neutral  a person on a horse jumps over a broken down a...   \\n\",\n       \"1  contradiction  a person on a horse jumps over a broken down a...   \\n\",\n       \"2     entailment  a person on a horse jumps over a broken down a...   \\n\",\n       \"3        neutral              children smiling and waving at camera   \\n\",\n       \"4     entailment              children smiling and waving at camera   \\n\",\n       \"\\n\",\n       \"                                           sentence2  \\\\\\n\",\n       \"0  a person is training his horse for a competition.   \\n\",\n       \"1      a person is at a diner, ordering an omelette.   \\n\",\n       \"2                  a person is outdoors, on a horse.   \\n\",\n       \"3                  they are smiling at their parents   \\n\",\n       \"4                         there are children present   \\n\",\n       \"\\n\",\n       \"                                    sentence1_tokens  \\\\\\n\",\n       \"0  [a, person, on, a, horse, jumps, over, a, brok...   \\n\",\n       \"1  [a, person, on, a, horse, jumps, over, a, brok...   \\n\",\n       \"2  [a, person, on, a, horse, jumps, over, a, brok...   \\n\",\n       \"3       [children, smiling, and, waving, at, camera]   \\n\",\n       \"4       [children, smiling, and, waving, at, camera]   \\n\",\n       \"\\n\",\n       \"                                    sentence2_tokens  \\n\",\n       \"0  [a, person, is, training, his, horse, for, a, ...  \\n\",\n       \"1  [a, person, is, at, a, diner, ,, ordering, an,...  \\n\",\n       \"2      [a, person, is, outdoors, ,, on, a, horse, .]  \\n\",\n       \"3           [they, are, smiling, at, their, parents]  \\n\",\n       \"4                    [there, are, children, present]  \"\n      ]\n     },\n     \"execution_count\": 9,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"train.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1.3 Preprocess\\n\",\n    \"We format our data in a specific way in order for the Gensen model to be able to ingest it. We do this by\\n\",\n    \"* Saving the tokens for each split in a `snli_1.0_{split}.txt.clean` file, with the sentence pairs and scores tab-separated and the tokens separated by a single space. Since some of the samples have invalid scores (\\\"-\\\"), we filter those out and save them separately in a `snli_1.0_{split}.txt.clean.noblank` file.\\n\",\n    \"* Saving the tokenized sentence and labels separately, in the form `snli_1.0_{split}.txt.s1.tok` or `snli_1.0_{split}.txt.s2.tok` or `snli_1.0_{split}.txt.lab`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Writing input data to ./temp\\\\data\\\\clean/snli_1.0\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"preprocessed_data_dir = gensen_preprocess(train, dev, test, data_dir)\\n\",\n    \"print(\\\"Writing input data to {}\\\".format(preprocessed_data_dir))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1.4 Upload to Azure Blob Storage\\n\",\n    \"We upload the data from the local machine into the datastore so that it can be accessed for remote training. The datastore is a reference that points to a storage account, e.g. the Azure Blob Storage service. It can be attached to an AzureML workspace to facilitate data management operations such as uploading/downloading data or interacting with data from remote compute targets.\\n\",\n    \"\\n\",\n    \"**Note: If you already have the preprocessed files under `clean/snli_1.0/` in your default datastore, you DO NOT need to redo this section.**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ds = ws.get_default_datastore()\\n\",\n    \"\\n\",\n    \"if AZUREML_VERBOSE:\\n\",\n    \"    print(\\\"Datastore type: {}\\\".format(ds.datastore_type))\\n\",\n    \"    print(\\\"Datastore account: {}\\\".format(ds.account_name))\\n\",\n    \"    print(\\\"Datastore container: {}\\\".format(ds.container_name))\\n\",\n    \"    print(\\\"Data reference: {}\\\".format(ds.as_mount()))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"_ = ds.upload(\\n\",\n    \"    src_dir=os.path.join(data_dir, \\\"clean/snli_1.0\\\"),\\n\",\n    \"    overwrite=False,\\n\",\n    \"    show_progress=AZUREML_VERBOSE,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 2 Train GenSen with Distributed Pytorch and Horovod on AzureML\\n\",\n    \"In this tutorial, we train a GenSen model with PyTorch on AML using distributed training across a GPU cluster.\\n\",\n    \"\\n\",\n    \"After creating the workspace and setting up the development environment, training a model in Azure Machine Learning involves the following steps:\\n\",\n    \"1. Creating a remote compute target\\n\",\n    \"2. Preparing the training data and uploading it to datastore (Note that this was done in Section 1.4)\\n\",\n    \"3. Preparing the training script\\n\",\n    \"4. Creating Estimator and Experiment objects\\n\",\n    \"5. Submitting the Estimator to an Experiment attached to the AzureML workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2.1 Create or Attach a Remote Compute Target\\n\",\n    \"We create and attach a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training the model. Here we use the AzureML-managed compute target ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) as our remote training compute resource. Our cluster autoscales from 0 to 2 `STANDARD_NC6` GPU nodes.\\n\",\n    \"\\n\",\n    \"Creating and configuring the AmlCompute cluster takes approximately 5 minutes the first time around. Once a cluster with the given configuration is created, it does not need to be created again.\\n\",\n    \"\\n\",\n    \"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Read more about the default limits and how to request more quota [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Found existing compute target gensen-aml\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"cluster_name = \\\"gensen-aml\\\"\\n\",\n    \"\\n\",\n    \"try:\\n\",\n    \"    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\\n\",\n    \"    print(\\\"Found existing compute target {}\\\".format(cluster_name))\\n\",\n    \"except ComputeTargetException:\\n\",\n    \"    print(\\\"Creating a new compute target {}...\\\".format(cluster_name))\\n\",\n    \"    compute_config = AmlCompute.provisioning_configuration(\\n\",\n    \"        vm_size=\\\"STANDARD_NC6\\\", max_nodes=8\\n\",\n    \"    )\\n\",\n    \"    # create the cluster\\n\",\n    \"    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\\n\",\n    \"    compute_target.wait_for_completion(show_output=AZUREML_VERBOSE)\\n\",\n    \"\\n\",\n    \"if AZUREML_VERBOSE:\\n\",\n    \"    print(compute_target.get_status().serialize())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2.2 Prepare the Training Script\\n\",\n    \"The training process involves the following steps:\\n\",\n    \"1. Create or load the dataset vocabulary\\n\",\n    \"2. Train on the training dataset for each batch epoch (batch size = 48 updates)\\n\",\n    \"3. Evaluate on the validation dataset for every 10 epochs\\n\",\n    \"4. Find the local minimum point on validation loss\\n\",\n    \"5. Save the best model and stop the training process\\n\",\n    \"\\n\",\n    \"In this section, we define the training script and move all necessary dependencies to `project_folder`, which will eventually be submitted to the remote compute target. Note that the size of the folder can not exceed 300Mb, so large dependencies such as pre-trained embeddings must be accessed from the datastore. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"project_folder = os.path.join(CACHE_DIR, \\\"gensen\\\")\\n\",\n    \"os.makedirs(project_folder, exist_ok=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The script for distributed GenSen training is provided at `./gensen_train.py`.\\n\",\n    \"\\n\",\n    \"In this example, we use MLflow to log metrics. We also use the [AzureML-Mlflow](https://pypi.org/project/azureml-mlflow/) package to persist these metrics to the AzureML workspace. This is done with no change to the provided training script! Note that logging is done for loss *per minibatch*.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copy the training script `gensen_train.py` and config file `gensen_config.json` into the project folder.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"utils_folder = os.path.join(project_folder, \\\"utils_nlp\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"_ = shutil.copytree(UTIL_NLP_PATH, utils_folder)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"_ = shutil.copy(TRAIN_SCRIPT, os.path.join(utils_folder, \\\"gensen\\\"))\\n\",\n    \"_ = shutil.copy(CONFIG_PATH, os.path.join(utils_folder, \\\"gensen\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2.3 Define the Estimator and Experiment\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.3.1 Create a PyTorch Estimator\\n\",\n    \"The Azure ML SDK's PyTorch Estimator allows us to submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch).\\n\",\n    \"\\n\",\n    \"Note that `gensen_config.json` defines all the hyperparameters and paths when training GenSen model. The trained model will be saved in `models` to Azure Blob Storage. **Remember to clean the `models` folder in order to save new models.**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 30,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"if MAX_EPOCH:\\n\",\n    \"    script_params = {\\n\",\n    \"        \\\"--config\\\": \\\"utils_nlp/gensen/gensen_config.json\\\",\\n\",\n    \"        \\\"--data_folder\\\": ws.get_default_datastore().as_mount(),\\n\",\n    \"        \\\"--max_epoch\\\": MAX_EPOCH,\\n\",\n    \"    }\\n\",\n    \"else:\\n\",\n    \"    script_params = {\\n\",\n    \"        \\\"--config\\\": \\\"utils_nlp/gensen/gensen_config.json\\\",\\n\",\n    \"        \\\"--data_folder\\\": ws.get_default_datastore().as_mount(),\\n\",\n    \"    }\\n\",\n    \"\\n\",\n    \"estimator = PyTorch(\\n\",\n    \"    source_directory=project_folder,\\n\",\n    \"    script_params=script_params,\\n\",\n    \"    compute_target=compute_target,\\n\",\n    \"    entry_script=ENTRY_SCRIPT,\\n\",\n    \"    node_count=2,\\n\",\n    \"    process_count_per_node=1,\\n\",\n    \"    distributed_training=MpiConfiguration(),\\n\",\n    \"    use_gpu=True,\\n\",\n    \"    framework_version=\\\"1.1\\\",\\n\",\n    \"    conda_packages=[\\\"scikit-learn=0.20.3\\\", \\\"h5py\\\", \\\"nltk\\\"],\\n\",\n    \"    pip_packages=[\\\"azureml-mlflow>=1.0.43.1\\\", \\\"numpy>=1.16.0\\\"],\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This Estimator specifies that the training script will run on `2` nodes, with one worker per node. In order to execute a distributed run using GPU, we must define `use_gpu` and `distributed_backend` to use MPI/Horovod. PyTorch, Horovod, and other necessary dependencies are installed automatically. If the training script makes use of packages that are not already defined in `.azureml/conda_dependencies.yml`, we must explicitly tell the estimator to install them via the constructor's `pip_packages` or `conda_packages` parameters.\\n\",\n    \"\\n\",\n    \"Note that if the estimator is being created for the first time, this step will take longer to run because the conda dependencies found under `.azureml/conda_dependencies.yml` must be installed from scratch. After the first run, it will use the existing conda environment and run the code directly. \\n\",\n    \"\\n\",\n    \"The training time will take around **2 hours** if you use the default value `max_epoch=None`, which means the training will stop if the local minimum loss has been found. User can specify the number of epochs for training.\\n\",\n    \"\\n\",\n    \"**Requirements:**\\n\",\n    \"- python=3.6.2\\n\",\n    \"- numpy=1.15.1\\n\",\n    \"- numpy-base=1.15.1\\n\",\n    \"- pip=10.0.1\\n\",\n    \"- python=3.6.6\\n\",\n    \"- python-dateutil=2.7.3\\n\",\n    \"- scikit-learn=0.20.3\\n\",\n    \"- azureml-defaults\\n\",\n    \"- h5py\\n\",\n    \"- nltk\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.3.2 Create the Experiment\\n\",\n    \"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in the AzureML workspace for this tutorial.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 31,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"experiment_name = EXPERIMENT_NAME\\n\",\n    \"experiment = Experiment(ws, name=experiment_name)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2.4 Submit the Training Job to the Compute Target\\n\",\n    \"We can run the experiment by simply submitting the Estimator object to the compute target. Note that this call is asynchronous.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 32,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"run = experiment.submit(estimator)\\n\",\n    \"if AZUREML_VERBOSE:\\n\",\n    \"    print(run)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.4.1 Monitor the Run\\n\",\n    \"We can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes. The widget automatically plots and visualizes the loss metric that we logged to the AzureML workspace.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 33,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"717d4d49af584b1ab32ed2755007db94\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"RunDetails(run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 35,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"_ = run.wait_for_completion(show_output=AZUREML_VERBOSE) # Block until the script has completed training.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.4.2 Interpret the Training Results\\n\",\n    \"The following chart shows the model validation loss with different node configurations on AmlCompute. We find that the minimum validation loss decreases as the number of nodes increases; that is, the performance scales with the number of nodes in the cluster.\\n\",\n    \"\\n\",\n    \"| Standard_NC6 | AML_1node | AML_2nodes | AML_4nodes | AML_8nodes |\\n\",\n    \"| --- | --- | --- | --- | --- |\\n\",\n    \"| min_val_loss | 4.81 | 4.78 | 4.77 | 4.58 |\\n\",\n    \"\\n\",\n    \"We also observe common tradeoffs associated with distributed training. We make use of [Horovod](https://github.com/horovod/horovod), a distributed training tool for many popular deep learning frameworks that enables parallelization of work across the nodes in the cluster. Distributed training decreases the time it takes for the model to converge in theory, but the model may also take more time in communicating with each node. Note that the communication time will eventually become negligible when training on larger and larger datasets, but being aware of this tradeoff is helpful for choosing the node configuration when training on smaller datasets.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 3 Tune Model Hyperparameters\\n\",\n    \"Now that we've seen how to do a simple PyTorch training run using the SDK, let's see if we can further improve the accuracy of our model. We can optimize our model's hyperparameters using Azure Machine Learning's hyperparameter tuning capabilities.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.1 Start a Hyperparameter Sweep\\n\",\n    \"First, we define the hyperparameter space to sweep over. Since the training script uses a learning rate schedule to decay the learning rate every several epochs, we can tune the initial learning rate parameter. In this example we will use random sampling to try different configuration sets of hyperparameters to minimize our primary metric, the best validation loss.\\n\",\n    \"\\n\",\n    \"Then, we specify the early termination policy to use to early terminate poorly performing runs. Here we use the `BanditPolicy`, which terminates any run that doesn't fall within the slack factor of our primary evaluation metric. In this tutorial, we will apply this policy every epoch (since we report our the validation loss metric every epoch and `evaluation_interval=1`). Note that we explicitly define `delay_evaluation` such that the first policy evaluation does not occur until after the 10th epoch.\\n\",\n    \"\\n\",\n    \"Refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-tune-hyperparameters#specify-an-early-termination-policy) for more information on the BanditPolicy and other policies available.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 36,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"param_sampling = RandomParameterSampling({\\\"learning_rate\\\": uniform(0.0001, 0.001)})\\n\",\n    \"\\n\",\n    \"early_termination_policy = BanditPolicy(\\n\",\n    \"    slack_factor=0.15, evaluation_interval=1, delay_evaluation=10\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"hyperdrive_config = HyperDriveConfig(\\n\",\n    \"    estimator=estimator,\\n\",\n    \"    hyperparameter_sampling=param_sampling,\\n\",\n    \"    policy=early_termination_policy,\\n\",\n    \"    primary_metric_name=\\\"min_val_loss\\\",\\n\",\n    \"    primary_metric_goal=PrimaryMetricGoal.MINIMIZE,\\n\",\n    \"    max_total_runs=MAX_TOTAL_RUNS,\\n\",\n    \"    max_concurrent_runs=MAX_CONCURRENT_RUNS,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally, lauch the hyperparameter tuning job.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 37,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"hyperdrive_run = experiment.submit(hyperdrive_config) # Start the HyperDrive run\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.2 Monitor HyperDrive Runs\\n\",\n    \"We can monitor the progress of the runs with a Jupyter widget, or again block until the run has completed. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 40,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"ace260d4bb8d43549ff0c2944bf9a1be\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO',…\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"RunDetails(hyperdrive_run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 39,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"_ = hyperdrive_run.wait_for_completion(show_output=AZUREML_VERBOSE) # Block until complete\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 3.2.1 Interpret the Tuning Results\\n\",\n    \"\\n\",\n    \"The chart below shows 4 different threads running in parallel with different learning rates. The number of total runs is 8. We pick the best learning rate by minimizing the validation loss. The HyperDrive run automatically shows the tracking charts (example in the following) to facilitate visualization of the tuning process.\\n\",\n    \"\\n\",\n    \"![Tuning](https://nlpbp.blob.core.windows.net/images/gensen_tune1.PNG)\\n\",\n    \"![Tuning](https://nlpbp.blob.core.windows.net/images/gensen_tune2.PNG)\\n\",\n    \"\\n\",\n    \"**From the results in section [2.3.5 Monitor your run](#2.4.1-Monitor-your-run), the best validation loss for 1 node is 4.81, but with tuning we can easily achieve better performance around 4.65.**\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.3 Find the Best Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Once all the runs complete, we can find the run that produced the model with the lowest loss.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 41,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Best Run:\\n\",\n      \"  Validation loss: 6.23771 \\n\",\n      \"  Learning rate: 0.00066 \\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"best_run = hyperdrive_run.get_best_run_by_primary_metric()\\n\",\n    \"best_run_metrics = best_run.get_metrics()\\n\",\n    \"print(\\n\",\n    \"    \\\"Best Run:\\\\n  Validation loss: {0:.5f} \\\\n  Learning rate: {1:.5f} \\\\n\\\".format(\\n\",\n    \"        best_run_metrics[\\\"min_val_loss\\\"], best_run_metrics[\\\"learning_rate\\\"]\\n\",\n    \"    )\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 42,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 6.237707614898682,\n       \"encoder\": \"json\",\n       \"name\": \"min_val_loss\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"min_val_loss\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.000660701847879559,\n       \"encoder\": \"json\",\n       \"name\": \"learning_rate\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"learning_rate\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"# Persist properties of the run so we can access the logged metrics later\\n\",\n    \"sb.glue(\\\"min_val_loss\\\", best_run_metrics['min_val_loss'])\\n\",\n    \"sb.glue(\\\"learning_rate\\\", best_run_metrics['learning_rate'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## References\\n\",\n    \"\\n\",\n    \"1. Subramanian, Sandeep and Trischler, Adam and Bengio, Yoshua and Pal, Christopher J, [*Learning general purpose distributed sentence representations via large scale multi-task learning*](https://arxiv.org/abs/1804.00079), ICLR, 2018.\\n\",\n    \"2. A. Conneau, D. Kiela, [*SentEval: An Evaluation Toolkit for Universal Sentence Representations*](https://arxiv.org/abs/1803.05449).\\n\",\n    \"3. Semantic textual similarity. url: http://nlpprogress.com/english/semantic_textual_similarity.html\\n\",\n    \"4. Minh-Thang Luong, Quoc V Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. [*Multi-task sequence to sequence learning*](https://arxiv.org/abs/1511.06114), 2015.\\n\",\n    \"5. Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. [*Learned in translation: Contextualized word vectors](https://arxiv.org/abs/1708.00107), 2017. \"\n   ]\n  }\n ],\n \"metadata\": {\n  \"authors\": [\n   {\n    \"name\": \"minxia\"\n   }\n  ],\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.5.5\"\n  },\n  \"msauthor\": \"minxia\"\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/sentence_similarity/gensen_config.json",
    "content": "{\n  \"training\": {\n    \"optimizer\": \"adam\",\n    \"clip_c\": 1,\n    \"lrate\": 0.0001,\n    \"batch_size\": 48,\n    \"n_gpus\": 1,\n    \"stop_patience\": 2\n  },\n  \"management\": {\n    \"monitor_loss\": 480,\n    \"print_samples\": 12800,\n    \"checkpoint_freq\": 480000,\n    \"eval_freq\": 9600\n  },\n  \"data\": {\"paths\": [\n        {\n            \"train_src\": \"snli_1.0_train.txt.s1.tok\",\n            \"train_trg\": \"snli_1.0_train.txt.s2.tok\",\n            \"val_src\": \"snli_1.0_dev.txt.s1.tok\",\n            \"val_trg\": \"snli_1.0_dev.txt.s1.tok\",\n            \"taskname\": \"snli\"\n        }\n    ],\n        \"max_src_length\": 90,\n        \"max_trg_length\": 90,\n        \"task\": \"multi-seq2seq-nli\",\n        \"save_dir\": \"models/\",\n        \"nli_train\": \"snli_1.0_train.txt.clean.noblank\",\n        \"nli_dev\": \"snli_1.0_dev.txt.clean.noblank\",\n        \"nli_test\": \"snli_1.0_test.txt.clean.noblank\"\n\t},\n    \"model\": {\n    \t\"dim_src\": 2048,\n    \t\"dim_trg\": 2048,\n    \t\"dim_word_src\": 512,\n    \t\"dim_word_trg\": 512,\n    \t\"n_words_src\": 80000,\n    \t\"n_words_trg\": 30000,\n    \t\"n_layers_src\": 1,\n    \t\"bidirectional\": true,\n        \"layernorm\": false,\n        \"dropout\": 0.8\n    }\n}\n"
  },
  {
    "path": "examples/sentence_similarity/gensen_local.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"\\n\",\n    \"Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\n    \"\\n\",\n    \"Licensed under the MIT License.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# GenSen with Pytorch\\n\",\n    \"In this tutorial, you will train a GenSen model for the sentence similarity task. We use the [SNLI](https://nlp.stanford.edu/projects/snli/) dataset in this example. For a more detailed walkthrough about data processing jump to [SNLI Data Prep](../01-prep-data/snli.ipynb). A quickstart version of this notebook can be found [here](../00-quick-start/)\\n\",\n    \"\\n\",\n    \"## Notes:\\n\",\n    \"The model training part of this notebook can only run on a GPU machine. The running time shown in the notebook is on a Standard_NC6 Azure VM with 1 NVIDIA Tesla K80 GPU and 12 GB GPU memory. See the [README](README.md) for more details of the running time.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Overview\\n\",\n    \"\\n\",\n    \"### What is GenSen?\\n\",\n    \"\\n\",\n    \"GenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training. GenSen model combines the benefits of diverse sentence-representation learning objectives into a single multi-task framework. \\\"This is the first large-scale reusable sentence representation model obtained by combining a set of training objectives with the level of diversity explored here, i.e. multi-lingual NMT, natural language inference, constituency parsing and skip-thought vectors.\\\" [\\\\[1\\\\]](#References) These representations are useful for transfer and low-resource learning. GenSen is trained on several data sources with multiple training objectives on over 100 milion sentences.\\n\",\n    \"\\n\",\n    \"The GenSen model is most similar to that of Luong et al. (2015) [\\\\[4\\\\]](#References), who train a many-to-many **sequence-to-sequence** model on a diverse set of weakly related tasks that includes machine translation, constituency parsing, image captioning, sequence autoencoding, and intra-sentence skip-thoughts. However, there are two key differences. \\\"First, like McCann et al. (2017) [\\\\[5\\\\]](#References), their use of an attention mechanism prevents learning a ﬁxed-length vector representation for a sentence. Second, their work aims for improvements on the same tasks on which the model is trained, as opposed to learning re-usable sentence representations that transfer elsewhere.\\\" [\\\\[1\\\\]](#References)\\n\",\n    \"\\n\",\n    \"### Why GenSen?\\n\",\n    \"\\n\",\n    \"GenSen model performs the state-of-the-art results on multiple datasets, such as MRPC, SICK-R, SICK-E and STS, for sentence similarity. The reported results are as follows compared with other models [\\\\[3\\\\]](#References):\\n\",\n    \"\\n\",\n    \"| Model | MRPC | SICK-R | SICK-E | STS |\\n\",\n    \"| --- | --- | --- | --- | --- |\\n\",\n    \"| GenSen (Subramanian et al., 2018) | 78.6/84.4 | 0.888 | 87.8 | 78.9/78.6 |\\n\",\n    \"| [InferSent](https://arxiv.org/abs/1705.02364) (Conneau et al., 2017) | 76.2/83.1 | 0.884 | 86.3 | 75.8/75.5 |\\n\",\n    \"| [TF-KLD](https://www.aclweb.org/anthology/D13-1090) (Ji and Eisenstein, 2013) | 80.4/85.9 | - | - | - |\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Outline\\n\",\n    \"This notebook is organized as follows:\\n\",\n    \"\\n\",\n    \"1. Data preparation and inspection.\\n\",\n    \"2. Model training and prediction.\\n\",\n    \"\\n\",\n    \"For a more detailed deep dive of the Gensen model checkout the [Gensen Deep Dive Notebook](gensen_aml_deep_dive.ipynb)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 0. Global Settings\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \\n\",\n      \"[GCC 7.3.0]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import sys\\n\",\n    \"sys.path.append(\\\"../..\\\")\\n\",\n    \"\\n\",\n    \"import os\\n\",\n    \"import papermill as pm\\n\",\n    \"import scrapbook as sb\\n\",\n    \"\\n\",\n    \"from utils_nlp.dataset.preprocess import to_lowercase, to_nltk_tokens\\n\",\n    \"from utils_nlp.dataset import snli, preprocess\\n\",\n    \"from utils_nlp.models.pretrained_embeddings.glove import download_and_extract\\n\",\n    \"from utils_nlp.dataset import Split\\n\",\n    \"from examples.sentence_similarity.gensen_wrapper import GenSenClassifier\\n\",\n    \"\\n\",\n    \"print(\\\"System version: {}\\\".format(sys.version))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"max_epoch = None\\n\",\n    \"config_filepath = 'gensen_config.json'\\n\",\n    \"base_data_path = '../../data'\\n\",\n    \"nrows = None\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1. Data Preparation and inspection\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The [SNLI](https://nlp.stanford.edu/projects/snli/) corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment (RTE). \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.1 Load the dataset\\n\",\n    \"\\n\",\n    \"We provide a function load_pandas_df which does the following\\n\",\n    \"\\n\",\n    \"* Downloads the SNLI zipfile at the specified directory location\\n\",\n    \"* Extracts the file based on the specified split\\n\",\n    \"* Loads the split as a pandas dataframe The zipfile contains the following files:\\n\",\n    \"    * snli_1.0_dev.txt\\n\",\n    \"    * snli_1.0_train.txt\\n\",\n    \"    * snli_1.0_test.tx\\n\",\n    \"    * snli_1.0_dev.jsonl\\n\",\n    \"    * snli_1.0_train.jsonl\\n\",\n    \"    * snli_1.0_test.jsonl\\n\",\n    \"    \\n\",\n    \"The loader defaults to reading from the .txt file; however, you can change this to .jsonl by setting the optional file_type parameter when calling the function.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>gold_label</th>\\n\",\n       \"      <th>sentence1_binary_parse</th>\\n\",\n       \"      <th>sentence2_binary_parse</th>\\n\",\n       \"      <th>sentence1_parse</th>\\n\",\n       \"      <th>sentence2_parse</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"      <th>sentence2</th>\\n\",\n       \"      <th>captionID</th>\\n\",\n       \"      <th>pairID</th>\\n\",\n       \"      <th>label1</th>\\n\",\n       \"      <th>label2</th>\\n\",\n       \"      <th>label3</th>\\n\",\n       \"      <th>label4</th>\\n\",\n       \"      <th>label5</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\\n\",\n       \"      <td>( ( A person ) ( ( is ( ( training ( his horse...</td>\\n\",\n       \"      <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\\n\",\n       \"      <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\\n\",\n       \"      <td>A person on a horse jumps over a broken down a...</td>\\n\",\n       \"      <td>A person is training his horse for a competition.</td>\\n\",\n       \"      <td>3416050480.jpg#4</td>\\n\",\n       \"      <td>3416050480.jpg#4r1n</td>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>contradiction</td>\\n\",\n       \"      <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\\n\",\n       \"      <td>( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...</td>\\n\",\n       \"      <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\\n\",\n       \"      <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\\n\",\n       \"      <td>A person on a horse jumps over a broken down a...</td>\\n\",\n       \"      <td>A person is at a diner, ordering an omelette.</td>\\n\",\n       \"      <td>3416050480.jpg#4</td>\\n\",\n       \"      <td>3416050480.jpg#4r1c</td>\\n\",\n       \"      <td>contradiction</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\\n\",\n       \"      <td>( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...</td>\\n\",\n       \"      <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\\n\",\n       \"      <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\\n\",\n       \"      <td>A person on a horse jumps over a broken down a...</td>\\n\",\n       \"      <td>A person is outdoors, on a horse.</td>\\n\",\n       \"      <td>3416050480.jpg#4</td>\\n\",\n       \"      <td>3416050480.jpg#4r1e</td>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>( Children ( ( ( smiling and ) waving ) ( at c...</td>\\n\",\n       \"      <td>( They ( are ( smiling ( at ( their parents ) ...</td>\\n\",\n       \"      <td>(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...</td>\\n\",\n       \"      <td>(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...</td>\\n\",\n       \"      <td>Children smiling and waving at camera</td>\\n\",\n       \"      <td>They are smiling at their parents</td>\\n\",\n       \"      <td>2267923837.jpg#2</td>\\n\",\n       \"      <td>2267923837.jpg#2r1n</td>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>( Children ( ( ( smiling and ) waving ) ( at c...</td>\\n\",\n       \"      <td>( There ( ( are children ) present ) )</td>\\n\",\n       \"      <td>(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...</td>\\n\",\n       \"      <td>(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...</td>\\n\",\n       \"      <td>Children smiling and waving at camera</td>\\n\",\n       \"      <td>There are children present</td>\\n\",\n       \"      <td>2267923837.jpg#2</td>\\n\",\n       \"      <td>2267923837.jpg#2r1e</td>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"      gold_label                             sentence1_binary_parse  \\\\\\n\",\n       \"0        neutral  ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...   \\n\",\n       \"1  contradiction  ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...   \\n\",\n       \"2     entailment  ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...   \\n\",\n       \"3        neutral  ( Children ( ( ( smiling and ) waving ) ( at c...   \\n\",\n       \"4     entailment  ( Children ( ( ( smiling and ) waving ) ( at c...   \\n\",\n       \"\\n\",\n       \"                              sentence2_binary_parse  \\\\\\n\",\n       \"0  ( ( A person ) ( ( is ( ( training ( his horse...   \\n\",\n       \"1  ( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...   \\n\",\n       \"2  ( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...   \\n\",\n       \"3  ( They ( are ( smiling ( at ( their parents ) ...   \\n\",\n       \"4             ( There ( ( are children ) present ) )   \\n\",\n       \"\\n\",\n       \"                                     sentence1_parse  \\\\\\n\",\n       \"0  (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...   \\n\",\n       \"1  (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...   \\n\",\n       \"2  (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...   \\n\",\n       \"3  (ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...   \\n\",\n       \"4  (ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...   \\n\",\n       \"\\n\",\n       \"                                     sentence2_parse  \\\\\\n\",\n       \"0  (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...   \\n\",\n       \"1  (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...   \\n\",\n       \"2  (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...   \\n\",\n       \"3  (ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...   \\n\",\n       \"4  (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...   \\n\",\n       \"\\n\",\n       \"                                           sentence1  \\\\\\n\",\n       \"0  A person on a horse jumps over a broken down a...   \\n\",\n       \"1  A person on a horse jumps over a broken down a...   \\n\",\n       \"2  A person on a horse jumps over a broken down a...   \\n\",\n       \"3              Children smiling and waving at camera   \\n\",\n       \"4              Children smiling and waving at camera   \\n\",\n       \"\\n\",\n       \"                                           sentence2         captionID  \\\\\\n\",\n       \"0  A person is training his horse for a competition.  3416050480.jpg#4   \\n\",\n       \"1      A person is at a diner, ordering an omelette.  3416050480.jpg#4   \\n\",\n       \"2                  A person is outdoors, on a horse.  3416050480.jpg#4   \\n\",\n       \"3                  They are smiling at their parents  2267923837.jpg#2   \\n\",\n       \"4                         There are children present  2267923837.jpg#2   \\n\",\n       \"\\n\",\n       \"                pairID         label1 label2 label3 label4 label5  \\n\",\n       \"0  3416050480.jpg#4r1n        neutral    NaN    NaN    NaN    NaN  \\n\",\n       \"1  3416050480.jpg#4r1c  contradiction    NaN    NaN    NaN    NaN  \\n\",\n       \"2  3416050480.jpg#4r1e     entailment    NaN    NaN    NaN    NaN  \\n\",\n       \"3  2267923837.jpg#2r1n        neutral    NaN    NaN    NaN    NaN  \\n\",\n       \"4  2267923837.jpg#2r1e     entailment    NaN    NaN    NaN    NaN  \"\n      ]\n     },\n     \"execution_count\": 3,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"train = snli.load_pandas_df(base_data_path, file_split=Split.TRAIN, nrows=nrows)\\n\",\n    \"dev = snli.load_pandas_df(base_data_path, file_split=Split.DEV, nrows=nrows)\\n\",\n    \"test = snli.load_pandas_df(base_data_path, file_split=Split.TEST, nrows=nrows)\\n\",\n    \"\\n\",\n    \"train.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.2 Tokenize\\n\",\n    \"\\n\",\n    \"We have loaded the dataset into pandas.DataFrame, we now convert sentences to tokens. We also clean the data before tokenizing. This includes dropping unneccessary columns and renaming the relevant columns as score, sentence_1, and sentence_2.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"def clean_and_tokenize(df):\\n\",\n    \"    df = snli.clean_cols(df)\\n\",\n    \"    df = snli.clean_rows(df)\\n\",\n    \"    df = preprocess.to_lowercase(df)\\n\",\n    \"    df = preprocess.to_nltk_tokens(df)\\n\",\n    \"    return df\\n\",\n    \"\\n\",\n    \"train = clean_and_tokenize(train)\\n\",\n    \"dev = clean_and_tokenize(dev)\\n\",\n    \"test = clean_and_tokenize(test)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK] (https://www.nltk.org/) library for tokenization.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>score</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"      <th>sentence2</th>\\n\",\n       \"      <th>sentence1_tokens</th>\\n\",\n       \"      <th>sentence2_tokens</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>two women are embracing while holding to go pa...</td>\\n\",\n       \"      <td>the sisters are hugging goodbye while holding ...</td>\\n\",\n       \"      <td>[two, women, are, embracing, while, holding, t...</td>\\n\",\n       \"      <td>[the, sisters, are, hugging, goodbye, while, h...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>two women are embracing while holding to go pa...</td>\\n\",\n       \"      <td>two woman are holding packages.</td>\\n\",\n       \"      <td>[two, women, are, embracing, while, holding, t...</td>\\n\",\n       \"      <td>[two, woman, are, holding, packages, .]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>contradiction</td>\\n\",\n       \"      <td>two women are embracing while holding to go pa...</td>\\n\",\n       \"      <td>the men are fighting outside a deli.</td>\\n\",\n       \"      <td>[two, women, are, embracing, while, holding, t...</td>\\n\",\n       \"      <td>[the, men, are, fighting, outside, a, deli, .]</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>entailment</td>\\n\",\n       \"      <td>two young children in blue jerseys, one with t...</td>\\n\",\n       \"      <td>two kids in numbered jerseys wash their hands.</td>\\n\",\n       \"      <td>[two, young, children, in, blue, jerseys, ,, o...</td>\\n\",\n       \"      <td>[two, kids, in, numbered, jerseys, wash, their...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>neutral</td>\\n\",\n       \"      <td>two young children in blue jerseys, one with t...</td>\\n\",\n       \"      <td>two kids at a ballgame wash their hands.</td>\\n\",\n       \"      <td>[two, young, children, in, blue, jerseys, ,, o...</td>\\n\",\n       \"      <td>[two, kids, at, a, ballgame, wash, their, hand...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"           score                                          sentence1  \\\\\\n\",\n       \"0        neutral  two women are embracing while holding to go pa...   \\n\",\n       \"1     entailment  two women are embracing while holding to go pa...   \\n\",\n       \"2  contradiction  two women are embracing while holding to go pa...   \\n\",\n       \"3     entailment  two young children in blue jerseys, one with t...   \\n\",\n       \"4        neutral  two young children in blue jerseys, one with t...   \\n\",\n       \"\\n\",\n       \"                                           sentence2  \\\\\\n\",\n       \"0  the sisters are hugging goodbye while holding ...   \\n\",\n       \"1                    two woman are holding packages.   \\n\",\n       \"2               the men are fighting outside a deli.   \\n\",\n       \"3     two kids in numbered jerseys wash their hands.   \\n\",\n       \"4           two kids at a ballgame wash their hands.   \\n\",\n       \"\\n\",\n       \"                                    sentence1_tokens  \\\\\\n\",\n       \"0  [two, women, are, embracing, while, holding, t...   \\n\",\n       \"1  [two, women, are, embracing, while, holding, t...   \\n\",\n       \"2  [two, women, are, embracing, while, holding, t...   \\n\",\n       \"3  [two, young, children, in, blue, jerseys, ,, o...   \\n\",\n       \"4  [two, young, children, in, blue, jerseys, ,, o...   \\n\",\n       \"\\n\",\n       \"                                    sentence2_tokens  \\n\",\n       \"0  [the, sisters, are, hugging, goodbye, while, h...  \\n\",\n       \"1            [two, woman, are, holding, packages, .]  \\n\",\n       \"2     [the, men, are, fighting, outside, a, deli, .]  \\n\",\n       \"3  [two, kids, in, numbered, jerseys, wash, their...  \\n\",\n       \"4  [two, kids, at, a, ballgame, wash, their, hand...  \"\n      ]\n     },\n     \"execution_count\": 5,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"dev.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"##  2. Model application, performance and analysis of the results\\n\",\n    \"The model has been implemented as a GenSen class with the specifics hidden inside the fit() method, so that no explicit call is needed. The algorithm operates in three different steps:\\n\",\n    \"\\n\",\n    \"** Model initialization ** : This is where we tell our class how to train the model. The main parameters to specify are the number of\\n\",\n    \"1. config file which contains information about the number of training epochs, the minibatch size etc.\\n\",\n    \"2. cache_dir which is the folder where all the data will be saved.\\n\",\n    \"3. learning rate for the model\\n\",\n    \"4. path to the pretrained embedding vectors.\\n\",\n    \"\\n\",\n    \"** Model fit ** : This is where we train the model on the data. The method takes two arguments: the training, dev and test set pandas dataframes. Note that the model is trained only on the training set, the test set is used to display the test set accuracy of the trained model, that in turn is an estimation of the generazation capabilities of the algorithm. It is generally useful to look at these quantities to have a first idea of the optimization behaviour.\\n\",\n    \"\\n\",\n    \"** Model prediction ** : This is where we generate the similarity for a pair of sentences. Once the model has been trained and we are satisfied with its overall accuracy we use the saved model to show the similarity between two provided sentences. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.0 Download pretrained vectors\\n\",\n    \"In this example we use gloVe for pretrained embedding vectors.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Vector file already exists. No changes made.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"pretrained_embedding_path = download_and_extract(base_data_path)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.1 Initialize Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"clf = GenSenClassifier(config_file = config_filepath, \\n\",\n    \"                       pretrained_embedding_path = pretrained_embedding_path,\\n\",\n    \"                       learning_rate = 0.0001, \\n\",\n    \"                       cache_dir=base_data_path,\\n\",\n    \"                      max_epoch=max_epoch)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.2 Train Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/modules/rnn.py:46: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.8 and num_layers=1\\n\",\n      \"  \\\"num_layers={}\\\".format(dropout, num_layers))\\n\",\n      \"../../examples/sentence_similarity/gensen_train.py:431: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\\n\",\n      \"  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\\n\",\n      \"../../utils_nlp/models/gensen/utils.py:364: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\\n\",\n      \"  Variable(torch.LongTensor(sorted_src_lens), volatile=True)\\n\",\n      \"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/functional.py:1332: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\\n\",\n      \"  warnings.warn(\\\"nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\\\")\\n\",\n      \"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/functional.py:1320: UserWarning: nn.functional.tanh is deprecated. Use torch.tanh instead.\\n\",\n      \"  warnings.warn(\\\"nn.functional.tanh is deprecated. Use torch.tanh instead.\\\")\\n\",\n      \"../../examples/sentence_similarity/gensen_train.py:523: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\\n\",\n      \"  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\\n\",\n      \"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/horovod/torch/__init__.py:163: UserWarning: optimizer.step(synchronize=True) called after optimizer.synchronize(). This can cause training slowdown. You may want to consider using optimizer.step(synchronize=False) if you use optimizer.synchronize() in your code.\\n\",\n      \"  warnings.warn(\\\"optimizer.step(synchronize=True) called after \\\"\\n\",\n      \"../../examples/sentence_similarity/gensen_train.py:243: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\\n\",\n      \"  f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\\n\",\n      \"../../examples/sentence_similarity/gensen_train.py:262: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\\n\",\n      \"  f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"CPU times: user 1h 19min 28s, sys: 22min 1s, total: 1h 41min 30s\\n\",\n      \"Wall time: 1h 41min 22s\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%time\\n\",\n    \"clf.fit(train, dev, test)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.3 Predict\\n\",\n    \"\\n\",\n    \"In the predict method we perform Pearson's Correlation computation [\\\\[2\\\\]](#References) on the outputs of the model. The predictions of the model can be further improved by hyperparameter tuning which we walk through in the other example [here](gensen_aml_deep_dive.ipynb). \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"******** Similarity Score for sentences **************\\n\",\n      \"          0         1\\n\",\n      \"0  1.000000  0.966793\\n\",\n      \"1  0.966793  1.000000\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"sentences = [\\n\",\n    \"        'The sky is blue and beautiful',\\n\",\n    \"        'Love this blue and beautiful sky!'\\n\",\n    \"    ]\\n\",\n    \"\\n\",\n    \"results = clf.predict(sentences)\\n\",\n    \"print(\\\"******** Similarity Score for sentences **************\\\")\\n\",\n    \"print(results)\\n\",\n    \"\\n\",\n    \"# Record results with scrapbook for tests\\n\",\n    \"sb.glue(\\\"results\\\", results.to_dict())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## References\\n\",\n    \"\\n\",\n    \"1. Subramanian, Sandeep and Trischler, Adam and Bengio, Yoshua and Pal, Christopher J, [*Learning general purpose distributed sentence representations via large scale multi-task learning*](https://arxiv.org/abs/1804.00079), ICLR, 2018.\\n\",\n    \"2. Pearson's Correlation Coefficient. url: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient\\n\",\n    \"3. Semantic textual similarity. url: http://nlpprogress.com/english/semantic_textual_similarity.html\\n\",\n    \"4. Minh-Thang Luong, Quoc V Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. [*Multi-task sequence to sequence learning*](https://arxiv.org/abs/1511.06114), 2015.\\n\",\n    \"5. Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. [*Learned in translation: Contextualized word vectors](https://arxiv.org/abs/1708.00107), 2017. \"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"nlp_gpu\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  },\n  \"pycharm\": {\n   \"stem_cell\": {\n    \"cell_type\": \"raw\",\n    \"metadata\": {\n     \"collapsed\": false\n    },\n    \"source\": []\n   }\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/sentence_similarity/gensen_train.py",
    "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\nThe GenSen training process follows the steps:\n1. Create or load the dataset vocabulary\n2. Train on the training dataset for each batch epoch (batch size = 48 updates)\n3. Evaluate on the validation dataset for every 10 epoches\n4. Find the local minimum point on validation loss\n5. Save the best model and stop the training process\n\nAzureML provides AI Compute to train the model and track the performance.\nThis training process is based on GPU only.\n\n\"\"\"\nimport argparse\nimport json\nimport logging\nimport os\nimport time\n\nimport horovod.torch as hvd\nimport mlflow\nimport numpy as np\nimport torch\nimport torch.backends.cudnn as cudnn\nimport torch.nn as nn\nimport torch.nn.functional as f\nimport torch.optim as optim\n\nfrom utils_nlp.models.gensen.multi_task_model import MultitaskModel\nfrom utils_nlp.models.gensen.utils import (\n    BufferedDataIterator,\n    NLIIterator,\n    compute_validation_loss,\n)\n\ncudnn.benchmark = True\nlogger = logging.getLogger(__name__)\n\nhvd.init()\nif torch.cuda.is_available():\n    # Horovod: pin GPU to local rank.\n    torch.cuda.set_device(hvd.local_rank())\n\n\ndef metric_average(value, name):\n    \"\"\"\n    Sync the validation loss with nodes.\n    :param value:\n    :param name:\n    :return:\n    \"\"\"\n    tensor = torch.tensor(value)\n    avg_tensor = hvd.allreduce(tensor, name=name)\n    return avg_tensor.item()\n\n\ndef setup_horovod(model, learning_rate):\n    \"\"\" Setup for Horovod usage.\n\n    Args:\n        model(MultitaskModel): The MultitaskModel object.\n        learning_rate(float): Learning rate for the model.\n\n    Returns: hvd.DistributedOptimizer: Optimizer to use for computing\n    gradients and applying updates.\n\n    \"\"\"\n    # Horovod: scale learning rate by the number of GPUs.\n    optimizer = optim.Adam(model.parameters(), lr=learning_rate * hvd.size())\n\n    # Horovod: broadcast parameters & optimizer state.\n    hvd.broadcast_parameters(model.state_dict(), root_rank=0)\n    hvd.broadcast_optimizer_state(optimizer, root_rank=0)\n\n    # Horovod: (optional) compression algorithm.\n    compression = hvd.Compression.fp16\n\n    # Horovod: wrap optimizer with DistributedOptimizer.\n    optimizer = hvd.DistributedOptimizer(\n        optimizer,\n        named_parameters=model.named_parameters(),\n        compression=compression,\n    )\n\n    return optimizer\n\n\ndef setup_logging(config):\n    logging.basicConfig(\n        level=logging.INFO,\n        format=\"%(asctime)s - %(levelname)s - %(message)s\",\n        filename=\"log/%s\" % (config[\"data\"][\"task\"]),\n        filemode=\"w\",\n    )\n\n    console = logging.StreamHandler()\n    console.setLevel(logging.INFO)\n    formatter = logging.Formatter(\"%(asctime)s - %(levelname)s - %(message)s\")\n    console.setFormatter(formatter)\n    logging.getLogger(\"\").addHandler(console)\n\n\ndef log_config(config):\n    logging.info(\"Model Parameters : \")\n    logging.info(\"Task : %s \" % (config[\"data\"][\"task\"]))\n    logging.info(\n        \"Source Word Embedding Dim  : %s\" % (config[\"model\"][\"dim_word_src\"])\n    )\n    logging.info(\n        \"Target Word Embedding Dim  : %s\" % (config[\"model\"][\"dim_word_trg\"])\n    )\n    logging.info(\"Source RNN Hidden Dim  : %s\" % (config[\"model\"][\"dim_src\"]))\n    logging.info(\"Target RNN Hidden Dim  : %s\" % (config[\"model\"][\"dim_trg\"]))\n    logging.info(\n        \"Source RNN Bidirectional  : %s\" % (config[\"model\"][\"bidirectional\"])\n    )\n    logging.info(\"Batch Size : %d \" % (config[\"training\"][\"batch_size\"]))\n    logging.info(\"Optimizer : %s \" % (config[\"training\"][\"optimizer\"]))\n    logging.info(\"Learning Rate : %f \" % (config[\"training\"][\"lrate\"]))\n\n\ndef evaluate(\n    config,\n    train_iterator,\n    model,\n    loss_criterion,\n    monitor_epoch,\n    min_val_loss,\n    min_val_loss_epoch,\n    save_dir,\n    starting_time,\n    model_state,\n    max_epoch,\n):\n    \"\"\" Function to validate the model.\n\n    Args:\n        max_epoch(int): Limit training to specified number of epochs.\n        model_state(dict): Saved model weights.\n        config(dict): Config object.\n        train_iterator(BufferedDataIterator): BufferedDataIterator object.\n        model(MultitaskModel): The MultitaskModel object.\n        loss_criterion(nn.CrossEntropyLoss): Cross entropy loss.\n        monitor_epoch(int): Current epoch count.\n        min_val_loss(float): Minimum validation loss\n        min_val_loss_epoch(int): Epoch where the minimum validation\n            loss was seen.\n        save_dir(str): Directory path to save the model dictionary.\n        starting_time(time.Time): Starting time of the training.\n\n    Returns:\n        bool: Whether to continue training or not.\n    \"\"\"\n\n    break_flag = 0\n\n    for task_idx, task in enumerate(train_iterator.tasknames):\n        if \"skipthought\" in task:\n            continue\n        validation_loss = compute_validation_loss(\n            config,\n            model,\n            train_iterator,\n            loss_criterion,\n            task_idx,\n            lowercase=True,\n        )\n        validation_loss = metric_average(validation_loss, \"val_loss\")\n        logging.info(\"%s Validation Loss : %.3f\" % (task, validation_loss))\n\n        # Horovod: print output only on first rank.\n        if hvd.rank() == 0:\n            # log the best val accuracy to AML run\n            logging.info(\n                \"Best Validation Loss: {}\".format(np.float(validation_loss))\n            )\n\n        # If the validation loss is small enough, and it starts to go up.\n        # Should stop training.\n        # Small is defined by the number of epochs it lasts.\n        if validation_loss < min_val_loss:\n            min_val_loss = validation_loss\n            min_val_loss_epoch = monitor_epoch\n            model_state = model.state_dict()\n\n        logging.info(\n            \"Monitor epoch: %d Validation Loss:  %.3f Min Validation Epoch: \"\n            \"%d Loss : %.3f \"\n            % (\n                monitor_epoch,\n                validation_loss,\n                min_val_loss_epoch,\n                min_val_loss,\n            )\n        )\n        if (monitor_epoch - min_val_loss_epoch) > config[\"training\"][\n            \"stop_patience\"\n        ] or (max_epoch is not None and monitor_epoch >= max_epoch):\n            logging.info(\"Saving model ...\")\n            # Save the name with validation loss.\n            torch.save(\n                model_state,\n                open(os.path.join(save_dir, \"best_model.model\"), \"wb\"),\n            )\n            # Let the training end.\n            break_flag = 1\n            break\n    if break_flag == 1:\n        logging.info(\"##### Training stopped at ##### %f\" % min_val_loss)\n        logging.info(\n            \"##### Training Time ##### %f seconds\"\n            % (time.time() - starting_time)\n        )\n        return True, min_val_loss_epoch, min_val_loss, model_state\n    else:\n        return False, min_val_loss_epoch, min_val_loss, model_state\n\n\ndef evaluate_nli(nli_iterator, model, batch_size, n_gpus):\n    \"\"\"\n\n    Args:\n        nli_iterator(NLIIterator): NLIIterator object.\n        model(MultitaskModel): Multitask model object.\n        batch_size(int): Batch size.\n        n_gpus(int): Number of gpus\n\n    \"\"\"\n    n_correct = 0.0\n    n_wrong = 0.0\n    for j in range(0, len(nli_iterator.dev_lines), batch_size * n_gpus):\n        minibatch = nli_iterator.get_parallel_minibatch(\n            j, batch_size * n_gpus, \"dev\"\n        )\n        class_logits = model(\n            minibatch, -1, return_hidden=False, paired_trg=None\n        )\n        class_preds = (\n            f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n        )\n        labels = minibatch[\"labels\"].data.cpu().numpy()\n        for pred, label in zip(class_preds, labels):\n            if pred == label:\n                n_correct += 1.0\n            else:\n                n_wrong += 1.0\n    logging.info(\"NLI Dev Acc : %.5f\" % (n_correct / (n_correct + n_wrong)))\n    n_correct = 0.0\n    n_wrong = 0.0\n    for j in range(0, len(nli_iterator.test_lines), batch_size * n_gpus):\n        minibatch = nli_iterator.get_parallel_minibatch(\n            j, batch_size * n_gpus, \"test\"\n        )\n        class_logits = model(\n            minibatch, -1, return_hidden=False, paired_trg=None\n        )\n        class_preds = (\n            f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n        )\n        labels = minibatch[\"labels\"].data.cpu().numpy()\n        for pred, label in zip(class_preds, labels):\n            if pred == label:\n                n_correct += 1.0\n            else:\n                n_wrong += 1.0\n    logging.info(\"NLI Test Acc : %.5f\" % (n_correct / (n_correct + n_wrong)))\n    logging.info(\"******************************************************\")\n\n\ndef train(config, data_folder, learning_rate=0.0001, max_epoch=None):\n    \"\"\" Train the Gensen model.\n\n    Args:\n        max_epoch(int): Limit training to specified number of epochs.\n        config(dict): Loaded json file as a python object.\n        data_folder(str): Path to the folder containing the data.\n        learning_rate(float): Learning rate for the model.\n    \"\"\"\n    owd = os.getcwd()\n    os.chdir(data_folder)\n\n    try:\n        with mlflow.start_run():\n            save_dir = config[\"data\"][\"save_dir\"]\n            if not os.path.exists(\"./log\"):\n                os.makedirs(\"./log\")\n\n            os.makedirs(save_dir, exist_ok=True)\n\n            setup_logging(config)\n\n            batch_size = config[\"training\"][\"batch_size\"]\n            src_vocab_size = config[\"model\"][\"n_words_src\"]\n            trg_vocab_size = config[\"model\"][\"n_words_trg\"]\n            max_len_src = config[\"data\"][\"max_src_length\"]\n            max_len_trg = config[\"data\"][\"max_trg_length\"]\n            model_state = {}\n\n            train_src = [item[\"train_src\"] for item in config[\"data\"][\"paths\"]]\n            train_trg = [item[\"train_trg\"] for item in config[\"data\"][\"paths\"]]\n            tasknames = [item[\"taskname\"] for item in config[\"data\"][\"paths\"]]\n\n            # Keep track of indicies to train forward and backward jointly\n            if (\n                \"skipthought_next\" in tasknames\n                and \"skipthought_previous\" in tasknames\n            ):\n                skipthought_idx = tasknames.index(\"skipthought_next\")\n                skipthought_backward_idx = tasknames.index(\n                    \"skipthought_previous\"\n                )\n                paired_tasks = {\n                    skipthought_idx: skipthought_backward_idx,\n                    skipthought_backward_idx: skipthought_idx,\n                }\n            else:\n                paired_tasks = None\n                skipthought_idx = None\n                skipthought_backward_idx = None\n\n            train_iterator = BufferedDataIterator(\n                train_src,\n                train_trg,\n                src_vocab_size,\n                trg_vocab_size,\n                tasknames,\n                save_dir,\n                buffer_size=1e6,\n                lowercase=True,\n                seed=(hvd.rank() + 1) * 12345,\n            )\n\n            nli_iterator = NLIIterator(\n                train=config[\"data\"][\"nli_train\"],\n                dev=config[\"data\"][\"nli_dev\"],\n                test=config[\"data\"][\"nli_test\"],\n                vocab_size=-1,\n                vocab=os.path.join(save_dir, \"src_vocab.pkl\"),\n                seed=(hvd.rank() + 1) * 12345,\n            )\n\n            src_vocab_size = len(train_iterator.src[0][\"word2id\"])\n            trg_vocab_size = len(train_iterator.trg[0][\"word2id\"])\n\n            # Logging set up.\n            logging.info(\"Finished creating iterator ...\")\n            log_config(config)\n            logging.info(\n                \"Found %d words in source : \"\n                % (len(train_iterator.src[0][\"id2word\"]))\n            )\n            for idx, taskname in enumerate(tasknames):\n                logging.info(\n                    \"Found %d target words in task %s \"\n                    % (len(train_iterator.trg[idx][\"id2word\"]), taskname)\n                )\n            logging.info(\"Found %d words in src \" % src_vocab_size)\n            logging.info(\"Found %d words in trg \" % trg_vocab_size)\n\n            weight_mask = torch.ones(trg_vocab_size).cuda()\n            weight_mask[train_iterator.trg[0][\"word2id\"][\"<pad>\"]] = 0\n            loss_criterion = nn.CrossEntropyLoss(weight=weight_mask).cuda()\n            nli_criterion = nn.CrossEntropyLoss().cuda()\n\n            model = MultitaskModel(\n                src_emb_dim=config[\"model\"][\"dim_word_src\"],\n                trg_emb_dim=config[\"model\"][\"dim_word_trg\"],\n                src_vocab_size=src_vocab_size,\n                trg_vocab_size=trg_vocab_size,\n                src_hidden_dim=config[\"model\"][\"dim_src\"],\n                trg_hidden_dim=config[\"model\"][\"dim_trg\"],\n                bidirectional=config[\"model\"][\"bidirectional\"],\n                pad_token_src=train_iterator.src[0][\"word2id\"][\"<pad>\"],\n                pad_token_trg=train_iterator.trg[0][\"word2id\"][\"<pad>\"],\n                nlayers_src=config[\"model\"][\"n_layers_src\"],\n                dropout=config[\"model\"][\"dropout\"],\n                num_tasks=len(train_iterator.src),\n                paired_tasks=paired_tasks,\n            ).cuda()\n\n            optimizer = setup_horovod(model, learning_rate=learning_rate)\n            logging.info(model)\n\n            n_gpus = config[\"training\"][\"n_gpus\"]\n            model = torch.nn.DataParallel(model, device_ids=range(n_gpus))\n\n            task_losses = [[] for _ in tasknames]\n            task_idxs = [0 for _ in tasknames]\n            nli_losses = []\n            updates = 0\n            nli_ctr = 0\n            nli_epoch = 0\n            monitor_epoch = 0\n            nli_mbatch_ctr = 0\n            mbatch_times = []\n            min_val_loss = 10000000\n            min_val_loss_epoch = -1\n            rng_num_tasks = (\n                len(tasknames) - 1 if paired_tasks else len(tasknames)\n            )\n            logging.info(\"OS Environ: \\n {} \\n\\n\".format(os.environ))\n            mlflow.log_param(\"learning_rate\", learning_rate)\n            logging.info(\"Commencing Training ...\")\n            start = time.time()\n            while True:\n                batch_start_time = time.time()\n                # Train NLI once every 10 minibatches of other tasks\n                if nli_ctr % 10 == 0:\n                    minibatch = nli_iterator.get_parallel_minibatch(\n                        nli_mbatch_ctr, batch_size * n_gpus\n                    )\n                    optimizer.zero_grad()\n                    class_logits = model(\n                        minibatch, -1, return_hidden=False, paired_trg=None\n                    )\n\n                    loss = nli_criterion(\n                        class_logits.contiguous().view(\n                            -1, class_logits.size(1)\n                        ),\n                        minibatch[\"labels\"].contiguous().view(-1),\n                    )\n\n                    # nli_losses.append(loss.data[0])\n                    nli_losses.append(loss.item())\n                    loss.backward()\n                    torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\n                    optimizer.step()\n\n                    nli_mbatch_ctr += batch_size * n_gpus\n                    if nli_mbatch_ctr >= len(nli_iterator.train_lines):\n                        nli_mbatch_ctr = 0\n                        nli_epoch += 1\n                else:\n                    # Sample a random task\n                    task_idx = np.random.randint(low=0, high=rng_num_tasks)\n\n                    # Get a minibatch corresponding to the sampled task\n                    minibatch = train_iterator.get_parallel_minibatch(\n                        task_idx,\n                        task_idxs[task_idx],\n                        batch_size * n_gpus,\n                        max_len_src,\n                        max_len_trg,\n                    )\n\n                    \"\"\"Increment pointer into task and if current buffer is\n                    exhausted, fetch new buffer. \"\"\"\n                    task_idxs[task_idx] += batch_size * n_gpus\n                    if task_idxs[task_idx] >= train_iterator.buffer_size:\n                        train_iterator.fetch_buffer(task_idx)\n                        task_idxs[task_idx] = 0\n\n                    if task_idx == skipthought_idx:\n                        minibatch_back = train_iterator.get_parallel_minibatch(\n                            skipthought_backward_idx,\n                            task_idxs[skipthought_backward_idx],\n                            batch_size * n_gpus,\n                            max_len_src,\n                            max_len_trg,\n                        )\n                        task_idxs[skipthought_backward_idx] += (\n                            batch_size * n_gpus\n                        )\n                        if (\n                            task_idxs[skipthought_backward_idx]\n                            >= train_iterator.buffer_size\n                        ):\n                            train_iterator.fetch_buffer(\n                                skipthought_backward_idx\n                            )\n                            task_idxs[skipthought_backward_idx] = 0\n\n                        optimizer.zero_grad()\n                        decoder_logit, decoder_logit_2 = model(\n                            minibatch,\n                            task_idx,\n                            paired_trg=minibatch_back[\"input_trg\"],\n                        )\n\n                        loss_f = loss_criterion(\n                            decoder_logit.contiguous().view(\n                                -1, decoder_logit.size(2)\n                            ),\n                            minibatch[\"output_trg\"].contiguous().view(-1),\n                        )\n\n                        loss_b = loss_criterion(\n                            decoder_logit_2.contiguous().view(\n                                -1, decoder_logit_2.size(2)\n                            ),\n                            minibatch_back[\"output_trg\"].contiguous().view(-1),\n                        )\n\n                        task_losses[task_idx].append(loss_f.data[0])\n                        task_losses[skipthought_backward_idx].append(\n                            loss_b.data[0]\n                        )\n                        loss = loss_f + loss_b\n\n                    else:\n                        optimizer.zero_grad()\n                        decoder_logit = model(minibatch, task_idx)\n\n                        loss = loss_criterion(\n                            decoder_logit.contiguous().view(\n                                -1, decoder_logit.size(2)\n                            ),\n                            minibatch[\"output_trg\"].contiguous().view(-1),\n                        )\n\n                        task_losses[task_idx].append(loss.item())\n\n                    loss.backward()\n                    # For distributed optimizer need to sync before gradient\n                    # clipping.\n                    optimizer.synchronize()\n\n                    torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\n                    optimizer.step()\n\n                end = time.time()\n                mbatch_times.append(end - batch_start_time)\n\n                # Validations\n                if (\n                    updates % config[\"management\"][\"monitor_loss\"] == 0\n                    and updates != 0\n                ):\n                    monitor_epoch += 1\n                    for idx, task in enumerate(tasknames):\n                        logging.info(\n                            \"Seq2Seq Examples Processed : %d %s Loss : %.5f Num %s \"\n                            \"minibatches : %d\"\n                            % (\n                                updates,\n                                task,\n                                np.mean(task_losses[idx]),\n                                task,\n                                len(task_losses[idx]),\n                            )\n                        )\n                        mlflow.log_metric(\n                            \"validation_loss\",\n                            np.mean(task_losses[idx]),\n                            step=monitor_epoch,\n                        )\n\n                    logging.info(\n                        \"Round: %d NLI Epoch : %d NLI Examples Processed : %d NLI \"\n                        \"Loss : %.5f \"\n                        % (\n                            nli_ctr,\n                            nli_epoch,\n                            nli_mbatch_ctr,\n                            np.mean(nli_losses),\n                        )\n                    )\n                    mlflow.log_metric(\n                        \"nli_loss\", np.mean(nli_losses), step=nli_epoch\n                    )\n\n                    logging.info(\n                        \"Average time per minibatch : %.5f\"\n                        % (np.mean(mbatch_times))\n                    )\n                    mlflow.log_metric(\n                        \"minibatch_avg_duration\", np.mean(mbatch_times)\n                    )\n\n                    task_losses = [[] for _ in tasknames]\n                    mbatch_times = []\n                    nli_losses = []\n\n                    # For validate and break if done.\n                    logging.info(\"############################\")\n                    logging.info(\"##### Evaluating model #####\")\n                    logging.info(\"############################\")\n                    training_complete, min_val_loss_epoch, min_val_loss, model_state = evaluate(\n                        config=config,\n                        train_iterator=train_iterator,\n                        model=model,\n                        loss_criterion=loss_criterion,\n                        monitor_epoch=monitor_epoch,\n                        min_val_loss=min_val_loss,\n                        min_val_loss_epoch=min_val_loss_epoch,\n                        save_dir=save_dir,\n                        starting_time=start,\n                        model_state=model_state,\n                        max_epoch=max_epoch,\n                    )\n                    if training_complete:\n                        mlflow.log_metric(\"min_val_loss\", float(min_val_loss))\n                        mlflow.log_metric(\"learning_rate\", learning_rate)\n                        break\n\n                    logging.info(\"Evaluating on NLI\")\n                    evaluate_nli(\n                        nli_iterator=nli_iterator,\n                        model=model,\n                        n_gpus=n_gpus,\n                        batch_size=batch_size,\n                    )\n\n                updates += batch_size * n_gpus\n                nli_ctr += 1\n                logging.info(\"Updates: %d\" % updates)\n    finally:\n        os.chdir(owd)\n\n\ndef read_config(json_file):\n    \"\"\"Read JSON config.\"\"\"\n    json_object = json.load(open(json_file, \"r\", encoding=\"utf-8\"))\n    return json_object\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--config\", help=\"path to json config\", required=True)\n    parser.add_argument(\"--data_folder\", type=str, help=\"data folder\")\n    # Add learning rate to tune model.\n    parser.add_argument(\n        \"--learning_rate\", type=float, default=0.0001, help=\"learning rate\"\n    )\n    parser.add_argument(\n        \"--max_epoch\",\n        type=int,\n        default=None,\n        help=\"Limit training to specified number of epochs.\",\n    )\n\n    args = parser.parse_args()\n    data_path = args.data_folder\n    lr = args.learning_rate\n\n    config_file_path = args.config\n    max_epoch = args.max_epoch\n    config_obj = read_config(config_file_path)\n    train(config_obj, data_path, lr, max_epoch)\n"
  },
  {
    "path": "examples/sentence_similarity/gensen_wrapper.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\nimport json\nimport os\n\nfrom examples.sentence_similarity.gensen_train import train\nfrom utils_nlp.eval.classification import compute_correlation_coefficients\nfrom utils_nlp.models.gensen.create_gensen_model import (\n    create_multiseq2seq_model,\n)\nfrom utils_nlp.models.gensen.gensen import GenSenSingle\nfrom utils_nlp.models.gensen.preprocess_utils import gensen_preprocess\n\n\nclass GenSenClassifier:\n    \"\"\" GenSen Classifier that trains a model on several NLP tasks.\n\n    learning_rate (str): The learning rate for the model.\n\n    config_file (str) : Configuration file that is used to train the model. This\n    specifies the batch size, directories to load and save the model.\n\n    cache_dir (str) : Location of GenSen's data directory.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        config_file,\n        pretrained_embedding_path,\n        learning_rate=0.0001,\n        cache_dir=\".\",\n        max_epoch=None,\n    ):\n        self.learning_rate = learning_rate\n        self.config_file = config_file\n        self.cache_dir = cache_dir\n        self.pretrained_embedding_path = pretrained_embedding_path\n        self.model_name = \"gensen_multiseq2seq\"\n        self.max_epoch = max_epoch\n\n        self._validate_params()\n\n    def _validate_params(self):\n        \"\"\"Validate input params.\"\"\"\n\n        if not isinstance(self.learning_rate, float) or (\n            self.learning_rate <= 0.0\n        ):\n            raise ValueError(\n                \"Learning rate must be of type float and greater than 0\"\n            )\n\n        assert os.path.isfile(self.pretrained_embedding_path)\n\n        try:\n            f = open(self.config_file)\n            self.config = self._read_config(self.config_file)\n            f.close()\n        except FileNotFoundError:\n            raise FileNotFoundError(\"Provided config file does not exist!\")\n\n    def _get_gensen_tokens(self, train_df=None, dev_df=None, test_df=None):\n        \"\"\"\n\n        Args:\n            train_df(pd.Dataframe): A dataframe containing tokenized sentences from\n        the training set.\n            dev_df(pd.Dataframe): A dataframe containing tokenized\n        sentences from the validation set.\n            test_df(pd.Dataframe): A dataframe containing tokenized sentences from the\n        test set.\n\n        Returns:\n            str: Path to the folder containing all preprocessed token files.\n\n        \"\"\"\n        return gensen_preprocess(train_df, dev_df, test_df, self.cache_dir)\n\n    @staticmethod\n    def _read_config(config_file):\n        \"\"\" Read JSON config.\n\n        Args:\n            config_file: Path to the config file.\n\n        Returns\n            dict: The loaded json file as python object\n\n        \"\"\"\n        json_object = json.load(open(config_file, \"r\", encoding=\"utf-8\"))\n        return json_object\n\n    def _create_multiseq2seq_model(self):\n        \"\"\" Method that creates a GenSen model from a MultiSeq2Seq model.\"\"\"\n\n        create_multiseq2seq_model(\n            save_folder=os.path.join(\n                self.cache_dir, self.config[\"data\"][\"save_dir\"]\n            ),\n            save_name=self.model_name,\n            trained_model_folder=os.path.join(\n                self.cache_dir, self.config[\"data\"][\"save_dir\"]\n            ),\n        )\n\n    def fit(self, train_df, dev_df, test_df):\n\n        \"\"\" Method to train the Gensen model.\n\n        Args:\n            train_df: A dataframe containing tokenized sentences from the training set.\n            dev_df: A dataframe containing tokenized sentences from the validation set.\n            test_df: A dataframe containing tokenized sentences from the test set.\n        \"\"\"\n\n        self.cache_dir = self._get_gensen_tokens(train_df, dev_df, test_df)\n\n        train(\n            data_folder=os.path.abspath(self.cache_dir),\n            config=self.config,\n            learning_rate=self.learning_rate,\n            max_epoch=self.max_epoch,\n        )\n\n        self._create_multiseq2seq_model()\n\n    def predict(self, sentences):\n\n        \"\"\"\n\n        Method to predict the model on the test dataset. This uses SentEval utils.\n\n        Args:\n            sentences(list) : List of sentences.\n\n        Returns\n            pd.Dataframe: A pairwise cosine similarity for the sentences provided based on their\n            gensen vector representations.\n\n        \"\"\"\n\n        # self.cache_dir = os.path.join(self.cache_dir, \"clean/snli_1.0\")\n        # self._create_multiseq2seq_model()\n\n        gensen_model = GenSenSingle(\n            model_folder=os.path.join(\n                self.cache_dir, self.config[\"data\"][\"save_dir\"]\n            ),\n            filename_prefix=self.model_name,\n            pretrained_emb=self.pretrained_embedding_path,\n        )\n\n        reps_h, reps_h_t = gensen_model.get_representation(\n            sentences, pool=\"last\", return_numpy=True, tokenize=True\n        )\n\n        return compute_correlation_coefficients(reps_h_t)\n"
  },
  {
    "path": "examples/sentiment_analysis/absa/README.md",
    "content": "# Aspect Based Sentiment Analysis\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for training [Aspect Based Sentiment Analysis Models using Intel's NLP Architect](http://nlp_architect.nervanasys.com/absa.html)\n models with the azure machine learning service.\n\n# What is Aspect Based Sentiment Analysis?\n\nAspect based sentiment analysis (ABSA) is an advanced sentiment analysis technique that identifies and provides coresponding sentiment scores to the aspects of a given text. ABSA a powerful tool for getting actionable insight from your text data.\n\nFor example consider the sentence following resturant review \n\n```\nThe ambiance is charming. Uncharacteristically, the service was DREADFUL.When we wanted to pay our bill at the end of the evening, our waitress was nowhere to be found...\n```\n\nWhile traditional sentiment analysis models such as [Azure Text Analytics](https://azure.microsoft.com/en-us/services/cognitive-services/text-analytics/?WT.mc_id=absa-notebook-abornst) will correctly classify the sentiment of this model as negative. An aspect based model will provide more granular insight by highlighting the fact that the while the **service** and **waitress** provided a negative expirence the resturants **ambiance** was indeed positive.\n\n## Summary\n\n|Notebook|Environment|Description|Dataset|\n|---|---|---|---|\n|[Aspect based sentiment analysis](absa.ipynb)|Local| A notebook for training and deploying [Aspect Based Sentiment Analysis Models using Intel's NLP Architect](http://nlp_architect.nervanasys.com/absa.html) |\n"
  },
  {
    "path": "examples/sentiment_analysis/absa/absa.ipynb",
    "content": "{\"cells\":[{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\"\\n\",\"Licensed under the MIT License.\\n\",\"\\n\",\"### Intel NLP-Architect ABSA on AzureML \\n\",\"\\n\",\"This notebook contains an end-to-end walkthrough of using Azure Machine Learning Service to train, finetune and test [Aspect Based Sentiment Analysis Models using Intel's NLP Architect](http://nlp_architect.nervanasys.com/absa.html)\\n\",\"\\n\",\"### Prerequisites\\n\",\"\\n\",\"* Understand the architecture and terms introduced by Azure Machine Learning (AML)\\n\",\"* Have working Jupyter Notebook Environment. You can:\\n\",\"    - Install Python environment locally, as described below in **Local Installation**\\n\",\"    - Use [Azure Notebooks](https://docs.microsoft.com/ru-ru/azure/notebooks/azure-notebooks-overview/?wt.mc_id=absa-notebook-abornst). In this case you should upload the `absa.ipynb` file to a new Azure Notebooks project, or just clone the [GitHub Repo](https://github.com/microsoft/ignite-learning-paths/tree/master/aiml/aiml40).\\n\",\"* Azure Machine Learning Workspace in your Azure Subscription\\n\",\"\\n\",\"#### Local Installation\\n\",\"\\n\",\"Install the Python SDK: make sure to install notebook, and contrib:\\n\",\"\\n\",\"```shell\\n\",\"conda create -n azureml -y Python=3.6\\n\",\"source activate azureml\\n\",\"pip install --upgrade azureml-sdk[notebooks,contrib] \\n\",\"conda install ipywidgets\\n\",\"jupyter nbextension install --py --user azureml.widgets\\n\",\"jupyter nbextension enable azureml.widgets --user --py\\n\",\"```\\n\",\"\\n\",\"You will need to restart jupyter after this Detailed instructions are [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-create-workspace-with-python/?WT.mc_id=absa-notebook-abornst)\\n\",\"\\n\",\"If you need a free trial account to get started you can get one [here](https://azure.microsoft.com/en-us/offers/ms-azr-0044p/?WT.mc_id=absa-notebook-abornst)\\n\",\"\\n\",\"#### Creating Azure ML Workspace\\n\",\"\\n\",\"Azure ML Workspace can be created by using one of the following ways:\\n\",\"* Manually through [Azure Portal](http://portal.azure.com/?WT.mc_id=absa-notebook-abornst) - [here is the complete walkthrough](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace/?wt.mc_id=absa-notebook-abornst)\\n\",\"* Using [Azure CLI](https://docs.microsoft.com/ru-ru/cli/azure/?view=azure-cli-latest&wt.mc_id=absa-notebook-abornst), using the following commands:\\n\",\"\\n\",\"```shell\\n\",\"az extension add -n azure-cli-ml\\n\",\"az group create -n absa -l westus2\\n\",\"az ml workspace create -w absa_space -g absa\\n\",\"```\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Initialize workspace\\n\",\"\\n\",\"To access an Azure ML Workspace, you will need to import the AML library and the following information:\\n\",\"* A name for your workspace (in our example - `absa_space`)\\n\",\"* Your subscription id (can be obtained by running `az account list`)\\n\",\"* The resource group name (in our case `absa`)\\n\",\"\\n\",\"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace/?WT.mc_id=absa-notebook-abornst) object from the existing workspace you created in the Prerequisites step or create a new one. \"]},{\"cell_type\":\"code\",\"execution_count\":20,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core import Workspace\\n\",\"\\n\",\"#subscription_id = ''\\n\",\"#resource_group  = 'absa'\\n\",\"#workspace_name  = 'absa_space'\\n\",\"#ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)\\n\",\"#ws.write_config()\\n\",\"\\n\",\"try:\\n\",\"    ws = Workspace.from_config()\\n\",\"    print(ws.name, ws.location, ws.resource_group, ws.location, sep='\\\\t')\\n\",\"    print('Library configuration succeeded')\\n\",\"except:\\n\",\"    print('Workspace not found')\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Compute\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"There are two computer option run once(preview) and persistent compute for this demo we will use persistent compute to learn more about run once compute check out the [docs](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute?WT.mc_id=absa-notebook-abornst).\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core.compute import ComputeTarget, AmlCompute\\n\",\"from azureml.core.compute_target import ComputeTargetException\\n\",\"\\n\",\"# Choose a name for your CPU cluster\\n\",\"cluster_name = \\\"absa-cluster\\\"\\n\",\"\\n\",\"# Verify that cluster does not exist already\\n\",\"try:\\n\",\"    cluster = ComputeTarget(workspace=ws, name=cluster_name)\\n\",\"    print('Found existing cluster, use it.')\\n\",\"except ComputeTargetException:\\n\",\"    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D3_V2',\\n\",\"                                                           vm_priority='lowpriority',\\n\",\"                                                           min_nodes=1,\\n\",\"                                                           max_nodes=4)\\n\",\"    cluster = ComputeTarget.create(ws, cluster_name, compute_config)\\n\",\"\\n\",\"cluster.wait_for_completion(show_output=True)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Upload Data\\n\",\"\\n\",\"The dataset we are using comes from the [womens ecommerce clothing reviews dataset](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews/) and is in the open domain, this can be replaced with any csv file with rows of text as the absa model is unsupervised. \\n\",\"\\n\",\"The documentation for uploading data can be found [here](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.azure_storage_datastore.azureblobdatastore/?WT.mc_id=absa-notebook-abornst) for now we will us the ds.upload command. \"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{\"scrolled\":true},\"outputs\":[],\"source\":[\"!wget -O 'dataset/glove.840B.300d.zip' 'http://nlp.stanford.edu/data/glove.840B.300d.zip'\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"import os                            \\n\",\"lib_root = os.path.dirname(os.path.abspath(\\\"__file__\\\"))\\n\",\"ds = ws.get_default_datastore()\\n\",\"ds.upload('./dataset', target_path='clothing_data', overwrite=True, show_progress=True)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Now the the glove file is uploaded to our datastore we can remove it from our local directory.\"]},{\"cell_type\":\"code\",\"execution_count\":31,\"metadata\":{},\"outputs\":[],\"source\":[\"!rm 'dataset/glove.840B.300d.zip'\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Train File\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"%%writefile train.py\\n\",\"import argparse\\n\",\"import json\\n\",\"import os \\n\",\"from pathlib import Path\\n\",\"from nltk import flatten\\n\",\"from azureml.core import Run\\n\",\"from sklearn.metrics import f1_score\\n\",\"from azureml.core.model import Model\\n\",\"\\n\",\"# Load NLP Architect\\n\",\"from nlp_architect.models.absa.train.train import TrainSentiment\\n\",\"from nlp_architect.models.absa.inference.inference import SentimentInference\\n\",\"\\n\",\"# Inputs\\n\",\"parser = argparse.ArgumentParser(description='ABSA Train')\\n\",\"parser.add_argument('--data_folder', type=str, dest='data_folder', help='data folder mounting point')\\n\",\"parser.add_argument('--asp_thresh', type=int, default=3)\\n\",\"parser.add_argument('--op_thresh', type=int, default=2)\\n\",\"parser.add_argument('--max_iter', type=int, default=3)\\n\",\"\\n\",\"args = parser.parse_args()\\n\",\"\\n\",\"# Download ABSA dependencies including spacy parser and glove embeddings \\n\",\"from spacy.cli.download import download as spacy_download\\n\",\"from nlp_architect.utils.io import uncompress_file\\n\",\"from nlp_architect.models.absa import TRAIN_OUT\\n\",\"\\n\",\"spacy_download('en')\\n\",\"GLOVE_ZIP = os.path.join(args.data_folder, \\n\",\"                                 'clothing_data/glove.840B.300d.zip')\\n\",\"EMBEDDING_PATH = TRAIN_OUT / 'word_emb_unzipped' / 'glove.840B.300d.txt'\\n\",\"\\n\",\"\\n\",\"uncompress_file(GLOVE_ZIP, Path(EMBEDDING_PATH).parent)\\n\",\"\\n\",\"clothing_train = os.path.join(args.data_folder, \\n\",\"                                 'clothing_data/clothing_absa_train_small.csv')\\n\",\"\\n\",\"os.makedirs('outputs', exist_ok=True)\\n\",\"\\n\",\"train = TrainSentiment(asp_thresh=args.asp_thresh,\\n\",\"                       op_thresh=args.op_thresh, \\n\",\"                       max_iter=args.max_iter)\\n\",\"\\n\",\"opinion_lex, aspect_lex = train.run(data=clothing_train,\\n\",\"                                    out_dir = './outputs')\\n\",\"\\n\",\"# Evaluation \\n\",\"# Although ABSA is an unsupervised method it can be metriced with a small sample of labeled data\\n\",\"def doc2IO(doc):\\n\",\"    \\\"\\\"\\\"\\n\",\"    Converts ABSA doc to IO span format for evaluation\\n\",\"    \\\"\\\"\\\"\\n\",\"    index = 0\\n\",\"    aspect_indexes = []\\n\",\"    doc_json = json.loads(doc.json())\\n\",\"    tokens = doc_json[\\\"_doc_text\\\"].split()\\n\",\"    io = [[t,'O'] for t in tokens]\\n\",\"    for t_index, token in enumerate(tokens):\\n\",\"        for s in doc_json[\\\"_sentences\\\"]:\\n\",\"            for ev in s[\\\"_events\\\"]:\\n\",\"                for e in ev:\\n\",\"                    if e[\\\"_type\\\"] == \\\"ASPECT\\\":\\n\",\"                        if e[\\\"_start\\\"] == index and all(aspect[0] != t_index for aspect in aspect_indexes):\\n\",\"                            io[t_index][1] = \\\"{}-{}\\\".format(e[\\\"_text\\\"], e[\\\"_polarity\\\"])\\n\",\"        index += len(token) + 1\\n\",\"    \\n\",\"    return io\\n\",\"\\n\",\"inference = SentimentInference('./outputs/train_out/generated_aspect_lex.csv', \\n\",\"                               './outputs/train_out/generated_opinion_lex_reranked.csv')\\n\",\"\\n\",\"clothing_val = os.path.join(args.data_folder, \\n\",\"                                 'clothing_data/clothing-absa-validation.json')\\n\",\"\\n\",\"with open(clothing_val) as json_file:\\n\",\"    val = json.load(json_file)\\n\",\"\\n\",\"predictions = []\\n\",\"for doc in val[\\\"data\\\"]:\\n\",\"    doc_raw = \\\" \\\".join([token[0] for token in doc])\\n\",\"    sentiment_doc = inference.run(doc=doc_raw)\\n\",\"    predictions.append(doc2IO(sentiment_doc))\\n\",\"    \\n\",\"y_pred = flatten(predictions)[1::2]\\n\",\"y_true = flatten(val['data'])[1::2]\\n\",\"\\n\",\"from sklearn.metrics import f1_score\\n\",\"\\n\",\"# Log metrics\\n\",\"run = Run.get_context()\\n\",\"run.log('Aspect Lexicon Size', len(aspect_lex))\\n\",\"run.log('Opinion Lexicon Size', len(opinion_lex))\\n\",\"run.log('f1_weighted', float(f1_score(y_true, y_pred, average='weighted')))\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Create An Experiment\\n\",\"\\n\",\"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment/?WT.mc_id=absa-notebook-abornst) to track all the runs in your workspace for this distributed PyTorch tutorial. \"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"PIP_PACKAGES = ['nlp-architect',\\n\",\"                'spacy==2.1.8']\"]},{\"cell_type\":\"code\",\"execution_count\":23,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core import Experiment\\n\",\"experiment_name = 'absa'\\n\",\"exp = Experiment(workspace=ws, name=experiment_name)\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.train.estimator import Estimator\\n\",\"\\n\",\"script_params = {\\n\",\"    '--data_folder': ds,\\n\",\"}\\n\",\"\\n\",\"nlp_est = Estimator(source_directory='.',\\n\",\"                   script_params=script_params,\\n\",\"                   compute_target=cluster,\\n\",\"                   environment_variables = {'NLP_ARCHITECT_BE':'CPU'},\\n\",\"                   entry_script='train.py',\\n\",\"                   pip_packages=PIP_PACKAGES\\n\",\")\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"To create a run we just submit our expierment as follows.\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"run = exp.submit(nlp_est)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Note: If you accidently run the following cell more than once you can cancel a run with the run.cancel() command.\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"# run.cancel()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"You can load any previous run using its run id\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"run.id\"]},{\"cell_type\":\"code\",\"execution_count\":24,\"metadata\":{},\"outputs\":[],\"source\":[\"run = [r for r in exp.get_runs() if r.id == 'absa_1570979110_e5021352'][0]\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Let's visualize our run:\"]},{\"cell_type\":\"code\",\"execution_count\":25,\"metadata\":{\"scrolled\":true},\"outputs\":[],\"source\":[\"from azureml.widgets import RunDetails\\n\",\"\\n\",\"RunDetails(run).show()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Fine-Tuning NLP Archictect  with AzureML HyperDrive\\n\",\"Although ABSA is an unsupervised method it's hyper parameters such as the aspect and opinion word thresholds can be fined tuned if provided with a small sample of labeled data\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.train.hyperdrive import *\\n\",\"import math\\n\",\"\\n\",\"param_sampling = RandomParameterSampling({\\n\",\"         '--asp_thresh': choice(range(2,5)),\\n\",\"         '--op_thresh': choice(range(2,5)), \\n\",\"         '--max_iter': choice(range(2,5))\\n\",\"    })\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Early Termination Policy\\n\",\"First we will define an early terminination policy. [Median stopping](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.medianstoppingpolicy?WT.mc_id=absa-notebook-abornst) is an early termination policy based on running averages of primary metrics reported by the runs. This policy computes running averages across all training runs and terminates runs whose performance is worse than the median of the running averages. \\n\",\"\\n\",\"This policy takes the following configuration parameters:\\n\",\"\\n\",\"- evaluation_interval: the frequency for applying the policy (optional parameter).\\n\",\"- delay_evaluation: delays the first policy evaluation for a specified number of intervals (optional parameter).\\n\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"early_termination_policy = MedianStoppingPolicy(evaluation_interval=1, delay_evaluation=0)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Refer [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters#specify-early-termination-policy?WT.mc_id=absa-notebook-abornst) for more information on the Median stopping policy and other policies available.\\n\",\"\\n\",\"Now that we've defined our early termination policy we can define our Hyper Drive configuration to maximize our Model's weighted F1 score. Hyper Drive can optimize any metric can be optimized as long as it's logged by the training script. \\n\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hd_config = HyperDriveConfig(estimator=nlp_est,\\n\",\"                            hyperparameter_sampling=param_sampling,\\n\",\"                            policy=early_termination_policy,\\n\",\"                            primary_metric_name='f1_weighted',\\n\",\"                            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,\\n\",\"                            max_total_runs=16,\\n\",\"                            max_concurrent_runs=4)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Finally, lauch the hyperparameter tuning job.\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"experiment = Experiment(workspace=ws, name='absa_hyperdrive')\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hyperdrive_run = experiment.submit(hd_config)\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hyperdrive_run.id\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hyperdrive_run = [r for r in experiment.get_runs() if r.id == 'absa_hyperdrive_1571092544235933'][0]\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Monitor HyperDrive runs\\n\",\"We can monitor the progress of the runs with the following Jupyter widget. \"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{\"scrolled\":false},\"outputs\":[],\"source\":[\"from azureml.widgets import RunDetails\\n\",\"\\n\",\"RunDetails(hyperdrive_run).show()\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hyperdrive_run.cancel()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Find and register the best model\\n\",\"Once all the runs complete, we can find the run that produced the model with the highest evaluation (METRIC TBD).\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"best_run = hyperdrive_run.get_best_run_by_primary_metric()\\n\",\"best_run_metrics = best_run.get_metrics()\\n\",\"print(best_run)\\n\",\"print('Best Run is:\\\\n  F1: {0:.5f}'.format(\\n\",\"        best_run_metrics['f1_weighted']\\n\",\"     ))\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Register Model Outputs\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"aspect_lex = run.register_model(model_name='c_aspect_lex', model_path='outputs/train_out/generated_aspect_lex.csv')\\n\",\"opinion_lex = run.register_model(model_name='c_opinion_lex', model_path='outputs/train_out/generated_opinion_lex_reranked.csv')\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Test Locally\\n\",\"\\n\",\"### Install Local PIP Dependencies\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"!pip install git+https://github.com/NervanaSystems/nlp-architect.git@absa\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"!pip install spacy==2.0.18\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Load Model From AzureML\"]},{\"cell_type\":\"code\",\"execution_count\":26,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core.model import Model\\n\",\"from nlp_architect.models.absa.inference.inference import SentimentInference\\n\",\"c_aspect_lex = Model._get_model_path_remote('c_aspect_lex', 1, ws)\\n\",\"c_opinion_lex = Model._get_model_path_remote('c_opinion_lex', 1, ws)   \\n\",\"inference = SentimentInference(c_aspect_lex, c_opinion_lex)\\n\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Run Model On Sample Data \"]},{\"cell_type\":\"code\",\"execution_count\":27,\"metadata\":{},\"outputs\":[],\"source\":[\"docs = [\\\"Loved the sweater but hated the pants\\\",\\n\",\"       \\\"Really great outfit, but the shirt is the wrong size\\\",\\n\",\"       \\\"I absolutely love this jacket! i wear it almost everyday. works as a cardigan or a jacket. my favorite retailer purchase so far\\\"]\\n\",\"\\n\",\"sentiment_docs = []\\n\",\"\\n\",\"for doc_raw in docs:\\n\",\"    sentiment_doc = inference.run(doc=doc_raw)\\n\",\"    sentiment_docs.append(sentiment_doc)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Visualize Model Results\"]},{\"cell_type\":\"code\",\"execution_count\":28,\"metadata\":{},\"outputs\":[],\"source\":[\"import spacy\\n\",\"from spacy import displacy\\n\",\"from nlp_architect.models.absa.inference.data_types import TermType\\n\",\"ents = []\\n\",\"for doc in sentiment_docs:    \\n\",\"    if doc:\\n\",\"        doc_viz = {'text':doc._doc_text, 'ents':[]}\\n\",\"        for s in doc._sentences:\\n\",\"            for ev in s._events:\\n\",\"                for e in ev:\\n\",\"                    if e._type == TermType.ASPECT:\\n\",\"                        ent = {'start': e._start, 'end': e._start + e._len,\\n\",\"                               'label':str(e._polarity.value), \\n\",\"                               'text':str(e._text)}\\n\",\"                        if all(kown_e['start'] != ent['start'] for kown_e in ents):\\n\",\"                            ents.append(ent)\\n\",\"                            doc_viz['ents'].append(ent)\\n\",\"        doc_viz['ents'].sort(key=lambda m: m[\\\"start\\\"])\\n\",\"        displacy.render(doc_viz, style=\\\"ent\\\", options={'colors':{'POS':'#7CFC00', 'NEG':'#FF0000'}}, manual=True)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"# Package Model For Deployment\\n\",\"\\n\",\"## Create scoring script\\n\",\"Create the scoring script, called score.py, used by the web service call to show how to use the model.\\n\",\"\\n\",\"You must include two required functions into the scoring script:\\n\",\"\\n\",\"The init() function, which typically loads the model into a global object. This function is run only once when the Docker container is started.\\n\",\"\\n\",\"The run(input_data) function uses the model to predict a value based on the input data. Inputs and outputs to the run typically use JSON for serialization and de-serialization, but other formats are supported.\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{\"scrolled\":false},\"outputs\":[],\"source\":[\"%%writefile score.py\\n\",\"from azureml.core.model import Model\\n\",\"from nlp_architect.models.absa.inference.inference import SentimentInference\\n\",\"from spacy.cli.download import download as spacy_download\\n\",\"\\n\",\"\\n\",\"def init():\\n\",\"    \\\"\\\"\\\"\\n\",\"    Set up the ABSA model for Inference  \\n\",\"    \\\"\\\"\\\"\\n\",\"    global SentInference\\n\",\"    spacy_download('en')\\n\",\"    aspect_lex = Model.get_model_path('c_aspect_lex')\\n\",\"    opinion_lex = Model.get_model_path('c_opinion_lex') \\n\",\"    SentInference = SentimentInference(aspect_lex, opinion_lex)\\n\",\"\\n\",\"def run(raw_data):\\n\",\"    \\\"\\\"\\\"\\n\",\"    Evaluate the model and return JSON string\\n\",\"    \\\"\\\"\\\"\\n\",\"    sentiment_doc = SentInference.run(doc=raw_data)\\n\",\"    return sentiment_doc.json()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Create configuration files\\n\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Create Enviorment File\\n\",\"create an environment file, called myenv.yml, that specifies all of the script's package dependencies. This file is used to ensure that all of those dependencies are installed in the Docker image. This model needs nlp-architect and the azureml-sdk. \"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core.conda_dependencies import CondaDependencies \\n\",\"\\n\",\"pip = [\\\"azureml-defaults\\\", \\\"azureml-monitoring\\\", \\n\",\"       \\\"git+https://github.com/NervanaSystems/nlp-architect.git@absa\\\", \\n\",\"       \\\"spacy==2.0.18\\\"]\\n\",\"\\n\",\"myenv = CondaDependencies.create(pip_packages=pip)\\n\",\"\\n\",\"with open(\\\"myenv.yml\\\",\\\"w\\\") as f:\\n\",\"    f.write(myenv.serialize_to_string())\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Create Environment Config\\n\",\"Create a Enviorment configuration file and specify the enviroment and enviormental variables required for the application\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core import Environment\\n\",\"deploy_env = Environment.from_conda_specification('absa_env', \\\"myenv.yml\\\")\\n\",\"deploy_env.environment_variables={'NLP_ARCHITECT_BE': 'CPU'}\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Inference Config \\n\",\"Create an inference configuration that recieves the deployment enviorment and the entry script\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core.model import InferenceConfig\\n\",\"inference_config = InferenceConfig(environment=deploy_env,\\n\",\"                                   entry_script=\\\"score.py\\\")\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Package Model and Pull \\n\",\"Create an inference configuration that recieves the deployment enviorment and the entry script\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{\"scrolled\":true},\"outputs\":[],\"source\":[\"package = Model.package(ws, [aspect_lex, opinion_lex], inference_config)\\n\",\"package.wait_for_creation(show_output=True)\\n\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"package.pull()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Next Steps\\n\",\"\\n\",\"We now have gone through all the steps for production training of a custom open source model using the AzureML Service check out AIML50 to learn how to deploy and models and manage re-training pipelines\"]}],\"nbformat\":4,\"nbformat_minor\":2,\"metadata\":{\"language_info\":{\"name\":\"python\",\"codemirror_mode\":{\"name\":\"ipython\",\"version\":3}},\"orig_nbformat\":2,\"file_extension\":\".py\",\"mimetype\":\"text/x-python\",\"name\":\"python\",\"npconvert_exporter\":\"python\",\"pygments_lexer\":\"ipython3\",\"version\":3}}\n"
  },
  {
    "path": "examples/sentiment_analysis/absa/absa_azureml.ipynb",
    "content": "{\"cells\":[{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\"\\n\",\"Licensed under the MIT License.\\n\",\"\\n\",\"### Intel NLP-Architect ABSA on AzureML \\n\",\"\\n\",\"This notebook contains an end-to-end walkthrough of using Azure Machine Learning Service to train, finetune and test [Aspect Based Sentiment Analysis Models using Intel's NLP Architect](http://nlp_architect.nervanasys.com/absa.html)\\n\",\"\\n\",\"### Prerequisites\\n\",\"\\n\",\"* Understand the architecture and terms introduced by Azure Machine Learning (AML)\\n\",\"* Have working Jupyter Notebook Environment. You can:\\n\",\"    - Install Python environment locally, as described below in **Local Installation**\\n\",\"    - Use [Azure Notebooks](https://docs.microsoft.com/ru-ru/azure/notebooks/azure-notebooks-overview/?wt.mc_id=absa-notebook-abornst). In this case you should upload the `absa.ipynb` file to a new Azure Notebooks project, or just clone the [GitHub Repo](https://github.com/microsoft/ignite-learning-paths/tree/master/aiml/aiml40).\\n\",\"* Azure Machine Learning Workspace in your Azure Subscription\\n\",\"\\n\",\"#### Local Installation\\n\",\"\\n\",\"Install the Python SDK: make sure to install notebook, and contrib:\\n\",\"\\n\",\"```shell\\n\",\"conda create -n azureml -y Python=3.6\\n\",\"source activate azureml\\n\",\"pip install --upgrade azureml-sdk[notebooks,contrib] \\n\",\"conda install ipywidgets\\n\",\"jupyter nbextension install --py --user azureml.widgets\\n\",\"jupyter nbextension enable azureml.widgets --user --py\\n\",\"```\\n\",\"\\n\",\"You will need to restart jupyter after this Detailed instructions are [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-create-workspace-with-python/?WT.mc_id=absa-notebook-abornst)\\n\",\"\\n\",\"If you need a free trial account to get started you can get one [here](https://azure.microsoft.com/en-us/offers/ms-azr-0044p/?WT.mc_id=absa-notebook-abornst)\\n\",\"\\n\",\"#### Creating Azure ML Workspace\\n\",\"\\n\",\"Azure ML Workspace can be created by using one of the following ways:\\n\",\"* Manually through [Azure Portal](http://portal.azure.com/?WT.mc_id=absa-notebook-abornst) - [here is the complete walkthrough](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace/?wt.mc_id=absa-notebook-abornst)\\n\",\"* Using [Azure CLI](https://docs.microsoft.com/ru-ru/cli/azure/?view=azure-cli-latest&wt.mc_id=absa-notebook-abornst), using the following commands:\\n\",\"\\n\",\"```shell\\n\",\"az extension add -n azure-cli-ml\\n\",\"az group create -n absa -l westus2\\n\",\"az ml workspace create -w absa_space -g absa\\n\",\"```\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Initialize workspace\\n\",\"\\n\",\"To access an Azure ML Workspace, you will need to import the AML library and the following information:\\n\",\"* A name for your workspace (in our example - `absa_space`)\\n\",\"* Your subscription id (can be obtained by running `az account list`)\\n\",\"* The resource group name (in our case `absa`)\\n\",\"\\n\",\"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace/?WT.mc_id=absa-notebook-abornst) object from the existing workspace you created in the Prerequisites step or create a new one. \"]},{\"cell_type\":\"code\",\"execution_count\":20,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core import Workspace\\n\",\"\\n\",\"#subscription_id = ''\\n\",\"#resource_group  = 'absa'\\n\",\"#workspace_name  = 'absa_space'\\n\",\"#ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)\\n\",\"#ws.write_config()\\n\",\"\\n\",\"try:\\n\",\"    ws = Workspace.from_config()\\n\",\"    print(ws.name, ws.location, ws.resource_group, ws.location, sep='\\\\t')\\n\",\"    print('Library configuration succeeded')\\n\",\"except:\\n\",\"    print('Workspace not found')\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Compute\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"There are two computer option run once(preview) and persistent compute for this demo we will use persistent compute to learn more about run once compute check out the [docs](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute?WT.mc_id=absa-notebook-abornst).\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core.compute import ComputeTarget, AmlCompute\\n\",\"from azureml.core.compute_target import ComputeTargetException\\n\",\"\\n\",\"# Choose a name for your CPU cluster\\n\",\"cluster_name = \\\"absa-cluster\\\"\\n\",\"\\n\",\"# Verify that cluster does not exist already\\n\",\"try:\\n\",\"    cluster = ComputeTarget(workspace=ws, name=cluster_name)\\n\",\"    print('Found existing cluster, use it.')\\n\",\"except ComputeTargetException:\\n\",\"    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D3_V2',\\n\",\"                                                           vm_priority='lowpriority',\\n\",\"                                                           min_nodes=1,\\n\",\"                                                           max_nodes=4)\\n\",\"    cluster = ComputeTarget.create(ws, cluster_name, compute_config)\\n\",\"\\n\",\"cluster.wait_for_completion(show_output=True)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Upload Data\\n\",\"\\n\",\"The dataset we are using comes from the [womens ecommerce clothing reviews dataset](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews/) and is in the open domain, this can be replaced with any csv file with rows of text as the absa model is unsupervised. \\n\",\"\\n\",\"The documentation for uploading data can be found [here](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.azure_storage_datastore.azureblobdatastore/?WT.mc_id=absa-notebook-abornst) for now we will us the ds.upload command. \"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{\"scrolled\":true},\"outputs\":[],\"source\":[\"!wget -O 'dataset/glove.840B.300d.zip' 'http://nlp.stanford.edu/data/glove.840B.300d.zip'\\n\",\"# save 'dataset/clothing_absa_train.csv'\\n\",\"# save 'dataset/clothing-absa-validation.json'\\n\",\"# save 'dataset/clothing_absa_train_small.csv'\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"import os                            \\n\",\"lib_root = os.path.dirname(os.path.abspath(\\\"__file__\\\"))\\n\",\"ds = ws.get_default_datastore()\\n\",\"ds.upload('./dataset', target_path='clothing_data', overwrite=True, show_progress=True)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Now the the glove file is uploaded to our datastore we can remove it from our local directory.\"]},{\"cell_type\":\"code\",\"execution_count\":31,\"metadata\":{},\"outputs\":[],\"source\":[\"!rm 'dataset/glove.840B.300d.zip'\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Train File\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"%%writefile train.py\\n\",\"import argparse\\n\",\"import json\\n\",\"import os \\n\",\"from pathlib import Path\\n\",\"from nltk import flatten\\n\",\"from azureml.core import Run\\n\",\"from sklearn.metrics import f1_score\\n\",\"from azureml.core.model import Model\\n\",\"\\n\",\"# Load NLP Architect\\n\",\"from nlp_architect.models.absa.train.train import TrainSentiment\\n\",\"from nlp_architect.models.absa.inference.inference import SentimentInference\\n\",\"\\n\",\"# Inputs\\n\",\"parser = argparse.ArgumentParser(description='ABSA Train')\\n\",\"parser.add_argument('--data_folder', type=str, dest='data_folder', help='data folder mounting point')\\n\",\"parser.add_argument('--asp_thresh', type=int, default=3)\\n\",\"parser.add_argument('--op_thresh', type=int, default=2)\\n\",\"parser.add_argument('--max_iter', type=int, default=3)\\n\",\"\\n\",\"args = parser.parse_args()\\n\",\"\\n\",\"# Download ABSA dependencies including spacy parser and glove embeddings \\n\",\"from spacy.cli.download import download as spacy_download\\n\",\"from nlp_architect.utils.io import uncompress_file\\n\",\"from nlp_architect.models.absa import TRAIN_OUT\\n\",\"\\n\",\"spacy_download('en')\\n\",\"GLOVE_ZIP = os.path.join(args.data_folder, \\n\",\"                                 'clothing_data/glove.840B.300d.zip')\\n\",\"EMBEDDING_PATH = TRAIN_OUT / 'word_emb_unzipped' / 'glove.840B.300d.txt'\\n\",\"\\n\",\"\\n\",\"uncompress_file(GLOVE_ZIP, Path(EMBEDDING_PATH).parent)\\n\",\"\\n\",\"clothing_train = os.path.join(args.data_folder, \\n\",\"                                 'clothing_data/clothing_absa_train_small.csv')\\n\",\"\\n\",\"os.makedirs('outputs', exist_ok=True)\\n\",\"\\n\",\"train = TrainSentiment(asp_thresh=args.asp_thresh,\\n\",\"                       op_thresh=args.op_thresh, \\n\",\"                       max_iter=args.max_iter)\\n\",\"\\n\",\"opinion_lex, aspect_lex = train.run(data=clothing_train,\\n\",\"                                    out_dir = './outputs')\\n\",\"\\n\",\"# Evaluation \\n\",\"# Although ABSA is an unsupervised method it can be metriced with a small sample of labeled data\\n\",\"def doc2IO(doc):\\n\",\"    \\\"\\\"\\\"\\n\",\"    Converts ABSA doc to IO span format for evaluation\\n\",\"    \\\"\\\"\\\"\\n\",\"    index = 0\\n\",\"    aspect_indexes = []\\n\",\"    doc_json = json.loads(doc.json())\\n\",\"    tokens = doc_json[\\\"_doc_text\\\"].split()\\n\",\"    io = [[t,'O'] for t in tokens]\\n\",\"    for t_index, token in enumerate(tokens):\\n\",\"        for s in doc_json[\\\"_sentences\\\"]:\\n\",\"            for ev in s[\\\"_events\\\"]:\\n\",\"                for e in ev:\\n\",\"                    if e[\\\"_type\\\"] == \\\"ASPECT\\\":\\n\",\"                        if e[\\\"_start\\\"] == index and all(aspect[0] != t_index for aspect in aspect_indexes):\\n\",\"                            io[t_index][1] = \\\"{}-{}\\\".format(e[\\\"_text\\\"], e[\\\"_polarity\\\"])\\n\",\"        index += len(token) + 1\\n\",\"    \\n\",\"    return io\\n\",\"\\n\",\"inference = SentimentInference('./outputs/train_out/generated_aspect_lex.csv', \\n\",\"                               './outputs/train_out/generated_opinion_lex_reranked.csv')\\n\",\"\\n\",\"clothing_val = os.path.join(args.data_folder, \\n\",\"                                 'clothing_data/clothing-absa-validation.json')\\n\",\"\\n\",\"with open(clothing_val) as json_file:\\n\",\"    val = json.load(json_file)\\n\",\"\\n\",\"predictions = []\\n\",\"for doc in val[\\\"data\\\"]:\\n\",\"    doc_raw = \\\" \\\".join([token[0] for token in doc])\\n\",\"    sentiment_doc = inference.run(doc=doc_raw)\\n\",\"    predictions.append(doc2IO(sentiment_doc))\\n\",\"    \\n\",\"y_pred = flatten(predictions)[1::2]\\n\",\"y_true = flatten(val['data'])[1::2]\\n\",\"\\n\",\"from sklearn.metrics import f1_score\\n\",\"\\n\",\"# Log metrics\\n\",\"run = Run.get_context()\\n\",\"run.log('Aspect Lexicon Size', len(aspect_lex))\\n\",\"run.log('Opinion Lexicon Size', len(opinion_lex))\\n\",\"run.log('f1_weighted', float(f1_score(y_true, y_pred, average='weighted')))\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Create An Experiment\\n\",\"\\n\",\"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment/?WT.mc_id=absa-notebook-abornst) to track all the runs in your workspace for this distributed PyTorch tutorial. \"]},{\"cell_type\":\"code\",\"execution_count\":23,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core import Experiment\\n\",\"experiment_name = 'absa'\\n\",\"exp = Experiment(workspace=ws, name=experiment_name)\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.train.estimator import Estimator\\n\",\"\\n\",\"script_params = {\\n\",\"    '--data_folder': ds,\\n\",\"}\\n\",\"\\n\",\"nlp_est = Estimator(source_directory='.',\\n\",\"                   script_params=script_params,\\n\",\"                   compute_target=cluster,\\n\",\"                   environment_variables = {'NLP_ARCHITECT_BE':'CPU'},\\n\",\"                   entry_script='train.py',\\n\",\"                   pip_packages=['git+https://github.com/NervanaSystems/nlp-architect.git@absa',\\n\",\"                                 'spacy==2.1.8']\\n\",\")\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"To create a run we just submit our expierment as follows.\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"run = exp.submit(nlp_est)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Note: If you accidently run the following cell more than once you can cancel a run with the run.cancel() command.\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"# run.cancel()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"You can load any previous run using its run id\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"run.id\"]},{\"cell_type\":\"code\",\"execution_count\":24,\"metadata\":{},\"outputs\":[],\"source\":[\"run = [r for r in exp.get_runs() if r.id == 'put_run_id_here'][0]\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Let's visualize our run:\"]},{\"cell_type\":\"code\",\"execution_count\":25,\"metadata\":{\"scrolled\":true},\"outputs\":[],\"source\":[\"from azureml.widgets import RunDetails\\n\",\"\\n\",\"RunDetails(run).show()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Fine-Tuning NLP Archictect  with AzureML HyperDrive\\n\",\"Although ABSA is an unsupervised method it's hyper parameters such as the aspect and opinion word thresholds can be fined tuned if provided with a small sample of labeled data\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.train.hyperdrive import *\\n\",\"import math\\n\",\"\\n\",\"param_sampling = RandomParameterSampling({\\n\",\"         '--asp_thresh': choice(range(2,5)),\\n\",\"         '--op_thresh': choice(range(2,5)), \\n\",\"         '--max_iter': choice(range(2,5))\\n\",\"    })\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Early Termination Policy\\n\",\"First we will define an early terminination policy. [Median stopping](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.medianstoppingpolicy?WT.mc_id=absa-notebook-abornst) is an early termination policy based on running averages of primary metrics reported by the runs. This policy computes running averages across all training runs and terminates runs whose performance is worse than the median of the running averages. \\n\",\"\\n\",\"This policy takes the following configuration parameters:\\n\",\"\\n\",\"- evaluation_interval: the frequency for applying the policy (optional parameter).\\n\",\"- delay_evaluation: delays the first policy evaluation for a specified number of intervals (optional parameter).\\n\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"early_termination_policy = MedianStoppingPolicy(evaluation_interval=1, delay_evaluation=0)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Refer [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters#specify-early-termination-policy?WT.mc_id=absa-notebook-abornst) for more information on the Median stopping policy and other policies available.\\n\",\"\\n\",\"Now that we've defined our early termination policy we can define our Hyper Drive configuration to maximize our Model's weighted F1 score. Hyper Drive can optimize any metric can be optimized as long as it's logged by the training script. \\n\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hd_config = HyperDriveConfig(estimator=nlp_est,\\n\",\"                            hyperparameter_sampling=param_sampling,\\n\",\"                            policy=early_termination_policy,\\n\",\"                            primary_metric_name='f1_weighted',\\n\",\"                            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,\\n\",\"                            max_total_runs=16,\\n\",\"                            max_concurrent_runs=4)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Finally, lauch the hyperparameter tuning job.\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"experiment = Experiment(workspace=ws, name='absa_hyperdrive')\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hyperdrive_run = experiment.submit(hd_config)\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hyperdrive_run.id\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hyperdrive_run = [r for r in experiment.get_runs() if r.id == 'absa_hyperdrive_1571092544235933'][0]\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Monitor HyperDrive runs\\n\",\"We can monitor the progress of the runs with the following Jupyter widget. \"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{\"scrolled\":false},\"outputs\":[],\"source\":[\"from azureml.widgets import RunDetails\\n\",\"\\n\",\"RunDetails(hyperdrive_run).show()\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"hyperdrive_run.cancel()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Find and register the best model\\n\",\"Once all the runs complete, we can find the run that produced the model with the highest evaluation (METRIC TBD).\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"best_run = hyperdrive_run.get_best_run_by_primary_metric()\\n\",\"best_run_metrics = best_run.get_metrics()\\n\",\"print(best_run)\\n\",\"print('Best Run is:\\\\n  F1: {0:.5f}'.format(\\n\",\"        best_run_metrics['f1_weighted']\\n\",\"     ))\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Register Model Outputs\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"aspect_lex = run.register_model(model_name='c_aspect_lex', model_path='outputs/train_out/generated_aspect_lex.csv')\\n\",\"opinion_lex = run.register_model(model_name='c_opinion_lex', model_path='outputs/train_out/generated_opinion_lex_reranked.csv')\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Test Locally\\n\",\"\\n\",\"### Install Local PIP Dependencies\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"!pip install git+https://github.com/NervanaSystems/nlp-architect.git@absa\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"!pip install spacy==2.0.18\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Load Model From AzureML\"]},{\"cell_type\":\"code\",\"execution_count\":26,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core.model import Model\\n\",\"from nlp_architect.models.absa.inference.inference import SentimentInference\\n\",\"c_aspect_lex = Model._get_model_path_remote('c_aspect_lex', 1, ws)\\n\",\"c_opinion_lex = Model._get_model_path_remote('c_opinion_lex', 1, ws)   \\n\",\"inference = SentimentInference(c_aspect_lex, c_opinion_lex)\\n\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Run Model On Sample Data \"]},{\"cell_type\":\"code\",\"execution_count\":27,\"metadata\":{},\"outputs\":[],\"source\":[\"docs = [\\\"Loved the sweater but hated the pants\\\",\\n\",\"       \\\"Really great outfit, but the shirt is the wrong size\\\",\\n\",\"       \\\"I absolutely love this jacket! i wear it almost everyday. works as a cardigan or a jacket. my favorite retailer purchase so far\\\"]\\n\",\"\\n\",\"sentiment_docs = []\\n\",\"\\n\",\"for doc_raw in docs:\\n\",\"    sentiment_doc = inference.run(doc=doc_raw)\\n\",\"    sentiment_docs.append(sentiment_doc)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Visualize Model Results\"]},{\"cell_type\":\"code\",\"execution_count\":28,\"metadata\":{},\"outputs\":[],\"source\":[\"import spacy\\n\",\"from spacy import displacy\\n\",\"from nlp_architect.models.absa.inference.data_types import TermType\\n\",\"ents = []\\n\",\"for doc in sentiment_docs:    \\n\",\"    if doc:\\n\",\"        doc_viz = {'text':doc._doc_text, 'ents':[]}\\n\",\"        for s in doc._sentences:\\n\",\"            for ev in s._events:\\n\",\"                for e in ev:\\n\",\"                    if e._type == TermType.ASPECT:\\n\",\"                        ent = {'start': e._start, 'end': e._start + e._len,\\n\",\"                               'label':str(e._polarity.value), \\n\",\"                               'text':str(e._text)}\\n\",\"                        if all(kown_e['start'] != ent['start'] for kown_e in ents):\\n\",\"                            ents.append(ent)\\n\",\"                            doc_viz['ents'].append(ent)\\n\",\"        doc_viz['ents'].sort(key=lambda m: m[\\\"start\\\"])\\n\",\"        displacy.render(doc_viz, style=\\\"ent\\\", options={'colors':{'POS':'#7CFC00', 'NEG':'#FF0000'}}, manual=True)\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"# Package Model For Deployment\\n\",\"\\n\",\"## Create scoring script\\n\",\"Create the scoring script, called score.py, used by the web service call to show how to use the model.\\n\",\"\\n\",\"You must include two required functions into the scoring script:\\n\",\"\\n\",\"The init() function, which typically loads the model into a global object. This function is run only once when the Docker container is started.\\n\",\"\\n\",\"The run(input_data) function uses the model to predict a value based on the input data. Inputs and outputs to the run typically use JSON for serialization and de-serialization, but other formats are supported.\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{\"scrolled\":false},\"outputs\":[],\"source\":[\"%%writefile score.py\\n\",\"from azureml.core.model import Model\\n\",\"from nlp_architect.models.absa.inference.inference import SentimentInference\\n\",\"from spacy.cli.download import download as spacy_download\\n\",\"\\n\",\"\\n\",\"def init():\\n\",\"    \\\"\\\"\\\"\\n\",\"    Set up the ABSA model for Inference  \\n\",\"    \\\"\\\"\\\"\\n\",\"    global SentInference\\n\",\"    spacy_download('en')\\n\",\"    aspect_lex = Model.get_model_path('c_aspect_lex')\\n\",\"    opinion_lex = Model.get_model_path('c_opinion_lex') \\n\",\"    SentInference = SentimentInference(aspect_lex, opinion_lex)\\n\",\"\\n\",\"def run(raw_data):\\n\",\"    \\\"\\\"\\\"\\n\",\"    Evaluate the model and return JSON string\\n\",\"    \\\"\\\"\\\"\\n\",\"    sentiment_doc = SentInference.run(doc=raw_data)\\n\",\"    return sentiment_doc.json()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Create configuration files\\n\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Create Enviorment File\\n\",\"create an environment file, called myenv.yml, that specifies all of the script's package dependencies. This file is used to ensure that all of those dependencies are installed in the Docker image. This model needs nlp-architect and the azureml-sdk. \"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core.conda_dependencies import CondaDependencies \\n\",\"\\n\",\"pip = [\\\"azureml-defaults\\\", \\\"azureml-monitoring\\\", \\n\",\"       \\\"git+https://github.com/NervanaSystems/nlp-architect.git@absa\\\", \\n\",\"       \\\"spacy==2.0.18\\\"]\\n\",\"\\n\",\"myenv = CondaDependencies.create(pip_packages=pip)\\n\",\"\\n\",\"with open(\\\"myenv.yml\\\",\\\"w\\\") as f:\\n\",\"    f.write(myenv.serialize_to_string())\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Create Environment Config\\n\",\"Create a Enviorment configuration file and specify the enviroment and enviormental variables required for the application\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core import Environment\\n\",\"deploy_env = Environment.from_conda_specification('absa_env', \\\"myenv.yml\\\")\\n\",\"deploy_env.environment_variables={'NLP_ARCHITECT_BE': 'CPU'}\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Inference Config \\n\",\"Create an inference configuration that recieves the deployment enviorment and the entry script\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"from azureml.core.model import InferenceConfig\\n\",\"inference_config = InferenceConfig(environment=deploy_env,\\n\",\"                                   entry_script=\\\"score.py\\\")\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"### Package Model and Pull \\n\",\"Create an inference configuration that recieves the deployment enviorment and the entry script\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{\"scrolled\":true},\"outputs\":[],\"source\":[\"package = Model.package(ws, [aspect_lex, opinion_lex], inference_config)\\n\",\"package.wait_for_creation(show_output=True)\\n\"]},{\"cell_type\":\"code\",\"execution_count\":null,\"metadata\":{},\"outputs\":[],\"source\":[\"package.pull()\"]},{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"## Next Steps\\n\",\"\\n\",\"We now have gone through all the steps for production training of a custom open source model using the AzureML Service check out AIML50 to learn how to deploy and models and manage re-training pipelines\"]}],\"nbformat\":4,\"nbformat_minor\":2,\"metadata\":{\"language_info\":{\"name\":\"python\",\"codemirror_mode\":{\"name\":\"ipython\",\"version\":3}},\"orig_nbformat\":2,\"file_extension\":\".py\",\"mimetype\":\"text/x-python\",\"name\":\"python\",\"npconvert_exporter\":\"python\",\"pygments_lexer\":\"ipython3\",\"version\":3}}\n"
  },
  {
    "path": "examples/sentiment_analysis/absa/dataset/data.md",
    "content": "# About the Dataset\n\nReview data for this demo is sourced from the text reviews of [Women's E-Commerce Clothing Review](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews/) dataset. The dataset has a CC0: Public Domain License and has been reformatted to build validation and training sets for both standard sentiment analysis and ABSA models.  \n"
  },
  {
    "path": "examples/text_classification/README.md",
    "content": "# Text Classification\nThis folder contains examples and best practices, written in Jupyter notebooks, for building text classification models. We use the\nutility scripts in the [utils_nlp](../../utils_nlp) folder to speed up data preprocessing and model building for text classification.  \nThe models can be used in a wide variety of applications, such as\nsentiment analysis, document indexing in digital libraries, hate speech detection, and general-purpose categorization in medical, academic, legal, and many other domains. \nCurrently, we focus on fine-tuning pre-trained BERT and XLNet models. We plan to continue adding state-of-the-art models as they come up and welcome community\ncontributions.\n\n## What is Text Classification?\nText classification is a supervised learning method of learning and predicting the category or the\nclass of a document given its text content. The state-of-the-art methods are based on neural\nnetworks of different architectures as well as pre-trained language models or word embeddings.\n\n\n## Summary\n\nThe following summarizes each notebook for Text Classification. Each notebook provides more details and guiding in principles on building state of the art models.\n\n|Notebook|Environment|Description|Dataset|\n|---|---|---|---|\n|[BERT for text classification on AzureML](tc_bert_azureml.ipynb) |Azure ML|A notebook which walks through fine-tuning and evaluating pre-trained BERT model on a distributed setup with AzureML. |[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)|\n|[Text Classification of MultiNLI Sentences using Multiple Transformer Models](tc_mnli_transformers.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a number of pre-trained transformer models|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)|\n|[Text Classification of Multi Language Datasets using Transformer Model](tc_multi_languages_transformers.ipynb)|Local|A notebook which walks through fine-tuning and evaluating a pre-trained transformer model for multiple datasets in different language|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) <br> [BBC Hindi News](https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1) <br> [DAC](https://data.mendeley.com/datasets/v524p5dhpj/2)\n"
  },
  {
    "path": "examples/text_classification/tc_bert_azureml.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"*Copyright (c) Microsoft Corporation. All rights reserved.*\\n\",\n    \"\\n\",\n    \"*Licensed under the MIT License.*\\n\",\n    \"\\n\",\n    \"# Text Classification of MultiNLI Sentences using BERT with Azure ML Pipelines\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/nlp/examples/text_classification/tc_bert_azureml.png)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 0. Introduction\\n\",\n    \"\\n\",\n    \"In this notebook, we fine-tune and evaluate a pretrained [BERT](https://arxiv.org/abs/1810.04805) model on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset using [AzureML](https://azure.microsoft.com/en-us/services/machine-learning-service/) Pipelines.\\n\",\n    \"\\n\",\n    \"We use a [distributed sequence classifier](../../utils_nlp/bert/sequence_classification_distributed.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert).\\n\",\n    \"\\n\",\n    \"The notebooks acts as a template to,\\n\",\n    \"1. Process a massive dataset in parallel by dividing the dataset into chunks using [DASK](https://dask.org/) .\\n\",\n    \"2. Perform distributed training on AzureML compute on these processed chunks.\\n\",\n    \"\\n\",\n    \"We create an [AzureML Pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines) for the two steps mentioned above. With this pipeline, the notebook can be scheduled regularly to fine tune BERT with new data and get a model which can be further deployed on [Azure Container Instance](https://docs.microsoft.com/en-us/azure/container-service/).\\n\",\n    \"\\n\",\n    \"AzureML Pipeline define reusable machine learning workflows that can be used as a template for your machine learning scenarios. Pipelines allow you to optimize your workflow and spend time on machine learning rather than infrastructure. If you are new to the concept of pipelines, [this would be a good place to get started](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines).\\n\",\n    \"\\n\",\n    \"**Note: To learn how to do pre-training on your own, please reference the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT) created by Microsoft.**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 29,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\\n\",\n      \"Azure ML SDK Version: 1.0.48\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import sys\\n\",\n    \"sys.path.append(\\\"../../\\\")\\n\",\n    \"import os\\n\",\n    \"import json\\n\",\n    \"import random\\n\",\n    \"import shutil\\n\",\n    \"import pandas as pd\\n\",\n    \"\\n\",\n    \"from utils_nlp.azureml import azureml_utils\\n\",\n    \"from utils_nlp.dataset.multinli import get_generator\\n\",\n    \"\\n\",\n    \"from sklearn.preprocessing import LabelEncoder\\n\",\n    \"import azureml.core\\n\",\n    \"from azureml.core import Datastore, Experiment,  get_run\\n\",\n    \"from azureml.core.conda_dependencies import CondaDependencies\\n\",\n    \"from azureml.core.runconfig import RunConfiguration\\n\",\n    \"from azureml.core.compute import ComputeTarget,  AmlCompute\\n\",\n    \"from azureml.exceptions import ComputeTargetException\\n\",\n    \"from azureml.data.data_reference import DataReference\\n\",\n    \"from azureml.pipeline.steps import PythonScriptStep\\n\",\n    \"from azureml.pipeline.core import Pipeline, PipelineData\\n\",\n    \"from azureml.widgets import RunDetails\\n\",\n    \"from azureml.train.dnn import PyTorch\\n\",\n    \"from azureml.core.runconfig import MpiConfiguration\\n\",\n    \"from azureml.pipeline.steps import EstimatorStep\\n\",\n    \"\\n\",\n    \"print(\\\"System version: {}\\\".format(sys.version))\\n\",\n    \"print(\\\"Azure ML SDK Version:\\\", azureml.core.VERSION)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Let's define a few variables before we get started, these variables define the folder where the data would reside, the batch size and the number of epochs we are training for. \\n\",\n    \"We also define the variables for AzureML workspace, which you can use to create a new workspace. You can ignore these variables if you have `config.json` in `.azureml` directory.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 30,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"LABEL_COL = \\\"genre\\\"\\n\",\n    \"DATA_FOLDER = \\\"../../data/temp\\\"\\n\",\n    \"TRAIN_FOLDER = \\\"../../data/temp/train\\\"\\n\",\n    \"TEST_FOLDER = \\\"../../data/temp/test\\\"\\n\",\n    \"ENCODED_LABEL_COL = \\\"label\\\"\\n\",\n    \"NUM_PARTITIONS = None\\n\",\n    \"LABELS = ['telephone', 'government', 'travel', 'slate', 'fiction']\\n\",\n    \"PROJECT_FOLDER = \\\"../../\\\"\\n\",\n    \"NODE_COUNT = 4\\n\",\n    \"\\n\",\n    \"config_path = (\\n\",\n    \"    \\\"./.azureml\\\"\\n\",\n    \")  # Path to the directory containing config.json with azureml credentials\\n\",\n    \"\\n\",\n    \"# Azure resources\\n\",\n    \"subscription_id = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"resource_group = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  \\n\",\n    \"workspace_name = \\\"YOUR_WORKSPACE_NAME\\\"  \\n\",\n    \"workspace_region = \\\"YOUR_WORKSPACE_REGION\\\" #Possible values eastus, eastus2 and so on.\\n\",\n    \"cluster_name = \\\"pipelines-tc-12\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this example we will use AzureML pipelines to execute training pipelines. Each preprocessing step is included as a step in the pipeline. For a more detailed walkthrough of what pipelines are with a getting started guidelines check this [notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb). We start by doing some AzureML related setup below.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 0.1 Initialize a workspace\\n\",\n    \"\\n\",\n    \"The following cell looks to set up the connection to your [Azure Machine Learning service Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). You can choose to connect to an existing workspace or create a new one. \\n\",\n    \"\\n\",\n    \"**To access an existing workspace:**\\n\",\n    \"1. If you have a `config.json` file, you do not need to provide the workspace information; you will only need to update the `config_path` variable that is defined above which contains the file.\\n\",\n    \"2. Otherwise, you will need to supply the following:\\n\",\n    \"    * The name of your workspace\\n\",\n    \"    * Your subscription id\\n\",\n    \"    * The resource group name\\n\",\n    \"\\n\",\n    \"**To create a new workspace:**\\n\",\n    \"\\n\",\n    \"Set the following information:\\n\",\n    \"* A name for your workspace\\n\",\n    \"* Your subscription id\\n\",\n    \"* The resource group name\\n\",\n    \"* [Azure region](https://azure.microsoft.com/en-us/global-infrastructure/regions/) to create the workspace in, such as `eastus2`. \\n\",\n    \"\\n\",\n    \"This will automatically create a new resource group for you in the region provided if a resource group with the name given does not already exist. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 31,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"ws = azureml_utils.get_or_create_workspace(\\n\",\n    \"    config_path=config_path,\\n\",\n    \"    subscription_id=subscription_id,\\n\",\n    \"    resource_group=resource_group,\\n\",\n    \"    workspace_name=workspace_name,\\n\",\n    \"    workspace_region=workspace_region,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 0.2 Create a compute target\\n\",\n    \"We create and attach a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training the model. Here we use the AzureML-managed compute target ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) as our remote training compute resource. Our cluster autoscales from 0 to 8 `STANDARD_NC12` GPU nodes.\\n\",\n    \"\\n\",\n    \"Creating and configuring the AmlCompute cluster takes approximately 5 minutes the first time around. Once a cluster with the given configuration is created, it does not need to be created again.\\n\",\n    \"\\n\",\n    \"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Read more about the default limits and how to request more quota [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 32,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Found existing compute target.\\n\",\n      \"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-08-11T08:53:18.284000+00:00', 'errors': None, 'creationTime': '2019-07-25T04:16:20.598768+00:00', 'modifiedTime': '2019-08-05T06:40:12.292030+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 10, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"try:\\n\",\n    \"    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\\n\",\n    \"    print(\\\"Found existing compute target.\\\")\\n\",\n    \"except ComputeTargetException:\\n\",\n    \"    print(\\\"Creating a new compute target...\\\")\\n\",\n    \"    compute_config = AmlCompute.provisioning_configuration(\\n\",\n    \"        vm_size=\\\"STANDARD_NC12\\\", max_nodes=8\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    # create the cluster\\n\",\n    \"    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\\n\",\n    \"\\n\",\n    \"    compute_target.wait_for_completion(show_output=True)\\n\",\n    \"\\n\",\n    \"# use get_status() to get a detailed status for the current AmlCompute.\\n\",\n    \"print(compute_target.get_status().serialize())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1. Preprocessing\\n\",\n    \"\\n\",\n    \"The pipeline is defined by a series of steps, the first being a PythonScriptStep which utilizes [DASK](https://dask.org/) to load dataframes in partitions allowing us to load and preprocess different sets of data in parallel.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.1 Read Dataset\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 33,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"train_batches = get_generator(DATA_FOLDER, \\\"train\\\", num_batches=NUM_PARTITIONS, batch_size=10e6)\\n\",\n    \"test_batches = get_generator(DATA_FOLDER, \\\"dev_matched\\\", num_batches=NUM_PARTITIONS, batch_size=10e6)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1.2 Preprocess and Tokenize\\n\",\n    \"\\n\",\n    \"In the classification task, we use the first sentence only as the text input, and the corresponding genre as the label. Select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are.\\n\",\n    \"\\n\",\n    \"Once filtered, we encode the labels. To do this, fit a label encoder with the known labels in a MNLI dataset.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 34,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"os.makedirs(TRAIN_FOLDER, exist_ok=True)\\n\",\n    \"os.makedirs(TEST_FOLDER, exist_ok=True)\\n\",\n    \"\\n\",\n    \"labels = LABELS\\n\",\n    \"label_encoder = LabelEncoder()\\n\",\n    \"label_encoder.fit(labels)\\n\",\n    \"\\n\",\n    \"num_train_partitions = 0\\n\",\n    \"for batch in train_batches:\\n\",\n    \"    batch = batch[batch[\\\"gold_label\\\"]==\\\"neutral\\\"]\\n\",\n    \"    batch[ENCODED_LABEL_COL] = label_encoder.transform(batch[LABEL_COL])\\n\",\n    \"    batch.to_csv(TRAIN_FOLDER+\\\"/batch{}.csv\\\".format(str(num_train_partitions)))\\n\",\n    \"    num_train_partitions += 1\\n\",\n    \"    \\n\",\n    \"num_test_partitions = 0\\n\",\n    \"for batch in test_batches:\\n\",\n    \"    batch = batch[batch[\\\"gold_label\\\"]==\\\"neutral\\\"]\\n\",\n    \"    batch[ENCODED_LABEL_COL] = label_encoder.transform(batch[LABEL_COL])\\n\",\n    \"    batch.to_csv(TEST_FOLDER+\\\"/batch{}.csv\\\".format(str(num_test_partitions)))\\n\",\n    \"    num_test_partitions += 1\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Once we have the partitions of data ready they are uploaded to the datastore.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 35,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"$AZUREML_DATAREFERENCE_9609849b541244d396d06017b5729edb\"\n      ]\n     },\n     \"execution_count\": 35,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"ds = ws.get_default_datastore()\\n\",\n    \"ds.upload(src_dir=TRAIN_FOLDER, target_path=\\\"mnli_data/train\\\", overwrite=True, show_progress=False)\\n\",\n    \"ds.upload(src_dir=TEST_FOLDER, target_path=\\\"mnli_data/test\\\", overwrite=True, show_progress=False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 36,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"shutil.rmtree(TRAIN_FOLDER)\\n\",\n    \"shutil.rmtree(TEST_FOLDER)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can now parallely operate on each batch to tokenize the data and preprocess the tokens. To do this, we create a PythonScript step below.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 37,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Writing preprocess.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile preprocess.py\\n\",\n    \"# Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\n    \"# Licensed under the MIT License.\\n\",\n    \"import argparse\\n\",\n    \"import logging\\n\",\n    \"import os\\n\",\n    \"\\n\",\n    \"import pandas as pd\\n\",\n    \"\\n\",\n    \"from utils_nlp.models.bert.common import Language, Tokenizer\\n\",\n    \"\\n\",\n    \"LABEL_COL = \\\"genre\\\"\\n\",\n    \"TEXT_COL = \\\"sentence1\\\"\\n\",\n    \"LANGUAGE = Language.ENGLISH\\n\",\n    \"TO_LOWER = True\\n\",\n    \"MAX_LEN = 150\\n\",\n    \"\\n\",\n    \"logger = logging.getLogger(__name__)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def tokenize(df):\\n\",\n    \"    \\\"\\\"\\\"Tokenize the text documents and convert them to lists of tokens using the BERT tokenizer.\\n\",\n    \"    Args:\\n\",\n    \"        df(pd.Dataframe): Dataframe with training or test samples\\n\",\n    \"\\n\",\n    \"    Returns:\\n\",\n    \"\\n\",\n    \"        list: List of lists of tokens for train set.\\n\",\n    \"\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    tokenizer = Tokenizer(\\n\",\n    \"        LANGUAGE, to_lower=TO_LOWER)\\n\",\n    \"    tokens = tokenizer.tokenize(list(df[TEXT_COL]))\\n\",\n    \"\\n\",\n    \"    return tokens\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def preprocess(tokens):\\n\",\n    \"    \\\"\\\"\\\" Preprocess method that does the following,\\n\",\n    \"            Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\\n\",\n    \"            Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\\n\",\n    \"            Pad or truncate the token lists to the specified max length\\n\",\n    \"            Return mask lists that indicate paddings' positions\\n\",\n    \"            Return token type id lists that indicate which sentence the tokens belong to (not needed\\n\",\n    \"            for one-sequence classification)\\n\",\n    \"\\n\",\n    \"    Args:\\n\",\n    \"        tokens(pd.Dataframe): Dataframe with tokens for train set.\\n\",\n    \"\\n\",\n    \"    Returns:\\n\",\n    \"        list: List of lists of tokens for train or test set with special tokens added.\\n\",\n    \"        list: Input mask.\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    tokenizer = Tokenizer(\\n\",\n    \"        LANGUAGE, to_lower=TO_LOWER)\\n\",\n    \"    tokens, mask, _ = tokenizer.preprocess_classification_tokens(\\n\",\n    \"        tokens, MAX_LEN\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    return tokens, mask\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"parser = argparse.ArgumentParser()\\n\",\n    \"parser.add_argument(\\\"--input_data\\\", type=str, help=\\\"input data\\\")\\n\",\n    \"parser.add_argument(\\\"--output_data\\\", type=str, help=\\\"Path to the output file.\\\")\\n\",\n    \"\\n\",\n    \"args = parser.parse_args()\\n\",\n    \"input_data = args.input_data\\n\",\n    \"output_data = args.output_data\\n\",\n    \"output_dir = os.path.dirname(os.path.abspath(output_data))\\n\",\n    \"\\n\",\n    \"if output_dir is not None:\\n\",\n    \"    os.makedirs(output_dir, exist_ok=True)\\n\",\n    \"    logger.info(\\\"%s created\\\" % output_dir)\\n\",\n    \"\\n\",\n    \"df = pd.read_csv(args.input_data)\\n\",\n    \"tokens_array = tokenize(df)\\n\",\n    \"tokens_array, mask_array = preprocess(tokens_array)\\n\",\n    \"\\n\",\n    \"df['tokens'] = tokens_array\\n\",\n    \"df['mask'] = mask_array\\n\",\n    \"\\n\",\n    \"# Filter columns\\n\",\n    \"cols = ['tokens', 'mask', 'label']\\n\",\n    \"df = df[cols]\\n\",\n    \"df.to_csv(output_data, header=False, index=False)\\n\",\n    \"logger.info(\\\"Completed\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 38,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'../../utils_nlp/models/bert/preprocess.py'\"\n      ]\n     },\n     \"execution_count\": 38,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"preprocess_file = os.path.join(PROJECT_FOLDER,'utils_nlp/models/bert/preprocess.py')\\n\",\n    \"shutil.move('preprocess.py',preprocess_file)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Create a conda environment for the steps below.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 39,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"conda_dependencies = CondaDependencies.create(\\n\",\n    \"    conda_packages=[\\n\",\n    \"        \\\"numpy\\\",\\n\",\n    \"        \\\"scikit-learn\\\",\\n\",\n    \"        \\\"pandas\\\",\\n\",\n    \"    ],\\n\",\n    \"    pip_packages=[\\\"azureml-sdk==1.0.43.*\\\", \\n\",\n    \"                  \\\"torch==1.1\\\", \\n\",\n    \"                  \\\"tqdm==4.31.1\\\",\\n\",\n    \"                 \\\"pytorch-pretrained-bert>=0.6\\\"],\\n\",\n    \"    python_version=\\\"3.6.8\\\",\\n\",\n    \")\\n\",\n    \"run_config = RunConfiguration(conda_dependencies=conda_dependencies)\\n\",\n    \"run_config.environment.docker.enabled = True\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then create the list of steps that use the preprocess.py created above. We use the output of these steps as input to training in the next section.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 40,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"processed_train_files = []\\n\",\n    \"processed_test_files = []\\n\",\n    \"ds = ws.get_default_datastore()\\n\",\n    \"\\n\",\n    \"for i in range(num_train_partitions):\\n\",\n    \"        input_data = DataReference(datastore=ds, \\n\",\n    \"                                   data_reference_name='train_batch_{}'.format(str(i)), \\n\",\n    \"                                   path_on_datastore='mnli_data/train/batch{}.csv'.format(str(i)),\\n\",\n    \"                                   overwrite=False)\\n\",\n    \"\\n\",\n    \"        output_data = PipelineData(name=\\\"train{}\\\".format(str(i)), datastore=ds,\\n\",\n    \"                       output_path_on_compute='mnli_data/processed_train/batch{}.csv'.format(str(i)))\\n\",\n    \"\\n\",\n    \"        step = PythonScriptStep(\\n\",\n    \"            name='preprocess_step_train_{}'.format(str(i)),\\n\",\n    \"            arguments=[\\\"--input_data\\\", input_data, \\\"--output_data\\\", output_data],\\n\",\n    \"            script_name= 'utils_nlp/models/bert/preprocess.py',\\n\",\n    \"            inputs=[input_data],\\n\",\n    \"            outputs=[output_data],\\n\",\n    \"            source_directory=PROJECT_FOLDER,\\n\",\n    \"            compute_target=compute_target,\\n\",\n    \"            runconfig=run_config,\\n\",\n    \"            allow_reuse=False,\\n\",\n    \"        )\\n\",\n    \"        \\n\",\n    \"        processed_train_files.append(output_data)         \\n\",\n    \"            \\n\",\n    \"for i in range(num_test_partitions):\\n\",\n    \"            input_data = DataReference(datastore=ds, \\n\",\n    \"                                       data_reference_name='test_batch_{}'.format(str(i)), \\n\",\n    \"                                       path_on_datastore='mnli_data/test/batch{}.csv'.format(str(i)),\\n\",\n    \"                                       overwrite=False)\\n\",\n    \"        \\n\",\n    \"            output_data = PipelineData(name=\\\"test{}\\\".format(str(i)), datastore=ds,\\n\",\n    \"                        output_path_on_compute='mnli_data/processed_test/batch{}.csv'.format(str(i)))\\n\",\n    \"            \\n\",\n    \"            step = PythonScriptStep(\\n\",\n    \"                name='preprocess_step_test_{}'.format(str(i)),\\n\",\n    \"                arguments=[\\\"--input_data\\\", input_data, \\\"--output_data\\\", output_data],\\n\",\n    \"                script_name= 'utils_nlp/models/bert/preprocess.py',\\n\",\n    \"                inputs=[input_data],\\n\",\n    \"                outputs=[output_data],\\n\",\n    \"                source_directory=PROJECT_FOLDER,\\n\",\n    \"                compute_target=compute_target,\\n\",\n    \"                runconfig=run_config,\\n\",\n    \"                allow_reuse=False,\\n\",\n    \"            )\\n\",\n    \"            \\n\",\n    \"            processed_test_files.append(output_data)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2. Train and Score\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Once the data is processed and available on datastore, we  train the classifier using the training examples. This involves fine-tuning the BERT Transformer and learning a linear classification layer on top of that. After training is complete we score the performance of the model on the test dataset\\n\",\n    \"\\n\",\n    \"The training is distributed and is done AzureML's capability to support distributed using MPI with horovod. \\n\",\n    \"\\n\",\n    \"**Please note** that training requires a GPU enabled cluster in AzureML Compute. We suggest using NC12. If you would like to change the GPU configuration, please changes `NUM_GPUS` variable accordingly.\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.1 Setup training script\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 41,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Writing train.py\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"%%writefile train.py\\n\",\n    \"# Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\n    \"# Licensed under the MIT License.\\n\",\n    \"\\n\",\n    \"import argparse\\n\",\n    \"import json\\n\",\n    \"import logging\\n\",\n    \"import os\\n\",\n    \"import torch\\n\",\n    \"\\n\",\n    \"from sklearn.metrics import classification_report\\n\",\n    \"\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"from utils_nlp.models.bert.common import Language, get_dataset_multiple_files\\n\",\n    \"from utils_nlp.models.bert.sequence_classification_distributed import (\\n\",\n    \"    BERTSequenceClassifier,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"BATCH_SIZE = 32\\n\",\n    \"NUM_GPUS = 2\\n\",\n    \"NUM_EPOCHS = 1\\n\",\n    \"LABELS = [\\\"telephone\\\", \\\"government\\\", \\\"travel\\\", \\\"slate\\\", \\\"fiction\\\"]\\n\",\n    \"OUTPUT_DIR = \\\"./outputs/\\\"\\n\",\n    \"\\n\",\n    \"logger = logging.getLogger(__name__)\\n\",\n    \"\\n\",\n    \"parser = argparse.ArgumentParser()\\n\",\n    \"parser.add_argument(\\n\",\n    \"    \\\"--train_files\\\",\\n\",\n    \"    nargs=\\\"+\\\",\\n\",\n    \"    default=[],\\n\",\n    \"    help=\\\"List of file paths to all the files in train dataset.\\\",\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"parser.add_argument(\\n\",\n    \"    \\\"--test_files\\\",\\n\",\n    \"    nargs=\\\"+\\\",\\n\",\n    \"    default=[],\\n\",\n    \"    help=\\\"List of file paths to all the files in test dataset.\\\",\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"args = parser.parse_args()\\n\",\n    \"train_files = [file.strip() for file in args.train_files]\\n\",\n    \"test_files = [file.strip() for file in args.test_files]\\n\",\n    \"\\n\",\n    \"# Handle square brackets from train list\\n\",\n    \"train_files[0] = train_files[0][1:]\\n\",\n    \"train_files[len(train_files) - 1] = train_files[len(train_files) - 1][:-1]\\n\",\n    \"train_dataset = get_dataset_multiple_files(train_files)\\n\",\n    \"\\n\",\n    \"# Handle square brackets from test list\\n\",\n    \"test_files[0] = test_files[0][1:]\\n\",\n    \"test_files[len(test_files) - 1] = test_files[len(test_files) - 1][:-1]\\n\",\n    \"test_dataset = get_dataset_multiple_files(test_files)\\n\",\n    \"\\n\",\n    \"# Train\\n\",\n    \"classifier = BERTSequenceClassifier(\\n\",\n    \"    language=Language.ENGLISH, num_labels=len(LABELS), use_distributed=True\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"# Create data loaders.\\n\",\n    \"kwargs = (\\n\",\n    \"    {\\\"num_workers\\\": NUM_GPUS, \\\"pin_memory\\\": True} if torch.cuda.is_available() else {}\\n\",\n    \")\\n\",\n    \"train_data_loader = classifier.create_data_loader(\\n\",\n    \"    train_dataset, batch_size=BATCH_SIZE, **kwargs\\n\",\n    \")\\n\",\n    \"test_data_loader = classifier.create_data_loader(\\n\",\n    \"    test_dataset, batch_size=BATCH_SIZE, mode=\\\"test\\\", **kwargs\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"# Create optimizer\\n\",\n    \"num_examples = len(train_dataset)\\n\",\n    \"num_batches = int(num_examples / BATCH_SIZE)\\n\",\n    \"num_train_optimization_steps = num_batches * NUM_EPOCHS\\n\",\n    \"optimizer = classifier.create_optimizer(num_train_optimization_steps)\\n\",\n    \"\\n\",\n    \"with Timer() as t:\\n\",\n    \"    for epoch in range(1, NUM_EPOCHS + 1):\\n\",\n    \"        train_data_loader.sampler.set_epoch(epoch)\\n\",\n    \"        classifier.fit(\\n\",\n    \"            train_data_loader,\\n\",\n    \"            epoch=epoch,\\n\",\n    \"            bert_optimizer=optimizer,\\n\",\n    \"            num_gpus=NUM_GPUS,\\n\",\n    \"            num_epochs=NUM_EPOCHS,\\n\",\n    \"        )\\n\",\n    \"\\n\",\n    \"# Predict\\n\",\n    \"preds, labels_test = classifier.predict(test_data_loader, num_gpus=NUM_GPUS)\\n\",\n    \"\\n\",\n    \"# Evaluate\\n\",\n    \"results = classification_report(\\n\",\n    \"    labels_test, preds, target_names=LABELS, output_dict=True\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"# Write out results.\\n\",\n    \"classifier.save_model()\\n\",\n    \"result_file = os.path.join(OUTPUT_DIR, \\\"results.json\\\")\\n\",\n    \"with open(result_file, \\\"w+\\\") as fp:\\n\",\n    \"    json.dump(results, fp)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 42,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'../../utils_nlp/models/bert/train.py'\"\n      ]\n     },\n     \"execution_count\": 42,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"train_file = os.path.join(PROJECT_FOLDER,'utils_nlp/models/bert/train.py')\\n\",\n    \"shutil.move('train.py',train_file)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.2 Create a Pytorch Estimator\\n\",\n    \"\\n\",\n    \"We create a Pytorch Estimator using AzureML SDK and additonally define an EstimatorStep to run it on AzureML pipelines.\\n\",\n    \"\\n\",\n    \"The Azure ML SDK's PyTorch Estimator allows us to submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch).\\n\",\n    \"\\n\",\n    \"This Estimator specifies that the training script will run on 4 nodes, with 2 worker per node. In order to execute a distributed run using GPU, we must define `use_gpu` and `distributed_backend` to use MPI/Horovod. PyTorch, Horovod, and other necessary dependencies are installed automatically.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 43,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING - framework_version is not specified, defaulting to version 1.1.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"estimator = PyTorch(source_directory=PROJECT_FOLDER,\\n\",\n    \"                    compute_target=compute_target,\\n\",\n    \"                    entry_script='utils_nlp/models/bert/train.py',\\n\",\n    \"                    node_count= NODE_COUNT,\\n\",\n    \"                    distributed_training= MpiConfiguration(),\\n\",\n    \"                    use_gpu=True,\\n\",\n    \"                    conda_packages=['scikit-learn=0.20.3', 'numpy>=1.16.0', 'pandas'],\\n\",\n    \"                    pip_packages=[\\\"tqdm==4.31.1\\\",\\\"pytorch-pretrained-bert>=0.6\\\"]\\n\",\n    \"                   )\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 44,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"inputs = processed_train_files + processed_test_files\\n\",\n    \"\\n\",\n    \"est_step = EstimatorStep(name=\\\"Estimator-Train\\\", \\n\",\n    \"                         estimator=estimator, \\n\",\n    \"                         estimator_entry_script_arguments=[\\n\",\n    \"                             '--train_files',  str(processed_train_files),\\n\",\n    \"                             '--test_files', str(processed_test_files)],\\n\",\n    \"                         inputs = inputs,\\n\",\n    \"                         runconfig_pipeline_params=None, \\n\",\n    \"                         compute_target=compute_target)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.3 Submit the pipeline\\n\",\n    \"\\n\",\n    \"The model is fine tuned on AML Compute and takes **45 minutes** to train. The total time to run the pipeline will be around **1h 30 minutes** if you use the default value `max_epoch=1`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"pipeline = Pipeline(workspace=ws, steps=[est_step])\\n\",\n    \"experiment = Experiment(ws, 'NLP-TC-BERT-distributed')\\n\",\n    \"pipeline_run = experiment.submit(pipeline)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 46,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"060659321062486694c0acbb0184eeed\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"RunDetails(pipeline_run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 47,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#If you would like to cancel the job for any reasons uncomment the code below.\\n\",\n    \"#pipeline_run.cancel()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"#wait for the run to complete before continuing in the notebook\\n\",\n    \"pipeline_run.wait_for_completion()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2.4 Download and analyze results\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 49,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Downloading file outputs/results.json to ./outputs\\\\results.json...\\n\",\n      \"Downloading file outputs/bert-large-uncased to ./outputs\\\\bert-large-uncased...\\n\",\n      \"Downloading file outputs/bert_config.json to ./outputs\\\\bert_config.json...\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"step_run = pipeline_run.find_step_run(\\\"Estimator-Train\\\")[0]\\n\",\n    \"file_names = ['outputs/results.json', 'outputs/bert-large-uncased', 'outputs/bert_config.json' ]\\n\",\n    \"azureml_utils.get_output_files(step_run, './outputs', file_names=file_names)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 50,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"              f1-score  precision    recall  support\\n\",\n      \"telephone     0.904130   0.843191  0.974563    629.0\\n\",\n      \"government    0.955857   0.972366  0.939900    599.0\\n\",\n      \"travel        0.839966   0.935849  0.761905    651.0\\n\",\n      \"slate         0.986411   0.974724  0.998382    618.0\\n\",\n      \"fiction       0.938871   0.918712  0.959936    624.0\\n\",\n      \"micro avg     0.925344   0.925344  0.925344   3121.0\\n\",\n      \"macro avg     0.925047   0.928968  0.926937   3121.0\\n\",\n      \"weighted avg  0.923913   0.928455  0.925344   3121.0\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"with open('outputs/results.json', 'r') as handle:\\n\",\n    \"    parsed = json.load(handle)\\n\",\n    \"    print(pd.DataFrame.from_dict(parsed).transpose())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"From the above chart we can notice the performance of the model trained on a distributed setup in AzureML Compute. From our comparison to fine tuning the same model on MNLI dataset on a `STANDARD_NC12` machine [here](tc_mnli_bert.ipynb) we notice a gain of 20% in the model training time with no drop in performance for AzureML Compute. We present the comparison of weight avg of the metrics along with the training time below,\\n\",\n    \"\\n\",\n    \"| Training Setup | F1-Score | Precision | Recall | Training Time |\\n\",\n    \"| --- | --- | --- | --- | --- |\\n\",\n    \"|Standard NC12 | 0.93 |0.93 |0.93 | 58 min |\\n\",\n    \"|AzureML Compute*|0.934| 0.934 | 0.934| 46 min |\\n\",\n    \"\\n\",\n    \"* AzureML Compute - The setup used 4 nodes with `STANDARD_NC12` machines.\\n\",\n    \"\\n\",\n    \"We also observe common tradeoffs associated with distributed training. We make use of [Horovod](https://github.com/horovod/horovod), a distributed training tool for many popular deep learning frameworks that enables parallelization of work across the nodes in the cluster. Distributed training decreases the time it takes for the model to converge in theory, but the model may also take more time in communicating with each node. Note that the communication time will eventually become negligible when training on larger and larger datasets, but being aware of this tradeoff is helpful for choosing the node configuration when training on smaller datasets. We expect the gains of using AzureML to increase with increased dataset size.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally clean up any intermediate files we created.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 51,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"os.remove(train_file)\\n\",\n    \"os.remove(preprocess_file)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python nlp_cpu\",\n   \"language\": \"python\",\n   \"name\": \"ame\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/text_classification/tc_mnli_mtdnn.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"*Copyright (c) Microsoft Corporation. All rights reserved.*\\n\",\n    \"\\n\",\n    \"*Licensed under the MIT License.*\\n\",\n    \"\\n\",\n    \"# The Microsoft Toolkit of Multi-Task Deep Neural Networks for Natural Language Understanding\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Summary\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"MT-DNN is an open-source natural language understanding (NLU) toolkit that makes it easy for researchers and developers to train customized deep learning models. Built upon PyTorch and Transformers, MT-DNN is designed to facilitate rapid\\n\",\n    \"customization for a broad spectrum of NLU tasks, using a variety of objectives (classification, regression, structured prediction) and text encoders (e.g., RNNs, BERT, RoBERTa, UniLM). A unique feature of MT-DNN is its built-in support for robust and transferable learning using the adversarial multi-task learning paradigm. To enable efficient production deployment, MT-DNN supports multitask knowledge distillation, which can substantially compress a deep neural model without significant performance drop. We demonstrate the effectiveness of MT-DNN on a wide range of NLU applications across general and biomedical domains. The pip installable package and pretrained models will be publicly available at https://github.com/microsoft/mt-dnn.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Design\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"MT-DNN is designed for modularity, flexibility, and ease of use. These modules are built upon PyTorch (Paszke et al., 2019) and Transformers (Wolf\\n\",\n    \"et al., 2019), allowing the use of the SOTA pretrained models, e.g., BERT (Devlin et al., 2019), RoBERTa (Liu et al., 2019c) and UniLM (Dong\\n\",\n    \"et al., 2019). The unique attribute of this package is a flexible interface for adversarial multi-task fine-tuning and knowledge distillation, so that researchers and developers can build large SOTA NLU models and then compress them to small ones\\n\",\n    \"for online deployment.The overall workflow and system architecture are shown in figures 1 and 3 respectively.\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"![Workflow Design](https://nlpbp.blob.core.windows.net/images/mt-dnn2.JPG)\\n\",\n    \"\\n\",\n    \"The above figure shows workflow of MT-DNN: train a neural language model on a large amount of unlabeled raw text\\n\",\n    \"to obtain general contextual representations; then finetune the learned contextual representation on downstream tasks, e.g. GLUE (Wang et al., 2018); lastly, distill this large model to a lighter one for online deployment. In the later two phrases, we can leverage powerful multi-task learning and adversarial training to further improve performance.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Architecture\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"![overall_arch](https://nlpbp.blob.core.windows.net/images/mt-dnn.png)\\n\",\n    \"The figure above shows the overall system architecture. The lower layers are shared across all tasks while the top layers are taskspecific. The input X (either a sentence or a set of sentences) is first represented as a sequence of embedding\\n\",\n    \"vectors, one for each word, in l1. Then the encoder, e.g a Transformer or recurrent neural network (LSTM) model,\\n\",\n    \"captures the contextual information for each word and generates the shared contextual embedding vectors in l2.\\n\",\n    \"Finally, for each task, additional task-specific layers generate task-specific representations, followed by operations\\n\",\n    \"necessary for classification, similarity scoring, or relevance ranking. In case of adversarial training, we perturb\\n\",\n    \"embeddings from the lexicon encoder and then add an extra loss term during the training. Note that for the\\n\",\n    \"inference phrase, it does not require perturbations.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Introduction\\n\",\n    \"In this notebook, we fine-tune and evaluate MT-DNN models on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset.  \\n\",\n    \"\\n\",\n    \"### Running Time\\n\",\n    \"\\n\",\n    \"This is a __computationally intensive__ notebook that runs on the entire MNLI dataset for match and mismatched datasets for training, development and test.  \\n\",\n    \"\\n\",\n    \"The table below provides some reference running time on a GPU machine.  \\n\",\n    \"\\n\",\n    \"|Dataset|MULTI_GPU_ON|Machine Configurations|Running time|\\n\",\n    \"|:------|:---------|:----------------------|:------------|\\n\",\n    \"|MultiNLI|True|4 NVIDIA Tesla K80 GPUs, 24GB GPU memory| ~ 20 hours |\\n\",\n    \"\\n\",\n    \"If you run into `CUDA out-of-memory error` or the jupyter kernel dies constantly, try reducing the `BATCH_SIZE` and `MAX_SEQ_LEN` in `MTDNNConfig`, but note that model performance may be compromised.\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"\\n\",\n    \"### Text Classification of MultiNLI Sentences using MT-DNN\\n\",\n    \"\\n\",\n    \"This notebook utilizes the pip installable package that implements the Multi-Task Deep Neural Network Toolkit (MTDNN) for Natural Language Understanding. It's recommended to run this notebook on GPU machines as it's very computationally intensive.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"%load_ext autoreload\\n\",\n    \"%autoreload 2\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 45,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import json\\n\",\n    \"import os\\n\",\n    \"import shutil\\n\",\n    \"import sys\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"\\n\",\n    \"import pandas as pd\\n\",\n    \"import torch\\n\",\n    \"\\n\",\n    \"from mtdnn.common.types import EncoderModelType\\n\",\n    \"from mtdnn.configuration_mtdnn import MTDNNConfig\\n\",\n    \"from mtdnn.data_builder_mtdnn import MTDNNDataBuilder\\n\",\n    \"from mtdnn.modeling_mtdnn import MTDNNModel\\n\",\n    \"from mtdnn.process_mtdnn import MTDNNDataProcess\\n\",\n    \"from mtdnn.tasks.config import MTDNNTaskDefs\\n\",\n    \"from mtdnn.tokenizer_mtdnn import MTDNNTokenizer\\n\",\n    \"from utils_nlp.dataset.multinli import download_tsv_files_and_extract\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Define Configuration, Tasks and Model Objects\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Define Configuration, Tasks and Model Objects\\n\",\n    \"ROOT_DIR = TemporaryDirectory().name\\n\",\n    \"OUTPUT_DIR = os.path.join(ROOT_DIR, 'checkpoint')\\n\",\n    \"os.makedirs(OUTPUT_DIR) if not os.path.exists(OUTPUT_DIR) else OUTPUT_DIR\\n\",\n    \"\\n\",\n    \"LOG_DIR = os.path.join(ROOT_DIR, 'tensorboard_logdir')\\n\",\n    \"os.makedirs(LOG_DIR) if not os.path.exists(LOG_DIR) else LOG_DIR\\n\",\n    \"\\n\",\n    \"DATA_DIR = os.path.join(ROOT_DIR, 'data')\\n\",\n    \"os.makedirs(DATA_DIR) if not os.path.exists(DATA_DIR) else DATA_DIR\\n\",\n    \"\\n\",\n    \"DATA_SOURCE_DIR = os.path.join(DATA_DIR, \\\"MNLI\\\")\\n\",\n    \"\\n\",\n    \"# Training parameters\\n\",\n    \"BATCH_SIZE = 16\\n\",\n    \"MULTI_GPU_ON = True\\n\",\n    \"MAX_SEQ_LEN = 128\\n\",\n    \"NUM_EPOCHS = 5\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Exploring the location for our data to be downloaded, model to be checkpointed and logs to be dumped\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"/tmp/tmpd9ok4aeo/data\\n\",\n      \"/tmp/tmpd9ok4aeo/checkpoint\\n\",\n      \"/tmp/tmpd9ok4aeo/tensorboard_logdir\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(DATA_DIR)\\n\",\n    \"print(OUTPUT_DIR)\\n\",\n    \"print(LOG_DIR)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Read Dataset\\n\",\n    \"We start by loading a subset of the data. The following function also downloads and extracts the files, if they don't exist in the data folder.\\n\",\n    \"\\n\",\n    \"The MultiNLI dataset is mainly used for natural language inference (NLI) tasks, where the inputs are sentence pairs and the labels are entailment indicators\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 305k/305k [00:05<00:00, 54.1kKB/s] \\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Downloaded file to:  /tmp/tmpd9ok4aeo/data/MNLI\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"download_tsv_files_and_extract(DATA_DIR)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Define a Configuration Object \\n\",\n    \"\\n\",\n    \"Create a model configuration object, `MTDNNConfig`, with the necessary parameters to initialize the MT-DNN model. Initialization without any parameters will default to a similar configuration that initializes a BERT model. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"config = MTDNNConfig(batch_size=BATCH_SIZE, \\n\",\n    \"                     max_seq_len=MAX_SEQ_LEN, \\n\",\n    \"                     multi_gpu_on=MULTI_GPU_ON)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"\\n\",\n    \"### Create Task Definition Object  \\n\",\n    \"\\n\",\n    \"Define the task parameters to train for and initialize an `MTDNNTaskDefs` object. Create a task parameter dictionary. Definition can be a single or multiple tasks to train.  `MTDNNTaskDefs` can take a python dict, yaml or json file with task(s) defintion.\\n\",\n    \"\\n\",\n    \"The data source directory is the path of data downloaded and extracted above using `download_tsv_files_and_extract` which is the `MNLI` dir under the `DATA_DIR` temporary directory.    \\n\",\n    \"\\n\",\n    \"The data source has options that are set to drive each task pre-processing; `data_process_opts`\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/26/2020 08:07:58 - mtdnn.tasks.config - INFO - Mapping Task attributes\\n\",\n      \"06/26/2020 08:07:58 - mtdnn.tasks.config - INFO - Configured task definitions - ['mnli']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"tasks_params = {\\n\",\n    \"    \\\"mnli\\\": {\\n\",\n    \"        \\\"data_format\\\": \\\"PremiseAndOneHypothesis\\\",\\n\",\n    \"        \\\"encoder_type\\\": \\\"BERT\\\",\\n\",\n    \"        \\\"dropout_p\\\": 0.3,\\n\",\n    \"        \\\"enable_san\\\": True,\\n\",\n    \"        \\\"labels\\\": [\\\"contradiction\\\", \\\"neutral\\\", \\\"entailment\\\"],\\n\",\n    \"        \\\"metric_meta\\\": [\\\"ACC\\\"],\\n\",\n    \"        \\\"loss\\\": \\\"CeCriterion\\\",\\n\",\n    \"        \\\"kd_loss\\\": \\\"MseCriterion\\\",\\n\",\n    \"        \\\"n_class\\\": 3,\\n\",\n    \"        \\\"split_names\\\": [\\n\",\n    \"            \\\"train\\\",\\n\",\n    \"            \\\"dev_matched\\\",\\n\",\n    \"            \\\"dev_mismatched\\\",\\n\",\n    \"            \\\"test_matched\\\",\\n\",\n    \"            \\\"test_mismatched\\\",\\n\",\n    \"        ],\\n\",\n    \"        \\\"data_source_dir\\\": DATA_SOURCE_DIR,\\n\",\n    \"        \\\"data_process_opts\\\": {\\\"header\\\": True, \\\"is_train\\\": True, \\\"multi_snli\\\": False,},\\n\",\n    \"        \\\"task_type\\\": \\\"Classification\\\",\\n\",\n    \"    },\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"# Define the tasks\\n\",\n    \"task_defs = MTDNNTaskDefs(tasks_params)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"\\n\",\n    \"### Create the MTDNN Data Tokenizer Object  \\n\",\n    \"\\n\",\n    \"Create a data tokenizing object, `MTDNNTokenizer`. Based on the model initial checkpoint, it wraps around the model's Huggingface transformers library to encode the data to MT-DNN format. This becomes the input to the data building stage.  \\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"tokenizer = MTDNNTokenizer(do_lower_case=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Testing out the Tokenizer encode function on a sample text\\n\",\n    \"`tokenizer.encode(\\\"What NLP toolkit do you recommend\\\", \\\"MT-DNN is a fantastic toolkit\\\")`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 24,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"([101, 2054, 17953, 2361, 6994, 23615, 2079, 2017, 16755, 102, 11047, 1011, 1040, 10695, 2003, 1037, 10392, 6994, 23615, 102], None, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(tokenizer.encode(\\\"What NLP toolkit do you recommend\\\", \\\"MT-DNN is a fantastic toolkit\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Data Preprocessing\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Create the Data Builder Object  \\n\",\n    \"\\n\",\n    \"Create a data preprocessing object, `MTDNNDataBuilder`. This class is responsible for converting the data into the MT-DNN format depending on the task.  \\n\",\n    \" \\n\",\n    \"\\n\",\n    \"Define a data builder that handles the creating of each task's vectorized data utilizing the model tokenizer. This will build out the vectorized data needed for creating the training, test and development PyTorch dataloaders\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 25,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/26/2020 08:08:01 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 392702 samples for mnli at /tmp/tmpd9ok4aeo/data/canonical_data/mnli_train.tsv\\n\",\n      \"06/26/2020 08:08:01 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 9815 samples for mnli at /tmp/tmpd9ok4aeo/data/canonical_data/mnli_dev_matched.tsv\\n\",\n      \"06/26/2020 08:08:01 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 9832 samples for mnli at /tmp/tmpd9ok4aeo/data/canonical_data/mnli_dev_mismatched.tsv\\n\",\n      \"06/26/2020 08:08:01 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 9796 samples for mnli at /tmp/tmpd9ok4aeo/data/canonical_data/mnli_test_matched.tsv\\n\",\n      \"06/26/2020 08:08:01 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 9847 samples for mnli at /tmp/tmpd9ok4aeo/data/canonical_data/mnli_test_mismatched.tsv\\n\",\n      \"mnli_train\\n\",\n      \"06/26/2020 08:08:01 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI TRAIN' Task\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Building Data For Premise and One Hypothesis: 392702it [05:19, 1228.85it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/26/2020 08:13:22 - mtdnn.data_builder_mtdnn - INFO - Saving data to /tmp/tmpd9ok4aeo/data/canonical_data/bert_base_uncased/mnli_train.json\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Saving Data For PremiseAndOneHypothesis: 100%|██████████| 392702/392702 [00:05<00:00, 70762.79it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"mnli_dev_matched\\n\",\n      \"06/26/2020 08:13:28 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI DEV MATCHED' Task\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Building Data For Premise and One Hypothesis: 9815it [00:09, 1017.29it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/26/2020 08:13:38 - mtdnn.data_builder_mtdnn - INFO - Saving data to /tmp/tmpd9ok4aeo/data/canonical_data/bert_base_uncased/mnli_dev_matched.json\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Saving Data For PremiseAndOneHypothesis: 100%|██████████| 9815/9815 [00:00<00:00, 66741.94it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"mnli_dev_mismatched\\n\",\n      \"06/26/2020 08:13:38 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI DEV MISMATCHED' Task\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Building Data For Premise and One Hypothesis: 9832it [00:08, 1207.60it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/26/2020 08:13:46 - mtdnn.data_builder_mtdnn - INFO - Saving data to /tmp/tmpd9ok4aeo/data/canonical_data/bert_base_uncased/mnli_dev_mismatched.json\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Saving Data For PremiseAndOneHypothesis: 100%|██████████| 9832/9832 [00:00<00:00, 72382.99it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"mnli_test_matched\\n\",\n      \"06/26/2020 08:13:46 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI TEST MATCHED' Task\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Building Data For Premise and One Hypothesis: 9796it [00:07, 1243.12it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/26/2020 08:13:54 - mtdnn.data_builder_mtdnn - INFO - Saving data to /tmp/tmpd9ok4aeo/data/canonical_data/bert_base_uncased/mnli_test_matched.json\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Saving Data For PremiseAndOneHypothesis: 100%|██████████| 9796/9796 [00:00<00:00, 73680.61it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"mnli_test_mismatched\\n\",\n      \"06/26/2020 08:13:54 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI TEST MISMATCHED' Task\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Building Data For Premise and One Hypothesis: 9847it [00:08, 1195.65it/s]\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/26/2020 08:14:02 - mtdnn.data_builder_mtdnn - INFO - Saving data to /tmp/tmpd9ok4aeo/data/canonical_data/bert_base_uncased/mnli_test_mismatched.json\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"Saving Data For PremiseAndOneHypothesis: 100%|██████████| 9847/9847 [00:00<00:00, 67993.64it/s]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"## Load and build data\\n\",\n    \"data_builder = MTDNNDataBuilder(\\n\",\n    \"    tokenizer=tokenizer,\\n\",\n    \"    task_defs=task_defs,\\n\",\n    \"    data_dir=DATA_DIR,\\n\",\n    \"    canonical_data_suffix=\\\"canonical_data\\\",\\n\",\n    \"    dump_rows=True,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"## Build data to MTDNN Format\\n\",\n    \"## Iterable of each specific task and processed data\\n\",\n    \"vectorized_data = data_builder.vectorize()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Create the Data Processing Object  \\n\",\n    \"\\n\",\n    \"Create a data preprocessing object, `MTDNNDataProcess`. This creates the training, test and development PyTorch dataloaders needed for training and testing. We also need to retrieve the necessary training options required to initialize the model correctly, for all tasks.  \\n\",\n    \"\\n\",\n    \"Define a data process that handles creating the training, test and development PyTorch dataloaders\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 26,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/26/2020 08:14:03 - mtdnn.process_mtdnn - INFO - Starting to process the training data sets\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.process_mtdnn - INFO - Loading mnli_train as task 0\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.dataset_mtdnn - INFO - Loaded 391533 samples out of 392702\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.process_mtdnn - INFO - Starting to process the testing data sets\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.process_mtdnn - INFO - Loading mnli_dev_matched as task 0\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.dataset_mtdnn - INFO - Loaded 9815 samples out of 9815\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.process_mtdnn - INFO - Loading mnli_dev_mismatched as task 0\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.dataset_mtdnn - INFO - Loaded 9832 samples out of 9832\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.process_mtdnn - INFO - Loading mnli_test_matched as task 0\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.dataset_mtdnn - INFO - Loaded 9796 samples out of 9796\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.process_mtdnn - INFO - Loading mnli_test_mismatched as task 0\\n\",\n      \"06/26/2020 08:14:03 - mtdnn.dataset_mtdnn - INFO - Loaded 9847 samples out of 9847\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Make the Data Preprocess step and update the config with training data updates\\n\",\n    \"data_processor = MTDNNDataProcess(\\n\",\n    \"    config=config, task_defs=task_defs, vectorized_data=vectorized_data\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Retrieve the processed batch multitask batch data loaders for training, development and test\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 27,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"multitask_train_dataloader = data_processor.get_train_dataloader()\\n\",\n    \"dev_dataloaders_list = data_processor.get_dev_dataloaders()\\n\",\n    \"test_dataloaders_list = data_processor.get_test_dataloaders()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now we can retrieve the training options, from the processor, to initialize model with.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 28,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"decoder_opts = data_processor.get_decoder_options_list()\\n\",\n    \"task_types = data_processor.get_task_types_list()\\n\",\n    \"dropout_list = data_processor.get_tasks_dropout_prob_list()\\n\",\n    \"loss_types = data_processor.get_loss_types_list()\\n\",\n    \"kd_loss_types = data_processor.get_kd_loss_types_list()\\n\",\n    \"tasks_nclass_list = data_processor.get_task_nclass_list()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Let us update the batch steps\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 29,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"num_all_batches = data_processor.get_num_all_batches()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Instantiate the MTDNN Model\\n\",\n    \"\\n\",\n    \"Now we can go ahead and create an `MTDNNModel` model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 30,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"idx: 0, number of task labels: 3\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"model = MTDNNModel(\\n\",\n    \"    config,\\n\",\n    \"    task_defs,\\n\",\n    \"    pretrained_model_name=\\\"bert-base-uncased\\\",\\n\",\n    \"    num_train_step=num_all_batches,\\n\",\n    \"    decoder_opts=decoder_opts,\\n\",\n    \"    task_types=task_types,\\n\",\n    \"    dropout_list=dropout_list,\\n\",\n    \"    loss_types=loss_types,\\n\",\n    \"    kd_loss_types=kd_loss_types,\\n\",\n    \"    tasks_nclass_list=tasks_nclass_list,\\n\",\n    \"    multitask_train_dataloader=multitask_train_dataloader,\\n\",\n    \"    dev_dataloaders_list=dev_dataloaders_list,\\n\",\n    \"    test_dataloaders_list=test_dataloaders_list,\\n\",\n    \"    output_dir=OUTPUT_DIR,\\n\",\n    \"    log_dir=LOG_DIR \\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Model Finetuning, Prediction and Evaluation\\n\",\n    \"\\n\",\n    \"### Fit and finetune model on five epochs and predict using the training and test  \\n\",\n    \"\\n\",\n    \"At this point the MT-DNN model allows us to fit to the model and create predictions. The fit takes an optional `epochs` parameter that overwrites the epochs set in the `MTDNNConfig` object. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 31,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/26/2020 08:14:07 - mtdnn.modeling_mtdnn - INFO - Total number of params: 109484547\\n\",\n      \"06/26/2020 08:14:07 - mtdnn.modeling_mtdnn - INFO - At epoch 0\\n\",\n      \"06/26/2020 08:14:07 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\\n\",\n      \"06/26/2020 08:14:13 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [     1] Training Loss - [1.63923] Time Remaining - [1 day, 13:33:33]\\n\",\n      \"06/26/2020 08:19:40 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [   500] Training Loss - [1.32204] Time Remaining - [4:25:55]\\n\",\n      \"06/26/2020 08:25:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  1000] Training Loss - [1.21343] Time Remaining - [4:18:55]\\n\",\n      \"06/26/2020 08:30:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  1500] Training Loss - [1.16369] Time Remaining - [4:13:52]\\n\",\n      \"06/26/2020 08:36:15 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  2000] Training Loss - [1.12522] Time Remaining - [4:08:36]\\n\",\n      \"06/26/2020 08:41:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  2500] Training Loss - [1.07541] Time Remaining - [4:03:18]\\n\",\n      \"06/26/2020 08:47:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  3000] Training Loss - [1.03195] Time Remaining - [3:58:03]\\n\",\n      \"06/26/2020 08:52:55 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  3500] Training Loss - [0.99050] Time Remaining - [3:52:26]\\n\",\n      \"06/26/2020 08:58:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  4000] Training Loss - [0.95599] Time Remaining - [3:46:59]\\n\",\n      \"06/26/2020 09:04:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  4500] Training Loss - [0.92721] Time Remaining - [3:41:34]\\n\",\n      \"06/26/2020 09:09:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  5000] Training Loss - [0.90235] Time Remaining - [3:36:07]\\n\",\n      \"06/26/2020 09:15:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  5500] Training Loss - [0.87961] Time Remaining - [3:30:33]\\n\",\n      \"06/26/2020 09:20:41 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  6000] Training Loss - [0.85982] Time Remaining - [3:24:55]\\n\",\n      \"06/26/2020 09:26:15 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  6500] Training Loss - [0.84107] Time Remaining - [3:19:24]\\n\",\n      \"06/26/2020 09:31:47 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  7000] Training Loss - [0.82505] Time Remaining - [3:13:51]\\n\",\n      \"06/26/2020 09:37:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  7500] Training Loss - [0.81009] Time Remaining - [3:08:25]\\n\",\n      \"06/26/2020 09:42:55 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  8000] Training Loss - [0.79706] Time Remaining - [3:02:49]\\n\",\n      \"06/26/2020 09:48:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  8500] Training Loss - [0.78522] Time Remaining - [2:57:13]\\n\",\n      \"06/26/2020 09:54:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  9000] Training Loss - [0.77296] Time Remaining - [2:51:42]\\n\",\n      \"06/26/2020 09:59:34 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  9500] Training Loss - [0.76185] Time Remaining - [2:46:11]\\n\",\n      \"06/26/2020 10:05:11 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 10000] Training Loss - [0.75168] Time Remaining - [2:40:42]\\n\",\n      \"06/26/2020 10:10:46 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 10500] Training Loss - [0.74186] Time Remaining - [2:35:11]\\n\",\n      \"06/26/2020 10:16:17 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 11000] Training Loss - [0.73347] Time Remaining - [2:29:37]\\n\",\n      \"06/26/2020 10:21:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 11500] Training Loss - [0.72535] Time Remaining - [2:24:03]\\n\",\n      \"06/26/2020 10:27:24 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 12000] Training Loss - [0.71798] Time Remaining - [2:18:30]\\n\",\n      \"06/26/2020 10:32:56 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 12500] Training Loss - [0.71132] Time Remaining - [2:12:56]\\n\",\n      \"06/26/2020 10:38:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 13000] Training Loss - [0.70462] Time Remaining - [2:07:23]\\n\",\n      \"06/26/2020 10:44:02 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 13500] Training Loss - [0.69882] Time Remaining - [2:01:49]\\n\",\n      \"06/26/2020 10:49:35 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 14000] Training Loss - [0.69229] Time Remaining - [1:56:16]\\n\",\n      \"06/26/2020 10:55:08 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 14500] Training Loss - [0.68647] Time Remaining - [1:50:43]\\n\",\n      \"06/26/2020 11:00:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 15000] Training Loss - [0.68061] Time Remaining - [1:45:10]\\n\",\n      \"06/26/2020 11:06:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 15500] Training Loss - [0.67555] Time Remaining - [1:39:39]\\n\",\n      \"06/26/2020 11:11:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 16000] Training Loss - [0.67038] Time Remaining - [1:34:05]\\n\",\n      \"06/26/2020 11:17:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 16500] Training Loss - [0.66557] Time Remaining - [1:28:32]\\n\",\n      \"06/26/2020 11:22:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 17000] Training Loss - [0.66106] Time Remaining - [1:22:57]\\n\",\n      \"06/26/2020 11:28:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 17500] Training Loss - [0.65651] Time Remaining - [1:17:24]\\n\",\n      \"06/26/2020 11:34:01 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 18000] Training Loss - [0.65221] Time Remaining - [1:11:51]\\n\",\n      \"06/26/2020 11:39:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 18500] Training Loss - [0.64808] Time Remaining - [1:06:17]\\n\",\n      \"06/26/2020 11:45:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 19000] Training Loss - [0.64444] Time Remaining - [1:00:44]\\n\",\n      \"06/26/2020 11:50:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 19500] Training Loss - [0.64039] Time Remaining - [0:55:11]\\n\",\n      \"06/26/2020 11:56:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 20000] Training Loss - [0.63708] Time Remaining - [0:49:38]\\n\",\n      \"06/27/2020 12:01:45 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 20500] Training Loss - [0.63337] Time Remaining - [0:44:05]\\n\",\n      \"06/27/2020 12:07:19 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 21000] Training Loss - [0.62972] Time Remaining - [0:38:32]\\n\",\n      \"06/27/2020 12:12:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 21500] Training Loss - [0.62656] Time Remaining - [0:32:59]\\n\",\n      \"06/27/2020 12:18:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 22000] Training Loss - [0.62311] Time Remaining - [0:27:26]\\n\",\n      \"06/27/2020 12:23:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 22500] Training Loss - [0.62002] Time Remaining - [0:21:53]\\n\",\n      \"06/27/2020 12:29:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 23000] Training Loss - [0.61681] Time Remaining - [0:16:20]\\n\",\n      \"06/27/2020 12:35:04 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 23500] Training Loss - [0.61411] Time Remaining - [0:10:46]\\n\",\n      \"06/27/2020 12:40:36 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 24000] Training Loss - [0.61127] Time Remaining - [0:05:13]\\n\",\n      \"06/27/2020 12:45:48 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_0.pt\\n\",\n      \"06/27/2020 12:45:50 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_0.pt\\n\",\n      \"06/27/2020 12:45:50 - mtdnn.modeling_mtdnn - INFO - At epoch 1\\n\",\n      \"06/27/2020 12:45:50 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\\n\",\n      \"06/27/2020 12:46:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 24500] Training Loss - [0.60860] Time Remaining - [4:31:07]\\n\",\n      \"06/27/2020 12:51:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 25000] Training Loss - [0.60618] Time Remaining - [4:27:29]\\n\",\n      \"06/27/2020 12:57:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 25500] Training Loss - [0.60383] Time Remaining - [4:20:36]\\n\",\n      \"06/27/2020 01:02:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 26000] Training Loss - [0.60122] Time Remaining - [4:15:02]\\n\",\n      \"06/27/2020 01:08:22 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 26500] Training Loss - [0.59883] Time Remaining - [4:09:14]\\n\",\n      \"06/27/2020 01:13:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 27000] Training Loss - [0.59667] Time Remaining - [4:03:36]\\n\",\n      \"06/27/2020 01:19:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 27500] Training Loss - [0.59434] Time Remaining - [3:58:02]\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/27/2020 01:25:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 28000] Training Loss - [0.59204] Time Remaining - [3:52:28]\\n\",\n      \"06/27/2020 01:30:34 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 28500] Training Loss - [0.58952] Time Remaining - [3:46:57]\\n\",\n      \"06/27/2020 01:36:07 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 29000] Training Loss - [0.58707] Time Remaining - [3:41:27]\\n\",\n      \"06/27/2020 01:41:39 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 29500] Training Loss - [0.58480] Time Remaining - [3:35:47]\\n\",\n      \"06/27/2020 01:47:11 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 30000] Training Loss - [0.58238] Time Remaining - [3:30:10]\\n\",\n      \"06/27/2020 01:52:43 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 30500] Training Loss - [0.57984] Time Remaining - [3:24:34]\\n\",\n      \"06/27/2020 01:58:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 31000] Training Loss - [0.57737] Time Remaining - [3:19:04]\\n\",\n      \"06/27/2020 02:03:47 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 31500] Training Loss - [0.57507] Time Remaining - [3:13:25]\\n\",\n      \"06/27/2020 02:09:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 32000] Training Loss - [0.57277] Time Remaining - [3:07:56]\\n\",\n      \"06/27/2020 02:14:52 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 32500] Training Loss - [0.57034] Time Remaining - [3:02:20]\\n\",\n      \"06/27/2020 02:20:22 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 33000] Training Loss - [0.56793] Time Remaining - [2:56:42]\\n\",\n      \"06/27/2020 02:25:56 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 33500] Training Loss - [0.56548] Time Remaining - [2:51:11]\\n\",\n      \"06/27/2020 02:31:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 34000] Training Loss - [0.56309] Time Remaining - [2:45:41]\\n\",\n      \"06/27/2020 02:37:04 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 34500] Training Loss - [0.56059] Time Remaining - [2:40:11]\\n\",\n      \"06/27/2020 02:42:39 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 35000] Training Loss - [0.55799] Time Remaining - [2:34:41]\\n\",\n      \"06/27/2020 02:48:11 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 35500] Training Loss - [0.55566] Time Remaining - [2:29:07]\\n\",\n      \"06/27/2020 02:53:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 36000] Training Loss - [0.55331] Time Remaining - [2:23:34]\\n\",\n      \"06/27/2020 02:59:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 36500] Training Loss - [0.55091] Time Remaining - [2:18:02]\\n\",\n      \"06/27/2020 03:04:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 37000] Training Loss - [0.54856] Time Remaining - [2:12:29]\\n\",\n      \"06/27/2020 03:10:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 37500] Training Loss - [0.54628] Time Remaining - [2:06:55]\\n\",\n      \"06/27/2020 03:15:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 38000] Training Loss - [0.54413] Time Remaining - [2:01:21]\\n\",\n      \"06/27/2020 03:21:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 38500] Training Loss - [0.54178] Time Remaining - [1:55:49]\\n\",\n      \"06/27/2020 03:27:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 39000] Training Loss - [0.53955] Time Remaining - [1:50:16]\\n\",\n      \"06/27/2020 03:32:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 39500] Training Loss - [0.53732] Time Remaining - [1:44:42]\\n\",\n      \"06/27/2020 03:38:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 40000] Training Loss - [0.53530] Time Remaining - [1:39:11]\\n\",\n      \"06/27/2020 03:43:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 40500] Training Loss - [0.53318] Time Remaining - [1:33:38]\\n\",\n      \"06/27/2020 03:49:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 41000] Training Loss - [0.53105] Time Remaining - [1:28:05]\\n\",\n      \"06/27/2020 03:54:41 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 41500] Training Loss - [0.52908] Time Remaining - [1:22:32]\\n\",\n      \"06/27/2020 04:00:14 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 42000] Training Loss - [0.52711] Time Remaining - [1:16:59]\\n\",\n      \"06/27/2020 04:05:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 42500] Training Loss - [0.52516] Time Remaining - [1:11:26]\\n\",\n      \"06/27/2020 04:11:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 43000] Training Loss - [0.52324] Time Remaining - [1:05:53]\\n\",\n      \"06/27/2020 04:16:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 43500] Training Loss - [0.52161] Time Remaining - [1:00:20]\\n\",\n      \"06/27/2020 04:22:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 44000] Training Loss - [0.51970] Time Remaining - [0:54:48]\\n\",\n      \"06/27/2020 04:27:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 44500] Training Loss - [0.51821] Time Remaining - [0:49:14]\\n\",\n      \"06/27/2020 04:33:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 45000] Training Loss - [0.51635] Time Remaining - [0:43:42]\\n\",\n      \"06/27/2020 04:39:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 45500] Training Loss - [0.51451] Time Remaining - [0:38:09]\\n\",\n      \"06/27/2020 04:44:33 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 46000] Training Loss - [0.51286] Time Remaining - [0:32:37]\\n\",\n      \"06/27/2020 04:50:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 46500] Training Loss - [0.51112] Time Remaining - [0:27:04]\\n\",\n      \"06/27/2020 04:55:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 47000] Training Loss - [0.50952] Time Remaining - [0:21:31]\\n\",\n      \"06/27/2020 05:01:08 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 47500] Training Loss - [0.50789] Time Remaining - [0:15:59]\\n\",\n      \"06/27/2020 05:06:39 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 48000] Training Loss - [0.50631] Time Remaining - [0:10:26]\\n\",\n      \"06/27/2020 05:12:12 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 48500] Training Loss - [0.50469] Time Remaining - [0:04:53]\\n\",\n      \"06/27/2020 05:17:06 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_1.pt\\n\",\n      \"06/27/2020 05:17:07 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_1.pt\\n\",\n      \"06/27/2020 05:17:07 - mtdnn.modeling_mtdnn - INFO - At epoch 2\\n\",\n      \"06/27/2020 05:17:07 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\\n\",\n      \"06/27/2020 05:17:46 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 49000] Training Loss - [0.50317] Time Remaining - [4:33:15]\\n\",\n      \"06/27/2020 05:23:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 49500] Training Loss - [0.50171] Time Remaining - [4:26:45]\\n\",\n      \"06/27/2020 05:28:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 50000] Training Loss - [0.50034] Time Remaining - [4:20:18]\\n\",\n      \"06/27/2020 05:34:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 50500] Training Loss - [0.49876] Time Remaining - [4:14:39]\\n\",\n      \"06/27/2020 05:39:58 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 51000] Training Loss - [0.49731] Time Remaining - [4:08:48]\\n\",\n      \"06/27/2020 05:45:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 51500] Training Loss - [0.49601] Time Remaining - [4:03:18]\\n\",\n      \"06/27/2020 05:51:06 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 52000] Training Loss - [0.49468] Time Remaining - [3:57:54]\\n\",\n      \"06/27/2020 05:56:40 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 52500] Training Loss - [0.49328] Time Remaining - [3:52:27]\\n\",\n      \"06/27/2020 06:02:14 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 53000] Training Loss - [0.49179] Time Remaining - [3:46:53]\\n\",\n      \"06/27/2020 06:07:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 53500] Training Loss - [0.49036] Time Remaining - [3:41:24]\\n\",\n      \"06/27/2020 06:13:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 54000] Training Loss - [0.48902] Time Remaining - [3:35:48]\\n\",\n      \"06/27/2020 06:18:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 54500] Training Loss - [0.48761] Time Remaining - [3:30:10]\\n\",\n      \"06/27/2020 06:24:25 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 55000] Training Loss - [0.48609] Time Remaining - [3:24:32]\\n\",\n      \"06/27/2020 06:29:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 55500] Training Loss - [0.48458] Time Remaining - [3:19:02]\\n\",\n      \"06/27/2020 06:35:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 56000] Training Loss - [0.48321] Time Remaining - [3:13:27]\\n\",\n      \"06/27/2020 06:41:07 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 56500] Training Loss - [0.48176] Time Remaining - [3:07:58]\\n\",\n      \"06/27/2020 06:46:38 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 57000] Training Loss - [0.48029] Time Remaining - [3:02:19]\\n\",\n      \"06/27/2020 06:52:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 57500] Training Loss - [0.47890] Time Remaining - [2:56:41]\\n\",\n      \"06/27/2020 06:57:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 58000] Training Loss - [0.47741] Time Remaining - [2:51:11]\\n\",\n      \"06/27/2020 07:03:17 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 58500] Training Loss - [0.47591] Time Remaining - [2:45:38]\\n\",\n      \"06/27/2020 07:08:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 59000] Training Loss - [0.47436] Time Remaining - [2:40:06]\\n\",\n      \"06/27/2020 07:14:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 59500] Training Loss - [0.47282] Time Remaining - [2:34:34]\\n\",\n      \"06/27/2020 07:19:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 60000] Training Loss - [0.47137] Time Remaining - [2:29:01]\\n\",\n      \"06/27/2020 07:25:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 60500] Training Loss - [0.46989] Time Remaining - [2:23:27]\\n\",\n      \"06/27/2020 07:31:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 61000] Training Loss - [0.46844] Time Remaining - [2:17:54]\\n\",\n      \"06/27/2020 07:36:38 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 61500] Training Loss - [0.46691] Time Remaining - [2:12:20]\\n\",\n      \"06/27/2020 07:42:12 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 62000] Training Loss - [0.46547] Time Remaining - [2:06:48]\\n\",\n      \"06/27/2020 07:47:45 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 62500] Training Loss - [0.46406] Time Remaining - [2:01:14]\\n\",\n      \"06/27/2020 07:53:17 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 63000] Training Loss - [0.46261] Time Remaining - [1:55:40]\\n\",\n      \"06/27/2020 07:58:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 63500] Training Loss - [0.46117] Time Remaining - [1:50:06]\\n\",\n      \"06/27/2020 08:04:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 64000] Training Loss - [0.45977] Time Remaining - [1:44:33]\\n\",\n      \"06/27/2020 08:10:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 64500] Training Loss - [0.45842] Time Remaining - [1:39:02]\\n\",\n      \"06/27/2020 08:15:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 65000] Training Loss - [0.45711] Time Remaining - [1:33:28]\\n\",\n      \"06/27/2020 08:21:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 65500] Training Loss - [0.45574] Time Remaining - [1:27:54]\\n\",\n      \"06/27/2020 08:26:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 66000] Training Loss - [0.45438] Time Remaining - [1:22:20]\\n\",\n      \"06/27/2020 08:32:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 66500] Training Loss - [0.45316] Time Remaining - [1:16:47]\\n\",\n      \"06/27/2020 08:37:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 67000] Training Loss - [0.45187] Time Remaining - [1:11:14]\\n\",\n      \"06/27/2020 08:43:13 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 67500] Training Loss - [0.45054] Time Remaining - [1:05:40]\\n\",\n      \"06/27/2020 08:48:47 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 68000] Training Loss - [0.44935] Time Remaining - [1:00:07]\\n\",\n      \"06/27/2020 08:54:20 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 68500] Training Loss - [0.44811] Time Remaining - [0:54:33]\\n\",\n      \"06/27/2020 08:59:52 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 69000] Training Loss - [0.44706] Time Remaining - [0:49:00]\\n\",\n      \"06/27/2020 09:05:25 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 69500] Training Loss - [0.44582] Time Remaining - [0:43:27]\\n\",\n      \"06/27/2020 09:10:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 70000] Training Loss - [0.44460] Time Remaining - [0:37:54]\\n\",\n      \"06/27/2020 09:16:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 70500] Training Loss - [0.44340] Time Remaining - [0:32:21]\\n\",\n      \"06/27/2020 09:22:04 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 71000] Training Loss - [0.44224] Time Remaining - [0:26:47]\\n\",\n      \"06/27/2020 09:27:36 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 71500] Training Loss - [0.44109] Time Remaining - [0:21:14]\\n\",\n      \"06/27/2020 09:33:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 72000] Training Loss - [0.43992] Time Remaining - [0:15:41]\\n\",\n      \"06/27/2020 09:38:43 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 72500] Training Loss - [0.43884] Time Remaining - [0:10:08]\\n\",\n      \"06/27/2020 09:44:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 73000] Training Loss - [0.43772] Time Remaining - [0:04:35]\\n\",\n      \"06/27/2020 09:48:52 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_2.pt\\n\",\n      \"06/27/2020 09:48:53 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_2.pt\\n\",\n      \"06/27/2020 09:48:53 - mtdnn.modeling_mtdnn - INFO - At epoch 3\\n\",\n      \"06/27/2020 09:48:53 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\\n\",\n      \"06/27/2020 09:49:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 73500] Training Loss - [0.43667] Time Remaining - [4:31:14]\\n\",\n      \"06/27/2020 09:55:24 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 74000] Training Loss - [0.43569] Time Remaining - [4:24:58]\\n\",\n      \"06/27/2020 10:00:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 74500] Training Loss - [0.43456] Time Remaining - [4:18:35]\\n\",\n      \"06/27/2020 10:06:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 75000] Training Loss - [0.43348] Time Remaining - [4:13:25]\\n\",\n      \"06/27/2020 10:12:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 75500] Training Loss - [0.43240] Time Remaining - [4:07:56]\\n\",\n      \"06/27/2020 10:17:31 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 76000] Training Loss - [0.43145] Time Remaining - [4:02:12]\\n\",\n      \"06/27/2020 10:23:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 76500] Training Loss - [0.43042] Time Remaining - [3:56:40]\\n\",\n      \"06/27/2020 10:28:36 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 77000] Training Loss - [0.42942] Time Remaining - [3:51:12]\\n\",\n      \"06/27/2020 10:34:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 77500] Training Loss - [0.42829] Time Remaining - [3:45:45]\\n\",\n      \"06/27/2020 10:39:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 78000] Training Loss - [0.42727] Time Remaining - [3:40:23]\\n\",\n      \"06/27/2020 10:45:14 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 78500] Training Loss - [0.42634] Time Remaining - [3:34:42]\\n\",\n      \"06/27/2020 10:50:46 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 79000] Training Loss - [0.42530] Time Remaining - [3:29:08]\\n\",\n      \"06/27/2020 10:56:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 79500] Training Loss - [0.42421] Time Remaining - [3:23:36]\\n\",\n      \"06/27/2020 11:01:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 80000] Training Loss - [0.42316] Time Remaining - [3:18:07]\\n\",\n      \"06/27/2020 11:07:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 80500] Training Loss - [0.42214] Time Remaining - [3:12:31]\\n\",\n      \"06/27/2020 11:12:56 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 81000] Training Loss - [0.42110] Time Remaining - [3:07:02]\\n\",\n      \"06/27/2020 11:18:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 81500] Training Loss - [0.42001] Time Remaining - [3:01:29]\\n\",\n      \"06/27/2020 11:23:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 82000] Training Loss - [0.41902] Time Remaining - [2:55:54]\\n\",\n      \"06/27/2020 11:29:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 82500] Training Loss - [0.41800] Time Remaining - [2:50:20]\\n\",\n      \"06/27/2020 11:35:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 83000] Training Loss - [0.41688] Time Remaining - [2:44:49]\\n\",\n      \"06/27/2020 11:40:36 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 83500] Training Loss - [0.41583] Time Remaining - [2:39:18]\\n\",\n      \"06/27/2020 11:46:08 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 84000] Training Loss - [0.41472] Time Remaining - [2:33:45]\\n\",\n      \"06/27/2020 11:51:38 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 84500] Training Loss - [0.41364] Time Remaining - [2:28:10]\\n\",\n      \"06/27/2020 11:57:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 85000] Training Loss - [0.41259] Time Remaining - [2:22:38]\\n\",\n      \"06/27/2020 12:02:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 85500] Training Loss - [0.41152] Time Remaining - [2:17:06]\\n\",\n      \"06/27/2020 12:08:14 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 86000] Training Loss - [0.41049] Time Remaining - [2:11:34]\\n\",\n      \"06/27/2020 12:13:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 86500] Training Loss - [0.40944] Time Remaining - [2:05:59]\\n\",\n      \"06/27/2020 12:19:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 87000] Training Loss - [0.40839] Time Remaining - [2:00:27]\\n\",\n      \"06/27/2020 12:24:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 87500] Training Loss - [0.40739] Time Remaining - [1:54:55]\\n\",\n      \"06/27/2020 12:30:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 88000] Training Loss - [0.40638] Time Remaining - [1:49:24]\\n\",\n      \"06/27/2020 12:35:52 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 88500] Training Loss - [0.40539] Time Remaining - [1:43:51]\\n\",\n      \"06/27/2020 12:41:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 89000] Training Loss - [0.40443] Time Remaining - [1:38:21]\\n\",\n      \"06/27/2020 12:47:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 89500] Training Loss - [0.40348] Time Remaining - [1:32:49]\\n\",\n      \"06/27/2020 12:52:33 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 90000] Training Loss - [0.40242] Time Remaining - [1:27:17]\\n\",\n      \"06/27/2020 12:58:02 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 90500] Training Loss - [0.40147] Time Remaining - [1:21:44]\\n\",\n      \"06/27/2020 01:03:34 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 91000] Training Loss - [0.40057] Time Remaining - [1:16:12]\\n\",\n      \"06/27/2020 01:09:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 91500] Training Loss - [0.39961] Time Remaining - [1:10:39]\\n\",\n      \"06/27/2020 01:14:35 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 92000] Training Loss - [0.39874] Time Remaining - [1:05:07]\\n\",\n      \"06/27/2020 01:20:06 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 92500] Training Loss - [0.39783] Time Remaining - [0:59:34]\\n\",\n      \"06/27/2020 01:25:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 93000] Training Loss - [0.39692] Time Remaining - [0:54:02]\\n\",\n      \"06/27/2020 01:31:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 93500] Training Loss - [0.39617] Time Remaining - [0:48:30]\\n\",\n      \"06/27/2020 01:36:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 94000] Training Loss - [0.39524] Time Remaining - [0:42:58]\\n\",\n      \"06/27/2020 01:42:15 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 94500] Training Loss - [0.39436] Time Remaining - [0:37:26]\\n\",\n      \"06/27/2020 01:47:46 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 95000] Training Loss - [0.39353] Time Remaining - [0:31:54]\\n\",\n      \"06/27/2020 01:53:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 95500] Training Loss - [0.39261] Time Remaining - [0:26:22]\\n\",\n      \"06/27/2020 01:58:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 96000] Training Loss - [0.39182] Time Remaining - [0:20:50]\\n\",\n      \"06/27/2020 02:04:20 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 96500] Training Loss - [0.39099] Time Remaining - [0:15:18]\\n\",\n      \"06/27/2020 02:09:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 97000] Training Loss - [0.39020] Time Remaining - [0:09:46]\\n\",\n      \"06/27/2020 02:15:22 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 97500] Training Loss - [0.38940] Time Remaining - [0:04:14]\\n\",\n      \"06/27/2020 02:19:37 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_3.pt\\n\",\n      \"06/27/2020 02:19:38 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_3.pt\\n\",\n      \"06/27/2020 02:19:38 - mtdnn.modeling_mtdnn - INFO - At epoch 4\\n\",\n      \"06/27/2020 02:19:38 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\\n\",\n      \"06/27/2020 02:20:57 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 98000] Training Loss - [0.38866] Time Remaining - [4:36:08]\\n\",\n      \"06/27/2020 02:26:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 98500] Training Loss - [0.38793] Time Remaining - [4:26:03]\\n\",\n      \"06/27/2020 02:32:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 99000] Training Loss - [0.38710] Time Remaining - [4:20:01]\\n\",\n      \"06/27/2020 02:37:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 99500] Training Loss - [0.38627] Time Remaining - [4:14:15]\\n\",\n      \"06/27/2020 02:43:11 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [100000] Training Loss - [0.38549] Time Remaining - [4:08:45]\\n\",\n      \"06/27/2020 02:48:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [100500] Training Loss - [0.38482] Time Remaining - [4:03:04]\\n\",\n      \"06/27/2020 02:54:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [101000] Training Loss - [0.38410] Time Remaining - [3:57:37]\\n\",\n      \"06/27/2020 02:59:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [101500] Training Loss - [0.38333] Time Remaining - [3:52:06]\\n\",\n      \"06/27/2020 03:05:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [102000] Training Loss - [0.38250] Time Remaining - [3:46:33]\\n\",\n      \"06/27/2020 03:11:01 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [102500] Training Loss - [0.38174] Time Remaining - [3:41:01]\\n\",\n      \"06/27/2020 03:16:34 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [103000] Training Loss - [0.38101] Time Remaining - [3:35:23]\\n\",\n      \"06/27/2020 03:22:06 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [103500] Training Loss - [0.38023] Time Remaining - [3:29:44]\\n\",\n      \"06/27/2020 03:27:40 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [104000] Training Loss - [0.37941] Time Remaining - [3:24:10]\\n\",\n      \"06/27/2020 03:33:13 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [104500] Training Loss - [0.37866] Time Remaining - [3:18:34]\\n\",\n      \"06/27/2020 03:38:45 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [105000] Training Loss - [0.37794] Time Remaining - [3:12:57]\\n\",\n      \"06/27/2020 03:44:19 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [105500] Training Loss - [0.37717] Time Remaining - [3:07:24]\\n\",\n      \"06/27/2020 03:49:49 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [106000] Training Loss - [0.37640] Time Remaining - [3:01:44]\\n\",\n      \"06/27/2020 03:55:22 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [106500] Training Loss - [0.37562] Time Remaining - [2:56:09]\\n\",\n      \"06/27/2020 04:00:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [107000] Training Loss - [0.37492] Time Remaining - [2:50:34]\\n\",\n      \"06/27/2020 04:06:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [107500] Training Loss - [0.37413] Time Remaining - [2:45:01]\\n\",\n      \"06/27/2020 04:12:01 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [108000] Training Loss - [0.37329] Time Remaining - [2:39:28]\\n\",\n      \"06/27/2020 04:17:35 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [108500] Training Loss - [0.37250] Time Remaining - [2:33:56]\\n\",\n      \"06/27/2020 04:23:07 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [109000] Training Loss - [0.37170] Time Remaining - [2:28:21]\\n\",\n      \"06/27/2020 04:28:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [109500] Training Loss - [0.37095] Time Remaining - [2:22:50]\\n\",\n      \"06/27/2020 04:34:15 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [110000] Training Loss - [0.37015] Time Remaining - [2:17:16]\\n\",\n      \"06/27/2020 04:39:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [110500] Training Loss - [0.36936] Time Remaining - [2:11:43]\\n\",\n      \"06/27/2020 04:45:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [111000] Training Loss - [0.36862] Time Remaining - [2:06:06]\\n\",\n      \"06/27/2020 04:50:52 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [111500] Training Loss - [0.36786] Time Remaining - [2:00:34]\\n\",\n      \"06/27/2020 04:56:24 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [112000] Training Loss - [0.36715] Time Remaining - [1:54:59]\\n\",\n      \"06/27/2020 05:01:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [112500] Training Loss - [0.36641] Time Remaining - [1:49:28]\\n\",\n      \"06/27/2020 05:07:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [113000] Training Loss - [0.36561] Time Remaining - [1:43:54]\\n\",\n      \"06/27/2020 05:13:08 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [113500] Training Loss - [0.36493] Time Remaining - [1:38:22]\\n\",\n      \"06/27/2020 05:18:39 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [114000] Training Loss - [0.36422] Time Remaining - [1:32:48]\\n\",\n      \"06/27/2020 05:24:13 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [114500] Training Loss - [0.36346] Time Remaining - [1:27:15]\\n\",\n      \"06/27/2020 05:29:45 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [115000] Training Loss - [0.36276] Time Remaining - [1:21:41]\\n\",\n      \"06/27/2020 05:35:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [115500] Training Loss - [0.36208] Time Remaining - [1:16:08]\\n\",\n      \"06/27/2020 05:40:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [116000] Training Loss - [0.36137] Time Remaining - [1:10:34]\\n\",\n      \"06/27/2020 05:46:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [116500] Training Loss - [0.36070] Time Remaining - [1:05:00]\\n\",\n      \"06/27/2020 05:51:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [117000] Training Loss - [0.36006] Time Remaining - [0:59:27]\\n\",\n      \"06/27/2020 05:57:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [117500] Training Loss - [0.35940] Time Remaining - [0:53:54]\\n\",\n      \"06/27/2020 06:03:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [118000] Training Loss - [0.35885] Time Remaining - [0:48:21]\\n\",\n      \"06/27/2020 06:08:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [118500] Training Loss - [0.35815] Time Remaining - [0:42:48]\\n\",\n      \"06/27/2020 06:14:07 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [119000] Training Loss - [0.35751] Time Remaining - [0:37:15]\\n\",\n      \"06/27/2020 06:19:40 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [119500] Training Loss - [0.35685] Time Remaining - [0:31:42]\\n\",\n      \"06/27/2020 06:25:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [120000] Training Loss - [0.35616] Time Remaining - [0:26:09]\\n\",\n      \"06/27/2020 06:30:47 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [120500] Training Loss - [0.35558] Time Remaining - [0:20:35]\\n\",\n      \"06/27/2020 06:36:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [121000] Training Loss - [0.35499] Time Remaining - [0:15:02]\\n\",\n      \"06/27/2020 06:41:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [121500] Training Loss - [0.35438] Time Remaining - [0:09:29]\\n\",\n      \"06/27/2020 06:47:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [122000] Training Loss - [0.35374] Time Remaining - [0:03:56]\\n\",\n      \"06/27/2020 06:51:23 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_4.pt\\n\",\n      \"06/27/2020 06:51:24 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_4.pt\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"model.fit(epochs=NUM_EPOCHS)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Evaluation and Prediction\\n\",\n    \"Perform inference using the last (best) checkpointed model. With 5 epochs, the last model would be `model_4.pt`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 32,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"06/27/2020 06:51:24 - mtdnn.modeling_mtdnn - INFO - Running predictions using: /tmp/tmpd9ok4aeo/checkpoint/model_4.pt\\n\",\n      \"06/27/2020 06:51:25 - mtdnn.modeling_mtdnn - INFO - predicting 0\\n\",\n      \"06/27/2020 06:51:45 - mtdnn.modeling_mtdnn - INFO - predicting 100\\n\",\n      \"06/27/2020 06:52:05 - mtdnn.modeling_mtdnn - INFO - predicting 200\\n\",\n      \"06/27/2020 06:52:27 - mtdnn.modeling_mtdnn - INFO - predicting 300\\n\",\n      \"06/27/2020 06:52:47 - mtdnn.modeling_mtdnn - INFO - predicting 400\\n\",\n      \"06/27/2020 06:53:07 - mtdnn.modeling_mtdnn - INFO - predicting 500\\n\",\n      \"06/27/2020 06:53:28 - mtdnn.modeling_mtdnn - INFO - predicting 600\\n\",\n      \"06/27/2020 06:53:48 - mtdnn.modeling_mtdnn - INFO - predicting 700\\n\",\n      \"06/27/2020 06:54:10 - mtdnn.modeling_mtdnn - INFO - predicting 800\\n\",\n      \"06/27/2020 06:54:30 - mtdnn.modeling_mtdnn - INFO - predicting 900\\n\",\n      \"06/27/2020 06:54:50 - mtdnn.modeling_mtdnn - INFO - predicting 1000\\n\",\n      \"06/27/2020 06:55:11 - mtdnn.modeling_mtdnn - INFO - predicting 1100\\n\",\n      \"06/27/2020 06:55:31 - mtdnn.modeling_mtdnn - INFO - predicting 1200\\n\",\n      \"06/27/2020 06:55:37 - mtdnn.modeling_mtdnn - INFO - Task mnli_mismatched -- epoch 0 -- Dev ACC: 84.422\\n\",\n      \"06/27/2020 06:55:37 - mtdnn.modeling_mtdnn - INFO - predicting 0\\n\",\n      \"06/27/2020 06:55:59 - mtdnn.modeling_mtdnn - INFO - predicting 100\\n\",\n      \"06/27/2020 06:56:19 - mtdnn.modeling_mtdnn - INFO - predicting 200\\n\",\n      \"06/27/2020 06:56:39 - mtdnn.modeling_mtdnn - INFO - predicting 300\\n\",\n      \"06/27/2020 06:57:00 - mtdnn.modeling_mtdnn - INFO - predicting 400\\n\",\n      \"06/27/2020 06:57:21 - mtdnn.modeling_mtdnn - INFO - predicting 500\\n\",\n      \"06/27/2020 06:57:42 - mtdnn.modeling_mtdnn - INFO - predicting 600\\n\",\n      \"06/27/2020 06:58:02 - mtdnn.modeling_mtdnn - INFO - predicting 700\\n\",\n      \"06/27/2020 06:58:22 - mtdnn.modeling_mtdnn - INFO - predicting 800\\n\",\n      \"06/27/2020 06:58:42 - mtdnn.modeling_mtdnn - INFO - predicting 900\\n\",\n      \"06/27/2020 06:59:04 - mtdnn.modeling_mtdnn - INFO - predicting 1000\\n\",\n      \"06/27/2020 06:59:24 - mtdnn.modeling_mtdnn - INFO - predicting 1100\\n\",\n      \"06/27/2020 06:59:45 - mtdnn.modeling_mtdnn - INFO - predicting 1200\\n\",\n      \"06/27/2020 06:59:50 - mtdnn.modeling_mtdnn - INFO - [new test scores saved.]\\n\",\n      \"06/27/2020 06:59:50 - mtdnn.modeling_mtdnn - INFO - predicting 0\\n\",\n      \"06/27/2020 07:00:10 - mtdnn.modeling_mtdnn - INFO - predicting 100\\n\",\n      \"06/27/2020 07:00:30 - mtdnn.modeling_mtdnn - INFO - predicting 200\\n\",\n      \"06/27/2020 07:00:52 - mtdnn.modeling_mtdnn - INFO - predicting 300\\n\",\n      \"06/27/2020 07:01:12 - mtdnn.modeling_mtdnn - INFO - predicting 400\\n\",\n      \"06/27/2020 07:01:32 - mtdnn.modeling_mtdnn - INFO - predicting 500\\n\",\n      \"06/27/2020 07:01:52 - mtdnn.modeling_mtdnn - INFO - predicting 600\\n\",\n      \"06/27/2020 07:02:14 - mtdnn.modeling_mtdnn - INFO - predicting 700\\n\",\n      \"06/27/2020 07:02:34 - mtdnn.modeling_mtdnn - INFO - predicting 800\\n\",\n      \"06/27/2020 07:02:55 - mtdnn.modeling_mtdnn - INFO - predicting 900\\n\",\n      \"06/27/2020 07:03:15 - mtdnn.modeling_mtdnn - INFO - predicting 1000\\n\",\n      \"06/27/2020 07:03:35 - mtdnn.modeling_mtdnn - INFO - predicting 1100\\n\",\n      \"06/27/2020 07:03:57 - mtdnn.modeling_mtdnn - INFO - predicting 1200\\n\",\n      \"06/27/2020 07:04:03 - mtdnn.modeling_mtdnn - INFO - Task mnli_matched -- epoch 0 -- Dev ACC: 84.144\\n\",\n      \"06/27/2020 07:04:03 - mtdnn.modeling_mtdnn - INFO - predicting 0\\n\",\n      \"06/27/2020 07:04:23 - mtdnn.modeling_mtdnn - INFO - predicting 100\\n\",\n      \"06/27/2020 07:04:43 - mtdnn.modeling_mtdnn - INFO - predicting 200\\n\",\n      \"06/27/2020 07:05:04 - mtdnn.modeling_mtdnn - INFO - predicting 300\\n\",\n      \"06/27/2020 07:05:24 - mtdnn.modeling_mtdnn - INFO - predicting 400\\n\",\n      \"06/27/2020 07:05:45 - mtdnn.modeling_mtdnn - INFO - predicting 500\\n\",\n      \"06/27/2020 07:06:06 - mtdnn.modeling_mtdnn - INFO - predicting 600\\n\",\n      \"06/27/2020 07:06:26 - mtdnn.modeling_mtdnn - INFO - predicting 700\\n\",\n      \"06/27/2020 07:06:46 - mtdnn.modeling_mtdnn - INFO - predicting 800\\n\",\n      \"06/27/2020 07:07:08 - mtdnn.modeling_mtdnn - INFO - predicting 900\\n\",\n      \"06/27/2020 07:07:28 - mtdnn.modeling_mtdnn - INFO - predicting 1000\\n\",\n      \"06/27/2020 07:07:48 - mtdnn.modeling_mtdnn - INFO - predicting 1100\\n\",\n      \"06/27/2020 07:08:09 - mtdnn.modeling_mtdnn - INFO - predicting 1200\\n\",\n      \"06/27/2020 07:08:15 - mtdnn.modeling_mtdnn - INFO - [new test scores saved.]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"model.predict(trained_model_chckpt=f\\\"{OUTPUT_DIR}/model_4.pt\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Results\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 44,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>Mnli Mismatched Dev</th>\\n\",\n       \"      <th>Mnli Matched Dev</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>ACCURACY</th>\\n\",\n       \"      <td>84.422</td>\\n\",\n       \"      <td>84.144</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"         Mnli Mismatched Dev Mnli Matched Dev\\n\",\n       \"ACCURACY              84.422           84.144\"\n      ]\n     },\n     \"execution_count\": 44,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"results = {}\\n\",\n    \"dev_result_files = list(filter(lambda x: x.endswith('.json') and 'dev' in x, os.listdir(OUTPUT_DIR))) \\n\",\n    \"for d in dev_result_files: \\n\",\n    \"    name =  ' '.join(list(map(str.capitalize, d.split('_')))[:3]) \\n\",\n    \"    file_name = os.path.join(OUTPUT_DIR, d)\\n\",\n    \"    with open(file_name, 'r') as f: \\n\",\n    \"        res = json.load(f) \\n\",\n    \"        results.update(\\n\",\n    \"            {name: {\\n\",\n    \"                'ACCURACY': f\\\"{res['metrics']['ACC']:.3f}\\\"\\n\",\n    \"                }\\n\",\n    \"            }) \\n\",\n    \"df_results = pd.DataFrame(results)   \\n\",\n    \"df_results\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Clean up temporary folders\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if os.path.exists(ROOT_DIR):\\n\",\n    \"    shutil.rmtree(ROOT_DIR, ignore_errors=True)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "examples/text_classification/tc_mnli_transformers.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"*Copyright (c) Microsoft Corporation. All rights reserved.*\\n\",\n    \"\\n\",\n    \"*Licensed under the MIT License.*\\n\",\n    \"\\n\",\n    \"# Text Classification of MultiNLI Sentences using Multiple Transformer Models\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import json\\n\",\n    \"import os\\n\",\n    \"import sys\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"\\n\",\n    \"import numpy as np\\n\",\n    \"import pandas as pd\\n\",\n    \"import scrapbook as sb\\n\",\n    \"import torch\\n\",\n    \"import torch.nn as nn\\n\",\n    \"from sklearn.metrics import accuracy_score, classification_report\\n\",\n    \"from sklearn.model_selection import train_test_split\\n\",\n    \"from sklearn.preprocessing import LabelEncoder\\n\",\n    \"from tqdm import tqdm\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\\n\",\n    \"from utils_nlp.dataset.multinli import load_pandas_df\\n\",\n    \"from utils_nlp.models.transformers.sequence_classification import (\\n\",\n    \"    Processor, SequenceClassifier)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Introduction\\n\",\n    \"In this notebook, we fine-tune and evaluate a number of pretrained models on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset.\\n\",\n    \"\\n\",\n    \"We use a [sequence classifier](../../utils_nlp/models/transformers/sequence_classification.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/transformers) of different transformers, like [BERT](https://github.com/google-research/bert), [XLNet](https://github.com/zihangdai/xlnet), and [RoBERTa](https://github.com/pytorch/fairseq).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# notebook parameters\\n\",\n    \"DATA_FOLDER = TemporaryDirectory().name\\n\",\n    \"CACHE_DIR = TemporaryDirectory().name\\n\",\n    \"NUM_EPOCHS = 1\\n\",\n    \"BATCH_SIZE = 16\\n\",\n    \"NUM_GPUS = 2\\n\",\n    \"MAX_LEN = 100\\n\",\n    \"TRAIN_DATA_FRACTION = 0.05\\n\",\n    \"TEST_DATA_FRACTION = 0.05\\n\",\n    \"TRAIN_SIZE = 0.75\\n\",\n    \"LABEL_COL = \\\"genre\\\"\\n\",\n    \"TEXT_COL = \\\"sentence1\\\"\\n\",\n    \"MODEL_NAMES = [\\\"distilbert-base-uncased\\\", \\\"roberta-base\\\", \\\"xlnet-base-cased\\\"]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Read Dataset\\n\",\n    \"We start by loading a subset of the data. The following function also downloads and extracts the files, if they don't exist in the data folder.\\n\",\n    \"\\n\",\n    \"The MultiNLI dataset is mainly used for natural language inference (NLI) tasks, where the inputs are sentence pairs and the labels are entailment indicators. The sentence pairs are also classified into *genres* that allow for more coverage and better evaluation of NLI models.\\n\",\n    \"\\n\",\n    \"For our classification task, we use the first sentence only as the text input, and the corresponding genre as the label. We select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 222k/222k [01:20<00:00, 2.74kKB/s] \\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"df = load_pandas_df(DATA_FOLDER, \\\"train\\\")\\n\",\n    \"df = df[df[\\\"gold_label\\\"]==\\\"neutral\\\"]  # get unique sentences\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>genre</th>\\n\",\n       \"      <th>sentence1</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>government</td>\\n\",\n       \"      <td>Conceptually cream skimming has two basic dime...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>telephone</td>\\n\",\n       \"      <td>yeah i tell you what though if you go price so...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <td>travel</td>\\n\",\n       \"      <td>But a few Christian mosaics survive above the ...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>12</th>\\n\",\n       \"      <td>slate</td>\\n\",\n       \"      <td>It's not that the questions they asked weren't...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>13</th>\\n\",\n       \"      <td>travel</td>\\n\",\n       \"      <td>Thebes held onto power until the 12th Dynasty,...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"         genre                                          sentence1\\n\",\n       \"0   government  Conceptually cream skimming has two basic dime...\\n\",\n       \"4    telephone  yeah i tell you what though if you go price so...\\n\",\n       \"6       travel  But a few Christian mosaics survive above the ...\\n\",\n       \"12       slate  It's not that the questions they asked weren't...\\n\",\n       \"13      travel  Thebes held onto power until the 12th Dynasty,...\"\n      ]\n     },\n     \"execution_count\": 4,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"df[[LABEL_COL, TEXT_COL]].head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We split the data for training and testing, sample a fraction for faster execution, and encode the class labels:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\\n\",\n      \"  FutureWarning)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# split\\n\",\n    \"df_train, df_test = train_test_split(df, train_size = TRAIN_SIZE, random_state=0)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# sample\\n\",\n    \"df_train = df_train.sample(frac=TRAIN_DATA_FRACTION).reset_index(drop=True)\\n\",\n    \"df_test = df_test.sample(frac=TEST_DATA_FRACTION).reset_index(drop=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The examples in the dataset are grouped into 5 genres:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"telephone     1043\\n\",\n       \"slate          989\\n\",\n       \"fiction        968\\n\",\n       \"travel         964\\n\",\n       \"government     945\\n\",\n       \"Name: genre, dtype: int64\"\n      ]\n     },\n     \"execution_count\": 7,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"df_train[LABEL_COL].value_counts()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# encode labels\\n\",\n    \"label_encoder = LabelEncoder()\\n\",\n    \"df_train[LABEL_COL] = label_encoder.fit_transform(df_train[LABEL_COL])\\n\",\n    \"df_test[LABEL_COL] = label_encoder.transform(df_test[LABEL_COL])\\n\",\n    \"\\n\",\n    \"num_labels = len(np.unique(df_train[LABEL_COL]))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Number of unique labels: 5\\n\",\n      \"Number of training examples: 4909\\n\",\n      \"Number of testing examples: 1636\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"Number of unique labels: {}\\\".format(num_labels))\\n\",\n    \"print(\\\"Number of training examples: {}\\\".format(df_train.shape[0]))\\n\",\n    \"print(\\\"Number of testing examples: {}\\\".format(df_test.shape[0]))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Select Pretrained Models\\n\",\n    \"\\n\",\n    \"Several pretrained models have been made available by [Hugging Face](https://github.com/huggingface/transformers). For text classification, the following pretrained models are supported.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>model_name</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>bert-base-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>bert-large-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>bert-base-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>bert-large-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>bert-base-multilingual-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <td>bert-base-multilingual-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <td>bert-base-chinese</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>7</th>\\n\",\n       \"      <td>bert-base-german-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>8</th>\\n\",\n       \"      <td>bert-large-uncased-whole-word-masking</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>9</th>\\n\",\n       \"      <td>bert-large-cased-whole-word-masking</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>10</th>\\n\",\n       \"      <td>bert-large-uncased-whole-word-masking-finetune...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>11</th>\\n\",\n       \"      <td>bert-large-cased-whole-word-masking-finetuned-...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>12</th>\\n\",\n       \"      <td>bert-base-cased-finetuned-mrpc</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>13</th>\\n\",\n       \"      <td>bert-base-german-dbmdz-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>14</th>\\n\",\n       \"      <td>bert-base-german-dbmdz-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>15</th>\\n\",\n       \"      <td>bert-base-japanese</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>16</th>\\n\",\n       \"      <td>bert-base-japanese-whole-word-masking</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>17</th>\\n\",\n       \"      <td>bert-base-japanese-char</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>18</th>\\n\",\n       \"      <td>bert-base-japanese-char-whole-word-masking</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>19</th>\\n\",\n       \"      <td>bert-base-finnish-cased-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>20</th>\\n\",\n       \"      <td>bert-base-finnish-uncased-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>21</th>\\n\",\n       \"      <td>roberta-base</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>22</th>\\n\",\n       \"      <td>roberta-large</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>23</th>\\n\",\n       \"      <td>roberta-large-mnli</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>24</th>\\n\",\n       \"      <td>distilroberta-base</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>25</th>\\n\",\n       \"      <td>roberta-base-openai-detector</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>26</th>\\n\",\n       \"      <td>roberta-large-openai-detector</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>27</th>\\n\",\n       \"      <td>xlnet-base-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>28</th>\\n\",\n       \"      <td>xlnet-large-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>29</th>\\n\",\n       \"      <td>distilbert-base-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>30</th>\\n\",\n       \"      <td>distilbert-base-uncased-distilled-squad</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>31</th>\\n\",\n       \"      <td>distilbert-base-german-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>32</th>\\n\",\n       \"      <td>distilbert-base-multilingual-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>33</th>\\n\",\n       \"      <td>albert-base-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>34</th>\\n\",\n       \"      <td>albert-large-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>35</th>\\n\",\n       \"      <td>albert-xlarge-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>36</th>\\n\",\n       \"      <td>albert-xxlarge-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>37</th>\\n\",\n       \"      <td>albert-base-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>38</th>\\n\",\n       \"      <td>albert-large-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>39</th>\\n\",\n       \"      <td>albert-xlarge-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>40</th>\\n\",\n       \"      <td>albert-xxlarge-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                                           model_name\\n\",\n       \"0                                   bert-base-uncased\\n\",\n       \"1                                  bert-large-uncased\\n\",\n       \"2                                     bert-base-cased\\n\",\n       \"3                                    bert-large-cased\\n\",\n       \"4                      bert-base-multilingual-uncased\\n\",\n       \"5                        bert-base-multilingual-cased\\n\",\n       \"6                                   bert-base-chinese\\n\",\n       \"7                              bert-base-german-cased\\n\",\n       \"8               bert-large-uncased-whole-word-masking\\n\",\n       \"9                 bert-large-cased-whole-word-masking\\n\",\n       \"10  bert-large-uncased-whole-word-masking-finetune...\\n\",\n       \"11  bert-large-cased-whole-word-masking-finetuned-...\\n\",\n       \"12                     bert-base-cased-finetuned-mrpc\\n\",\n       \"13                       bert-base-german-dbmdz-cased\\n\",\n       \"14                     bert-base-german-dbmdz-uncased\\n\",\n       \"15                                 bert-base-japanese\\n\",\n       \"16              bert-base-japanese-whole-word-masking\\n\",\n       \"17                            bert-base-japanese-char\\n\",\n       \"18         bert-base-japanese-char-whole-word-masking\\n\",\n       \"19                         bert-base-finnish-cased-v1\\n\",\n       \"20                       bert-base-finnish-uncased-v1\\n\",\n       \"21                                       roberta-base\\n\",\n       \"22                                      roberta-large\\n\",\n       \"23                                 roberta-large-mnli\\n\",\n       \"24                                 distilroberta-base\\n\",\n       \"25                       roberta-base-openai-detector\\n\",\n       \"26                      roberta-large-openai-detector\\n\",\n       \"27                                   xlnet-base-cased\\n\",\n       \"28                                  xlnet-large-cased\\n\",\n       \"29                            distilbert-base-uncased\\n\",\n       \"30            distilbert-base-uncased-distilled-squad\\n\",\n       \"31                       distilbert-base-german-cased\\n\",\n       \"32                 distilbert-base-multilingual-cased\\n\",\n       \"33                                     albert-base-v1\\n\",\n       \"34                                    albert-large-v1\\n\",\n       \"35                                   albert-xlarge-v1\\n\",\n       \"36                                  albert-xxlarge-v1\\n\",\n       \"37                                     albert-base-v2\\n\",\n       \"38                                    albert-large-v2\\n\",\n       \"39                                   albert-xlarge-v2\\n\",\n       \"40                                  albert-xxlarge-v2\"\n      ]\n     },\n     \"execution_count\": 10,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"pd.DataFrame({\\\"model_name\\\": SequenceClassifier.list_supported_models()})\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Fine-tune\\n\",\n    \"\\n\",\n    \"Our wrappers make it easy to fine-tune different models in a unified way, hiding the preprocessing details that are needed before training. In this example, we're going to select the following models and use the same piece of code to fine-tune them on our genre classification task. Note that some models were pretrained on multilingual datasets and can be used with non-English datasets.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"['distilbert-base-uncased', 'roberta-base', 'xlnet-base-cased']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(MODEL_NAMES)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"For each pretrained model, we preprocess the data, fine-tune the classifier, score the test set, and store the evaluation results.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\\n\",\n      \"  warnings.warn('Was asked to gather along dimension 0, but all '\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"results = {}\\n\",\n    \"\\n\",\n    \"for model_name in tqdm(MODEL_NAMES, disable=True):\\n\",\n    \"\\n\",\n    \"    # preprocess\\n\",\n    \"    processor = Processor(\\n\",\n    \"        model_name=model_name,\\n\",\n    \"        to_lower=model_name.endswith(\\\"uncased\\\"),\\n\",\n    \"        cache_dir=CACHE_DIR,\\n\",\n    \"    )\\n\",\n    \"    train_dataset = processor.dataset_from_dataframe(\\n\",\n    \"        df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN\\n\",\n    \"    )\\n\",\n    \"    train_dataloader = dataloader_from_dataset(\\n\",\n    \"        train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\\n\",\n    \"    )\\n\",\n    \"    test_dataset = processor.dataset_from_dataframe(\\n\",\n    \"        df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN\\n\",\n    \"    )\\n\",\n    \"    test_dataloader = dataloader_from_dataset(\\n\",\n    \"        test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    # fine-tune\\n\",\n    \"    classifier = SequenceClassifier(\\n\",\n    \"        model_name=model_name, num_labels=num_labels, cache_dir=CACHE_DIR\\n\",\n    \"    )\\n\",\n    \"    with Timer() as t:\\n\",\n    \"        classifier.fit(\\n\",\n    \"            train_dataloader, num_epochs=NUM_EPOCHS, num_gpus=NUM_GPUS, verbose=False,\\n\",\n    \"        )\\n\",\n    \"    train_time = t.interval / 3600\\n\",\n    \"\\n\",\n    \"    # predict\\n\",\n    \"    preds = classifier.predict(test_dataloader, num_gpus=NUM_GPUS, verbose=False)\\n\",\n    \"\\n\",\n    \"    # eval\\n\",\n    \"    accuracy = accuracy_score(df_test[LABEL_COL], preds)\\n\",\n    \"    class_report = classification_report(\\n\",\n    \"        df_test[LABEL_COL], preds, target_names=label_encoder.classes_, output_dict=True\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    # save results\\n\",\n    \"    results[model_name] = {\\n\",\n    \"        \\\"accuracy\\\": accuracy,\\n\",\n    \"        \\\"f1-score\\\": class_report[\\\"macro avg\\\"][\\\"f1-score\\\"],\\n\",\n    \"        \\\"time(hrs)\\\": train_time,\\n\",\n    \"    }\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Evaluate\\n\",\n    \"\\n\",\n    \"Finally, we report the accuracy and F1-score metrics for each model, as well as the fine-tuning time in hours.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>distilbert-base-uncased</th>\\n\",\n       \"      <th>roberta-base</th>\\n\",\n       \"      <th>xlnet-base-cased</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>accuracy</th>\\n\",\n       \"      <td>0.889364</td>\\n\",\n       \"      <td>0.885697</td>\\n\",\n       \"      <td>0.886308</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>f1-score</th>\\n\",\n       \"      <td>0.885225</td>\\n\",\n       \"      <td>0.880926</td>\\n\",\n       \"      <td>0.881819</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>time(hrs)</th>\\n\",\n       \"      <td>0.023326</td>\\n\",\n       \"      <td>0.044209</td>\\n\",\n       \"      <td>0.052801</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"           distilbert-base-uncased  roberta-base  xlnet-base-cased\\n\",\n       \"accuracy                  0.889364      0.885697          0.886308\\n\",\n       \"f1-score                  0.885225      0.880926          0.881819\\n\",\n       \"time(hrs)                 0.023326      0.044209          0.052801\"\n      ]\n     },\n     \"execution_count\": 13,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"df_results = pd.DataFrame(results)\\n\",\n    \"df_results\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.887123064384678,\n       \"encoder\": \"json\",\n       \"name\": \"accuracy\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"accuracy\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.8826569624491233,\n       \"encoder\": \"json\",\n       \"name\": \"f1\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"f1\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"# for testing\\n\",\n    \"sb.glue(\\\"accuracy\\\", df_results.iloc[0, :].mean())\\n\",\n    \"sb.glue(\\\"f1\\\", df_results.iloc[1, :].mean())\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3.6.8 64-bit ('nlp_gpu': conda)\",\n   \"language\": \"python\",\n   \"name\": \"python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/text_classification/tc_multi_languages_transformers.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"*Copyright (c) Microsoft Corporation. All rights reserved.*\\n\",\n    \"\\n\",\n    \"*Licensed under the MIT License.*\\n\",\n    \"\\n\",\n    \"# Text Classification of Multi Language Datasets using Transformer Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import scrapbook as sb\\n\",\n    \"import pandas as pd\\n\",\n    \"import torch\\n\",\n    \"import numpy as np\\n\",\n    \"\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"from utils_nlp.common.timer import Timer\\n\",\n    \"from sklearn.metrics import classification_report\\n\",\n    \"from utils_nlp.models.transformers.sequence_classification import SequenceClassifier\\n\",\n    \"\\n\",\n    \"from utils_nlp.dataset import multinli\\n\",\n    \"from utils_nlp.dataset import dac\\n\",\n    \"from utils_nlp.dataset import bbc_hindi\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Introduction\\n\",\n    \"\\n\",\n    \"In this notebook, we fine-tune and evaluate a pretrained Transformer model using BERT earchitecture on three different language datasets:\\n\",\n    \"\\n\",\n    \"- [MultiNLI dataset](https://www.nyu.edu/projects/bowman/multinli/): The Multi-Genre NLI corpus, in English\\n\",\n    \"- [DAC dataset](https://data.mendeley.com/datasets/v524p5dhpj/2): DataSet for Arabic Classification corpus, in Arabic\\n\",\n    \"- [BBC Hindi dataset](https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1): BBC Hindi News corpus, in Hindi\\n\",\n    \"\\n\",\n    \"If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. You can also choose a dataset from three existing datasets (**`MultNLI`**, **`DAC`**, and **`BBC Hindi`**) to experiment. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Running Time\\n\",\n    \"\\n\",\n    \"The table below provides some reference running time on different datasets.  \\n\",\n    \"\\n\",\n    \"|Dataset|QUICK_RUN|Machine Configurations|Running time|\\n\",\n    \"|:------|:---------|:----------------------|:------------|\\n\",\n    \"|MultiNLI|True|2 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 8 minutes |\\n\",\n    \"|MultiNLI|False|2 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 5.7 hours |\\n\",\n    \"|DAC|True|2 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 13 minutes |\\n\",\n    \"|DAC|False|2 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 5.6 hours |\\n\",\n    \"|BBC Hindi|True|2 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 1 minute |\\n\",\n    \"|BBC Hindi|False|2 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 14 minutes |\\n\",\n    \"\\n\",\n    \"If you run into CUDA out-of-memory error or the jupyter kernel dies constantly, try reducing the `batch_size` and `max_len` in `CONFIG`, but note that model performance may be compromised. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\\n\",\n    \"QUICK_RUN = True\\n\",\n    \"\\n\",\n    \"# the dataset you want to try, valid values are: \\\"multinli\\\", \\\"dac\\\", \\\"bbc-hindi\\\"\\n\",\n    \"USE_DATASET = \\\"dac\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Several pretrained models have been made available by [Hugging Face](https://github.com/huggingface/transformers). For text classification, the following pretrained models are supported.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>model_name</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>bert-base-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>bert-large-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>bert-base-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>bert-large-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>bert-base-multilingual-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <td>bert-base-multilingual-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <td>bert-base-chinese</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>7</th>\\n\",\n       \"      <td>bert-base-german-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>8</th>\\n\",\n       \"      <td>bert-large-uncased-whole-word-masking</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>9</th>\\n\",\n       \"      <td>bert-large-cased-whole-word-masking</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>10</th>\\n\",\n       \"      <td>bert-large-uncased-whole-word-masking-finetune...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>11</th>\\n\",\n       \"      <td>bert-large-cased-whole-word-masking-finetuned-...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>12</th>\\n\",\n       \"      <td>bert-base-cased-finetuned-mrpc</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>13</th>\\n\",\n       \"      <td>bert-base-german-dbmdz-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>14</th>\\n\",\n       \"      <td>bert-base-german-dbmdz-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>15</th>\\n\",\n       \"      <td>bert-base-japanese</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>16</th>\\n\",\n       \"      <td>bert-base-japanese-whole-word-masking</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>17</th>\\n\",\n       \"      <td>bert-base-japanese-char</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>18</th>\\n\",\n       \"      <td>bert-base-japanese-char-whole-word-masking</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>19</th>\\n\",\n       \"      <td>bert-base-finnish-cased-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>20</th>\\n\",\n       \"      <td>bert-base-finnish-uncased-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>21</th>\\n\",\n       \"      <td>roberta-base</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>22</th>\\n\",\n       \"      <td>roberta-large</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>23</th>\\n\",\n       \"      <td>roberta-large-mnli</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>24</th>\\n\",\n       \"      <td>distilroberta-base</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>25</th>\\n\",\n       \"      <td>roberta-base-openai-detector</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>26</th>\\n\",\n       \"      <td>roberta-large-openai-detector</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>27</th>\\n\",\n       \"      <td>xlnet-base-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>28</th>\\n\",\n       \"      <td>xlnet-large-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>29</th>\\n\",\n       \"      <td>distilbert-base-uncased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>30</th>\\n\",\n       \"      <td>distilbert-base-uncased-distilled-squad</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>31</th>\\n\",\n       \"      <td>distilbert-base-german-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>32</th>\\n\",\n       \"      <td>distilbert-base-multilingual-cased</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>33</th>\\n\",\n       \"      <td>albert-base-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>34</th>\\n\",\n       \"      <td>albert-large-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>35</th>\\n\",\n       \"      <td>albert-xlarge-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>36</th>\\n\",\n       \"      <td>albert-xxlarge-v1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>37</th>\\n\",\n       \"      <td>albert-base-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>38</th>\\n\",\n       \"      <td>albert-large-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>39</th>\\n\",\n       \"      <td>albert-xlarge-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>40</th>\\n\",\n       \"      <td>albert-xxlarge-v2</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                                           model_name\\n\",\n       \"0                                   bert-base-uncased\\n\",\n       \"1                                  bert-large-uncased\\n\",\n       \"2                                     bert-base-cased\\n\",\n       \"3                                    bert-large-cased\\n\",\n       \"4                      bert-base-multilingual-uncased\\n\",\n       \"5                        bert-base-multilingual-cased\\n\",\n       \"6                                   bert-base-chinese\\n\",\n       \"7                              bert-base-german-cased\\n\",\n       \"8               bert-large-uncased-whole-word-masking\\n\",\n       \"9                 bert-large-cased-whole-word-masking\\n\",\n       \"10  bert-large-uncased-whole-word-masking-finetune...\\n\",\n       \"11  bert-large-cased-whole-word-masking-finetuned-...\\n\",\n       \"12                     bert-base-cased-finetuned-mrpc\\n\",\n       \"13                       bert-base-german-dbmdz-cased\\n\",\n       \"14                     bert-base-german-dbmdz-uncased\\n\",\n       \"15                                 bert-base-japanese\\n\",\n       \"16              bert-base-japanese-whole-word-masking\\n\",\n       \"17                            bert-base-japanese-char\\n\",\n       \"18         bert-base-japanese-char-whole-word-masking\\n\",\n       \"19                         bert-base-finnish-cased-v1\\n\",\n       \"20                       bert-base-finnish-uncased-v1\\n\",\n       \"21                                       roberta-base\\n\",\n       \"22                                      roberta-large\\n\",\n       \"23                                 roberta-large-mnli\\n\",\n       \"24                                 distilroberta-base\\n\",\n       \"25                       roberta-base-openai-detector\\n\",\n       \"26                      roberta-large-openai-detector\\n\",\n       \"27                                   xlnet-base-cased\\n\",\n       \"28                                  xlnet-large-cased\\n\",\n       \"29                            distilbert-base-uncased\\n\",\n       \"30            distilbert-base-uncased-distilled-squad\\n\",\n       \"31                       distilbert-base-german-cased\\n\",\n       \"32                 distilbert-base-multilingual-cased\\n\",\n       \"33                                     albert-base-v1\\n\",\n       \"34                                    albert-large-v1\\n\",\n       \"35                                   albert-xlarge-v1\\n\",\n       \"36                                  albert-xxlarge-v1\\n\",\n       \"37                                     albert-base-v2\\n\",\n       \"38                                    albert-large-v2\\n\",\n       \"39                                   albert-xlarge-v2\\n\",\n       \"40                                  albert-xxlarge-v2\"\n      ]\n     },\n     \"execution_count\": 3,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"pd.DataFrame({\\\"model_name\\\": SequenceClassifier.list_supported_models()})\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In order to demonstrate multi language capability of Transformer models, we only use the model **`bert-base-multilingual-cased`** by default in this notebook.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Configuration\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"CONFIG = {\\n\",\n    \"    'local_path': TemporaryDirectory().name,\\n\",\n    \"    'test_fraction': 0.2,\\n\",\n    \"    'random_seed': 100,\\n\",\n    \"    'train_sample_ratio': 1.0,\\n\",\n    \"    'test_sample_ratio': 1.0,\\n\",\n    \"    'model_name': 'distilbert-base-multilingual-cased',\\n\",\n    \"    'to_lower': False,\\n\",\n    \"    'cache_dir': TemporaryDirectory().name,\\n\",\n    \"    'max_len': 150,\\n\",\n    \"    'num_train_epochs': 5,\\n\",\n    \"    'num_gpus': 2,\\n\",\n    \"    'batch_size': 16,\\n\",\n    \"    'verbose': False,\\n\",\n    \"    'load_dataset_func': None,\\n\",\n    \"    'get_labels_func': None\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"if QUICK_RUN:\\n\",\n    \"    CONFIG['train_sample_ratio'] = 0.2\\n\",\n    \"    CONFIG['test_sample_ratio'] = 0.2\\n\",\n    \"    CONFIG['num_train_epochs'] = 1\\n\",\n    \"\\n\",\n    \"torch.manual_seed(CONFIG['random_seed'])\\n\",\n    \"\\n\",\n    \"if torch.cuda.is_available():\\n\",\n    \"    CONFIG['batch_size'] = 32\\n\",\n    \"    \\n\",\n    \"if USE_DATASET == \\\"multinli\\\":\\n\",\n    \"    CONFIG['to_lower'] = True\\n\",\n    \"    CONFIG['load_dataset_func'] = multinli.load_tc_dataset\\n\",\n    \"    CONFIG['get_labels_func'] = multinli.get_label_values\\n\",\n    \"    \\n\",\n    \"    if QUICK_RUN:\\n\",\n    \"        CONFIG['train_sample_ratio'] = 0.1\\n\",\n    \"        CONFIG['test_sample_ratio'] = 0.1\\n\",\n    \"elif USE_DATASET == \\\"dac\\\":\\n\",\n    \"    CONFIG['load_dataset_func'] = dac.load_tc_dataset\\n\",\n    \"    CONFIG['get_labels_func'] = dac.get_label_values\\n\",\n    \"elif USE_DATASET == \\\"bbc-hindi\\\":\\n\",\n    \"    CONFIG['load_dataset_func'] = bbc_hindi.load_tc_dataset\\n\",\n    \"    CONFIG['get_labels_func'] = bbc_hindi.get_label_values\\n\",\n    \"else:\\n\",\n    \"    raise ValueError(\\\"Supported datasets are: 'multinli', 'dac', and 'bbc-hindi'\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Load Dataset\\n\",\n    \"\\n\",\n    \"By choosing the dataset you want to experiment with, the code snippet below will adaptively seletct a helper function **`load_dataset`** for the dataset.  The helper function downloads the raw data, splits it into training and testing datasets (also sub-sampling if the sampling ratio is smaller than 1.0), and then processes for the transformer model. Everything is done in one function call, and you can use the processed training and testing Pytorch datasets to fine tune the model and evaluate the performance of the model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 80.1k/80.1k [00:02<00:00, 30.8kKB/s]\\n\",\n      \"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\\n\",\n      \"  FutureWarning)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"train_dataloader, test_dataloader, label_encoder, test_labels = CONFIG['load_dataset_func'](\\n\",\n    \"    local_path=CONFIG['local_path'],\\n\",\n    \"    test_fraction=CONFIG['test_fraction'],\\n\",\n    \"    random_seed=CONFIG['random_seed'],\\n\",\n    \"    train_sample_ratio=CONFIG['train_sample_ratio'],\\n\",\n    \"    test_sample_ratio=CONFIG['test_sample_ratio'],\\n\",\n    \"    model_name=CONFIG['model_name'],\\n\",\n    \"    to_lower=CONFIG['to_lower'],\\n\",\n    \"    cache_dir=CONFIG['cache_dir'],\\n\",\n    \"    max_len=CONFIG['max_len'],\\n\",\n    \"    num_gpus=CONFIG['num_gpus']\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Fine Tune\\n\",\n    \"\\n\",\n    \"There are two steps to fine tune a transformer model for text classifiction: 1). instantiate a `SequenceClassifier` class which is a wrapper of the transformer model, and 2), fit the model using the preprocessed training dataset. The member method `fit` of `SequenceClassifier` class is used to fine tune the model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\\n\",\n      \"  warnings.warn('Was asked to gather along dimension 0, but all '\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Training time : 0.190 hrs\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"model = SequenceClassifier(\\n\",\n    \"    model_name=CONFIG['model_name'],\\n\",\n    \"    num_labels=len(label_encoder.classes_),\\n\",\n    \"    cache_dir=CONFIG['cache_dir']\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"# Fine tune the model using the training dataset\\n\",\n    \"with Timer() as t:\\n\",\n    \"    model.fit(\\n\",\n    \"        train_dataloader=train_dataloader,\\n\",\n    \"        num_epochs=CONFIG['num_train_epochs'],\\n\",\n    \"        num_gpus=CONFIG['num_gpus'],\\n\",\n    \"        verbose=CONFIG['verbose'],\\n\",\n    \"        seed=CONFIG['random_seed']\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"print(\\\"Training time : {:.3f} hrs\\\".format(t.interval / 3600))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Evaluate on Testing Dataset\\n\",\n    \"\\n\",\n    \"The `predict` method of the `SequenceClassifier` returns a Numpy ndarray of raw predictions. Each predicting value is a label ID, and if you want to get the label values you will need to call function `get_label_values` from the dataset package. An instance of sklearn `LabelEncoder` is returned when loading the dataset and can be used to get the mapping between label ID and label value.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Prediction time : 0.021 hrs\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"with Timer() as t:\\n\",\n    \"    preds = model.predict(\\n\",\n    \"        test_dataloader=test_dataloader,\\n\",\n    \"        num_gpus=CONFIG['num_gpus'],\\n\",\n    \"        verbose=CONFIG['verbose']\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"print(\\\"Prediction time : {:.3f} hrs\\\".format(t.interval / 3600))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally, we compute the precision, recall, and F1 metrics of the evaluation on the test set.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"              precision    recall  f1-score   support\\n\",\n      \"\\n\",\n      \"     culture       0.93      0.94      0.93       548\\n\",\n      \"     diverse       0.94      0.94      0.94       640\\n\",\n      \"     economy       0.90      0.88      0.89       570\\n\",\n      \"    politics       0.87      0.88      0.88       809\\n\",\n      \"      sports       0.99      0.98      0.99      1785\\n\",\n      \"\\n\",\n      \"   micro avg       0.94      0.94      0.94      4352\\n\",\n      \"   macro avg       0.93      0.93      0.93      4352\\n\",\n      \"weighted avg       0.94      0.94      0.94      4352\\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"report = classification_report(\\n\",\n    \"    test_labels, \\n\",\n    \"    preds,\\n\",\n    \"    digits=2,\\n\",\n    \"    labels=np.unique(test_labels),\\n\",\n    \"    target_names=label_encoder.classes_\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"print(report)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.94,\n       \"encoder\": \"json\",\n       \"name\": \"precision\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"precision\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.94,\n       \"encoder\": \"json\",\n       \"name\": \"recall\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"recall\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"application/scrapbook.scrap.json+json\": {\n       \"data\": 0.94,\n       \"encoder\": \"json\",\n       \"name\": \"f1\",\n       \"version\": 1\n      }\n     },\n     \"metadata\": {\n      \"scrapbook\": {\n       \"data\": true,\n       \"display\": false,\n       \"name\": \"f1\"\n      }\n     },\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"# for testing\\n\",\n    \"report_splits = report.split('\\\\n')[-2].split()\\n\",\n    \"\\n\",\n    \"sb.glue(\\\"precision\\\", float(report_splits[2]))\\n\",\n    \"sb.glue(\\\"recall\\\", float(report_splits[3]))\\n\",\n    \"sb.glue(\\\"f1\\\", float(report_splits[4]))\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3.6.8 64-bit ('nlp_gpu': conda)\",\n   \"language\": \"python\",\n   \"name\": \"python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/text_summarization/abstractive_summarization_bertsum_cnndm_distributed_train.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport argparse\nimport os\nimport sys\nimport time\nimport torch\nimport torch.distributed as dist\nimport torch.multiprocessing as mp\n\n# torch.set_printoptions(threshold=5000)\n\nnlp_path = os.path.abspath(\"../../\")\nif nlp_path not in sys.path:\n    sys.path.insert(0, nlp_path)\n\nsys.path.insert(0, \"./\")\n\nfrom utils_nlp.models.transformers.abstractive_summarization_bertsum import (\n    BertSumAbs,\n    BertSumAbsProcessor,\n    validate,\n)\nfrom utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\n\nos.environ[\"NCCL_IB_DISABLE\"] = \"0\"\n# os.environ[\"NCCL_DEBUG\"] = \"INFO\"\nos.environ[\"NCCL_DEBUG_SUBSYS\"] = \"ALL\"\n# os.environ[\"MASTER_PORT\"] = \"29952\"\n# os.environ[\"MASTER_ADDR\"] = \"172.12.0.6\"\n# os.environ['NCCL_SOCKET_IFNAME'] = 'lo'\n\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\n    \"--rank\", type=int, default=0, help=\"The rank of the current node in the cluster\"\n)\nparser.add_argument(\n    \"--dist_url\",\n    type=str,\n    default=\"tcp://127.0.0.1:29507\",\n    help=\"URL specifying how to initialize the process groupi.\",\n)\nparser.add_argument(\n    \"--node_count\", type=int, default=1, help=\"Number of nodes in the cluster.\"\n)\n\nparser.add_argument(\n    \"--cache_dir\",\n    type=str,\n    default=\"./abstemp\",\n    help=\"Directory to cache the tokenizer.\",\n)\nparser.add_argument(\n    \"--data_dir\",\n    type=str,\n    default=\"./abstemp\",\n    help=\"Directory to download the preprocessed data.\",\n)\nparser.add_argument(\n    \"--output_dir\",\n    type=str,\n    default=\"./abstemp\",\n    help=\"Directory to save the output model and prediction results.\",\n)\nparser.add_argument(\n    \"--quick_run\",\n    type=str.lower,\n    default=\"false\",\n    choices=[\"true\", \"false\"],\n    help=\"Whether to have a quick run\",\n)\nparser.add_argument(\n    \"--model_name\",\n    type=str,\n    default=\"bert-base-uncased\",\n    help='Transformer model used in the summarization model, only \\\n                        \"bert-uncased\" is supported so far.',\n)\nparser.add_argument(\n    \"--lr_bert\", type=float, default=2e-3, help=\"Learning rate for the BERT encoder.\"\n)\nparser.add_argument(\n    \"--lr_dec\", type=float, default=2e-1, help=\"Learning rate for the decoder.\"\n)\nparser.add_argument(\n    \"--batch_size\",\n    type=int,\n    default=5,\n    help=\"batch size in terms of input token numbers in training\",\n)\nparser.add_argument(\n    \"--max_pos_length\",\n    type=int,\n    default=512,\n    help=\"maximum input length in terms of input token numbers in training\",\n)\nparser.add_argument(\n    \"--max_steps\",\n    type=int,\n    default=5e4,\n    help=\"\"\"Maximum number of training steps run in training.\n        If quick_run is set, it's not used.\"\"\",\n)\nparser.add_argument(\n    \"--warmup_steps_bert\",\n    type=int,\n    default=2e4,\n    help=\"Warm-up number of training steps run in training for the encoder. \\\n        If quick_run is set, it's not used.\",\n)\nparser.add_argument(\n    \"--warmup_steps_dec\",\n    type=int,\n    default=1e4,\n    help=\"Warm-up number of training steps run in training for the decoder. \\\n        If quick_run is set, it's not used.\",\n)\nparser.add_argument(\n    \"--summary_filename\",\n    type=str,\n    default=\"generated_summaries.txt\",\n    help=\"Summary file name generated by prediction for evaluation.\",\n)\nparser.add_argument(\n    \"--model_filename\",\n    type=str,\n    default=\"dist_abssum_model.pt\",\n    help=\"model file name saved for evaluation.\",\n)\nparser.add_argument(\n    \"--checkpoint_filename\",\n    type=str,\n    default=None,\n    help=\"filename of a checkpoint where the trainging resumes from. \\\n                            default path is at cache_dir\",\n)\nparser.add_argument(\n    \"--report_every\",\n    type=int,\n    default=10,\n    help=\"number of steps between each loss report\",\n)\nparser.add_argument(\n    \"--save_every\",\n    type=int,\n    default=500,\n    help=\"number of steps between each model save and validation\",\n)\nparser.add_argument(\n    \"--fp16\",\n    type=str.lower,\n    default=\"false\",\n    choices=[\"true\", \"false\"],\n    help=\"Whether to use mixed precision training\",\n)\nparser.add_argument(\n    \"--fp16_opt_level\",\n    type=str.upper,\n    default=\"O2\",\n    choices=[\"O0\", \"O1\", \"O2\", \"O3\"],\n    help=\"optimization level, refer to \\\n         https://nvidia.github.io/apex/amp.html#opt-levels for details \",\n)\n\n\ndef main():\n\n    args = parser.parse_args()\n\n    print(\"NCCL_IB_DISABLE: {}\".format(os.getenv(\"NCCL_IB_DISABLE\")))\n    print(\"quick_run is {}\".format(args.quick_run))\n    print(\"output_dir is {}\".format(args.output_dir))\n    print(\"data_dir is {}\".format(args.data_dir))\n    print(\"cache_dir is {}\".format(args.cache_dir))\n\n    TOP_N = -1\n    if args.quick_run.lower() == \"false\":\n        TOP_N = 10\n    train_dataset, test_dataset = CNNDMSummarizationDataset(\n        top_n=TOP_N, local_cache_path=args.data_dir, prepare_extractive=False\n    )\n\n    ngpus_per_node = torch.cuda.device_count()\n    processor = BertSumAbsProcessor(\n        cache_dir=args.cache_dir, max_src_len=args.max_pos_length\n    )\n    summarizer = BertSumAbs(\n        processor, cache_dir=args.cache_dir, max_pos_length=args.max_pos_length\n    )\n    mp.spawn(\n        main_worker,\n        nprocs=ngpus_per_node,\n        args=(ngpus_per_node, summarizer, train_dataset, test_dataset, args),\n    )\n\n\ndef main_worker(\n    local_rank, ngpus_per_node, summarizer, train_dataset, test_dataset, args\n):\n    rank = args.rank * ngpus_per_node + local_rank\n    world_size = args.node_count * ngpus_per_node\n    print(\"world_size is {}\".format(world_size))\n    print(\"local_rank is {} and rank is {}\".format(local_rank, rank))\n\n    torch.distributed.init_process_group(\n        backend=\"nccl\", init_method=args.dist_url, world_size=world_size, rank=rank,\n    )\n\n    # return\n    ## should not load checkpoint from this place, otherwise, huge memory increase\n    if args.checkpoint_filename:\n        checkpoint = os.path.join(args.cache_dir, args.checkpoint_filename)\n    else:\n        checkpoint = None\n\n    # train_sum_dataset, test_sum_dataset = load_processed_cnndm_abs(args.data_dir)\n    def this_validate(class_obj):\n        return validate(class_obj, test_dataset)\n\n    if rank not in [-1, 0]:\n        save_every = -1\n        this_validate = None\n    else:\n        save_every = args.save_every\n\n    fp16 = args.fp16.lower() == \"true\"\n    print(\"fp16 is {}\".format(fp16))\n    # total number of steps for training\n    MAX_STEPS = 10\n    SAVE_EVERY = 10\n    REPORT_EVERY = 10\n    # number of steps for warm up\n    WARMUP_STEPS_BERT = MAX_STEPS\n    WARMUP_STEPS_DEC = MAX_STEPS\n    if args.quick_run.lower() == \"false\":\n        MAX_STEPS = args.max_steps\n        WARMUP_STEPS_BERT = args.warmup_steps_bert\n        WARMUP_STEPS_DEC = args.warmup_steps_dec\n        SAVE_EVERY = save_every\n        REPORT_EVERY = args.report_every\n\n    print(\"max steps is {}\".format(MAX_STEPS))\n    print(\"warmup steps for encoder bert is {}\".format(WARMUP_STEPS_BERT))\n    print(\"warmup steps for decoder is {}\".format(WARMUP_STEPS_DEC))\n    start = time.time()\n\n    # summarizer.model.load_checkpoint(checkpoint['model'])\n    summarizer.fit(\n        train_dataset,\n        world_size=world_size,\n        num_gpus=None,\n        local_rank=local_rank,\n        rank=rank,\n        batch_size=args.batch_size,\n        max_steps=MAX_STEPS / world_size,\n        learning_rate_bert=args.lr_bert,\n        learning_rate_dec=args.lr_dec,\n        warmup_steps_bert=WARMUP_STEPS_BERT,\n        warmup_steps_dec=WARMUP_STEPS_DEC,\n        save_every=SAVE_EVERY,\n        report_every=REPORT_EVERY,\n        validation_function=this_validate,\n        fp16=fp16,\n        fp16_opt_level=args.fp16_opt_level,\n        checkpoint=checkpoint,\n    )\n\n    end = time.time()\n    print(\"rank {0}, duration {1:.6f}s\".format(rank, end - start))\n    if local_rank in [0, -1] and args.rank == 0:\n        TOP_N = -1\n        if args.quick_run.lower() == \"false\":\n            TOP_N = ngpus_per_node\n        saved_model_path = os.path.join(\n            args.output_dir, \"{}_step{}\".format(args.model_filename, MAX_STEPS)\n        )\n        summarizer.save_model(MAX_STEPS, saved_model_path)\n        prediction = summarizer.predict(\n            test_dataset.shorten(top_n=TOP_N), batch_size=ngpus_per_node, num_gpus=ngpus_per_node\n        )\n        print(prediction[0])\n\n        def _write_list_to_file(list_items, filename):\n            with open(filename, \"w\") as filehandle:\n                # for cnt, line in enumerate(filehandle):\n                for item in list_items:\n                    filehandle.write(\"%s\\n\" % item)\n\n        print(\"writing generated summaries\")\n        _write_list_to_file(\n            prediction, os.path.join(args.output_dir, args.summary_filename)\n        )\n\n    # only use the following line when you use your own cluster.\n    # AML distributed training run cleanup for you.\n    dist.destroy_process_group()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/text_summarization/abstractive_summarization_bertsumabs_cnndm.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation.\\n\",\n    \"Licensed under the MIT License.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Abstractive Summarization using BertSumAbs on CNN/DailyMails Dataset\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Summary\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This notebook demonstrates how to fine tune BERT for abstractive text summarization. Utility functions and classes in the NLP Best Practices repo are used to facilitate data preprocessing, model training, model scoring, result postprocessing, and model evaluation.\\n\",\n    \"\\n\",\n    \"### Abstractive Summarization\\n\",\n    \"Abstractive summarization is the task of taking an input text and summarizing its content in a shorter output text. In contrast to extractive summarization, abstractive summarization doesn't take sentences directly from the input text, instead, rephrases the input text.\\n\",\n    \"\\n\",\n    \"### BertSumAbs\\n\",\n    \"\\n\",\n    \"BertSumAbs refers to an BERT-based abstractive summarization algorithm  in [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345) with [published examples](https://github.com/nlpyang/PreSumm). It uses the pretrained BERT model as encoder and finetune both encoder and decoder on a specific labeled summarization dataset like [CNN/DM dataset](https://github.com/harvardnlp/sent-summary). \\n\",\n    \"\\n\",\n    \"The figure below shows the comparison of architecture of the original BERT model (left) and BERTSUM (right), which BertSumAbs is built upon. For BERTSUM, a input document is split into sentences, and [CLS] and [SEP] tokens are inserted before and after each sentence. This resulting sequence is followed by the summation of three kinds of embeddings for each token before feeding into the transformer layers. The positional embedding used in BertSumAbs enables input length of more than 512, which is the  maximum input length for BERT model. \\n\",\n    \"\\n\",\n    \"It should be noted that the architecture only shows the encoder part. For decoder, BertSumAbs also uses a transformer with multiple layers and random initialization. As pretrained weights are used in the encoder, there is a mismatch in encoder and decoder which may result in unstable finetuning. Therefore, in fine tuning, BertSumAbs uses seperate optimizers for encoder and decoder, each uses its own scheduling. In text generation, techniques like trigram blocking and beam search can be used to improve model accuracy.\\n\",\n    \"<img src=\\\"https://nlpbp.blob.core.windows.net/images/BertForSummarization.PNG\\\">\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Before you start\\n\",\n    \"\\n\",\n    \"It's recommended to run this notebook on GPU machines as it's very computationally intensive. Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of steps. If QUICK_RUN = False, the notebook takes about 5 hours to run on a VM with 4 16GB NVIDIA V100 GPUs. Finetuning costs around 1.5 hours and inferecing costs around 3.5 hour.  Better performance can be achieved by increasing the MAX_STEPS.\\n\",\n    \"\\n\",\n    \"* **ROUGE Evalation**: To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](./summarization_evaluation.ipynb) for setup.\\n\",\n    \"\\n\",\n    \"* **Distributed Training**:\\n\",\n    \"Please note that the jupyter notebook only allows to use pytorch [DataParallel](https://pytorch.org/docs/master/nn.html#dataparallel). Faster speed and larger batch size can be achieved with pytorch [DistributedDataParallel](https://pytorch.org/docs/master/notes/ddp.html)(DDP). Script [abstractive_summarization_bertsum_cnndm_distributed_train.py](./abstractive_summarization_bertsum_cnndm_distributed_train.py) shows an example of how to use DDP.\\n\",\n    \"\\n\",\n    \"* **Mixed Precision Training**:\\n\",\n    \"Please note that by default this notebook doesn't use mixed precision training. Faster speed and larger batch size can be achieved when you set FP16 to True. Refer to  https://nvidia.github.io/apex and https://github.com/nvidia/apex) for details to use mixed precision training. Check the GPU model on your machine to see if it allows mixed precision training. Please also note that mixed precision inferencing is also enabled in the prediciton utility function. When you use mixed precision training and/or inferencing, the model performance can be slightly worse than the full precision mode.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"%load_ext autoreload\\n\",\n    \"%autoreload 2\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"QUICK_RUN = True\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"import shutil\\n\",\n    \"import sys\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"import torch\\n\",\n    \"\\n\",\n    \"nlp_path = os.path.abspath(\\\"../../\\\")\\n\",\n    \"if nlp_path not in sys.path:\\n\",\n    \"    sys.path.insert(0, nlp_path)\\n\",\n    \"\\n\",\n    \"from utils_nlp.models.transformers.abstractive_summarization_bertsum import (\\n\",\n    \"    BertSumAbs,\\n\",\n    \"    BertSumAbsProcessor,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\\n\",\n    \"from utils_nlp.eval import compute_rouge_python\\n\",\n    \"\\n\",\n    \"from utils_nlp.models.transformers.datasets import SummarizationDataset\\n\",\n    \"import nltk\\n\",\n    \"from nltk import tokenize\\n\",\n    \"\\n\",\n    \"import pandas as pd\\n\",\n    \"import pprint\\n\",\n    \"import scrapbook as sb\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Data Preprocessing\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The dataset we used for this notebook is CNN/DM dataset which contains the documents and accompanying questions from the news articles of CNN and Daily mail. The highlights in each article are used as summary. The dataset consits of ~289K training examples, ~11K valiation examples and ~11K test examples. The length of the news articles is 781 tokens on average and the summaries are of 3.75 sentences and 56 tokens on average.\\n\",\n    \"\\n\",\n    \"The significant part of data preprocessing only involve splitting the input document into sentences.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# the data path used to save the downloaded data file\\n\",\n    \"DATA_PATH = TemporaryDirectory().name\\n\",\n    \"# The number of lines at the head of data file used for preprocessing. -1 means all the lines.\\n\",\n    \"TOP_N = 100\\n\",\n    \"if not QUICK_RUN:\\n\",\n    \"    TOP_N = -1\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"train_dataset, test_dataset = CNNDMSummarizationDataset(\\n\",\n    \"    top_n=TOP_N, local_cache_path=DATA_PATH, prepare_extractive=False\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"len(train_dataset)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"len(test_dataset)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Model Finetuning\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# notebook parameters\\n\",\n    \"# the cache path\\n\",\n    \"CACHE_PATH = TemporaryDirectory().name\\n\",\n    \"\\n\",\n    \"# model parameters\\n\",\n    \"MODEL_NAME = \\\"bert-base-uncased\\\"\\n\",\n    \"MAX_POS = 768\\n\",\n    \"MAX_SOURCE_SEQ_LENGTH = 640\\n\",\n    \"MAX_TARGET_SEQ_LENGTH = 140\\n\",\n    \"\\n\",\n    \"# mixed precision setting. To enable mixed precision training, follow instructions in SETUP.md.\\n\",\n    \"FP16 = False\\n\",\n    \"if FP16:\\n\",\n    \"    FP16_OPT_LEVEL = \\\"O2\\\"\\n\",\n    \"\\n\",\n    \"# fine-tuning parameters\\n\",\n    \"# batch size, unit is the number of tokens\\n\",\n    \"BATCH_SIZE_PER_GPU = 1\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# GPU used for training\\n\",\n    \"NUM_GPUS = torch.cuda.device_count()\\n\",\n    \"if NUM_GPUS > 0:\\n\",\n    \"    BATCH_SIZE = NUM_GPUS * BATCH_SIZE_PER_GPU\\n\",\n    \"else:\\n\",\n    \"    BATCH_SIZE = 1\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# Learning rate\\n\",\n    \"LEARNING_RATE_BERT = 5e-4 / 2.0\\n\",\n    \"LEARNING_RATE_DEC = 0.05 / 2.0\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# How often the statistics reports show up in training, unit is step.\\n\",\n    \"REPORT_EVERY = 10\\n\",\n    \"SAVE_EVERY = 500\\n\",\n    \"\\n\",\n    \"# total number of steps for training\\n\",\n    \"MAX_STEPS = 1e3\\n\",\n    \"\\n\",\n    \"if not QUICK_RUN:\\n\",\n    \"    MAX_STEPS = 5e3\\n\",\n    \"\\n\",\n    \"WARMUP_STEPS_BERT = 2000\\n\",\n    \"WARMUP_STEPS_DEC = 1000\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# processor which contains the colloate function to load the preprocessed data\\n\",\n    \"processor = BertSumAbsProcessor(cache_dir=CACHE_PATH, max_src_len=MAX_SOURCE_SEQ_LENGTH, max_tgt_len=MAX_TARGET_SEQ_LENGTH)\\n\",\n    \"# summarizer\\n\",\n    \"summarizer = BertSumAbs(\\n\",\n    \"    processor, cache_dir=CACHE_PATH, max_pos_length=MAX_POS\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"BATCH_SIZE_PER_GPU*NUM_GPUS\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"summarizer.fit(\\n\",\n    \"    train_dataset,\\n\",\n    \"    num_gpus=NUM_GPUS,\\n\",\n    \"    batch_size=BATCH_SIZE,\\n\",\n    \"    max_steps=MAX_STEPS,\\n\",\n    \"    learning_rate_bert=LEARNING_RATE_BERT,\\n\",\n    \"    learning_rate_dec=LEARNING_RATE_DEC,\\n\",\n    \"    warmup_steps_bert=WARMUP_STEPS_BERT,\\n\",\n    \"    warmup_steps_dec=WARMUP_STEPS_DEC,\\n\",\n    \"    save_every=SAVE_EVERY,\\n\",\n    \"    report_every=REPORT_EVERY * 5,\\n\",\n    \"    fp16=FP16,\\n\",\n    \"    # checkpoint=\\\"saved checkpoint path\\\"\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"summarizer.save_model(MAX_STEPS, os.path.join(CACHE_PATH, \\\"bertsumabs.pt\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Model Evaluation\\n\",\n    \"\\n\",\n    \"To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb) for setup.\\n\",\n    \"For the settings in this notebook with QUICK_RUN=False, you should get ROUGE scores close to the following numbers: <br />\\n\",\n    \"``\\n\",\n    \"{'rouge-1': {'f': 0.34819639878321873,\\n\",\n    \"             'p': 0.39977932634737307,\\n\",\n    \"             'r': 0.34429079596863604},\\n\",\n    \" 'rouge-2': {'f': 0.13919271352557894,\\n\",\n    \"             'p': 0.16129965067780644,\\n\",\n    \"             'r': 0.1372938054050938},\\n\",\n    \" 'rouge-l': {'f': 0.2313282318854973,\\n\",\n    \"             'p': 0.26664667422849747,\\n\",\n    \"             'r': 0.22850294283399628}}\\n\",\n    \" ``\\n\",\n    \" \\n\",\n    \" Better performance can be achieved by increasing the MAX_STEPS.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"\\n\",\n    \"# checkpoint = torch.load(os.path.join(CACHE_PATH, \\\"bertsumabs.pt\\\"), map_location=\\\"cpu\\\")\\n\",\n    \"# summarizer = BertSumAbs(\\n\",\n    \"#     processor, cache_dir=CACHE_PATH, max_pos_length=MAX_POS, test=True\\n\",\n    \"# )\\n\",\n    \"# summarizer.model.load_checkpoint(checkpoint['model'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"TEST_TOP_N = 32\\n\",\n    \"if not QUICK_RUN:\\n\",\n    \"    TEST_TOP_N = len(test_dataset)\\n\",\n    \"\\n\",\n    \"if NUM_GPUS:\\n\",\n    \"    BATCH_SIZE = NUM_GPUS * BATCH_SIZE_PER_GPU\\n\",\n    \"else:\\n\",\n    \"    BATCH_SIZE = 1\\n\",\n    \"    \\n\",\n    \"shortened_dataset = test_dataset.shorten(top_n=TEST_TOP_N)\\n\",\n    \"src = shortened_dataset.get_source()\\n\",\n    \"reference_summaries = [\\\" \\\".join(t).rstrip(\\\"\\\\n\\\") for t in shortened_dataset.get_target()]\\n\",\n    \"generated_summaries = summarizer.predict(\\n\",\n    \"    shortened_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS\\n\",\n    \")\\n\",\n    \"assert len(generated_summaries) == len(reference_summaries)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"src[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"generated_summaries[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"reference_summaries[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"rouge_scores = compute_rouge_python(cand=generated_summaries, ref=reference_summaries)\\n\",\n    \"pprint.pprint(rouge_scores)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# for testing\\n\",\n    \"sb.glue(\\\"rouge_2_f_score\\\", rouge_scores['rouge-2']['f'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Prediction on a single input sample\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"source = \\\"\\\"\\\"\\n\",\n    \"But under the new rule, set to be announced in the next 48 hours, Border Patrol agents would immediately return anyone to Mexico — without any detainment and without any due process — who attempts to cross the southwestern border between the legal ports of entry. The person would not be held for any length of time in an American facility.\\n\",\n    \"\\n\",\n    \"Although they advised that details could change before the announcement, administration officials said the measure was needed to avert what they fear could be a systemwide outbreak of the coronavirus inside detention facilities along the border. Such an outbreak could spread quickly through the immigrant population and could infect large numbers of Border Patrol agents, leaving the southwestern border defenses weakened, the officials argued.\\n\",\n    \"The Trump administration plans to immediately turn back all asylum seekers and other foreigners attempting to enter the United States from Mexico illegally, saying the nation cannot risk allowing the coronavirus to spread through detention facilities and Border Patrol agents, four administration officials said.\\n\",\n    \"The administration officials said the ports of entry would remain open to American citizens, green-card holders and foreigners with proper documentation. Some foreigners would be blocked, including Europeans currently subject to earlier travel restrictions imposed by the administration. The points of entry will also be open to commercial traffic.\\\"\\\"\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"test_dataset = SummarizationDataset(\\n\",\n    \"    None, source=[source], source_preprocessing=[tokenize.sent_tokenize],\\n\",\n    \")\\n\",\n    \"generated_summaries = summarizer.predict(test_dataset, batch_size=1, num_gpus=NUM_GPUS)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"generated_summaries[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Clean up temporary folders\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if os.path.exists(DATA_PATH):\\n\",\n    \"    shutil.rmtree(DATA_PATH, ignore_errors=True)\\n\",\n    \"if os.path.exists(CACHE_PATH):\\n\",\n    \"    shutil.rmtree(CACHE_PATH, ignore_errors=True)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}"
  },
  {
    "path": "examples/text_summarization/abstractive_summarization_minilm_cnndm.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation.  \\n\",\n    \"Licensed under the MIT License.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Abstractive Summarization using MiniLM on CNN/DailyMails\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Before you start\\n\",\n    \"Set `QUICK_RUN = True` to run the notebook on a small subset of data and a smaller number of steps. If `QUICK_RUN = False`, the notebook takes about 2 hours to run on a VM with 4 16GB NVIDIA V100 GPUs. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"QUICK_RUN = True\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Summary\\n\",\n    \"This notebook demostrates how to fine-tune the [MiniLM](https://arxiv.org/abs/2002.10957) for abstractive summarization task. Utility functions and classes in the microsoft/nlp-recipes repo are used to facilitate data preprocessing, model training, model scoring, result postprocessing, and model evaluation.\\n\",\n    \"\\n\",\n    \"### Abstractive Summarization\\n\",\n    \"Abstractive summarization is the task of taking an input text and summarizing its content in a shorter output text. In contrast to extractive summarization, abstractive summarization doesn't take sentences directly from the input text, instead, rephrases the input text.\\n\",\n    \"\\n\",\n    \"### MiniLM\\n\",\n    \"[Unified Language Model](https://arxiv.org/abs/1905.03197) (UniLM) is a state of the art model developed by Microsoft Research Asia (MSRA). The model is first pre-trained on a large unlabeled natural language corpus (English Wikipedia and BookBorpus) and can be fine-tuned on different types of labeled data for various NLP tasks like text classification and abstractive summarization. For more information, please consult the notebook [Abstractive Summarization using MiniLM on CNN/DailyMails](./abstractive_summarization_unilm_cnndm.ipynb).\\n\",\n    \"\\n\",\n    \"Large pre-trained language models like BERT and UniLM usually consists of **hundreds** of millions of parameters and it's challleging to fine-tune such large models and also serve  real-life applications due to latency and capacity constraints.\\n\",\n    \"\\n\",\n    \"MiniLM is a small version of UniLM, which is trained to deelply mimic UniLM with  deep self-attention knowledge distillation. It only consits of **tens** of millions of parameters (33M), which is less than one third of BERT base model and only half of the size of [DistilBERT](https://arxiv.org/abs/1910.01108). Experimental results demonstrate that MiniLM retains most of the performance of UniLM on various NLP tasks with much less computation.  Our experiments show that to achieve the same performance, MiniLM funtuning on CNN/DailyMail dataset can be more than **ten times faster** and inferencing can be **six times faster** than UniLM's. \\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"%load_ext autoreload\\n\",\n    \"%autoreload 2\\n\",\n    \"import os\\n\",\n    \"import shutil\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"import pprint\\n\",\n    \"import scrapbook as sb\\n\",\n    \"import sys\\n\",\n    \"import time\\n\",\n    \"import torch\\n\",\n    \"\\n\",\n    \"nlp_path = os.path.abspath(\\\"../../\\\")\\n\",\n    \"if nlp_path not in sys.path:\\n\",\n    \"    sys.path.insert(0, nlp_path)\\n\",\n    \"\\n\",\n    \"from utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg\\n\",\n    \"from utils_nlp.models.transformers.abstractive_summarization_seq2seq import S2SAbsSumProcessor, S2SAbstractiveSummarizer\\n\",\n    \"from utils_nlp.eval import compute_rouge_python\\n\",\n    \"\\n\",\n    \"from utils_nlp.models.transformers.datasets import SummarizationDataset\\n\",\n    \"from utils_nlp.dataset.cnndm import detokenize\\n\",\n    \"\\n\",\n    \"start_time = time.time()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# model parameters\\n\",\n    \"MODEL_NAME = \\\"minilm-l12-h384-uncased\\\" \\n\",\n    \"MAX_SEQ_LENGTH = 512 \\n\",\n    \"MAX_SOURCE_SEQ_LENGTH = 464 \\n\",\n    \"MAX_TARGET_SEQ_LENGTH = MAX_SEQ_LENGTH - MAX_SOURCE_SEQ_LENGTH \\n\",\n    \"\\n\",\n    \"# use 0 for CPU\\n\",\n    \"NUM_GPUS =  torch.cuda.device_count()\\n\",\n    \"\\n\",\n    \"# fine-tuning parameters\\n\",\n    \"TRAIN_PER_GPU_BATCH_SIZE = 4\\n\",\n    \"GRADIENT_ACCUMULATION_STEPS = 1\\n\",\n    \"LEARNING_RATE = 1e-4\\n\",\n    \"\\n\",\n    \"TOP_N = -1\\n\",\n    \"WARMUP_STEPS = 500\\n\",\n    \"MAX_STEPS = 5000\\n\",\n    \"BEAM_SIZE = 5\\n\",\n    \"if QUICK_RUN:\\n\",\n    \"    TOP_N = 1000\\n\",\n    \"    WARMUP_STEPS = 500\\n\",\n    \"    MAX_STEPS = 1000\\n\",\n    \"    BEAM_SIZE = 3\\n\",\n    \"    if NUM_GPUS == 0:\\n\",\n    \"        TOP_N = 5\\n\",\n    \"        MAX_STEPS = 10\\n\",\n    \"\\n\",\n    \"# inference parameters\\n\",\n    \"TEST_PER_GPU_BATCH_SIZE = 12\\n\",\n    \"FORBID_IGNORE_WORD = \\\".\\\"\\n\",\n    \"\\n\",\n    \"# mixed precision setting. To enable mixed precision training, follow instructions in SETUP.md. \\n\",\n    \"# You will be able to increase the batch sizes with mixed precision training.\\n\",\n    \"FP16 = False\\n\",\n    \"\\n\",\n    \"CLEANUP_RESULTS = False\\n\",\n    \"\\n\",\n    \"DATA_DIR = TemporaryDirectory().name\\n\",\n    \"CACHE_DIR = TemporaryDirectory().name\\n\",\n    \"\\n\",\n    \"MODEL_DIR = \\\"./minilm_cnndm_model\\\"\\n\",\n    \"RESULT_DIR = \\\"./minilm_cnndm_result\\\"\\n\",\n    \"os.makedirs(MODEL_DIR, exist_ok=True)\\n\",\n    \"os.makedirs(RESULT_DIR, exist_ok=True)\\n\",\n    \"OUTPUT_FILE = os.path.join(RESULT_DIR, 'nlp_cnndm_finetuning_results.txt')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Load the CNN/DailyMail dataset\\n\",\n    \"The [CNN/DailyMail dataset](https://cs.nyu.edu/~kcho/DMQA/) was original introduced for Q&A research. There are multiple versions of the dataset processed for summarization task available on the web. The `CNNDMSummarizationDatasetOrg` function downloads a version from the [UniLM repo](https://github.com/microsoft/unilm) with minimal processing. The function returns the training and testing dataset as `SummarizationDataset` which can be further processed for model training and testing.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"train_ds, test_ds = CNNDMSummarizationDatasetOrg(local_path=DATA_DIR, top_n=TOP_N)\\n\",\n    \"print(len(train_ds))\\n\",\n    \"print(len(test_ds))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Preprocessing\\n\",\n    \"The `S2SAbsSumProcessor` has multiple methods for converting input data in `SummarizationDataset`, `IterableSummarizationDataset` or json files into the format required for model training and testing. The preprocessing steps include\\n\",\n    \"- Tokenize input text\\n\",\n    \"- Convert tokens into token ids\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"processor = S2SAbsSumProcessor(model_name=MODEL_NAME,  cache_dir=CACHE_DIR)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"train_dataset = processor.s2s_dataset_from_sum_ds(train_ds, train_mode=True)\\n\",\n    \"test_dataset = processor.s2s_dataset_from_sum_ds(test_ds, train_mode=False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# example code to load preprocessed xsum dataset from UniLM Repo\\n\",\n    \"# train_dataset = processor.s2s_dataset_from_json_or_file(\\\"/dadendev/unilm/data/xsum.train.uncased_tokenized.json\\\", train_mode=True, top_n=TOP_N)\\n\",\n    \"# test_dataset = processor.s2s_dataset_from_json_or_file(\\\"/dadendev/unilm/data/xsum.test.uncased_tokenized.json\\\", train_mode=False, top_n=TOP_N)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Fine tune model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The `S2SAbstractiveSummarizer` loads a pre-trained UniLM model specified by `model_name`.  \\n\",\n    \"Call `S2SAbstractiveSummarizer.list_supported_models()` to see all the supported models.  \\n\",\n    \"If you want to use a model on the local disk, specify `load_model_from_dir` and `model_file_name`. This is particularly useful if you want to load a previously fine-tuned model and use it for inference directly without fine-tuning. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"S2SAbstractiveSummarizer.list_supported_models()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"abs_summarizer = S2SAbstractiveSummarizer(\\n\",\n    \"    model_name=MODEL_NAME,\\n\",\n    \"    max_seq_length=MAX_SEQ_LENGTH,\\n\",\n    \"    max_source_seq_length=MAX_SOURCE_SEQ_LENGTH,\\n\",\n    \"    max_target_seq_length=MAX_TARGET_SEQ_LENGTH,\\n\",\n    \"    cache_dir=CACHE_DIR\\n\",\n    \")\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"abs_summarizer.model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# example code to load the model from a saved checkpoint\\n\",\n    \"\\\"\\\"\\\"\\n\",\n    \"abs_summarizer = S2SAbstractiveSummarizer(\\n\",\n    \"     model_name=MODEL_NAME,\\n\",\n    \"     max_seq_length=MAX_SEQ_LENGTH,\\n\",\n    \"     max_source_seq_length=MAX_SOURCE_SEQ_LENGTH,\\n\",\n    \"    max_target_seq_length=MAX_TARGET_SEQ_LENGTH,\\n\",\n    \"     load_model_from_dir=RESULT_DIR,\\n\",\n    \"    model_file_name=\\\"model.5000.bin\\\",\\n\",\n    \" )\\n\",\n    \"\\\"\\\"\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"%%time\\n\",\n    \"abs_summarizer.fit(\\n\",\n    \"    train_dataset=train_dataset,\\n\",\n    \"    num_gpus=NUM_GPUS,\\n\",\n    \"    per_gpu_batch_size=TRAIN_PER_GPU_BATCH_SIZE,\\n\",\n    \"    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\\n\",\n    \"    learning_rate=LEARNING_RATE,\\n\",\n    \"    warmup_steps=WARMUP_STEPS,\\n\",\n    \"    max_steps=MAX_STEPS,\\n\",\n    \"    fp16=FP16,\\n\",\n    \"    save_model_to_dir=MODEL_DIR\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# save the finetuned model\\n\",\n    \"# abs_summarizer.save_model(RESULT_DIR, 5000, False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Generate summaries on testing dataset\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"predictions = abs_summarizer.predict(\\n\",\n    \"    test_dataset=test_dataset,\\n\",\n    \"    num_gpus=NUM_GPUS,\\n\",\n    \"    per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE,\\n\",\n    \"    beam_size=BEAM_SIZE,\\n\",\n    \"    max_tgt_length=MAX_TARGET_SEQ_LENGTH,\\n\",\n    \"    forbid_ignore_word=FORBID_IGNORE_WORD,\\n\",\n    \"    fp16=FP16\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"for r in predictions[:5]:\\n\",\n    \"    print(r)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"test_ds.get_source()[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"test_ds.get_target()[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"predictions[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with open(OUTPUT_FILE, 'w', encoding=\\\"utf-8\\\") as f:\\n\",\n    \"    for line in predictions:\\n\",\n    \"        f.write(line + '\\\\n')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Prediction on a single input sample\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"source = \\\"\\\"\\\"\\n\",\n    \"But under the new rule, set to be announced in the next 48 hours, Border Patrol agents would immediately return anyone to Mexico — without any detainment and without any due process — who attempts to cross the southwestern border between the legal ports of entry. The person would not be held for any length of time in an American facility.\\n\",\n    \"\\n\",\n    \"Although they advised that details could change before the announcement, administration officials said the measure was needed to avert what they fear could be a systemwide outbreak of the coronavirus inside detention facilities along the border. Such an outbreak could spread quickly through the immigrant population and could infect large numbers of Border Patrol agents, leaving the southwestern border defenses weakened, the officials argued.\\n\",\n    \"The Trump administration plans to immediately turn back all asylum seekers and other foreigners attempting to enter the United States from Mexico illegally, saying the nation cannot risk allowing the coronavirus to spread through detention facilities and Border Patrol agents, four administration officials said.\\n\",\n    \"The administration officials said the ports of entry would remain open to American citizens, green-card holders and foreigners with proper documentation. Some foreigners would be blocked, including Europeans currently subject to earlier travel restrictions imposed by the administration. The points of entry will also be open to commercial traffic.\\\"\\\"\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"singel_test_ds = SummarizationDataset(\\n\",\n    \"    None, source=[source], source_preprocessing=[detokenize],\\n\",\n    \")\\n\",\n    \"single_test_dataset = processor.s2s_dataset_from_sum_ds(singel_test_ds, train_mode=False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"single_prediction = abs_summarizer.predict(\\n\",\n    \"    test_dataset=single_test_dataset,\\n\",\n    \"    num_gpus=NUM_GPUS,\\n\",\n    \"    per_gpu_batch_size=1,\\n\",\n    \"    beam_size=BEAM_SIZE,\\n\",\n    \"    forbid_ignore_word=FORBID_IGNORE_WORD,\\n\",\n    \"    fp16=FP16\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"single_prediction[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Evaluation\\n\",\n    \"We provide utility functions for evaluating summarization models and details can be found in the [summarization evaluation notebook](./summarization_evaluation.ipynb).  \\n\",\n    \"For the settings in this notebook with QUICK_RUN=False, you should get ROUGE scores close to the following numbers: <br />\\n\",\n    \"``\\n\",\n    \"{'rouge-1': {'f': 0.36208534811461,\\n\",\n    \"             'p': 0.4743143496862804,\\n\",\n    \"             'r': 0.30901813498597874},\\n\",\n    \" 'rouge-2': {'f': 0.1620935174111968,\\n\",\n    \"             'p': 0.2153396681546399,\\n\",\n    \"             'r': 0.13747476622638555},\\n\",\n    \" 'rouge-l': {'f': 0.2612394493528272,\\n\",\n    \"             'p': 0.3426511372716949,\\n\",\n    \"             'r': 0.22311445054693663}}\\n\",\n    \"``\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"rouge_scores = compute_rouge_python(cand=predictions, ref=test_ds.get_target())\\n\",\n    \"pprint.pprint(rouge_scores)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# for testing\\n\",\n    \"sb.glue(\\\"rouge_1_f_score\\\", rouge_scores[\\\"rouge-1\\\"][\\\"f\\\"])\\n\",\n    \"sb.glue(\\\"rouge_2_f_score\\\", rouge_scores[\\\"rouge-2\\\"][\\\"f\\\"])\\n\",\n    \"sb.glue(\\\"rouge_l_f_score\\\", rouge_scores[\\\"rouge-l\\\"][\\\"f\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Distributed training with DistributedDataParallel (DDP)\\n\",\n    \"Please consult the notebook [Abstractive Summarization using MiniLM on CNN/DailyMails](./abstractive_summarization_unilm_cnndm.ipynb) for distributed training.    \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Clean up \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if os.path.exists(DATA_DIR):\\n\",\n    \"    shutil.rmtree(DATA_DIR, ignore_errors=True)\\n\",\n    \"if os.path.exists(CACHE_DIR):\\n\",\n    \"    shutil.rmtree(CACHE_DIR, ignore_errors=True)\\n\",\n    \"    \\n\",\n    \"if CLEANUP_RESULTS:\\n\",\n    \"    if os.path.exists(MODEL_DIR):\\n\",\n    \"        shutil.rmtree(MODEL_DIR, ignore_errors=True)\\n\",\n    \"    if os.path.exists(RESULT_DIR):\\n\",\n    \"        shutil.rmtree(RESULT_DIR, ignore_errors=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\\"Total notebook running time {}\\\".format(time.time() - start_time))\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/text_summarization/abstractive_summarization_unilm_cnndm.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation.  \\n\",\n    \"Licensed under the MIT License.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Abstractive Summarization using UniLM on CNN/DailyMails\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Before you start\\n\",\n    \"Set `QUICK_RUN = True` to run the notebook on a small subset of data and a smaller number of steps. If `QUICK_RUN = False`, the notebook takes about 9 hours to run on a VM with 4 16GB NVIDIA V100 GPUs. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"QUICK_RUN = True\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Summary\\n\",\n    \"This notebook demostrates how to fine-tune the [Unified Language Model](https://arxiv.org/abs/1905.03197) (UniLM) for abstractive summarization task. Utility functions and classes in the microsoft/nlp-recipes repo are used to facilitate data preprocessing, model training, model scoring, result postprocessing, and model evaluation.\\n\",\n    \"\\n\",\n    \"### Abstractive Summarization\\n\",\n    \"Abstractive summarization is the task of taking an input text and summarizing its content in a shorter output text. In contrast to extractive summarization, abstractive summarization doesn't take sentences directly from the input text, instead, rephrases the input text.\\n\",\n    \"\\n\",\n    \"### UniLM\\n\",\n    \"UniLM is a state of the art model developed by Microsoft Research Asia (MSRA). The model is first pre-trained on a large unlabeled natural language corpus (English Wikipedia and BookBorpus) and can be fine-tuned on different types of labeled data for various NLP tasks like text classification and abstractive summarization.   \\n\",\n    \"The figure below shows the UniLM architecture. During pre-training, the model parameters are shared across the LM objectives (i.e., bidirectional LM, unidirectional LM, and sequence-to-sequence LM). For different NLP tasks, UniLM uses different self-attention masks to control the access to context for each word token.  \\n\",\n    \"The seq-to-seq LM in the third row in the figure is used in summarization task. In seq-to-seq LM, word tokens in the input sequence can access all the other tokens in the input sequence, but can not access the word tokens in the output sequence. Word tokens in the output sequence can access all the tokens in the input sequence and the tokens in the output sequence generated before the current position. \\n\",\n    \"<img src=\\\"https://nlpbp.blob.core.windows.net/images/unilm_architecture.PNG\\\" width=\\\"600\\\" height=\\\"600\\\">\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"%load_ext autoreload\\n\",\n    \"%autoreload 2\\n\",\n    \"import os\\n\",\n    \"import shutil\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"import pprint\\n\",\n    \"import scrapbook as sb\\n\",\n    \"import sys\\n\",\n    \"import time\\n\",\n    \"import torch\\n\",\n    \"\\n\",\n    \"nlp_path = os.path.abspath(\\\"../../\\\")\\n\",\n    \"if nlp_path not in sys.path:\\n\",\n    \"    sys.path.insert(0, nlp_path)\\n\",\n    \"\\n\",\n    \"from utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg\\n\",\n    \"from utils_nlp.models.transformers.abstractive_summarization_seq2seq import S2SAbsSumProcessor, S2SAbstractiveSummarizer\\n\",\n    \"from utils_nlp.eval import compute_rouge_python\\n\",\n    \"\\n\",\n    \"from utils_nlp.models.transformers.datasets import SummarizationDataset\\n\",\n    \"from utils_nlp.dataset.cnndm import detokenize\\n\",\n    \"\\n\",\n    \"start_time = time.time()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# model parameters\\n\",\n    \"MODEL_NAME = \\\"unilm-base-cased\\\"\\n\",\n    \"MAX_SEQ_LENGTH = 768\\n\",\n    \"MAX_SOURCE_SEQ_LENGTH = 640\\n\",\n    \"MAX_TARGET_SEQ_LENGTH = 128\\n\",\n    \"\\n\",\n    \"# use 0 for CPU\\n\",\n    \"NUM_GPUS =  torch.cuda.device_count()\\n\",\n    \"\\n\",\n    \"# fine-tuning parameters\\n\",\n    \"TRAIN_PER_GPU_BATCH_SIZE = 1\\n\",\n    \"GRADIENT_ACCUMULATION_STEPS = 2\\n\",\n    \"LEARNING_RATE = 3e-5\\n\",\n    \"\\n\",\n    \"TOP_N = -1\\n\",\n    \"WARMUP_STEPS = 500\\n\",\n    \"MAX_STEPS = 5000\\n\",\n    \"BEAM_SIZE = 5\\n\",\n    \"if QUICK_RUN:\\n\",\n    \"    TOP_N = 100\\n\",\n    \"    WARMUP_STEPS = 5\\n\",\n    \"    MAX_STEPS = 50\\n\",\n    \"    BEAM_SIZE = 3\\n\",\n    \"    if NUM_GPUS == 0:\\n\",\n    \"        TOP_N = 5\\n\",\n    \"        MAX_STEPS = 10\\n\",\n    \"\\n\",\n    \"# inference parameters\\n\",\n    \"TEST_PER_GPU_BATCH_SIZE = 12\\n\",\n    \"FORBID_IGNORE_WORD = \\\".\\\"\\n\",\n    \"\\n\",\n    \"# mixed precision setting. To enable mixed precision training, follow instructions in SETUP.md. \\n\",\n    \"# You will be able to increase the batch sizes with mixed precision training.\\n\",\n    \"FP16 = False\\n\",\n    \"\\n\",\n    \"DATA_DIR = TemporaryDirectory().name\\n\",\n    \"CACHE_DIR = TemporaryDirectory().name\\n\",\n    \"MODEL_DIR = \\\".\\\"\\n\",\n    \"RESULT_DIR = \\\".\\\"\\n\",\n    \"OUTPUT_FILE = os.path.join(RESULT_DIR, 'nlp_cnndm_finetuning_results.txt')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Load the CNN/DailyMail dataset\\n\",\n    \"The [CNN/DailyMail dataset](https://cs.nyu.edu/~kcho/DMQA/) was original introduced for Q&A research. There are multiple versions of the dataset processed for summarization task available on the web. The `CNNDMSummarizationDatasetOrg` function downloads a version from the [UniLM repo](https://github.com/microsoft/unilm) with minimal processing. The function returns the training and testing dataset as `SummarizationDataset` which can be further processed for model training and testing.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"train_ds, test_ds = CNNDMSummarizationDatasetOrg(local_path=DATA_DIR, top_n=TOP_N)\\n\",\n    \"print(len(train_ds))\\n\",\n    \"print(len(test_ds))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Preprocessing\\n\",\n    \"The `S2SAbsSumProcessor` has multiple methods for converting input data in `SummarizationDataset`, `IterableSummarizationDataset` or json files into the format required for model training and testing. The preprocessing steps include\\n\",\n    \"- Tokenize input text\\n\",\n    \"- Convert tokens into token ids\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"processor = S2SAbsSumProcessor(model_name=MODEL_NAME, cache_dir=CACHE_DIR)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"cached_features_file_train = os.path.join(RESULT_DIR, \\\"cached_features_for_training.pt\\\")\\n\",\n    \"cached_features_file_test = os.path.join(RESULT_DIR, \\\"cached_features_for_testing.pt\\\")\\n\",\n    \"train_dataset = processor.s2s_dataset_from_sum_ds(train_ds, cached_features_file=cached_features_file_train, train_mode=True)\\n\",\n    \"test_dataset = processor.s2s_dataset_from_sum_ds(test_ds, cached_features_file=cached_features_file_test, train_mode=False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Fine tune model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The `S2SAbstractiveSummarizer` loads a pre-trained UniLM model specified by `model_name`.  \\n\",\n    \"Call `S2SAbstractiveSummarizer.list_supported_models()` to see all the supported models.  \\n\",\n    \"If you want to use a model on the local disk, specify `load_model_from_dir` and `model_file_name`. This is particularly useful if you want to load a previously fine-tuned model and use it for inference directly without fine-tuning. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"abs_summarizer = S2SAbstractiveSummarizer(\\n\",\n    \"    model_name=MODEL_NAME,\\n\",\n    \"    max_seq_length=MAX_SEQ_LENGTH,\\n\",\n    \"    max_source_seq_length=MAX_SOURCE_SEQ_LENGTH,\\n\",\n    \"    max_target_seq_length=MAX_TARGET_SEQ_LENGTH,\\n\",\n    \"    cache_dir=CACHE_DIR\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"## To load a model on the local disk\\n\",\n    \"# abs_summarizer = S2SAbstractiveSummarizer(\\n\",\n    \"#     model_name=MODEL_NAME,\\n\",\n    \"#     max_seq_len=MAX_SEQ_LEN,\\n\",\n    \"#     max_source_seq_length=MAX_SOURCE_SEQ_LENGTH,\\n\",\n    \"#     max_target_seq_length=MAX_TARGET_SEQ_LENGTH,\\n\",\n    \"#     load_model_from_dir=\\\"./\\\",\\n\",\n    \"#     model_file_name=\\\"model.5000.bin\\\",\\n\",\n    \"# )\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"abs_summarizer.fit(\\n\",\n    \"    train_dataset=train_dataset,\\n\",\n    \"    num_gpus=NUM_GPUS,\\n\",\n    \"    per_gpu_batch_size=TRAIN_PER_GPU_BATCH_SIZE,\\n\",\n    \"    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\\n\",\n    \"    learning_rate=LEARNING_RATE,\\n\",\n    \"    warmup_steps=WARMUP_STEPS,\\n\",\n    \"    max_steps=MAX_STEPS,\\n\",\n    \"    fp16=FP16,\\n\",\n    \"    save_model_to_dir=MODEL_DIR\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Generate summaries on testing dataset\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"predictions = abs_summarizer.predict(\\n\",\n    \"    test_dataset=test_dataset,\\n\",\n    \"    num_gpus=NUM_GPUS,\\n\",\n    \"    per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE,\\n\",\n    \"    beam_size=BEAM_SIZE,\\n\",\n    \"    forbid_ignore_word=FORBID_IGNORE_WORD,\\n\",\n    \"    fp16=FP16\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"for r in predictions[:TOP_N]:\\n\",\n    \"    print(r)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"test_ds.get_source()[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"test_ds.get_target()[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"predictions[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with open(OUTPUT_FILE, 'w', encoding=\\\"utf-8\\\") as f:\\n\",\n    \"    for line in predictions:\\n\",\n    \"        f.write(line + '\\\\n')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Prediction on a single input sample\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"source = \\\"\\\"\\\"\\n\",\n    \"But under the new rule, set to be announced in the next 48 hours, Border Patrol agents would immediately return anyone to Mexico — without any detainment and without any due process — who attempts to cross the southwestern border between the legal ports of entry. The person would not be held for any length of time in an American facility.\\n\",\n    \"\\n\",\n    \"Although they advised that details could change before the announcement, administration officials said the measure was needed to avert what they fear could be a systemwide outbreak of the coronavirus inside detention facilities along the border. Such an outbreak could spread quickly through the immigrant population and could infect large numbers of Border Patrol agents, leaving the southwestern border defenses weakened, the officials argued.\\n\",\n    \"The Trump administration plans to immediately turn back all asylum seekers and other foreigners attempting to enter the United States from Mexico illegally, saying the nation cannot risk allowing the coronavirus to spread through detention facilities and Border Patrol agents, four administration officials said.\\n\",\n    \"The administration officials said the ports of entry would remain open to American citizens, green-card holders and foreigners with proper documentation. Some foreigners would be blocked, including Europeans currently subject to earlier travel restrictions imposed by the administration. The points of entry will also be open to commercial traffic.\\\"\\\"\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"singel_test_ds = SummarizationDataset(\\n\",\n    \"    None, source=[source], source_preprocessing=[detokenize],\\n\",\n    \")\\n\",\n    \"single_test_dataset = processor.s2s_dataset_from_sum_ds(singel_test_ds, train_mode=False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"single_prediction = abs_summarizer.predict(\\n\",\n    \"    test_dataset=single_test_dataset,\\n\",\n    \"    num_gpus=NUM_GPUS,\\n\",\n    \"    per_gpu_batch_size=1,\\n\",\n    \"    beam_size=BEAM_SIZE,\\n\",\n    \"    forbid_ignore_word=FORBID_IGNORE_WORD,\\n\",\n    \"    fp16=FP16\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"single_prediction[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Evaluation\\n\",\n    \"We provide utility functions for evaluating summarization models and details can be found in the [summarization evaluation notebook](./summarization_evaluation.ipynb).  \\n\",\n    \"For the settings in this notebook with QUICK_RUN=False, you should get ROUGE scores close to the following numbers:  \\n\",\n    \"{'rouge-1': {'f': 0.37109626943068647,\\n\",\n    \"  'p': 0.4692792272280924,\\n\",\n    \"  'r': 0.33322322114381886},  \\n\",\n    \" 'rouge-2': {'f': 0.1690495786379728,\\n\",\n    \"  'p': 0.21782900161918375,\\n\",\n    \"  'r': 0.15079122430118444},  \\n\",\n    \" 'rouge-l': {'f': 0.2671310062443078,\\n\",\n    \"  'p': 0.3414039392451434,\\n\",\n    \"  'r': 0.2392756715930202}}\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"rouge_scores = compute_rouge_python(cand=predictions, ref=test_ds.get_target())\\n\",\n    \"pprint.pprint(rouge_scores)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Distributed training with DistributedDataParallel (DDP)\\n\",\n    \"This notebook uses DataParallel for multi-GPU training by default. In general, DistributedDataParallel(DDP) is recommended because of its better performance. See details [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).  \\n\",\n    \"Since DDP requires multiprocess and can not be run from the notebook, we provide a python script [abstractive_summarization_unilm_cnndm.py](./abstractive_summarization_unilm_cnndm.py) to demonstrate how to use DDP.  \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"First, we save the training and testing dataset to jsonlines files to be used by the python script. This avoids multiple processes repeating the initial data pre-processing. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"train_ds.save_to_jsonl(os.path.join(RESULT_DIR, \\\"train_ds.jsonl\\\"))\\n\",\n    \"test_ds.save_to_jsonl(os.path.join(RESULT_DIR, \\\"test_ds.jsonl\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Next, we can execute the Python script using torch.distributed.launch, set `--nproc_per_node` to the number of GPUs on your machine.\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"python -m torch.distributed.launch --nproc_per_node=4 --nnode=1 abstractive_summarization_unilm_cnndm.py\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"**Note that the python script set `fp16=False` by default. If you have enabled mixed precision training following the instructions in [SETUP.md](\\\"../../SETUP.md\\\"), you can call the script with an additional argument \\\"--fp16 true\\\". You will be able to increase the batch sizes with mixed precision training.**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if os.path.exists(DATA_DIR):\\n\",\n    \"    shutil.rmtree(DATA_DIR, ignore_errors=True)\\n\",\n    \"if os.path.exists(CACHE_DIR):\\n\",\n    \"    shutil.rmtree(CACHE_DIR, ignore_errors=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\\"Total notebook running time {}\\\".format(time.time() - start_time))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# for testing\\n\",\n    \"sb.glue(\\\"rouge_1_f_score\\\", rouge_scores[\\\"rouge-1\\\"][\\\"f\\\"])\\n\",\n    \"sb.glue(\\\"rouge_2_f_score\\\", rouge_scores[\\\"rouge-2\\\"][\\\"f\\\"])\\n\",\n    \"sb.glue(\\\"rouge_l_f_score\\\", rouge_scores[\\\"rouge-l\\\"][\\\"f\\\"])\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/text_summarization/abstractive_summarization_unilm_cnndm.py",
    "content": "import datetime\nimport argparse\nimport jsonlines\n\nimport torch\n\nfrom utils_nlp.models.transformers.abstractive_summarization_seq2seq import (\n     S2SAbsSumProcessor, \n     S2SAbstractiveSummarizer\n)\n\nfrom utils_nlp.eval import compute_rouge_python\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\n    \"--local_rank\", type=int, default=-1, help=\"For distributed training: local_rank\"\n)\nparser.add_argument(\"--fp16\", type=bool, default=False)\nparser.add_argument(\"--fp16_opt_level\", type=str, default=\"O2\")\nargs = parser.parse_args()\n\n\nQUICK_RUN = True\nOUTPUT_FILE = \"./nlp_cnndm_finetuning_results.txt\"\n\n# model parameters\nMODEL_NAME = \"unilm-large-cased\"\nMAX_SEQ_LENGTH = 768\nMAX_SOURCE_SEQ_LENGTH = 640\nMAX_TARGET_SEQ_LENGTH = 128\n\n# fine-tuning parameters\nTRAIN_PER_GPU_BATCH_SIZE = 1\nGRADIENT_ACCUMULATION_STEPS = 2\nLEARNING_RATE = 3e-5\nif QUICK_RUN:\n    TOP_N = 100\n    WARMUP_STEPS = 10\n    MAX_STEPS = 100\nelse:\n    TOP_N = -1\n    WARMUP_STEPS = 500\n    MAX_STEPS = 5000\n\n# inference parameters\nTEST_PER_GPU_BATCH_SIZE = 8\nBEAM_SIZE = 5\nFORBID_IGNORE_WORD = \".\"\n\ntrain_ds = \"train_ds.jsonl\"\ntest_ds = \"test_ds.jsonl\"\n\n\ndef main():\n    torch.distributed.init_process_group(\n        timeout=datetime.timedelta(0, 5400), backend=\"nccl\",\n    )\n\n    if args.local_rank not in [-1, 0]:\n        torch.distributed.barrier()\n\n    processor = S2SAbsSumProcessor(model_name=MODEL_NAME)\n\n    abs_summarizer = S2SAbstractiveSummarizer(\n        model_name=MODEL_NAME,\n        max_seq_length=MAX_SEQ_LENGTH,\n        max_source_seq_length=MAX_SOURCE_SEQ_LENGTH,\n        max_target_seq_length=MAX_TARGET_SEQ_LENGTH,\n    )\n\n    if args.local_rank == 0:\n        torch.distributed.barrier()\n\n    train_dataset = processor.s2s_dataset_from_json_or_file(\n        train_ds, train_mode=True, local_rank=args.local_rank\n    )\n\n    test_dataset = processor.s2s_dataset_from_json_or_file(\n        test_ds, train_mode=False, local_rank=args.local_rank\n    )\n\n    abs_summarizer.fit(\n        train_dataset=train_dataset,\n        per_gpu_batch_size=TRAIN_PER_GPU_BATCH_SIZE,\n        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n        learning_rate=LEARNING_RATE,\n        warmup_steps=WARMUP_STEPS,\n        max_steps=MAX_STEPS,\n        fp16=args.fp16,\n        fp16_opt_level=args.fp16_opt_level,\n        local_rank=args.local_rank,\n        save_model_to_dir=\".\",\n    )\n\n    torch.distributed.barrier()\n\n    if args.local_rank in [-1, 0]:\n        res = abs_summarizer.predict(\n            test_dataset=test_dataset,\n            per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE,\n            beam_size=BEAM_SIZE,\n            forbid_ignore_word=FORBID_IGNORE_WORD,\n            fp16=args.fp16,\n        )\n\n        for r in res[:5]:\n            print(r)\n\n        with open(OUTPUT_FILE, \"w\", encoding=\"utf-8\") as f:\n            for line in res:\n                f.write(line + \"\\n\")\n\n        tgt = []\n        with jsonlines.open(test_ds) as reader:\n            for item in reader:\n                tgt.append(item[\"tgt\"])\n\n        for t in tgt[:5]:\n            print(t)\n\n        print(compute_rouge_python(cand=res, ref=tgt))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/text_summarization/extractive_summarization_cnndm_aml_distributed.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\n    \"\\n\",\n    \"Licensed under the MIT License\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Distributed Training For Extractive Summarization on CNN/DM Dataset\\n\",\n    \"\\n\",\n    \"## Summary\\n\",\n    \"This notebook demonstrates how to use Azure Machine Learning to run distributed training using Distributed Data Parallel in Pytorch for extractive summarization. For more detailed model related information, please see [extractive_summarization_cnndm_transformer.ipynb](extractive_summarization_cnndm_transformer.ipynb)\\n\",\n    \"\\n\",\n    \"## Prerequisites\\n\",\n    \"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, refer to the [Configuration Notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace. Prerequisites are:\\n\",\n    \"\\n\",\n    \"- Azure subscription\\n\",\n    \"- Azure Machine Learning Workspace\\n\",\n    \"- Azure Machine Learning SDK\\n\",\n    \"\\n\",\n    \"To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). \\n\",\n    \"\\n\",\n    \"You can run this notebook on CPU-only machines.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Import Libraries\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"%load_ext autoreload\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"%autoreload 2\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"import sys\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"import torch\\n\",\n    \"\\n\",\n    \"import azureml.core\\n\",\n    \"from azureml.core import Experiment, Workspace, Run\\n\",\n    \"from azureml.core.compute import ComputeTarget, AmlCompute\\n\",\n    \"from azureml.core.compute_target import ComputeTargetException\\n\",\n    \"from azureml.train.dnn import PyTorch\\n\",\n    \"from azureml.train.dnn import Nccl\\n\",\n    \"from azureml.widgets import RunDetails\\n\",\n    \"\\n\",\n    \"nlp_path = os.path.abspath(\\\"../../\\\")\\n\",\n    \"if nlp_path not in sys.path:\\n\",\n    \"    sys.path.insert(0, nlp_path)\\n\",\n    \"from utils_nlp.azureml.azureml_utils import get_or_create_workspace\\n\",\n    \"from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\\n\",\n    \"from utils_nlp.eval import compute_rouge_python\\n\",\n    \"from utils_nlp.models.transformers.extractive_summarization import (\\n\",\n    \"    ExtractiveSummarizer,\\n\",\n    \"    ExtSumProcessedData,\\n\",\n    \"    ExtSumProcessor,\\n\",\n    \")\\n\",\n    \"# Check core SDK version number\\n\",\n    \"print(\\\"SDK version:\\\", azureml.core.VERSION)\\n\",\n    \"\\n\",\n    \"import pprint\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Configuration \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# for Azure ML Workspacen\\n\",\n    \"SUBSRIPTION_ID = \\\"YOUR_SUBSCRIPTION_ID\\\"\\n\",\n    \"LOCATION = \\\"YOUR_RESOURCE_GROUP_NAME\\\"  # example \\\"eastus2\\\"\\n\",\n    \"RESOURCE_GROUP = \\\"YOUR_WORKSPACE_NAME\\\"  # modifiy to use your own\\n\",\n    \"WORKSPACE_NAME = \\\"YOUR_WORKSPACE_REGION\\\"  # modifiy to use your own\\n\",\n    \"\\n\",\n    \"# for creating Azure ML Compute Cluster\\n\",\n    \"AMLCOMPUTE_CLUSTER_NAME = \\\"bertsumext\\\"  # modifiy to use your own\\n\",\n    \"NODE_COUNT = 2\\n\",\n    \"VM_SIZE = \\\"STANDARD_NC6\\\"  # this should be the VM that's supported by Azure and Azure ML\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# for creating Azure ML Experiment\\n\",\n    \"EXPERIMENT_NAME = \\\"NLP-ExtSum\\\"  # modifiy to use your own\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# local folder to save the downloaded data\\n\",\n    \"LOCAL_DATA_FOLDER = (\\n\",\n    \"    \\\"./bertsumext_aml/data/\\\"\\n\",\n    \")  # modify to use your own, the penultimate level folder should exist\\n\",\n    \"LOCAL_CACHE_DIR = (\\n\",\n    \"    \\\"./bertsumext_aml/cache/\\\"\\n\",\n    \") \\n\",\n    \"# Training related parameter\\n\",\n    \"MODEL_NAME = \\\"distilbert-base-uncased\\\"  # limited choice\\n\",\n    \"ENCODER = \\\"transformer\\\"\\n\",\n    \"# folder in the workspace where the data is uploaded to\\n\",\n    \"TARGET_DATA_FOLDER = \\\"/bertsum_processed_data\\\"  # modify to use your own\\n\",\n    \"TARGET_OUTPUT_DIR = f\\\"output/{EXPERIMENT_NAME}/\\\"\\n\",\n    \"# cache dir in the workspace\\n\",\n    \"TARGET_CACHE_DIR = f\\\"cache/{EXPERIMENT_NAME}/\\\"\\n\",\n    \"\\n\",\n    \"TRAIN_FILE = \\\"train.pt\\\"\\n\",\n    \"TEST_FILE = \\\"test.pt\\\"\\n\",\n    \"# file name for saving the prediction\\n\",\n    \"SUMMARY_FILENAME = \\\"generated_summaries.txt\\\"\\n\",\n    \"# file name for saving the trained model\\n\",\n    \"MODEL_FILENAME = \\\"dist_extsum.pt\\\"\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# local path to download the output from the cluster\\n\",\n    \"LOCAL_OUTPUT_DIR = \\\"./bertsumext_aml/output\\\"  # modifiy to use your own, the penultimate level folder\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# local folder to store all the related files to be copied to the workspace\\n\",\n    \"PROJECT_FOLDER = \\\"./azureml_exp\\\"\\n\",\n    \"# conda environment name, the yaml file will be copied to the workspace\\n\",\n    \"CONDA_ENV_NAME = \\\"nlp_gpu\\\"\\n\",\n    \"\\n\",\n    \"##\\n\",\n    \"# The number of lines at the head of data file used for preprocessing. -1 means all the lines.\\n\",\n    \"TOP_N = 100\\n\",\n    \"QUICK_RUN = True\\n\",\n    \"if not QUICK_RUN:\\n\",\n    \"    TOP_N = -1\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Create an AML Workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Create the workspace using the specified parameters\\n\",\n    \"ws = get_or_create_workspace(\\n\",\n    \"    workspace_name=WORKSPACE_NAME,\\n\",\n    \"    subscription_id=SUBSRIPTION_ID,\\n\",\n    \"    resource_group=RESOURCE_GROUP,\\n\",\n    \"    workspace_region=LOCATION,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\n\",\n    \"    \\\"Workspace name: \\\" + ws.name,\\n\",\n    \"    \\\"Azure region: \\\" + ws.location,\\n\",\n    \"    \\\"Subscription id: \\\" + ws.subscription_id,\\n\",\n    \"    \\\"Resource group: \\\" + ws.resource_group,\\n\",\n    \"    sep=\\\"\\\\n\\\",\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Create an AML GPU Compute Cluster\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"try:\\n\",\n    \"    gpu_compute_target = ComputeTarget(workspace=ws, name=AMLCOMPUTE_CLUSTER_NAME)\\n\",\n    \"    print(\\\"Found existing compute target.\\\")\\n\",\n    \"except ComputeTargetException:\\n\",\n    \"    print(\\\"Creating a new compute target...\\\")\\n\",\n    \"    compute_config = AmlCompute.provisioning_configuration(\\n\",\n    \"        vm_size=VM_SIZE, max_nodes=NODE_COUNT, \\n\",\n    \"        idle_seconds_before_scaledown=\\\"600\\\"\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    # create the cluster\\n\",\n    \"    gpu_compute_target = ComputeTarget.create(\\n\",\n    \"        ws, AMLCOMPUTE_CLUSTER_NAME, compute_config\\n\",\n    \"    )\\n\",\n    \"\\n\",\n    \"    gpu_compute_target.wait_for_completion(show_output=True)\\n\",\n    \"\\n\",\n    \"# use get_status() to get a detailed status for the current AmlCompute.\\n\",\n    \"print(gpu_compute_target.get_status().serialize())\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Create an Experiment\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"experiment = Experiment(ws, name=EXPERIMENT_NAME)\\n\",\n    \"ds = ws.get_default_datastore()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"train_dataset, test_dataset = CNNDMSummarizationDataset(top_n=TOP_N, local_cache_path=LOCAL_DATA_FOLDER)\\n\",\n    \"processor = ExtSumProcessor(model_name=MODEL_NAME, cache_dir=LOCAL_CACHE_DIR)\\n\",\n    \"ext_sum_train = processor.preprocess(train_dataset, oracle_mode=\\\"greedy\\\")\\n\",\n    \"ext_sum_test = processor.preprocess(test_dataset, oracle_mode=\\\"greedy\\\")\\n\",\n    \"save_path = os.path.join(LOCAL_DATA_FOLDER, \\\"processed\\\")\\n\",\n    \"os.makedirs(save_path, exist_ok=True)\\n\",\n    \"torch.save(ext_sum_train, os.path.join(save_path, TRAIN_FILE))\\n\",\n    \"torch.save(ext_sum_test, os.path.join(save_path, TEST_FILE))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ds.upload(src_dir=save_path, target_path=TARGET_DATA_FOLDER)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Prepare for the Experiment Run\\n\",\n    \"Prepare the local project folder which is mirror to the workspace for the experiment\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ENTRY_SCRIPT = \\\"extractive_summarization_cnndm_distributed_train.py\\\"\\n\",\n    \"os.makedirs(PROJECT_FOLDER, exist_ok=True)\\n\",\n    \"os.system(\\\"python ../../tools/generate_conda_file.py --gpu --name {}\\\".format(CONDA_ENV_NAME))\\n\",\n    \"os.system(\\\"cp ./nlp_gpu.yaml {}\\\".format(PROJECT_FOLDER))\\n\",\n    \"os.system(\\\"cp {} {}\\\".format(ENTRY_SCRIPT, PROJECT_FOLDER))\\n\",\n    \"os.system(\\\"cp -r ../../utils_nlp {}\\\".format(PROJECT_FOLDER))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Submit Run\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"os.makedirs(LOCAL_OUTPUT_DIR, exist_ok=True)\\n\",\n    \"os.makedirs(os.path.join(LOCAL_OUTPUT_DIR, EXPERIMENT_NAME), exist_ok=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"NcclConfig=Nccl()\\n\",\n    \"quick_run = \\\"true\\\" if QUICK_RUN else \\\"false\\\"\\n\",\n    \"estimator = PyTorch(source_directory=PROJECT_FOLDER,\\n\",\n    \"                    compute_target=gpu_compute_target,\\n\",\n    \"                    script_params={\\n\",\n    \"                        \\\"--dist_url\\\": \\\"$AZ_BATCHAI_PYTORCH_INIT_METHOD\\\",\\n\",\n    \"                        \\\"--rank\\\": \\\"$AZ_BATCHAI_TASK_INDEX\\\",\\n\",\n    \"                        \\\"--node_count\\\": NODE_COUNT,\\n\",\n    \"                        \\\"--data_dir\\\":ds.path(f'{TARGET_DATA_FOLDER}').as_mount(),\\n\",\n    \"                        \\\"--cache_dir\\\": ds.path(f'{TARGET_CACHE_DIR}').as_mount(),\\n\",\n    \"                        '--output_dir':ds.path(f'{TARGET_OUTPUT_DIR}').as_mount(),\\n\",\n    \"                        \\\"--quick_run\\\":  quick_run,\\n\",\n    \"                        \\\"--summary_filename\\\": f'{SUMMARY_FILENAME}',\\n\",\n    \"                        \\\"--model_filename\\\": f'{MODEL_FILENAME}',\\n\",\n    \"                        \\\"--model_name\\\": MODEL_NAME,\\n\",\n    \"                        \\\"--encoder\\\": ENCODER,\\n\",\n    \"                        \\\"--train_file\\\": TRAIN_FILE,\\n\",\n    \"                        \\\"--test_file\\\": TEST_FILE\\n\",\n    \"                    },\\n\",\n    \"                    entry_script= ENTRY_SCRIPT,\\n\",\n    \"                    node_count=NODE_COUNT,\\n\",\n    \"                    distributed_training=NcclConfig,\\n\",\n    \"                    conda_dependencies_file=f'{CONDA_ENV_NAME}.yaml',\\n\",\n    \"                    use_gpu=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"run = experiment.submit(estimator)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"RunDetails(run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"\\\"\\\"\\\"\\n\",\n    \"If you stop the notebook and come back, \\n\",\n    \"you'll need to use the run_id in the output of the previous cell \\n\",\n    \"to get run details.\\n\",\n    \"\\\"\\\"\\\"\\n\",\n    \"# fetched_run = Run(experiment, \\\"NLP-ExtSum_1579816237_ea238f69\\\")\\n\",\n    \"# RunDetails(fetched_run).show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Download Generated Summaries \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# need to clear the local output dir as the ds.download won't download if the path exists \\n\",\n    \"os.system(\\\"rm -rf  {}/*\\\".format(LOCAL_OUTPUT_DIR))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ds.download(target_path=LOCAL_OUTPUT_DIR,\\n\",\n    \"                   prefix=f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}',\\n\",\n    \"                   show_progress=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# the script uses <q> as sentence separator so it can write the prediction into the files properly\\n\",\n    \"# here we need to replace <q> with \\\"\\\\n\\\" to prepare for evalation\\n\",\n    \"# removing the ending \\\"\\\\n\\\" is also a preparation step for evalution.\\n\",\n    \"prediction = []\\n\",\n    \"with open(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}'), \\\"r\\\") as filehandle:\\n\",\n    \"    for cnt, line in enumerate(filehandle):\\n\",\n    \"        prediction.append(line[0:-1].replace(\\\"<q>\\\", \\\"\\\\n\\\")) # remove the ending \\\"\\\\n\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"prediction[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Compare with gold summaries\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"source = []\\n\",\n    \"temp_target = []\\n\",\n    \"for i in ext_sum_test:\\n\",\n    \"    source.append(i[\\\"src_txt\\\"]) \\n\",\n    \"    temp_target.append(\\\" \\\".join(j) for j in i['tgt']) \\n\",\n    \"target = ['\\\\n'.join(i) for i in list(temp_target)]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"target[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"source[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Download and evaluation the trained model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"## you can also download the saved model and run prediction if you are running the notebook on a gpu machine\\n\",\n    \"#\\\"\\\"\\\"\\n\",\n    \"ds.download(target_path=LOCAL_OUTPUT_DIR,\\n\",\n    \"               prefix=f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}',\\n\",\n    \"               show_progress=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"BATCH_SIZE = 32\\n\",\n    \"summarizer = ExtractiveSummarizer(processor, encoder=ENCODER, cache_dir=LOCAL_CACHE_DIR)\\n\",\n    \"summarizer.model.load_state_dict(\\n\",\n    \"    torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'),\\n\",\n    \"               map_location=\\\"cpu\\\"))\\n\",\n    \"\\n\",\n    \"prediction = summarizer.predict(ext_sum_test, num_gpus=torch.cuda.device_count(), batch_size=BATCH_SIZE, sentence_separator = \\\"\\\\n\\\")\\n\",\n    \"#\\\"\\\"\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"prediction[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"rouge_scores = compute_rouge_python(cand=prediction, ref=target)\\n\",\n    \"pprint.pprint(rouge_scores)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Cleanup\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import shutil\\n\",\n    \"if os.path.exists(LOCAL_DATA_FOLDER):\\n\",\n    \"    shutil.rmtree(LOCAL_DATA_FOLDER, ignore_errors=True)\\n\",\n    \"if os.path.exists(LOCAL_OUTPUT_DIR):\\n\",\n    \"    shutil.rmtree(LOCAL_OUTPUT_DIR, ignore_errors=True)\\n\",\n    \"if os.path.exists(LOCAL_CACHE_DIR):\\n\",\n    \"    shutil.rmtree(LOCAL_CACHE_DIR, ignore_errors=True)\\n\",\n    \"if os.path.exists(PROJECT_FOLDER):\\n\",\n    \"    shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/text_summarization/extractive_summarization_cnndm_distributed_train.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport argparse\nimport os\nimport sys\nimport time\nimport torch\nimport torch.distributed as dist\nimport torch.multiprocessing as mp\n\nnlp_path = os.path.abspath(\"../../\")\nif nlp_path not in sys.path:\n    sys.path.insert(0, nlp_path)\n\nsys.path.insert(0, \"./\")\nfrom utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset\nfrom utils_nlp.models.transformers.extractive_summarization import (\n    ExtractiveSummarizer,\n    ExtSumProcessedData,\n    ExtSumProcessor,\n)\n\n# os.environ[\"NCCL_BLOCKING_WAIT\"] = \"1\"\n\nos.environ[\"NCCL_IB_DISABLE\"] = \"0\"\nos.environ['OMP_NUM_THREADS'] = str(torch.cuda.device_count())\nos.environ[\"KMP_AFFINITY\"] = \"verbose\"\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\n    \"--rank\", type=int, default=0, help=\"The rank of the current node in the cluster\"\n)\nparser.add_argument(\n    \"--dist_url\",\n    type=str,\n    default=\"tcp://127.0.0.1:29501\",\n    help=\"URL specifying how to initialize the process groupi.\",\n)\nparser.add_argument(\n    \"--node_count\", type=int, default=1, help=\"Number of nodes in the cluster.\"\n)\nparser.add_argument(\n    \"--cache_dir\", type=str, default=\"./\", help=\"Directory to cache the tokenizer.\"\n)\nparser.add_argument(\n    \"--data_dir\",\n    type=str,\n    default=\"./\",\n    help=\"Directory to download the preprocessed data.\",\n)\nparser.add_argument(\n    \"--output_dir\",\n    type=str,\n    default=\"./\",\n    help=\"Directory to save the output model and prediction results.\",\n)\nparser.add_argument(\n    \"--quick_run\",\n    type=str.lower,\n    default=\"false\",\n    choices=[\"true\", \"false\"],\n    help=\"Whether to have a quick run\",\n)\nparser.add_argument(\n    \"--model_name\",\n    type=str,\n    default=\"distilbert-base-uncased\",\n    help='Transformer model used in the extractive summarization, only \\\n                        \"bert-uncased\" and \"distilbert-base-uncased\" are supported.',\n)\nparser.add_argument(\n    \"--encoder\",\n    type=str.lower,\n    default=\"transformer\",\n    choices=[\"baseline\", \"classifier\", \"transformer\", \"rnn\"],\n    help=\"Encoder types in the extractive summarizer.\",\n)\nparser.add_argument(\n    \"--max_pos_length\",\n    type=int,\n    default=512,\n    help=\"maximum input length in terms of input token numbers in training\",\n)\nparser.add_argument(\"--learning_rate\", type=float, default=1e-3, help=\"Learning rate.\")\nparser.add_argument(\n    \"--batch_size\",\n    type=int,\n    default=5,\n    help=\"batch size in terms of the number of samples in training\",\n    # default=3000,\n    # help=\"batch size in terms of input token numbers in training\",\n)\nparser.add_argument(\n    \"--max_steps\",\n    type=int,\n    default=1e4,\n    help=\"Maximum number of training steps run in training. If quick_run is set,\\\n                        it's not used.\",\n)\nparser.add_argument(\n    \"--warmup_steps\",\n    type=int,\n    default=5e3,\n    help=\"Warm-up number of training steps run in training. If quick_run is set,\\\n                        it's not used.\",\n)\nparser.add_argument(\n    \"--top_n\",\n    type=int,\n    default=3,\n    help=\"Number of sentences selected in prediction for evaluation.\",\n)\nparser.add_argument(\n    \"--summary_filename\",\n    type=str,\n    default=\"generated_summaries.txt\",\n    help=\"Summary file name generated by prediction for evaluation.\",\n)\nparser.add_argument(\n    \"--model_filename\",\n    type=str,\n    default=\"dist_extsum_model.pt\",\n    help=\"model file name saved for evaluation.\",\n)\nparser.add_argument(\n    \"--train_file\",\n    type=str,\n    default=None,\n    help=\"training data file which is saved through torch\",\n)\nparser.add_argument(\n    \"--test_file\",\n    type=str,\n    default=None,\n    help=\"test data file for evaluation.\",\n)\n\n\n\ndef cleanup():\n    dist.destroy_process_group()\n\n\n# How often the statistics reports show up in training, unit is step.\nREPORT_EVERY = 100\nSAVE_EVERY = 1000\n\n\ndef main():\n    print(\"NCCL_IB_DISABLE: {}\".format(os.getenv(\"NCCL_IB_DISABLE\")))\n    args = parser.parse_args()\n    print(\"quick_run is {}\".format(args.quick_run))\n    print(\"output_dir is {}\".format(args.output_dir))\n    print(\"data_dir is {}\".format(args.data_dir))\n    print(\"cache_dir is {}\".format(args.cache_dir))\n\n    # shutil.rmtree(args.output_dir)\n    os.makedirs(args.output_dir, exist_ok=True)\n    os.makedirs(args.cache_dir, exist_ok=True)\n\n    ngpus_per_node = torch.cuda.device_count()\n    processor = ExtSumProcessor(model_name=args.model_name)\n    summarizer = ExtractiveSummarizer(\n        processor, args.model_name, args.encoder, args.max_pos_length, args.cache_dir\n    )\n\n    mp.spawn(\n        main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, summarizer, args)\n    )\n\n\ndef main_worker(local_rank, ngpus_per_node, summarizer, args):\n    rank = args.rank * ngpus_per_node + local_rank\n    world_size = args.node_count * ngpus_per_node\n\n    print(\"init_method: {}\".format(args.dist_url))\n    print(\"ngpus_per_node: {}\".format(ngpus_per_node))\n    print(\"rank: {}\".format(rank))\n    print(\"local_rank: {}\".format(local_rank))\n    print(\"world_size: {}\".format(world_size))\n\n    torch.distributed.init_process_group(\n        backend=\"nccl\", init_method=args.dist_url, world_size=world_size, rank=rank,\n    )\n    # total number of steps for training\n    MAX_STEPS = 1e1\n    # number of steps for warm up\n    WARMUP_STEPS = 5e2\n    TOP_N = 10\n    if args.quick_run.lower() == \"false\":\n        MAX_STEPS = args.max_steps\n        WARMUP_STEPS = args.warmup_steps\n        TOP_N = -1\n\n    print(\"max steps is {}\".format(MAX_STEPS))\n    print(\"warmup steps is {}\".format(WARMUP_STEPS))\n\n    if local_rank not in [-1, 0]:\n        torch.distributed.barrier()\n\n    # download_path = CNNDMBertSumProcessedData.download(local_path=args.data_dir)\n    # ext_sum_train, ext_sum_train = ExtSumProcessedData().splits(\n    #    root=download_path, train_iterable=True\n    # )\n    if args.train_file is None or args.test_file is None:\n        train_dataset, test_dataset = CNNDMSummarizationDataset(\n            top_n=TOP_N, local_cache_path=args.data_dir\n        )\n        ext_sum_train = summarizer.processor.preprocess(train_dataset, oracle_mode=\"greedy\")\n        ext_sum_test = summarizer.processor.preprocess(test_dataset, oracle_mode=\"greedy\")\n    else:\n        ext_sum_train = torch.load(os.path.join(args.data_dir, args.train_file))\n        ext_sum_test = torch.load(os.path.join(args.data_dir, args.test_file))\n\n    if local_rank in [-1, 0]:\n        torch.distributed.barrier()\n\n    start = time.time()\n\n    if rank not in [-1, 0]:\n        save_every = -1\n    else:\n        save_every = SAVE_EVERY\n    # \"\"\"\n    print(\"starting training\")\n    summarizer.fit(\n        ext_sum_train,\n        num_gpus=world_size,\n        batch_size=args.batch_size,\n        gradient_accumulation_steps=1,\n        max_steps=MAX_STEPS / world_size,\n        learning_rate=args.learning_rate,\n        warmup_steps=WARMUP_STEPS,\n        verbose=True,\n        report_every=REPORT_EVERY,\n        clip_grad_norm=False,\n        local_rank=local_rank,\n        save_every=save_every,\n        world_size=world_size,\n        rank=rank,\n        # use_preprocessed_data=True\n    )\n    end = time.time()\n    print(\"rank {0}, duration {1:.6f}s\".format(rank, end - start))\n    # \"\"\"\n    torch.distributed.barrier()\n    if local_rank in [-1, 0] and args.rank == 0:\n        summarizer.save_model(os.path.join(args.output_dir, args.model_filename))\n        prediction = summarizer.predict(ext_sum_test[0:TOP_N], batch_size=128)\n\n        def _write_list_to_file(list_items, filename):\n            with open(filename, \"w\") as filehandle:\n                # for cnt, line in enumerate(filehandle):\n                for item in list_items:\n                    filehandle.write(\"%s\\n\" % item)\n\n        print(\"writing generated summaries\")\n        _write_list_to_file(\n            prediction, os.path.join(args.output_dir, args.summary_filename)\n        )\n\n    # only use the following line when you use your own cluster.\n    # AML distributed training run cleanup for you.\n    cleanup()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/text_summarization/extractive_summarization_cnndm_transformer.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation. All rights reserved.\\n\",\n    \"\\n\",\n    \"Licensed under the MIT License.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Extractive Summarization on CNN/DM Dataset using Transformer Version of BertSum\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"### Summary\\n\",\n    \"\\n\",\n    \"This notebook demonstrates how to fine tune Transformers for extractive text summarization. Utility functions and classes in the NLP Best Practices repo are used to facilitate data preprocessing, model training, model scoring, result postprocessing, and model evaluation.\\n\",\n    \"\\n\",\n    \"BertSum refers to  [Fine-tune BERT for Extractive Summarization](https://arxiv.org/pdf/1903.10318.pdf) with [published example](https://github.com/nlpyang/BertSum/). And the Transformer version of Bertsum refers to our modification of BertSum and the source code can be accessed at (https://github.com/daden-ms/BertSum/). \\n\",\n    \"\\n\",\n    \"Extractive summarization are usually used in document summarization where each input document consists of mutiple sentences. The preprocessing of the input training data involves assigning label 0 or 1 to the document sentences based on the give summary. The summarization problem is also simplfied to classifying whether a document sentence should be included in the summary. \\n\",\n    \"\\n\",\n    \"The figure below illustrates how BERTSum can be fine tuned for extractive summarization task. [CLS] token is inserted at the beginning of each sentence, so is [SEP] token at the end. Interval segment embedding and positional embedding are added upon the token embedding as the input of the BERT model. The [CLS] token representation is used as sentence embedding and only the [CLS] tokens are used as the input for the summarization model. The summarization layer predicts the probability for each  sentence being included in the summary. Techniques like trigram blocking can be used to improve model accuarcy.   \\n\",\n    \"\\n\",\n    \"<img src=\\\"https://nlpbp.blob.core.windows.net/images/BertSum.PNG\\\">\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"### Before You Start\\n\",\n    \"\\n\",\n    \"The running time shown in this notebook is on a Standard_NC24s_v3 Azure Ubuntu Virtual Machine with 4 NVIDIA Tesla V100 GPUs. \\n\",\n    \"> **Tip**: If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \\n\",\n    \"\\n\",\n    \"Using only 1 NVIDIA Tesla V100 GPUs, 16GB GPU memory configuration,\\n\",\n    \"- for data preprocessing, it takes around 1 minutes to preprocess the data for quick run. Otherwise it takes ~20 minutes to finish the data preprocessing. This time estimation assumes that the chosen transformer model is \\\"distilbert-base-uncased\\\" and the sentence selection method is \\\"greedy\\\", which is the default. The preprocessing time can be significantly longer if the sentence selection method is \\\"combination\\\", which can achieve better model performance.\\n\",\n    \"\\n\",\n    \"- for model fine tuning, it takes around 2 minutes for quick run. Otherwise, it takes around ~3 hours to finish. This estimation assumes the chosen encoder method is \\\"transformer\\\". The model fine tuning time can be shorter if other encoder method is chosen, which may result in worse model performance. \\n\",\n    \"\\n\",\n    \"### Additional Notes\\n\",\n    \"\\n\",\n    \"* **ROUGE Evalation**: To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](./summarization_evaluation.ipynb) for setup.\\n\",\n    \"\\n\",\n    \"* **Distributed Training**:\\n\",\n    \"Please note that the jupyter notebook only allows to use pytorch [DataParallel](https://pytorch.org/docs/master/nn.html#dataparallel). Faster speed and larger batch size can be achieved with pytorch [DistributedDataParallel](https://pytorch.org/docs/master/notes/ddp.html)(DDP). Script [extractive_summarization_cnndm_distributed_train.py](./extractive_summarization_cnndm_distributed_train.py) shows an example of how to use DDP.\\n\",\n    \"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"%load_ext autoreload\\n\",\n    \"\\n\",\n    \"%autoreload 2\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\\n\",\n    \"QUICK_RUN = True\\n\",\n    \"## Set USE_PREPROCSSED_DATA = True to skip the data preprocessing\\n\",\n    \"USE_PREPROCSSED_DATA = False\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Configuration\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"import shutil\\n\",\n    \"import sys\\n\",\n    \"from tempfile import TemporaryDirectory\\n\",\n    \"import torch\\n\",\n    \"\\n\",\n    \"nlp_path = os.path.abspath(\\\"../../\\\")\\n\",\n    \"if nlp_path not in sys.path:\\n\",\n    \"    sys.path.insert(0, nlp_path)\\n\",\n    \"\\n\",\n    \"from utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset\\n\",\n    \"from utils_nlp.eval import compute_rouge_python, compute_rouge_perl\\n\",\n    \"from utils_nlp.models.transformers.extractive_summarization import (\\n\",\n    \"    ExtractiveSummarizer,\\n\",\n    \"    ExtSumProcessedData,\\n\",\n    \"    ExtSumProcessor,\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"from utils_nlp.models.transformers.datasets import SummarizationDataset\\n\",\n    \"import nltk\\n\",\n    \"from nltk import tokenize\\n\",\n    \"\\n\",\n    \"import pandas as pd\\n\",\n    \"import scrapbook as sb\\n\",\n    \"import pprint\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"\\n\",\n    \"### Configuration: choose the transformer model to be used\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Several pretrained models have been made available by [Hugging Face](https://github.com/huggingface/transformers). For extractive summarization, the following pretrained models are supported. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pd.DataFrame({\\\"model_name\\\": ExtractiveSummarizer.list_supported_models()})\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# Transformer model being used\\n\",\n    \"MODEL_NAME = \\\"distilbert-base-uncased\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# notebook parameters\\n\",\n    \"# the cache data path during find tuning\\n\",\n    \"CACHE_DIR = TemporaryDirectory().name\\n\",\n    \"processor = ExtSumProcessor(model_name=MODEL_NAME, cache_dir=CACHE_DIR)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Data Preprocessing\\n\",\n    \"\\n\",\n    \"The dataset we used for this notebook is CNN/DM dataset which contains the documents and accompanying questions from the news articles of CNN and Daily mail. The highlights in each article are used as summary. The dataset consits of ~289K training examples, ~11K valiation examples and ~11K test examples.  You can choose the [Option 1] below preprocess the data or [Option 2] to use the preprocessed version at [BERTSum published example](https://github.com/nlpyang/BertSum/). You don't need to manually download any of these two data sets as the code below will handle downloading. Functions defined specific in [cnndm.py](../../utils_nlp/dataset/cnndm.py) are unique to CNN/DM dataset that's preprocessed by harvardnlp. However, it provides a skeleton of how to preprocessing text into the format that model preprocessor takes: sentence tokenization and work tokenization. \\n\",\n    \"\\n\",\n    \"##### Details of Data Preprocessing\\n\",\n    \"\\n\",\n    \"The purpose of preprocessing is to process the input articles to the format that model finetuning needed. Assuming you have (1) all articles and (2) target summaries, each in a file and line-breaker separated, the steps to preprocess the data are:\\n\",\n    \"1. sentence tokenization\\n\",\n    \"2. word tokenization\\n\",\n    \"3. **label** the sentences in the article with 1 meaning the sentence is selected and 0 meaning the sentence is not selected. The algorithms for the sentence selection are \\\"greedy\\\" and \\\"combination\\\" and can be found in [sentence_selection.py](../../utils_nlp/dataset/sentence_selection.py)\\n\",\n    \"3. convert each example to  the desired format for extractive summarization\\n\",\n    \"    - filter the sentences in the example based on the min_src_ntokens argument. If the lefted total sentence number is less than min_nsents, the example is discarded.\\n\",\n    \"    - truncate the sentences in the example if the length is greater than max_src_ntokens\\n\",\n    \"    - truncate the sentences in the example and the labels if the total number of sentences is greater than max_nsents\\n\",\n    \"    - [CLS] and [SEP] are inserted before and after each sentence\\n\",\n    \"    - wordPiece tokenization or Byte Pair Encoding (BPE) subword tokenization\\n\",\n    \"    - truncate the example to 512 tokens\\n\",\n    \"    - convert the tokens into token indices corresponding to the transformer tokenizer's vocabulary.\\n\",\n    \"    - segment ids are generated and added\\n\",\n    \"    - [CLS] token positions are logged\\n\",\n    \"    - [CLS] token labels are truncated if it's greater than 512, which is the maximum input length that can be taken by the transformer model.\\n\",\n    \"    \\n\",\n    \"    \\n\",\n    \"Note that the original BERTSum paper use Stanford CoreNLP for data preprocessing, here we use NLTK for data preprocessing. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"##### [Option 1] Preprocess  data (Please skil this part if you choose to use preprocessed data)\\n\",\n    \"The code in following cell will download the CNN/DM dataset listed at https://github.com/harvardnlp/sent-summary/.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# the data path used to save the downloaded data file\\n\",\n    \"DATA_PATH = TemporaryDirectory().name\\n\",\n    \"# The number of lines at the head of data file used for preprocessing. -1 means all the lines.\\n\",\n    \"TOP_N = 1000\\n\",\n    \"if not QUICK_RUN:\\n\",\n    \"    TOP_N = -1\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"train_dataset, test_dataset = CNNDMSummarizationDataset(top_n=TOP_N, local_cache_path=DATA_PATH)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Preprocess the data.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"\\n\",\n    \"ext_sum_train = processor.preprocess(train_dataset, oracle_mode=\\\"greedy\\\")\\n\",\n    \"ext_sum_test = processor.preprocess(test_dataset, oracle_mode=\\\"greedy\\\")\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"\\\"\\\"\\\"\\n\",\n    \"# save and load preprocessed data\\n\",\n    \"save_path = os.path.join(DATA_PATH, \\\"processed\\\")\\n\",\n    \"torch.save(ext_sum_train, os.path.join(save_path, \\\"train_full.pt\\\"))\\n\",\n    \"torch.save(ext_sum_test, os.path.join(save_path, \\\"test_full.pt\\\"))\\n\",\n    \"\\n\",\n    \"\\\"\\\"\\\"\\n\",\n    \"# ext_sum_train = torch.load(os.path.join(save_path, \\\"train_full.pt\\\"))\\n\",\n    \"# ext_sum_test = torch.load(os.path.join(save_path, \\\"test_full.pt\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"len(ext_sum_train)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"len(ext_sum_test)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Inspect Data\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ext_sum_train[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"ext_sum_train[0].keys()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"##### [Option 2] Reuse Preprocessed  data from [BERTSUM Repo](https://github.com/nlpyang/BertSum)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\",\n     \":w\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# the data path used to downloaded the preprocessed data from BERTSUM Repo.\\n\",\n    \"# if you have downloaded the dataset, change the code to use that path where the dataset is.\\n\",\n    \"PROCESSED_DATA_PATH = TemporaryDirectory().name\\n\",\n    \"os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)\\n\",\n    \"#data_path = \\\"./temp_data5/\\\"\\n\",\n    \"#PROCESSED_DATA_PATH = data_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if USE_PREPROCSSED_DATA:\\n\",\n    \"    download_path = CNNDMBertSumProcessedData.download(local_path=PROCESSED_DATA_PATH)\\n\",\n    \"    ext_sum_train, ext_sum_test = ExtSumProcessedData().splits(root=download_path, train_iterable=True)\\n\",\n    \"    \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Model training\\n\",\n    \"To start model training, we need to create a instance of ExtractiveSummarizer.\\n\",\n    \"#### Choose the transformer model.\\n\",\n    \"Currently ExtractiveSummarizer support two models:\\n\",\n    \"- distilbert-base-uncase, \\n\",\n    \"- bert-base-uncase\\n\",\n    \"\\n\",\n    \"Potentionally, roberta-based model and xlnet can be supported but needs to be tested.\\n\",\n    \"#### Choose the encoder algorithm.\\n\",\n    \"There are four options:\\n\",\n    \"- baseline: it used a smaller transformer model to replace the bert model and with transformer summarization layer\\n\",\n    \"- classifier: it uses pretrained BERT and fine-tune BERT with **simple logistic classification** summarization layer\\n\",\n    \"- transformer: it uses pretrained BERT and fine-tune BERT with **transformer** summarization layer\\n\",\n    \"- RNN: it uses pretrained BERT and fine-tune BERT with **LSTM** summarization layer\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"parameters\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"BATCH_SIZE = 5 # batch size, unit is the number of samples\\n\",\n    \"MAX_POS_LENGTH = 512\\n\",\n    \"if USE_PREPROCSSED_DATA: #if bertsum published data is used\\n\",\n    \"    BATCH_SIZE = 3000 # batch size, unit is the number of tokens\\n\",\n    \"    MAX_POS_LENGTH = 512\\n\",\n    \"    \\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# GPU used for training\\n\",\n    \"NUM_GPUS = torch.cuda.device_count()\\n\",\n    \"\\n\",\n    \"# Encoder name. Options are: 1. baseline, classifier, transformer, rnn.\\n\",\n    \"ENCODER = \\\"transformer\\\"\\n\",\n    \"\\n\",\n    \"# Learning rate\\n\",\n    \"LEARNING_RATE=2e-3\\n\",\n    \"\\n\",\n    \"# How often the statistics reports show up in training, unit is step.\\n\",\n    \"REPORT_EVERY=100\\n\",\n    \"\\n\",\n    \"# total number of steps for training\\n\",\n    \"MAX_STEPS=1e2\\n\",\n    \"# number of steps for warm up\\n\",\n    \"WARMUP_STEPS=5e2\\n\",\n    \"    \\n\",\n    \"if not QUICK_RUN:\\n\",\n    \"    MAX_STEPS=5e4\\n\",\n    \"    WARMUP_STEPS=5e3\\n\",\n    \" \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"summarizer = ExtractiveSummarizer(processor, MODEL_NAME, ENCODER, MAX_POS_LENGTH, CACHE_DIR)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"#\\\"\\\"\\\"\\n\",\n    \"\\n\",\n    \"summarizer.fit(\\n\",\n    \"            ext_sum_train,\\n\",\n    \"            num_gpus=NUM_GPUS,\\n\",\n    \"            batch_size=BATCH_SIZE,\\n\",\n    \"            gradient_accumulation_steps=2,\\n\",\n    \"            max_steps=MAX_STEPS,\\n\",\n    \"            learning_rate=LEARNING_RATE,\\n\",\n    \"            warmup_steps=WARMUP_STEPS,\\n\",\n    \"            verbose=True,\\n\",\n    \"            report_every=REPORT_EVERY,\\n\",\n    \"            clip_grad_norm=False,\\n\",\n    \"            use_preprocessed_data=USE_PREPROCSSED_DATA\\n\",\n    \"        )\\n\",\n    \"\\n\",\n    \"#\\\"\\\"\\\"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"summarizer.save_model(\\n\",\n    \"    os.path.join(\\n\",\n    \"        CACHE_DIR,\\n\",\n    \"        \\\"extsum_modelname_{0}_usepreprocess{1}_steps_{2}.pt\\\".format(\\n\",\n    \"            MODEL_NAME, USE_PREPROCSSED_DATA, MAX_STEPS\\n\",\n    \"        ),\\n\",\n    \"    )\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# for loading a previous saved model\\n\",\n    \"\\\"\\\"\\\"\\n\",\n    \"import torch\\n\",\n    \"model_path = os.path.join(\\n\",\n    \"        CACHE_DIR,\\n\",\n    \"        \\\"extsum_modelname_{0}_usepreprocess{1}_steps_{2}.pt\\\".format(\\n\",\n    \"            MODEL_NAME, USE_PREPROCSSED_DATA, MAX_STEPS\\n\",\n    \"        ))\\n\",\n    \"summarizer = ExtractiveSummarizer(processor, MODEL_NAME, ENCODER, MAX_POS_LENGTH, CACHE_DIR)\\n\",\n    \"summarizer.model.load_state_dict(torch.load(model_path, map_location=\\\"cpu\\\"))\\n\",\n    \"\\\"\\\"\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Model Evaluation\\n\",\n    \"\\n\",\n    \"[ROUGE](https://en.wikipedia.org/wiki/ROUGE_(metric)), or Recall-Oriented Understudy for Gisting Evaluation has been commonly used for evaluating text summarization.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ext_sum_test[0].keys()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if \\\"segs\\\" in ext_sum_test[0]: # preprocessed_data\\n\",\n    \"    source = [i['src_txt'] for i in ext_sum_test]\\n\",\n    \"    target = [\\\"\\\\n\\\".join(i['tgt_txt'].split(\\\"<q>\\\")) for i in ext_sum_test]\\n\",\n    \"else:\\n\",\n    \"    source = []\\n\",\n    \"    temp_target = []\\n\",\n    \"    for i in ext_sum_test:\\n\",\n    \"        source.append(i[\\\"src_txt\\\"]) \\n\",\n    \"        temp_target.append(\\\" \\\".join(j) for j in i['tgt']) \\n\",\n    \"    target = [''.join(i) for i in list(temp_target)]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"%%time\\n\",\n    \"sentence_separator = \\\"\\\\n\\\"\\n\",\n    \"prediction = summarizer.predict(ext_sum_test, num_gpus=NUM_GPUS, batch_size=256, sentence_separator=sentence_separator)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"len(prediction)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"rouge_scores = compute_rouge_python(cand=prediction, ref=target)\\n\",\n    \"pprint.pprint(rouge_scores)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"target[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"prediction[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"source[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# for testing\\n\",\n    \"sb.glue(\\\"rouge_2_f_score\\\", rouge_scores['rouge-2']['f'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Prediction on a single input sample\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"source = \\\"\\\"\\\"\\n\",\n    \"But under the new rule, set to be announced in the next 48 hours, Border Patrol agents would immediately return anyone to Mexico — without any detainment and without any due process — who attempts to cross the southwestern border between the legal ports of entry. The person would not be held for any length of time in an American facility.\\n\",\n    \"\\n\",\n    \"Although they advised that details could change before the announcement, administration officials said the measure was needed to avert what they fear could be a systemwide outbreak of the coronavirus inside detention facilities along the border. Such an outbreak could spread quickly through the immigrant population and could infect large numbers of Border Patrol agents, leaving the southwestern border defenses weakened, the officials argued.\\n\",\n    \"The Trump administration plans to immediately turn back all asylum seekers and other foreigners attempting to enter the United States from Mexico illegally, saying the nation cannot risk allowing the coronavirus to spread through detention facilities and Border Patrol agents, four administration officials said.\\n\",\n    \"The administration officials said the ports of entry would remain open to American citizens, green-card holders and foreigners with proper documentation. Some foreigners would be blocked, including Europeans currently subject to earlier travel restrictions imposed by the administration. The points of entry will also be open to commercial traffic.\\\"\\\"\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"test_dataset = SummarizationDataset(\\n\",\n    \"    None,\\n\",\n    \"    source=[source],\\n\",\n    \"    source_preprocessing=[tokenize.sent_tokenize],\\n\",\n    \"    word_tokenize=nltk.word_tokenize,\\n\",\n    \")\\n\",\n    \"processor = ExtSumProcessor(model_name=MODEL_NAME,  cache_dir=CACHE_DIR)\\n\",\n    \"preprocessed_dataset = processor.preprocess(test_dataset)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"preprocessed_dataset[0].keys()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"prediction = summarizer.predict(preprocessed_dataset, num_gpus=0, batch_size=1, sentence_separator=\\\"\\\\n\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"prediction\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Clean up temporary folders\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if os.path.exists(DATA_PATH):\\n\",\n    \"    shutil.rmtree(DATA_PATH, ignore_errors=True)\\n\",\n    \"if os.path.exists(CACHE_DIR):\\n\",\n    \"    shutil.rmtree(CACHE_DIR, ignore_errors=True)\\n\",\n    \"if USE_PREPROCSSED_DATA:\\n\",\n    \"    if os.path.exists(PROCESSED_DATA_PATH):\\n\",\n    \"        shutil.rmtree(PROCESSED_DATA_PATH, ignore_errors=True)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/text_summarization/summarization_evaluation.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Copyright (c) Microsoft Corporation.  \\n\",\n    \"Licensed under the MIT License. \\n\",\n    \"\\n\",\n    \"# Summarization Evaluation\\n\",\n    \"This notebook explains the metrics commonly used to evaluate text summarization results and how to use the evaluation utilities provided in the repo. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## ROUGE\\n\",\n    \"Recall-Oriented Understudy for Gisting Evaluation(ROUGE) is a set of metrics for evaluating automatic text summarization and machine translation results. The metrics compare machine-generated summaries or translations against one or multiple reference summaries or translations created by human.  \\n\",\n    \"Commonly used ROUGE metrics are ROUGE-1, ROUGE-2, and ROUGE-L\\n\",\n    \"* ROUGE-1: Overlap of unigrams (single words) between machine-generated and reference summaries. \\n\",\n    \"* ROUGE-2: Overlap of bigrams (two adjacent words) between machine-generated and reference summaries.\\n\",\n    \"* ROUGE-L: Longest Common Subsequence (LCS), which doesn't require consecutive matches but in-sequence matches that refect sentence-level structure similarity.  \\n\",\n    \"\\n\",\n    \"For each metric, recall, precision, and F1 score are computed. \\n\",\n    \"\\n\",\n    \"**Utilities for computing ROUGE**\\n\",\n    \"* `compute_rouge_perl`: The [pyrouge](https://github.com/bheinzerling/pyrouge) package based on the ROUGE package written in perl is the most popular package for computing ROUGE scores. We provide the `compute_rouge_perl` function based on pyrouge. This function supports English only. \\n\",\n    \"* `compute_rouge_python`: The [py-rouge](https://pypi.org/project/py-rouge/) package is a Python implementation of the ROUGE metric which produces almost the same results as the perl implemenation. Since it's easier to install than pyrouge and can be extended to other languages, we provide the `compute_rouge_python` function based on py-rouge. Currently, English and Hindi are supported. Supports for other languages will be added on an as-needed basis.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"import sys\\n\",\n    \"\\n\",\n    \"nlp_path = os.path.abspath('../../')\\n\",\n    \"if nlp_path not in sys.path:\\n\",\n    \"    sys.path.insert(0, nlp_path)\\n\",\n    \"    \\n\",\n    \"from utils_nlp.eval import compute_rouge_perl, compute_rouge_python\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Sample inputs\\n\",\n    \"Both `compute_rouge_perl` and `compute_rouge_python` takes lists of candidate summaries and reference summaries as inputs. Alternatively, you can also provide paths to files containing the candidates and references and set the `is_input_files` argument to `True`. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"summary_candidates = [\\\"The stock market is doing well this year.\\\", \\\"The movie is very popular.\\\"]\\n\",\n    \"summary_references = [\\\"The stock market is doing really well in 2019.\\\", \\\"The movie is very popular among millennials.\\\"]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### compute_rouge_python\\n\",\n    \"To use `compute_rouge_python`, you only need to install the Python package `py-rouge` and `nltk`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"python_rouge_scores = compute_rouge_python(cand=summary_candidates, ref=summary_references)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\\"ROUGE-1: {}\\\".format(python_rouge_scores[\\\"rouge-1\\\"]))\\n\",\n    \"print(\\\"ROUGE-2: {}\\\".format(python_rouge_scores[\\\"rouge-2\\\"]))\\n\",\n    \"print(\\\"ROUGE-L: {}\\\".format(python_rouge_scores[\\\"rouge-l\\\"]))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The `compute_rouge_python` function can also support non-English languages. Currently, only Hindi is supported. Support for other languages will be added on an as-needed basis.  \\n\",\n    \"Note that the Hindi sample inputs are generated by translation, so they are not perfect, but suffcient for testing.  \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"summary_candidates_hi = [\\n\",\n    \"    \\\"शेयर बाजार इस साल बहुत अच्छा कर रहा है। 2020 के लिए भी यही उम्मीद है।\\\",\\n\",\n    \"    \\\"नई फिल्म बहुत लोकप्रिय है।\\\",\\n\",\n    \"]\\n\",\n    \"summary_references_hi = [\\n\",\n    \"    \\\"शेयर बाजार 2019 में वास्तव में अच्छा कर रहा है। आशा है कि 2020 भी ऐसा ही होगा।\\\",\\n\",\n    \"    \\\"फिल्म सदियों के बीच बहुत लोकप्रिय है।\\\",\\n\",\n    \"]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"python_rouge_scores_hi = compute_rouge_python(cand=summary_candidates_hi, ref=summary_references_hi, language=\\\"hi\\\")\\n\",\n    \"print(\\\"ROUGE-1: {}\\\".format(python_rouge_scores_hi[\\\"rouge-1\\\"]))\\n\",\n    \"print(\\\"ROUGE-2: {}\\\".format(python_rouge_scores_hi[\\\"rouge-2\\\"]))\\n\",\n    \"print(\\\"ROUGE-L: {}\\\".format(python_rouge_scores_hi[\\\"rouge-l\\\"]))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### compute_rouge_perl\\n\",\n    \"To use `compute_rouge_perl`, in addition to installing the Python package `pyrouge`, you also need to go through the following setup steps on a Linux machine.  \\n\",\n    \"**NOTE**: Set `PYTHON_PATH` to the root directory of the conda environment where you installed `pyrouge` first. You can use \\\"conda env list\\\" to find the`PYTHON_PATH`. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"python_path = !which python\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"PYTHON_PATH = \\\"/\\\".join(python_path[0].split(os.pathsep)[0].split(\\\"/\\\")[0:-2])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"scrolled\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"%%bash\\n\",\n    \"git clone https://github.com/andersjo/pyrouge.git\\n\",\n    \"PYROUGE_PATH=./pyrouge \\n\",\n    \"PYTHON_PATH=$PYTHON_PATH #<root directory of conda environment> #e.g./data/anaconda/envs/nlp_gpu\\n\",\n    \"$PYTHON_PATH/bin/pyrouge_set_rouge_path $PYROUGE_PATH/tools/ROUGE-1.5.5\\n\",\n    \"\\n\",\n    \"# install XML::DOM plugin, instructions https://web.archive.org/web/20171107220839/www.summarizerman.com/post/42675198985/figuring-out-rouge\\n\",\n    \"sudo cpan App::cpanminus\\n\",\n    \"sudo cpanm XML::DOM\\n\",\n    \"\\n\",\n    \"# install XLM::Parser and its dependencies\\n\",\n    \"sudo apt-get update\\n\",\n    \"sudo apt-get install libexpat1-dev -y\\n\",\n    \"sudo cpanm  XML::Parser\\n\",\n    \"\\n\",\n    \"# Fix WordNet issue\\n\",\n    \"# Instructions https://web.archive.org/web/20180812011301/http://kavita-ganesan.com/rouge-howto/#IamHavingWordNetExceptions\\n\",\n    \"cd  $PYROUGE_PATH/tools/ROUGE-1.5.5/data/\\n\",\n    \"rm WordNet-2.0.exc.db\\n\",\n    \"\\n\",\n    \"cd WordNet-2.0-Exceptions/\\n\",\n    \"./buildExeptionDB.pl . exc WordNet-2.0.exc.db\\n\",\n    \"cd ..\\n\",\n    \"ln -s WordNet-2.0-Exceptions/WordNet-2.0.exc.db WordNet-2.0.exc.db\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"perl_rouge_scores = compute_rouge_perl(cand=summary_candidates, ref=summary_references)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"perl_rouge_scores\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"For each score, the 95% confidence interval is also computed, i.e. \\\"\\\\_cb\\\" and \\\"\\\\_ce\\\" stand for the beginning  and end of the confidence interval, respectively.  \\n\",\n    \"In addition to ROUGE-1, ROUGE-2, ROUGE-L, the perl script computes a few other ROUGE scores. See details of all scores [here](https://en.wikipedia.org/wiki/ROUGE_%28metric%29).  \"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python (nlp_gpu)\",\n   \"language\": \"python\",\n   \"name\": \"nlp_gpu\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[tool.black]\nline-length = 88\n"
  },
  {
    "path": "setup.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nimport io\nimport re\nfrom os.path import dirname, join\nfrom setuptools import setup\nfrom utils_nlp import VERSION, AUTHOR, TITLE, LICENSE\n\n\ndef read(*names, **kwargs):\n    with io.open(join(dirname(__file__), *names), encoding=kwargs.get(\"encoding\", \"utf8\")) as fh:\n        return fh.read()\n\n\nsetup(\n    name=\"utils_nlp\",\n    version=VERSION,\n    license=LICENSE,\n    description=\"NLP Utility functions that are used for best practices in building state-of-the-art NLP methods and scenarios. Developed by Microsoft AI CAT\",\n    long_description=\"%s\\n%s\"\n    % (\n        re.compile(\"^.. start-badges.*^.. end-badges\", re.M | re.S).sub(\"\", read(\"README.md\")),\n        re.sub(\":[a-z]+:`~?(.*?)`\", r\"``\\1``\", read(\"CONTRIBUTING.md\")),\n    ),\n    author=AUTHOR,\n    author_email=\"teamsharat@microsoft.com\",\n    url=\"https://github.com/microsoft/nlp-recipes\",\n    packages=[\"utils_nlp\"],\n    include_package_data=True,\n    zip_safe=True,\n    classifiers=[\n        # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers\n        \"Development Status :: 5 - Production/Stable\",\n        \"Intended Audience :: Developers\",\n        \"License :: OSI Approved :: MIT License\",\n        \"Operating System :: Unix\",\n        \"Operating System :: POSIX\",\n        \"Operating System :: Microsoft :: Windows\",\n        \"Programming Language :: Python :: 3.6\",\n        \"Programming Language :: Python :: 3.7\",\n        \"Programming Language :: Python :: Implementation :: CPython\",\n        \"Programming Language :: Python :: Implementation :: PyPy\",\n        \"Topic :: Text Processing :: Linguistic\",\n        \"Topic :: Utilities\",\n        \"Intended Audience :: Science/Research\",\n        \"Intended Audience :: Developers\",\n        \"Intended Audience :: Education\",\n        \"Intended Audience :: Financial and Insurance Industry\",\n        \"Intended Audience :: Healthcare Industry\",\n        \"Intended Audience :: Information Technology\",\n        \"Intended Audience :: Telecommunications Industry\",\n    ],\n    project_urls={\n        \"Documentation\": \"https://github.com/microsoft/nlp-recipes/\",\n        \"Issue Tracker\": \"https://github.com/microsoft/nlp-recipes/issues\",\n    },\n    keywords=[\n        \"Microsoft NLP\",\n        \"NLP Recipes\",\n        \"Natural Language Processing\",\n        \"Text Processing\",\n        \"Word Embedding\",\n    ],\n    python_requires=\">=3.6\",\n    install_requires=[],\n    dependency_links=[],\n    extras_require={},\n    use_scm_version=False,\n    setup_requires=[],\n)\n"
  },
  {
    "path": "tests/README.md",
    "content": "# Tests\n\nThis project uses unit, smoke and integration tests with Python files and notebooks.\n\n * In the unit tests we just make sure the notebook runs.\n * In the smoke tests, we run them with a small dataset or a small number of epochs to make sure that, apart from running, they provide reasonable metrics.\n * In the integration tests we use a bigger dataset for more epochs and we test that the metrics are what we expect.\n\nFor more information, see a [quick introduction to unit, smoke and integration tests](https://miguelgfierro.com/blog/2018/a-beginners-guide-to-python-testing/). To manually execute the unit tests in the different environments, first **make sure you are in the correct environment as described in the [SETUP.md](../SETUP.md)**.\n\nTests are automatically run as part of a DevOps pipeline. The pipelines are defined in the `.yml` files in [tests/ci](./ci) with filenames that align with pipeline names.\n\n## Test execution\n\n**Click on the following menus** to see more details on how to execute the unit, smoke and integration tests:\n\n<details>\n<summary><strong><em>Unit tests (click to expand)</em></strong></summary>\n\nUnit tests ensure that each class or function behaves as it should. Every time a developer makes a pull request to staging or master branch, a battery of unit tests is executed.\n\n**Note that the next instructions execute the tests from the root folder.**\n\nFor executing the Python unit tests for the utilities:\n\n    pytest tests/unit -m \"not notebooks and not gpu and not azureml\"\n\nFor executing the Python unit tests for the notebooks:\n\n    pytest tests/unit -m \"notebooks and not gpu and not azureml\"\n\nFor executing the Python GPU unit tests for the utilities:\n\n    pytest tests/unit -m \"not notebooks and gpu and not azureml\"\n\nFor executing the Python GPU unit tests for the notebooks:\n\n    pytest tests/unit -m \"notebooks and gpu and not azureml\"\n\nFor executing the AzureML unit tests:\n\n    pytest tests/unit -m \"azureml\"\n\n</details>\n\n\n<details>\n<summary><strong><em>Smoke tests (click to expand)</em></strong></summary>\n\nSmoke tests make sure that the system works and are executed just before the integration tests every night.\n\n**Note that the next instructions execute the tests from the root folder.**\n\nFor executing the Python smoke tests:\n\n    pytest --durations=0 tests/smoke -m \"smoke and not gpu and not azureml\"\n\nFor executing the Python GPU smoke tests:\n\n    pytest --durations=0 tests/smoke -m \"smoke and gpu and not azureml\"\n\nFor executing the AzureML smoke tests:\n\n    pytest --durations=0 tests/smoke -m \"azureml\"\n\n</details>\n\n<details>\n<summary><strong><em>Integration tests (click to expand)</em></strong></summary>\n\nIntegration tests make sure that the program results are acceptable\n\n**Note that the next instructions execute the tests from the root folder.**\n\nFor executing the Python integration tests:\n\n    pytest --durations=0 tests/integration -m \"integration and not gpu and not azureml\"\n\nFor executing the Python GPU integration tests:\n\n    pytest --durations=0 tests/integration -m \"integration and gpu and not azureml\"\n\nFor executing the AzureML integration tests:\n\n    pytest --durations=0 tests/smoke -m \"azureml\"\n\n</details>\n\n\n## How to create tests on notebooks with Papermill\n\nIn the notebooks of these repo we use [Papermill](https://github.com/nteract/papermill) in unit, smoke and integration tests.\n\nIn the unit tests we just make sure the notebook runs. In the smoke tests, we run them with a small dataset or a small number of epochs to make sure that, apart from running, they provide reasonable metrics. Finally, in the integration tests, we use a bigger dataset for more epochs and we test that the metrics are what we expect.\n\nFor a deep overview on how to integrate papermill on unit, smoke and integration test, please refer to [this guide from Microsoft Recommenders repo](https://github.com/microsoft/recommenders/blob/master/tests/README.md#how-to-create-tests-on-notebooks-with-papermill).\n\nMore details on how to integrate Papermill with notebooks can be found in their [repo](https://github.com/nteract/papermill).\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/ci/azureml_integration_tests.yml",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers\n# Implementing the scheduler from the dashboard\n# Uncomment in case it wants to be done from using the yml\n#schedules:\n#- cron: \"56 22 * * *\"\n#  displayName: Daily computation of nightly builds\n#  branches:\n#    include:\n#    - master\n#  always: true\n\n# no PR builds\npr: none\n\n# no CI trigger\ntrigger: none\n\nvariables:\n- group: AzureMLConfig\n- name  : 'resource_group'\n  value : 'nlpbp_project_resources'\n- name  : 'workspace_name'\n  value : 'nlpazuremltestws'\n- name  : 'workspace_region'\n  value : 'eastus2'\n- name  : 'junitxml'\n  value : 'reports/test-azureml.xml'\n\njobs:\n- job: AzureMLNotebookTest\n  timeoutInMinutes: 300 # how long to run the job before automatically cancelling\n  pool:\n    vmImage: 'Ubuntu-16.04'\n  steps:\n  - bash: |\n      echo \"##vso[task.prependpath]/usr/share/miniconda/bin\"\n    displayName: Add Conda to PATH\n\n  - bash: |\n      python tools/generate_conda_file.py --gpu\n      conda env create -n nlp_gpu -f nlp_gpu.yaml\n      pip install paramiko==2.4.2\n      source activate nlp_gpu\n      conda env list\n      echo Login Azure Account\n      az login --service-principal -u $(spidentity) -p $(spsecret) --tenant $(sptenant)\n      az account set --subscription $(subscriptionid)\n    displayName: 'Create and activate conda environment'\n\n  - bash: |\n      source activate nlp_gpu\n      pytest --durations=0 tests/integration -m \"azureml\" -q --subscription_id=$(subscriptionid) --resource_group=$(resource_group) --workspace_name=$(workspace_name) --workspace_region=$(workspace_region) --junitxml $(junitxml) \n    displayName: 'Run AzureML notebook tests'\n  \n  - bash: |\n      echo Ensure Resource Group Deletion $(resource_group)\n      existResponse=$(az group exists -n $(resource_group))\n      if [ \"$existResponse\" == \"true\" ]; then\n        echo Deleting project resource group\n        az group delete --name $(resource_group) --yes\n      else\n        echo Project resource group did not exist\n      fi\n      echo Done Cleanup\n    displayName: 'Cleanup Task'\n    condition: always()\n\n  - task: PublishTestResults@2\n    displayName: 'Publish Test Results **/test-*.xml'\n    inputs:\n      testResultsFiles: '**/test-*.xml'\n      failTaskOnFailedTests: true\n    condition: succeededOrFailed()"
  },
  {
    "path": "tests/ci/component_governance.yml",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against these branches will trigger this build\npr: \n  - master\n\n# no CI trigger\ntrigger: none\n\njobs:\n- job: Component_governance\n  timeoutInMinutes: 20 # how long to run the job before automatically cancelling\n  pool:\n    vmImage: 'ubuntu-16.04'\n\n  steps:\n  - bash: |\n      python tools/generate_requirements_txt.py\n    displayName: 'Generate requirements.txt file from generate_conda_file.py'\n\n  - task: ComponentGovernanceComponentDetection@0\n    inputs:\n      scanType: 'Register'\n      verbosity: 'Verbose'\n      alertWarningLevel: 'High'  \n"
  },
  {
    "path": "tests/ci/cpu_integration_tests_linux.yml",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\n# More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers\n# Implementing the scheduler from the dashboard\n# Uncomment in case it wants to be done from using the yml\n#schedules:\n#- cron: \"56 22 * * *\"\n#  displayName: Daily computation of nightly builds\n#  branches:\n#    include:\n#    - master\n#  always: true\n\n\n# no PR builds\npr: none\n\n# no CI trigger\ntrigger: none\n\njobs:\n- job: nightly\n  displayName : 'Nightly tests'\n  timeoutInMinutes: 180 # how long to run the job before automatically cancelling\n  pool:\n    name: nlpagentpool\n\n  steps:\n  - bash: |\n      echo \"##vso[task.prependpath]/data/anaconda/bin\"\n      conda env list\n    displayName: 'Add Conda to PATH'\n\n  # Conda creation can take around 10min\n  - bash: |\n      python tools/generate_conda_file.py \n      conda env create -n integration_cpu -f nlp_cpu.yaml\n    displayName: 'Creating Conda Environment with dependencies'\n\n  - bash: |\n      source activate integration_cpu\n      pytest --durations=0 tests/smoke -m \"smoke and not gpu and not azureml\" --junitxml=junit/test-smoke-test.xml\n    displayName: 'Run smoke tests'\n\n  - bash: |\n      source activate integration_cpu\n      pytest --durations=0 tests/integration -m \"integration and not gpu and not azureml\" --junitxml=junit/test-integration-test.xml\n    displayName: 'Run integration tests'\n\n  - bash: |\n      echo Remove Conda Environment\n      conda remove -n integration_cpu --all -q --force -y\n      echo Done Cleanup\n    displayName: 'Cleanup Task'\n    condition: always()\n\n  - task: PublishTestResults@2\n    inputs:\n      testResultsFiles: '**/test-*-test.xml'\n      testRunTitle: 'Test results for PyTest'"
  },
  {
    "path": "tests/ci/cpu_unit_tests_linux.yml",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against these branches will trigger this build\npr:\n- master\n- staging\n\n#Any commit to this branch will trigger the build.\ntrigger:\n- staging\n- master\n\n\njobs:\n- job: cpu_unit_tests_linux\n  timeoutInMinutes: 12 # how long to run the job before automatically cancelling\n  pool:\n    # vmImage: 'ubuntu-16.04' # hosted machine \n    name: nlpagentpool\n\n  steps:\n\n# Uncomment if hosted machine\n#  - task: UsePythonVersion@0\n#    inputs:\n#      versionSpec: '3.6.8'\n#      architecture: 'x64'\n#      addToPath: true\n#    displayName: 'Use Python 3.6.8'\n  \n  - bash: |\n      echo \"##vso[task.prependpath]/data/anaconda/bin\"\n      conda env list\n    displayName: Add Conda to PATH\n\n# Uncomment if needed\n# Conda creation can take around 10min\n#  - bash: |\n#      python tools/generate_conda_file.py\n#      conda env create -n nlp_cpu -f nlp_cpu.yaml\n#    displayName: 'Creating Conda Environment with dependencies'\n\n  - bash: |\n      source activate nlp_cpu\n      pytest --durations=0 tests/unit -m \"not notebooks and not gpu and not azureml\" --junitxml=junit/test-unitttest.xml\n    displayName: 'Run Unit tests'\n\n# Uncomment if needed\n#  - bash: |\n#      echo Remove Conda Environment\n#      conda remove -n nlp_cpu --all -q --force -y\n#      echo Done Cleanup\n#    displayName: 'Cleanup Task'\n#    condition: always()\n\n  - task: PublishTestResults@2\n    inputs:\n      testResultsFiles: '**/test-unitttest.xml'\n      testRunTitle: 'Test results for PyTest'\n\n\n"
  },
  {
    "path": "tests/ci/gpu_integration_tests_linux.yml",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\n# More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers\n# Implementing the scheduler from the dashboard\n# Uncomment in case it wants to be done from using the yml\n#schedules:\n#- cron: \"56 11 * * *\"\n#  displayName: Daily computation of nightly builds\n#  branches:\n#    include:\n#    - master\n#  always: true\n\n\n# no PR builds\npr: none\n\n# no CI trigger\ntrigger: none\n\njobs:\n- job: nightly\n  displayName : 'Nightly tests'\n  timeoutInMinutes: 180 # how long to run the job before automatically cancelling\n  pool:\n    name: nlpagentpool\n\n  steps:\n  - bash: |\n      echo \"##vso[task.prependpath]/data/anaconda/bin\"\n      conda env list\n    displayName: 'Add Conda to PATH'\n\n  # Conda creation can take around 10min\n  - bash: |\n      python tools/generate_conda_file.py --gpu\n      conda env create -n integration_gpu -f nlp_gpu.yaml\n    displayName: 'Creating Conda Environment with dependencies'\n\n  - bash: |\n      source activate integration_gpu\n      pytest --durations=0 tests/smoke -m \"smoke and gpu and not azureml\" --junitxml=junit/test-smoke-test.xml\n    displayName: 'Run smoke tests'\n\n  - bash: |\n      source activate integration_gpu\n      pytest --durations=0 tests/integration -m \"integration and gpu and not azureml\" --junitxml=junit/test-integration-test.xml\n    displayName: 'Run integration tests'\n\n  - bash: |\n      echo Remove Conda Environment\n      conda remove -n integration_gpu --all -q --force -y\n      echo Done Cleanup\n    displayName: 'Cleanup Task'\n    condition: always()\n\n  - task: PublishTestResults@2\n    inputs:\n      testResultsFiles: '**/test-*-test.xml'\n      testRunTitle: 'Test results for PyTest'\n"
  },
  {
    "path": "tests/ci/gpu_unit_tests_linux.yml",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against these branches will trigger this build\npr:\n- master\n- staging\n\n#Any commit to this branch will trigger the build.\ntrigger:\n- staging\n- master\n\njobs:\n- job: gpu_unit_tests_linux\n  timeoutInMinutes: 30 # how long to run the job before automatically cancelling\n  pool:\n    name: nlpagentpool\n\n  steps:\n  - bash: |\n      echo \"##vso[task.prependpath]/data/anaconda/bin\"\n      conda env list\n    displayName: Add Conda to PATH\n\n# Uncomment if needed\n# Conda creation can take around 10min\n#  - bash: |\n#      python tools/generate_conda_file.py --gpu\n#      conda env create -n nlp_gpu -f nlp_gpu.yaml\n#    displayName: 'Creating Conda Environment with dependencies'\n\n  - bash: |\n      source activate nlp_gpu\n      pytest --durations=0 tests/unit -m \"not notebooks and gpu and not azureml\" --junitxml=junit/test-unitttest.xml\n    displayName: 'Run Unit tests'\n\n# Uncomment if needed\n#  - bash: |\n#      echo Remove Conda Environment\n#      conda remove -n nlp_gpu --all -q --force -y\n#      echo Done Cleanup\n#    displayName: 'Cleanup Task'\n#    condition: always()\n\n  - task: PublishTestResults@2\n    inputs:\n      testResultsFiles: '**/test-unitttest.xml'\n      testRunTitle: 'Test results for PyTest'\n\n\n"
  },
  {
    "path": "tests/ci/notebooks_cpu_unit_tests_linux.yml",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against these branches will trigger this build\npr:\n- master\n- staging\n\n#Any commit to this branch will trigger the build.\ntrigger:\n- staging\n- master\n\njobs:\n- job: notebooks_cpu_unit_tests_linux\n  timeoutInMinutes: 10 # how long to run the job before automatically cancelling\n  pool:\n    name: nlpagentpool\n\n  steps:\n  - bash: |\n      echo \"##vso[task.prependpath]/data/anaconda/bin\"\n      conda env list\n    displayName: Add Conda to PATH\n\n# Uncomment if needed\n# Conda creation can take around 10min\n#  - bash: |\n#      python tools/generate_conda_file.py\n#      conda env create -n nlp_cpu -f nlp_cpu.yaml\n#    displayName: 'Creating Conda Environment with dependencies'\n\n  - bash: |\n      source activate nlp_cpu\n      pytest --durations=0 tests/unit -m \"notebooks and not gpu and not azureml\" --junitxml=junit/test-unitttest.xml\n    displayName: 'Run Unit tests'\n\n# Uncomment if needed\n#  - bash: |\n#      echo Remove Conda Environment\n#      conda remove -n nlp_cpu --all -q --force -y\n#      echo Done Cleanup\n#    displayName: 'Cleanup Task'\n#    condition: always()\n\n  - task: PublishTestResults@2\n    inputs:\n      testResultsFiles: '**/test-unitttest.xml'\n      testRunTitle: 'Test results for PyTest'\n\n"
  },
  {
    "path": "tests/ci/notebooks_gpu_unit_tests_linux.yml",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against these branches will trigger this build\npr:\n- master\n- staging\n\n#Any commit to this branch will trigger the build.\ntrigger:\n- staging\n- master\n\njobs:\n- job: notebooks_gpu_unit_tests_linux\n  timeoutInMinutes: 10 # how long to run the job before automatically cancelling\n  pool:\n    name: nlpagentpool\n\n  steps:\n  - bash: |\n      echo \"##vso[task.prependpath]/data/anaconda/bin\"\n      conda env list\n    displayName: Add Conda to PATH\n\n# Uncomment if needed\n# Conda creation can take around 10min\n#  - bash: |\n#      python tools/generate_conda_file.py --gpu\n#      conda env create -n nlp_gpu -f nlp_gpu.yaml\n#    displayName: 'Creating Conda Environment with dependencies'\n\n  - bash: |\n      source activate nlp_gpu\n      pytest --durations=0 tests/unit -m \"notebooks and gpu and not azureml\" --junitxml=junit/test-unitttest.xml\n    displayName: 'Run Unit tests'\n\n# Uncomment if needed\n#  - bash: |\n#      echo Remove Conda Environment\n#      conda remove -n nlp_gpu --all -q --force -y\n#      echo Done Cleanup\n#    displayName: 'Cleanup Task'\n#    condition: always()\n\n  - task: PublishTestResults@2\n    inputs:\n      testResultsFiles: '**/test-unitttest.xml'\n      testRunTitle: 'Test results for PyTest'\n\n"
  },
  {
    "path": "tests/conftest.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# NOTE: This file is used by pytest to inject fixtures automatically.\n# As it is explained in the documentation\n# https://docs.pytest.org/en/latest/fixture.html:\n# \"If during implementing your tests you realize that you want to use\n# a fixture function from multiple test files you can move it to a conftest.py\n# file. You don’t need to import the fixture you want to use in a test, it\n# automatically gets discovered by pytest.\"\n\nimport os\nimport pandas as pd\nfrom tempfile import TemporaryDirectory\n\nimport pytest\nfrom tests.notebooks_common import path_notebooks\n\nfrom utils_nlp.models.bert.common import Language as BERTLanguage\nfrom utils_nlp.models.xlnet.common import Language as XLNetLanguage\nfrom utils_nlp.models.bert.common import Tokenizer as BERTTokenizer\nfrom utils_nlp.models.xlnet.common import Tokenizer as XLNetTokenizer\nfrom utils_nlp.azureml import azureml_utils\nfrom azureml.core.webservice import Webservice\n\n\n@pytest.fixture(scope=\"module\")\ndef scripts():\n    folder_notebooks = path_notebooks()\n    paths = {\n        \"ddp_bertsumext\": os.path.join(\n            folder_notebooks,\n            \"text_summarization\",\n            \"extractive_summarization_cnndm_distributed_train.py\",\n        ),\n        \"ddp_bertsumabs\": os.path.join(\n            folder_notebooks,\n            \"text_summarization\",\n            \"abstractive_summarization_bertsum_cnndm_distributed_train.py\",\n        ),\n    }\n\n    return paths\n\n\n@pytest.fixture(scope=\"module\")\ndef notebooks():\n    folder_notebooks = path_notebooks()\n\n    # Path for the notebooks\n    paths = {\n        \"embedding_trainer\": os.path.join(\n            folder_notebooks, \"embeddings\", \"embedding_trainer.ipynb\"\n        ),\n        \"similarity_embeddings_baseline\": os.path.join(\n            folder_notebooks, \"sentence_similarity\", \"baseline_deep_dive.ipynb\"\n        ),\n        \"bert_encoder\": os.path.join(\n            folder_notebooks, \"sentence_similarity\", \"bert_encoder.ipynb\"\n        ),\n        \"gensen_local\": os.path.join(\n            folder_notebooks, \"sentence_similarity\", \"gensen_local.ipynb\"\n        ),\n        \"gensen_aml_deep_dive\": os.path.join(\n            folder_notebooks, \"sentence_similarity\", \"gensen_aml_deep_dive.ipynb\"\n        ),\n        \"automl_local_deployment_aci\": os.path.join(\n            folder_notebooks, \"sentence_similarity\", \"automl_local_deployment_aci.ipynb\"\n        ),\n        \"automl_with_pipelines_deployment_aks\": os.path.join(\n            folder_notebooks,\n            \"sentence_similarity\",\n            \"automl_with_pipelines_deployment_aks.ipynb\",\n        ),\n        \"question_answering_squad_transformers\": os.path.join(\n            folder_notebooks,\n            \"question_answering\",\n            \"question_answering_squad_transformers.ipynb\",\n        ),\n        \"bert_senteval\": os.path.join(\n            folder_notebooks, \"sentence_similarity\", \"bert_senteval.ipynb\"\n        ),\n        \"bert_qa_trainer\": os.path.join(\n            folder_notebooks,\n            \"question_answering\",\n            \"pretrained-BERT-SQuAD-deep-dive-aml.ipynb\",\n        ),\n        \"bidaf_deep_dive\": os.path.join(\n            folder_notebooks, \"question_answering\", \"bidaf_aml_deep_dive.ipynb\"\n        ),\n        \"bidaf_quickstart\": os.path.join(\n            folder_notebooks,\n            \"question_answering\",\n            \"question_answering_system_bidaf_quickstart.ipynb\",\n        ),\n        \"entailment_multinli_transformers\": os.path.join(\n            folder_notebooks, \"entailment\", \"entailment_multinli_transformers.ipynb\"\n        ),\n        \"entailment_xnli_bert_azureml\": os.path.join(\n            folder_notebooks, \"entailment\", \"entailment_xnli_bert_azureml.ipynb\"\n        ),\n        \"tc_bert_azureml\": os.path.join(\n            folder_notebooks, \"text_classification\", \"tc_bert_azureml.ipynb\"\n        ),\n        \"tc_mnli_transformers\": os.path.join(\n            folder_notebooks, \"text_classification\", \"tc_mnli_transformers.ipynb\"\n        ),\n        \"tc_multi_languages_transformers\": os.path.join(\n            folder_notebooks,\n            \"text_classification\",\n            \"tc_multi_languages_transformers.ipynb\",\n        ),\n        \"ner_wikigold_transformer\": os.path.join(\n            folder_notebooks,\n            \"named_entity_recognition\",\n            \"ner_wikigold_transformer.ipynb\",\n        ),\n        \"deep_and_unified_understanding\": os.path.join(\n            folder_notebooks, \"model_explainability\", \"interpret_dnn_layers.ipynb\"\n        ),\n        \"extractive_summarization_cnndm_transformer\": os.path.join(\n            folder_notebooks,\n            \"text_summarization\",\n            \"extractive_summarization_cnndm_transformer.ipynb\",\n        ),\n        \"unilm_abstractive_summarization\": os.path.join(\n            folder_notebooks,\n            \"text_summarization\",\n            \"abstractive_summarization_unilm_cnndm.ipynb\",\n        ),\n        \"minilm_abstractive_summarization\": os.path.join(\n            folder_notebooks,\n            \"text_summarization\",\n            \"abstractive_summarization_minilm_cnndm.ipynb\",\n        ),\n        \"abstractive_summarization_bertsumabs_cnndm\": os.path.join(\n            folder_notebooks,\n            \"text_summarization\",\n            \"abstractive_summarization_bertsumabs_cnndm.ipynb\",\n        ),\n    }\n    return paths\n\n\n@pytest.fixture\ndef tmp(tmp_path_factory):\n    td = TemporaryDirectory(dir=tmp_path_factory.getbasetemp())\n    try:\n        yield td.name\n    finally:\n        td.cleanup()\n\n\n@pytest.fixture(scope=\"module\")\ndef tmp_module(tmp_path_factory):\n    td = TemporaryDirectory(dir=tmp_path_factory.getbasetemp())\n    try:\n        yield td.name\n    finally:\n        td.cleanup()\n\n\n@pytest.fixture(scope=\"module\")\ndef ner_test_data():\n    UNIQUE_LABELS = [\"O\", \"I-LOC\", \"I-MISC\", \"I-PER\", \"I-ORG\", \"X\"]\n    LABEL_MAP = {label: i for i, label in enumerate(UNIQUE_LABELS)}\n    TRAILING_TOKEN_MASK = [[True] * 20]\n    false_pos = [1, 2]\n    for p in false_pos:\n        TRAILING_TOKEN_MASK[0][p] = False\n    INPUT_LABEL_IDS = [[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n    return {\n        \"INPUT_TEXT\": [\n            [\n                \"Johnathan\",\n                \"is\",\n                \"studying\",\n                \"in\",\n                \"the\",\n                \"University\",\n                \"of\",\n                \"Michigan\",\n                \".\",\n            ]\n        ],\n        \"INPUT_TEXT_SINGLE\": [\n            \"Johnathan\",\n            \"is\",\n            \"studying\",\n            \"in\",\n            \"the\",\n            \"University\",\n            \"of\",\n            \"Michigan\",\n            \".\",\n        ],\n        \"INPUT_LABELS\": [[\"I-PER\", \"O\", \"O\", \"O\", \"O\", \"I-ORG\", \"I-ORG\", \"I-ORG\", \"O\"]],\n        \"INPUT_LABELS_SINGLE\": [\n            \"I-PER\",\n            \"O\",\n            \"O\",\n            \"O\",\n            \"O\",\n            \"I-ORG\",\n            \"I-ORG\",\n            \"I-ORG\",\n            \"O\",\n        ],\n        \"INPUT_LABELS_WRONG\": [\n            [\"I-PER\", \"O\", \"O\", \"O\", \"O\", \"I-ORG\", \"I-ORG\", \"I-ORG\"]\n        ],\n        \"INPUT_TOKEN_IDS\": [\n            [\n                1287,\n                9779,\n                1389,\n                1110,\n                5076,\n                1107,\n                1103,\n                1239,\n                1104,\n                3312,\n                119,\n                0,\n                0,\n                0,\n                0,\n                0,\n                0,\n                0,\n                0,\n                0,\n            ]\n        ],\n        \"INPUT_LABEL_IDS\": INPUT_LABEL_IDS,\n        \"INPUT_MASK\": [[1] * 11 + [0] * 9],\n        \"PREDICTED_LABELS\": [\n            [3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n        ],\n        \"TRAILING_TOKEN_MASK\": TRAILING_TOKEN_MASK,\n        \"UNIQUE_LABELS\": UNIQUE_LABELS,\n        \"LABEL_MAP\": LABEL_MAP,\n        \"EXPECTED_TOKENS_NO_PADDING\": [\n            [\"I-PER\", \"X\", \"X\", \"O\", \"O\", \"O\", \"O\", \"I-ORG\", \"I-ORG\", \"I-ORG\", \"O\"]\n        ],\n        \"EXPECTED_TOKENS_NO_PADDING_NO_TRAILING\": [\n            [\"I-PER\", \"O\", \"O\", \"O\", \"O\", \"I-ORG\", \"I-ORG\", \"I-ORG\", \"O\"]\n        ],\n        \"EXPECTED_TRAILING_TOKEN_MASK\": TRAILING_TOKEN_MASK,\n        \"EXPECTED_LABEL_IDS\": INPUT_LABEL_IDS,\n    }\n\n\n@pytest.fixture(scope=\"module\")\ndef qa_test_df():\n    test_df = pd.DataFrame(\n        {\n            \"doc_text\": [\n                \"The color of the sky is blue.\",\n                \"Architecturally, the school has a Catholic character. Atop the \"\n                \"Main Building's gold dome is a golden statue of the Virgin Mary. \"\n                \"Immediately in front of the Main Building and facing it, is a \"\n                \"copper statue of Christ with arms upraised with the \"\n                'legend \"Venite Ad Me Omnes\". Next to the Main Building is the '\n                \"Basilica of the Sacred Heart. Immediately behind the basilica is \"\n                \"the Grotto, a Marian place of prayer and reflection. It is a \"\n                \"replica of the grotto at Lourdes, France where the Virgin Mary \"\n                \"reputedly appeared to Saint Bernadette Soubirous in 1858. At \"\n                \"the end of the main drive (and in a direct line that connects \"\n                \"through 3 statues and the Gold Dome), is a simple, modern stone \"\n                \"statue of Mary.\",\n            ],\n            \"question_text\": [\n                \"What's the color of the sky?\",\n                \"To whom did the Virgin Mary allegedly appear in 1858 in Lourdes \"\n                \"France?\",\n            ],\n            \"answer_start\": [24, 515],\n            \"answer_text\": [\"blue\", \"Saint Bernadette Soubirous\"],\n            \"answer_start_list\": [[24], [515]],\n            \"answer_text_list\": [[\"blue\"], [\"Saint Bernadette Soubirous\"]],\n            \"answer_start_multi\": [[24, 25], [515, 516]],\n            \"answer_text_multi\": [\n                [\"blue\", \"grey\"],\n                [\"Saint Bernadette Soubirous\", \"Bernadette Soubirous\"],\n            ],\n            \"qa_id\": [\"1\", \"2\"],\n            \"is_impossible\": [False, False],\n        }\n    )\n\n    return {\n        \"test_df\": test_df,\n        \"doc_text_col\": \"doc_text\",\n        \"question_text_col\": \"question_text\",\n        \"answer_start_col\": \"answer_start\",\n        \"answer_text_col\": \"answer_text\",\n        \"answer_start_list_col\": \"answer_start_list\",\n        \"answer_text_list_col\": \"answer_text_list\",\n        \"answer_start_multi_col\": \"answer_start_multi\",\n        \"answer_text_multi_col\": \"answer_text_multi\",\n        \"qa_id_col\": \"qa_id\",\n        \"is_impossible_col\": \"is_impossible\",\n    }\n\n\ndef pytest_addoption(parser):\n    parser.addoption(\n        \"--subscription_id\", help=\"Azure Subscription Id to create resources in\"\n    )\n    parser.addoption(\"--resource_group\", help=\"Name of the resource group\")\n    parser.addoption(\"--workspace_name\", help=\"Name of Azure ML Workspace\")\n    parser.addoption(\n        \"--workspace_region\", help=\"Azure region to create the workspace in\"\n    )\n    parser.addoption(\"--cluster_name\", help=\"Name of the AzureML Cluster.\")\n\n\n@pytest.fixture(scope=\"module\")\ndef subscription_id(request):\n    return request.config.getoption(\"--subscription_id\")\n\n\n@pytest.fixture(scope=\"module\")\ndef resource_group(request):\n    return request.config.getoption(\"--resource_group\")\n\n\n@pytest.fixture(scope=\"module\")\ndef workspace_name(request):\n    return request.config.getoption(\"--workspace_name\")\n\n\n@pytest.fixture(scope=\"module\")\ndef workspace_region(request):\n    return request.config.getoption(\"--workspace_region\")\n\n\n@pytest.fixture(scope=\"module\")\ndef cluster_name(request):\n    return request.config.getoption(\"--cluster_name\")\n\n\n@pytest.fixture()\ndef bert_english_tokenizer():\n    return BERTTokenizer(language=BERTLanguage.ENGLISHCASED, to_lower=False)\n\n\n@pytest.fixture()\ndef xlnet_english_tokenizer():\n    return XLNetTokenizer(language=XLNetLanguage.ENGLISHCASED)\n\n\n@pytest.fixture(scope=\"module\")\ndef teardown_service(subscription_id, resource_group, workspace_name, workspace_region):\n\n    yield\n\n    # connect to workspace\n    ws = azureml_utils.get_or_create_workspace(\n        subscription_id=subscription_id,\n        resource_group=resource_group,\n        workspace_name=workspace_name,\n        workspace_region=workspace_region,\n    )\n\n    # connect to aci_service\n    aci_service = Webservice(workspace=ws, name=\"aci-test-service\")\n\n    # delete aci_service\n    aci_service.delete()\n"
  },
  {
    "path": "tests/integration/test_ddp_summarization.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\nimport torch\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_ddp_extractive_summarization_cnndm_transformers(scripts, tmp):\n    ddp_env = os.environ.copy()\n    ddp_env[\"OMP_NUM_THREADS\"] = str(torch.cuda.device_count())\n    ddp_env[\"KMP_AFFINITY\"] = \"verbose\"\n    script = scripts[\"ddp_bertsumext\"]\n    summary_filename = \"bertsumext_prediction.txt\"\n    import subprocess\n\n    process = subprocess.Popen(\n        [\n            \"python\",\n            script,\n            \"--data_dir\",\n            tmp,\n            \"--cache_dir\",\n            tmp,\n            \"--output_dir\",\n            tmp,\n            \"--quick_run\",\n            \"true\",\n            \"--summary_filename\",\n            summary_filename,\n        ],\n        env=ddp_env,\n        stdout=subprocess.PIPE,\n        stderr=subprocess.PIPE,\n    )\n    stdout, stderr = process.communicate()\n    print(stdout)\n    if process.returncode:\n        print(stdout)\n        print(stderr)\n        assert False\n    assert os.path.exists(os.path.join(tmp, summary_filename))\n\n\n@pytest.mark.skip(\n    reason=\"\"\"it takes too long; if the previous test works,\n            and the notebook runs, this should also work.\"\"\"\n)\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_ddp_abstractive_summarization_cnndm_transformers(scripts, tmp):\n    script = scripts[\"ddp_bertsumabs\"]\n    summary_filename = \"bertsumabs_prediction.txt\"\n    import subprocess\n\n    process = subprocess.Popen(\n        [\n            \"python\",\n            script,\n            \"--data_dir\",\n            tmp,\n            \"--cache_dir\",\n            tmp,\n            \"--output_dir\",\n            tmp,\n            \"--quick_run\",\n            \"true\",\n            \"--batch_size\",\n            \"1\",\n            \"--summary_filename\",\n            summary_filename,\n        ],\n        stdout=subprocess.PIPE,\n        stderr=subprocess.PIPE,\n    )\n    stdout, stderr = process.communicate()\n    print(stdout)\n    if process.returncode:\n        print(stdout)\n        print(stderr)\n        assert False\n    assert os.path.exists(os.path.join(tmp, summary_filename))\n"
  },
  {
    "path": "tests/integration/test_gpu_utils.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport torch\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_machine_is_gpu_machine():\n    assert torch.cuda.is_available() is True\n"
  },
  {
    "path": "tests/integration/test_notebooks_abstractive_summarization_bertsumabs.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport papermill as pm\nimport pytest\nimport scrapbook as sb\nfrom tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK\nimport torch\n\nABS_TOL = 0.02\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_abstractive_summarization_bertsumabs_cnndm(notebooks, tmp):\n    notebook_path = notebooks[\"abstractive_summarization_bertsumabs_cnndm\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            QUICK_RUN=True,\n            TOP_N=1000,\n            MAX_POS=512,\n            DATA_FOLDER=tmp,\n            CACHE_DIR=tmp,\n            BATCH_SIZE_PER_GPU=3,\n            REPORT_EVERY=50,\n            MAX_STEPS=100,\n            MODEL_NAME=\"bert-base-uncased\",\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"rouge_2_f_score\"], 0.01, abs=ABS_TOL)\n"
  },
  {
    "path": "tests/integration/test_notebooks_embeddings.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\r\n# Licensed under the MIT License.\r\n\r\nimport pytest\r\nimport papermill as pm\r\n\r\nfrom tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME\r\n\r\n@pytest.mark.integration\r\n@pytest.mark.skip(reason=\"\")\r\n@pytest.mark.notebooks\r\ndef test_embedding_trainer_runs(notebooks):\r\n    notebook_path = notebooks[\"embedding_trainer\"]\r\n    pm.execute_notebook(\r\n        notebook_path,\r\n        OUTPUT_NOTEBOOK,\r\n        kernel_name=KERNEL_NAME,\r\n        parameters=dict(NLP_REPO_PATH=\".\")\r\n    )\r\n"
  },
  {
    "path": "tests/integration/test_notebooks_entailment.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport papermill as pm\nimport scrapbook as sb\nimport os\nimport json\nimport shutil\nfrom tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME\n\nABS_TOL = 0.1\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_entailment_multinli_bert(notebooks, tmp):\n    notebook_path = notebooks[\"entailment_multinli_transformers\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        parameters={\n            \"MODEL_NAME\": \"bert-base-uncased\",\n            \"TO_LOWER\": True,\n            \"TRAIN_DATA_USED_FRACTION\": 0.05,\n            \"DEV_DATA_USED_FRACTION\": 0.05,\n            \"NUM_EPOCHS\": 1,\n            \"CACHE_DIR\": tmp\n        },\n        kernel_name=KERNEL_NAME,\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"matched_precision\"], 0.76, abs=ABS_TOL)\n    assert pytest.approx(result[\"matched_recall\"], 0.76, abs=ABS_TOL)\n    assert pytest.approx(result[\"matched_f1\"], 0.76, abs=ABS_TOL)\n    assert pytest.approx(result[\"mismatched_precision\"], 0.76, abs=ABS_TOL)\n    assert pytest.approx(result[\"mismatched_recall\"], 0.76, abs=ABS_TOL)\n    assert pytest.approx(result[\"mismatched_f1\"], 0.76, abs=ABS_TOL)\n\n@pytest.mark.integration\n@pytest.mark.azureml\ndef test_entailment_xnli_bert_azureml(\n    notebooks, subscription_id, resource_group, workspace_name, workspace_region, cluster_name\n):\n    notebook_path = notebooks[\"entailment_xnli_bert_azureml\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        parameters={\n            \"DATA_PERCENT_USED\": 0.0025,\n            \"subscription_id\": subscription_id,\n            \"resource_group\": resource_group,\n            \"workspace_name\": workspace_name,\n            \"workspace_region\": workspace_region,\n            \"cluster_name\": cluster_name,\n        },\n        kernel_name=KERNEL_NAME,\n    )\n\n    with open(\"outputs/results.json\", \"r\") as handle:\n        result_dict = json.load(handle)\n        assert result_dict[\"weighted avg\"][\"f1-score\"] == pytest.approx(0.2, abs=ABS_TOL)\n\n    if os.path.exists(\"outputs\"):\n        shutil.rmtree(\"outputs\")\n"
  },
  {
    "path": "tests/integration/test_notebooks_extractive_summarization.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport papermill as pm\nimport pytest\nimport scrapbook as sb\nfrom tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK\n\nABS_TOL = 0.02\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_extractive_summarization_cnndm_transformers(notebooks, tmp):\n    notebook_path = notebooks[\"extractive_summarization_cnndm_transformer\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            QUICK_RUN=True,\n            TOP_N=100,\n            CHUNK_SIZE=200,\n            USE_PREPROCESSED_DATA=False,\n            DATA_PATH=tmp,\n            CACHE_DIR=tmp,\n            BATCH_SIZE=3000,\n            REPORT_EVERY=50,\n            MAX_STEPS=100,\n            WARMUP_STEPS=5e2,\n            MODEL_NAME=\"distilbert-base-uncased\",\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"rouge_2_f_score\"], 0.1, abs=ABS_TOL)\n\n\n@pytest.mark.skip(reason=\"no need to test\")\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_extractive_summarization_cnndm_transformers_processed(notebooks, tmp):\n    notebook_path = notebooks[\"extractive_summarization_cnndm_transformer\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            QUICK_RUN=True,\n            TOP_N=100,\n            CHUNK_SIZE=200,\n            USE_PREPROCESSED_DATA=True,\n            DATA_PATH=tmp,\n            CACHE_DIR=tmp,\n            PROCESSED_DATA_PATH=tmp,\n            BATCH_SIZE=3000,\n            REPORT_EVERY=50,\n            MAX_STEPS=100,\n            WARMUP_STEPS=5e2,\n            MODEL_NAME=\"distilbert-base-uncased\",\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"rouge_2_f_score\"], 0.1, abs=ABS_TOL)\n"
  },
  {
    "path": "tests/integration/test_notebooks_interpretability.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport numpy as np\nimport papermill as pm\nimport scrapbook as sb\nfrom tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_deep_and_unified_understanding(notebooks):\n    notebook_path = notebooks[\"deep_and_unified_understanding\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME)\n    \n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    sigma_numbers = [0.00317593, 0.00172284, 0.00634005, 0.00164305, 0.00317159]\n    sigma_bert = [0.1735696 , 0.14028822, 0.14590865, 0.2263149 , 0.20640415,\n       0.21249843, 0.18685372, 0.14112663, 0.25824168, 0.22399105,\n       0.2393731 , 0.12868434, 0.27386534, 0.35876372]\n    \n    np.testing.assert_array_almost_equal(result[\"sigma_numbers\"], sigma_numbers, decimal=3) \n    np.testing.assert_array_almost_equal(result[\"sigma_bert\"], sigma_bert, decimal=1) \n    "
  },
  {
    "path": "tests/integration/test_notebooks_minilm_abstractive_summarization.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport papermill as pm\nimport pytest\nimport scrapbook as sb\nfrom tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK\nimport torch\n\nABS_TOL = 0.02\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_minilm_abstractive_summarization(notebooks, tmp):\n    notebook_path = notebooks[\"minilm_abstractive_summarization\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            QUICK_RUN=True,\n            NUM_GPUS=torch.cuda.device_count(),\n            TOP_N=100,\n            WARMUP_STEPS=5,\n            MAX_STEPS=50,\n            GRADIENT_ACCUMULATION_STEPS=1,\n            TEST_PER_GPU_BATCH_SIZE=2,\n            BEAM_SIZE=3,\n            CLEANUP_RESULTS=True,\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"rouge_1_f_score\"], 0.2, abs=ABS_TOL)\n    assert pytest.approx(result[\"rouge_2_f_score\"], 0.07, abs=ABS_TOL)\n    assert pytest.approx(result[\"rouge_l_f_score\"], 0.16, abs=ABS_TOL)\n\n@pytest.mark.cpu\n@pytest.mark.integration\ndef test_minilm_abstractive_summarization(notebooks, tmp):\n    notebook_path = notebooks[\"minilm_abstractive_summarization\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            QUICK_RUN=True,\n            NUM_GPUS=0,\n            TOP_N=2,\n            WARMUP_STEPS=5,\n            MAX_STEPS=50,\n            GRADIENT_ACCUMULATION_STEPS=1,\n            TEST_PER_GPU_BATCH_SIZE=2,\n            BEAM_SIZE=3,\n            CLEANUP_RESULTS=True,\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"rouge_1_f_score\"], 0.1, abs=ABS_TOL)\n    assert pytest.approx(result[\"rouge_2_f_score\"], 0.05, abs=ABS_TOL)\n    assert pytest.approx(result[\"rouge_l_f_score\"], 0.1, abs=ABS_TOL)\n\n"
  },
  {
    "path": "tests/integration/test_notebooks_named_entity_recognition.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport papermill as pm\nimport scrapbook as sb\nfrom tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME\n\nABS_TOL = 0.05\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_ner_wikigold_bert(notebooks, tmp):\n    notebook_path = notebooks[\"ner_wikigold_transformer\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        parameters={\n            \"DATA_PATH\": tmp,\n            \"CACHE_DIR\": tmp\n        },\n        kernel_name=KERNEL_NAME,\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"precision\"], 0.80, abs=ABS_TOL)\n    assert pytest.approx(result[\"recall\"], 0.83, abs=ABS_TOL)\n    assert pytest.approx(result[\"f1\"], 0.83, abs=ABS_TOL)"
  },
  {
    "path": "tests/integration/test_notebooks_question_answering.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport papermill as pm\nimport scrapbook as sb\nfrom tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME\n\nABS_TOL = 0.2\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_question_answering_squad_transformers(notebooks, tmp):\n    notebook_path = notebooks[\"question_answering_squad_transformers\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        parameters={\n            \"TRAIN_DATA_USED_PERCENT\": 0.15,\n            \"DEV_DATA_USED_PERCENT\": 0.15,\n            \"NUM_EPOCHS\": 1,\n            \"MAX_SEQ_LENGTH\": 384,\n            \"DOC_STRIDE\": 128,\n            \"PER_GPU_BATCH_SIZE\": 4,\n            \"MODEL_NAME\": \"distilbert-base-uncased\",\n            \"DO_LOWER_CASE\": True,\n            \"CACHE_DIR\": tmp\n        },\n        kernel_name=KERNEL_NAME,\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"exact\"], 0.55, abs=ABS_TOL)\n    assert pytest.approx(result[\"f1\"], 0.70, abs=ABS_TOL)\n\n\n@pytest.mark.integration\n@pytest.mark.azureml\ndef test_bidaf_deep_dive(\n    notebooks, subscription_id, resource_group, workspace_name, workspace_region\n):\n    notebook_path = notebooks[\"bidaf_deep_dive\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        parameters={\n            \"NUM_EPOCHS\": 1,\n            \"config_path\": None,\n            \"PROJECT_FOLDER\": \"examples/question_answering/bidaf-question-answering\",\n            \"SQUAD_FOLDER\": \"examples/question_answering/squad\",\n            \"LOGS_FOLDER\": \"examples/question_answering/\",\n            \"BIDAF_CONFIG_PATH\": \"examples/question_answering/\",\n            \"subscription_id\": subscription_id,\n            \"resource_group\": resource_group,\n            \"workspace_name\": workspace_name,\n            \"workspace_region\": workspace_region,\n        },\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict[\"validation_EM\"]\n    assert result == pytest.approx(0.5, abs=ABS_TOL)\n\n\n@pytest.mark.usefixtures(\"teardown_service\")\n@pytest.mark.integration\n@pytest.mark.azureml\ndef test_bidaf_quickstart(\n    notebooks, subscription_id, resource_group, workspace_name, workspace_region\n):\n    notebook_path = notebooks[\"bidaf_quickstart\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        parameters={\n            \"config_path\": None,\n            \"subscription_id\": subscription_id,\n            \"resource_group\": resource_group,\n            \"workspace_name\": workspace_name,\n            \"workspace_region\": workspace_region,\n            \"webservice_name\": \"aci-test-service\",\n        },\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict[\"answer\"]\n    assert result == \"Bi-Directional Attention Flow\"\n\n\n@pytest.mark.integration\n@pytest.mark.azureml\n@pytest.mark.gpu\ndef test_bert_qa_runs(notebooks, subscription_id, resource_group, workspace_name, workspace_region):\n    notebook_path = notebooks[\"bert_qa_trainer\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        parameters=dict(\n            AZUREML_CONFIG_PATH=\".\",\n            DATA_FOLDER=\"./tests/integration/squad\",\n            PROJECT_FOLDER=\"./tests/integration/transformers\",\n            EXPERIMENT_NAME=\"NLP-QA-BERT-deepdive\",\n            BERT_UTIL_PATH=\"./utils_nlp/azureml/azureml_bert_util.py\",\n            EVALUATE_SQAD_PATH=\"./utils_nlp/eval/evaluate_squad.py\",\n            TRAIN_SCRIPT_PATH=\"./examples/question_answering/bert_run_squad_azureml.py\",\n            BERT_MODEL=\"bert-base-uncased\",\n            NUM_TRAIN_EPOCHS=1.0,\n            NODE_COUNT=1,\n            MAX_TOTAL_RUNS=1,\n            MAX_CONCURRENT_RUNS=1,\n            TARGET_GRADIENT_STEPS=1,\n            INIT_GRADIENT_STEPS=1,\n            subscription_id=subscription_id,\n            resource_group=resource_group,\n            workspace_name=workspace_name,\n            workspace_region=workspace_region,\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert result[\"f1\"] > 70\n    assert result[\"learning_rate\"] >= 5e-5\n    assert result[\"learning_rate\"] <= 9e-5\n\n"
  },
  {
    "path": "tests/integration/test_notebooks_sentence_similarity.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport papermill as pm\nimport scrapbook as sb\nfrom tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME\n\n\nABS_TOL = 0.2\nABS_TOL_PEARSONS = 0.05\n\n\n@pytest.fixture(scope=\"module\")\ndef baseline_results():\n    return {\n        \"Word2vec Cosine\": 0.6476606845766778,\n        \"Word2vec Cosine with Stop Words\": 0.6683808069062863,\n        \"Word2vec WMD\": 0.6574175839579567,\n        \"Word2vec WMD with Stop Words\": 0.6574175839579567,\n        \"GLoVe Cosine\": 0.6688056947022161,\n        \"GLoVe Cosine with Stop Words\": 0.6049380247374541,\n        \"GLoVe WMD\": 0.6267300417407605,\n        \"GLoVe WMD with Stop Words\": 0.48470008225931194,\n        \"fastText Cosine\": 0.6707510007525627,\n        \"fastText Cosine with Stop Words\": 0.6771300330824099,\n        \"fastText WMD\": 0.6394958913339955,\n        \"fastText WMD with Stop Words\": 0.5177829727556036,\n        \"TF-IDF Cosine\": 0.6749213786510483,\n        \"TF-IDF Cosine with Stop Words\": 0.7118087132257667,\n        \"Doc2vec Cosine\": 0.5337078384749167,\n        \"Doc2vec Cosine with Stop Words\": 0.4498543211602068,\n    }\n\n\n@pytest.mark.integration\ndef test_similarity_embeddings_baseline_runs(notebooks, baseline_results):\n    notebook_path = notebooks[\"similarity_embeddings_baseline\"]\n    pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)\n    results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict[\"results\"]\n    for key, value in baseline_results.items():\n        assert results[key] == pytest.approx(value, abs=ABS_TOL)\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\n@pytest.mark.skip(\n    reason=\"push for release, no horovod installation automation or documentation yet\"\n)\ndef test_gensen_local(notebooks):\n    notebook_path = notebooks[\"gensen_local\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            max_epoch=1,\n            config_filepath=\"examples/sentence_similarity/gensen_config.json\",\n            base_data_path=\"data\",\n        ),\n    )\n\n    results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict[\"results\"]\n    expected = {\"0\": {\"0\": 1, \"1\": 0.95}, \"1\": {\"0\": 0.95, \"1\": 1}}\n\n    for key, value in expected.items():\n        for k, v in value.items():\n            assert results[key][k] == pytest.approx(v, abs=ABS_TOL_PEARSONS)\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_bert_encoder(notebooks, tmp):\n    notebook_path = notebooks[\"bert_encoder\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(NUM_GPUS=1, MAX_SEQ_LENGTH=128, CACHE_DIR=tmp),\n    )\n    size_emb = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict[\"size_emb\"]\n    assert size_emb == 768\n\n\n@pytest.mark.integration\n@pytest.mark.azureml\ndef test_bert_senteval(\n    notebooks, subscription_id, resource_group, workspace_name, workspace_region, tmp\n):\n    notebook_path = notebooks[\"bert_senteval\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            subscription_id=subscription_id,\n            resource_group=resource_group,\n            workspace_name=workspace_name,\n            workspace_region=workspace_region,\n            CACHE_DIR=tmp,\n            LOCAL_UTILS=\"utils_nlp\",\n            LOCAL_SENTEVAL=\"utils_nlp/eval/SentEval\",\n            EXPERIMENT_NAME=\"test-nlp-ss-bert\",\n            CLUSTER_NAME=\"eval-gpu\",\n            MAX_NODES=1,\n        ),\n    )\n    pearson = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict[\"pearson\"]\n    mse = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict[\"mse\"]\n    assert pearson == pytest.approx(0.6, abs=ABS_TOL)\n    assert mse < 1.8\n\n\n@pytest.mark.integration\n@pytest.mark.azureml\ndef test_similarity_embeddings_baseline_runs(notebooks, baseline_results):\n    notebook_path = notebooks[\"similarity_embeddings_baseline\"]\n    pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)\n    results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict[\"results\"]\n    for key, value in baseline_results.items():\n        assert results[key] == pytest.approx(value, abs=ABS_TOL)\n\n\n@pytest.mark.usefixtures(\"teardown_service\")\n@pytest.mark.integration\n@pytest.mark.azureml\ndef test_automl_local_deployment_aci(\n    notebooks, subscription_id, resource_group, workspace_name, workspace_region\n):\n    notebook_path = notebooks[\"automl_local_deployment_aci\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        parameters={\n            \"automl_iterations\": 1,\n            \"automl_iteration_timeout\": 7,\n            \"config_path\": None,\n            \"webservice_name\": \"aci-test-service\",\n            \"subscription_id\": subscription_id,\n            \"resource_group\": resource_group,\n            \"workspace_name\": workspace_name,\n            \"workspace_region\": workspace_region,\n        },\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict[\"pearson_correlation\"]\n    assert result == pytest.approx(0.5, abs=ABS_TOL)\n\n\n@pytest.mark.integration\n@pytest.mark.azureml\n@pytest.mark.skip(\n    reason=\"push for release, no horovod installation automation or documentation yet\"\n)\ndef test_gensen_aml_deep_dive(notebooks):\n    notebook_path = notebooks[\"gensen_aml_deep_dive\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        parameters=dict(\n            CACHE_DIR=\"./tests/integration/temp\",\n            AZUREML_CONFIG_PATH=\"./tests/integration/.azureml\",\n            UTIL_NLP_PATH=\"./utils_nlp\",\n            MAX_EPOCH=1,\n            TRAIN_SCRIPT=\"./examples/sentence_similarity/gensen_train.py\",\n            CONFIG_PATH=\"./examples/sentence_similarity/gensen_config.json\",\n            MAX_TOTAL_RUNS=1,\n            MAX_CONCURRENT_RUNS=1,\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert result[\"min_val_loss\"] > 5\n    assert result[\"learning_rate\"] >= 0.0001\n    assert result[\"learning_rate\"] <= 0.001\n\n\n@pytest.mark.integration\n@pytest.mark.azureml\n@pytest.mark.skip(\n    reason=\"can't run programmatically, AKS cluster takes ~20 minutes to create and there is no blocking call in the notebook to tell that the cluster creation is in progress\"\n)\ndef test_automl_with_pipelines_deployment_aks(notebooks):\n    notebook_path = notebooks[\"automl_with_pipelines_deployment_aks\"]\n    pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK)\n"
  },
  {
    "path": "tests/integration/test_notebooks_text_classification.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport json\nimport shutil\nimport pytest\nimport papermill as pm\nimport scrapbook as sb\nfrom tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME\n\n\nABS_TOL = 0.1\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_tc_mnli_transformers(notebooks, tmp):\n    notebook_path = notebooks[\"tc_mnli_transformers\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            NUM_GPUS=1,\n            DATA_FOLDER=tmp,\n            CACHE_DIR=tmp,\n            BATCH_SIZE=16,\n            NUM_EPOCHS=1,\n            TRAIN_DATA_FRACTION=0.05,\n            TEST_DATA_FRACTION=0.05,\n            MODEL_NAMES=[\"distilbert-base-uncased\"],\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"accuracy\"], 0.885, abs=ABS_TOL)\n    assert pytest.approx(result[\"f1\"], 0.885, abs=ABS_TOL)\n\n\n@pytest.mark.integration\n@pytest.mark.azureml\n@pytest.mark.gpu\ndef test_tc_bert_azureml(\n    notebooks, subscription_id, resource_group, workspace_name, workspace_region, tmp\n):\n    notebook_path = notebooks[\"tc_bert_azureml\"]\n\n    train_folder = os.path.join(tmp, \"train\")\n    test_folder = os.path.join(tmp, \"test\")\n\n    parameters = {\n        \"config_path\": None,\n        \"subscription_id\": subscription_id,\n        \"resource_group\": resource_group,\n        \"workspace_name\": workspace_name,\n        \"workspace_region\": workspace_region,\n        \"cluster_name\": \"tc-bert-cluster\",\n        \"DATA_FOLDER\": tmp,\n        \"TRAIN_FOLDER\": train_folder,\n        \"TEST_FOLDER\": test_folder,\n        \"PROJECT_FOLDER\": \".\",\n        \"NUM_PARTITIONS\": 1,\n        \"NODE_COUNT\": 1,\n    }\n\n    pm.execute_notebook(\n        notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=parameters\n    )\n\n    with open(\"outputs/results.json\", \"r\") as handle:\n        result_dict = json.load(handle)\n        assert result_dict[\"weighted avg\"][\"f1-score\"] == pytest.approx(0.85, abs=ABS_TOL)\n\n    if os.path.exists(\"outputs\"):\n        shutil.rmtree(\"outputs\")\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_multi_languages_transformer(notebooks, tmp):\n    notebook_path = notebooks[\"tc_multi_languages_transformers\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters={\"QUICK_RUN\": True, \"USE_DATASET\": \"dac\"},\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"precision\"], 0.94, abs=ABS_TOL)\n    assert pytest.approx(result[\"recall\"], 0.94, abs=ABS_TOL)\n    assert pytest.approx(result[\"f1\"], 0.94, abs=ABS_TOL)\n"
  },
  {
    "path": "tests/integration/test_notebooks_unilm_abstractive_summarization.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport papermill as pm\nimport pytest\nimport scrapbook as sb\nfrom tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK\nimport torch\n\nABS_TOL = 0.02\n\n\n@pytest.mark.gpu\n@pytest.mark.integration\ndef test_unilm_abstractive_summarization(notebooks, tmp):\n    notebook_path = notebooks[\"unilm_abstractive_summarization\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            QUICK_RUN=True,\n            NUM_GPUS=torch.cuda.device_count(),\n            TOP_N=100,\n            WARMUP_STEPS=5,\n            MAX_STEPS=50,\n            GRADIENT_ACCUMULATION_STEPS=1,\n            TEST_PER_GPU_BATCH_SIZE=2,\n            BEAM_SIZE=3,\n            MODEL_DIR=tmp,\n            RESULT_DIR=tmp,\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"rouge_1_f_score\"], 0.2, abs=ABS_TOL)\n    assert pytest.approx(result[\"rouge_2_f_score\"], 0.07, abs=ABS_TOL)\n    assert pytest.approx(result[\"rouge_l_f_score\"], 0.16, abs=ABS_TOL)\n\n@pytest.mark.cpu\n@pytest.mark.integration\ndef test_unilm_abstractive_summarization(notebooks, tmp):\n    notebook_path = notebooks[\"unilm_abstractive_summarization\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            QUICK_RUN=True,\n            NUM_GPUS=0,\n            TOP_N=2,\n            WARMUP_STEPS=5,\n            MAX_STEPS=50,\n            GRADIENT_ACCUMULATION_STEPS=1,\n            TEST_PER_GPU_BATCH_SIZE=2,\n            BEAM_SIZE=3,\n            MODEL_DIR=tmp,\n            RESULT_DIR=tmp,\n        ),\n    )\n    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict\n    assert pytest.approx(result[\"rouge_1_f_score\"], 0.1, abs=ABS_TOL)\n    assert pytest.approx(result[\"rouge_2_f_score\"], 0.05, abs=ABS_TOL)\n    assert pytest.approx(result[\"rouge_l_f_score\"], 0.1, abs=ABS_TOL)\n\n"
  },
  {
    "path": "tests/notebooks_common.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\n\n# Unless manually modified, python3 should be the name of the current jupyter kernel\n# that runs on the activated conda environment\nKERNEL_NAME = \"python3\"\nOUTPUT_NOTEBOOK = \"output.ipynb\"\n\n\ndef path_notebooks():\n    \"\"\"Returns the path of the notebooks folder\"\"\"\n    return os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, \"examples\"))\n"
  },
  {
    "path": "tests/smoke/test_dataset.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\n\nfrom utils_nlp.dataset import msrpc\nfrom utils_nlp.dataset import xnli\n\n\n@pytest.mark.smoke\ndef test_msrpc_download(tmp_path):\n    filepath = msrpc.download_msrpc(tmp_path)\n    statinfo = os.stat(filepath)\n    assert statinfo.st_size == 1359872\n\n\n@pytest.mark.skip(reason=\"Can't test it programmatically, needs input\")\n@pytest.mark.smoke\ndef test_msrpc_load_df(tmp_path):\n    df_train = msrpc.load_pandas_df(\n        local_cache_path=tmp_path, dataset_type=\"train\"\n    )\n\n\n@pytest.mark.smoke\ndef test_xnli(tmp_path):\n    df_train = xnli.load_pandas_df(\n        local_cache_path=tmp_path, file_split=\"train\"\n    )\n    assert df_train.shape == (392702, 2)\n"
  },
  {
    "path": "tests/smoke/test_gpu_utils.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport torch\n\n\n@pytest.mark.smoke\n@pytest.mark.gpu\ndef test_machine_is_gpu_machine():\n    assert torch.cuda.is_available() is True\n"
  },
  {
    "path": "tests/smoke/test_word_embeddings.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\n\nimport pytest\nfrom gensim.models.fasttext import FastText\nfrom gensim.models.keyedvectors import Word2VecKeyedVectors\n\nfrom utils_nlp.models.pretrained_embeddings.fasttext import (\n    load_pretrained_vectors as load_fasttext,\n)\nfrom utils_nlp.models.pretrained_embeddings.glove import (\n    load_pretrained_vectors as load_glove,\n)\nfrom utils_nlp.models.pretrained_embeddings.word2vec import (\n    load_pretrained_vectors as load_word2vec,\n)\n\n\n@pytest.mark.smoke\ndef test_load_pretrained_vectors_word2vec(tmp_path):\n    filename = \"GoogleNews-vectors-negative300.bin\"\n    model = load_word2vec(tmp_path, limit=500000)\n    filepath = os.path.join(os.path.join(tmp_path, \"word2vec\"), filename)\n    statinfo = os.stat(filepath)\n    assert statinfo.st_size == 3644258522\n    assert isinstance(model, Word2VecKeyedVectors)\n    assert len(model.vocab) == 500000\n\n\n@pytest.mark.smoke\ndef test_load_pretrained_vectors_glove(tmp_path):\n    filename = \"glove.840B.300d.txt\"\n    model = load_glove(tmp_path, limit=50000)\n    filepath = os.path.join(os.path.join(tmp_path, \"gloVe\"), filename)\n    statinfo = os.stat(filepath)\n    assert statinfo.st_size == 5646236541\n    assert isinstance(model, Word2VecKeyedVectors)\n    assert len(model.vocab) == 50000\n\n\n@pytest.mark.smoke\ndef test_load_pretrained_vectors_fasttext(tmp_path):\n    filename = \"wiki.simple.bin\"\n    model = load_fasttext(tmp_path)\n    filepath = os.path.join(os.path.join(tmp_path, \"fastText\"), filename)\n    statinfo = os.stat(filepath)\n    assert statinfo.st_size == 2668450750\n    assert isinstance(model, FastText)\n\n\n"
  },
  {
    "path": "tests/unit/test_abstractive_summarization_bertsum.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport argparse\nimport nltk\nfrom nltk import tokenize\nimport os\nimport pytest\nimport torch\n\ntorch.set_printoptions(threshold=5000)\n\nfrom utils_nlp.models.transformers.datasets import SummarizationDataset\nfrom utils_nlp.models.transformers.abstractive_summarization_bertsum import (\n    BertSumAbs,\n    BertSumAbsProcessor,\n    validate,\n)\n\n# @pytest.fixture()\ndef source_data():\n    return [\n        [\n            \"Boston, MA welcome to Microsoft/nlp\",\n            \"Welcome to text summarization.\",\n            \"Welcome to Microsoft NERD.\",\n            \"Look outside, what a beautiful Charlse River fall view.\",\n        ],\n        [\"I am just another test case\"],\n        [\"want to test more\"],\n    ]\n\n\n# @pytest.fixture()\ndef target_data():\n    return [\n        [\n            \"welcome to microsoft/nlp.\",\n            \"Welcome to text summarization.\",\n            \"Welcome to Microsoft NERD.\",\n        ],\n        [\"I am just another test summary\"],\n        [\"yest, I agree\"],\n    ]\n\n\n#NUM_GPUS = 2\nos.environ[\"NCCL_IB_DISABLE\"] = \"0\"\n\n\n@pytest.fixture(scope=\"module\")\ndef test_dataset_for_bertsumabs(tmp_module):\n    source = source_data()\n    target = target_data()\n    source_file = os.path.join(tmp_module, \"source.txt\")\n    target_file = os.path.join(tmp_module, \"target.txt\")\n    f = open(source_file, \"w\")\n    for i in source:\n        f.write(\" \".join(i))\n        f.write(\"\\n\")\n    f.close()\n    f = open(target_file, \"w\")\n    for i in target:\n        f.write(\" \".join(i))\n        f.write(\"\\n\")\n    f.close()\n    train_dataset = SummarizationDataset(\n        source_file = source_file,\n        target_file = target_file,\n        source_preprocessing = [tokenize.sent_tokenize],\n        target_preprocessing = [tokenize.sent_tokenize],\n    )\n    test_dataset = SummarizationDataset(\n        source_file = source_file,\n        target_file = target_file,\n        source_preprocessing = [tokenize.sent_tokenize],\n        target_preprocessing = [tokenize.sent_tokenize],\n    )\n    processor = BertSumAbsProcessor(cache_dir=tmp_module)\n    batch = processor.collate(train_dataset, 512, \"cuda:0\")\n    assert len(batch.src) == 3\n    return train_dataset, test_dataset\n\n\n@pytest.mark.gpu\n@pytest.fixture()\ndef test_train_model(tmp_module, test_dataset_for_bertsumabs, batch_size=1):\n    CACHE_PATH = (\n        tmp_module  \n    )\n    DATA_PATH = (\n        tmp_module \n    )\n    MODEL_PATH = (\n        tmp_module\n    )\n\n    processor = BertSumAbsProcessor(cache_dir=CACHE_PATH)\n    summarizer = BertSumAbs(processor, cache_dir=CACHE_PATH)\n\n    checkpoint = None\n    train_sum_dataset, test_sum_dataset = test_dataset_for_bertsumabs\n\n    def this_validate(class_obj):\n        return validate(class_obj, test_sum_dataset)\n\n    MAX_STEP = 20\n    TOP_N = 8\n    summarizer.fit(\n        train_sum_dataset,\n        batch_size=batch_size,\n        max_steps=MAX_STEP,\n        local_rank=-1,\n        learning_rate_bert=0.002,\n        learning_rate_dec=0.2,\n        warmup_steps_bert=20000,\n        warmup_steps_dec=10000,\n        num_gpus=None,\n        report_every=10,\n        save_every=100,\n        validation_function=this_validate,\n        fp16=False,\n        fp16_opt_level=\"O1\",\n        checkpoint=checkpoint,\n    )\n    saved_model_path = os.path.join(\n        MODEL_PATH, \"summarizer_step_{}.pt\".format(MAX_STEP)\n    )\n    summarizer.save_model(MAX_STEP, saved_model_path)\n\n    return saved_model_path\n\n\n@pytest.mark.gpu\ndef test_finetuned_model(\n    tmp_module,\n    test_train_model,\n    test_dataset_for_bertsumabs,\n    top_n=8,\n    batch_size=1,\n):\n    CACHE_PATH = (\n        tmp_module  \n    )\n    DATA_PATH = (\n        tmp_module \n    )\n    MODEL_PATH = (\n        tmp_module\n    )\n\n    # train_sum_dataset, test_sum_dataset = preprocess_cnndm_abs(need_process=False)\n    train_sum_dataset, test_sum_dataset = test_dataset_for_bertsumabs\n\n    processor = BertSumAbsProcessor(cache_dir=CACHE_PATH)\n    checkpoint = torch.load(test_train_model)\n\n    summarizer = BertSumAbs(\n        processor,\n        cache_dir=CACHE_PATH,\n        test=True,\n        max_pos_length=checkpoint[\"max_pos_length\"],\n    )\n    summarizer.model.load_checkpoint(checkpoint[\"model\"])\n    \n    shortened_dataset = test_sum_dataset.shorten(top_n)\n    reference_summaries = [\n        \"\".join(t).rstrip(\"\\n\") for t in shortened_dataset.get_target()\n    ]\n    print(\"start prediction\")\n    generated_summaries = summarizer.predict(\n        shortened_dataset, batch_size=batch_size, num_gpus=None\n    )\n\n    def _write_list_to_file(list_items, filename):\n        with open(filename, \"w\") as filehandle:\n            # for cnt, line in enumerate(filehandle):\n            for item in list_items:\n                filehandle.write(\"%s\\n\" % item)\n\n    print(\"writing generated summaries\")\n    _write_list_to_file(generated_summaries, os.path.join(CACHE_PATH, \"prediction.txt\"))\n\n    assert len(generated_summaries) == len(reference_summaries)\n"
  },
  {
    "path": "tests/unit/test_abstractive_summarization_seq2seq.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\n\nfrom utils_nlp.models.transformers.abstractive_summarization_seq2seq import (\n    S2SAbsSumProcessor, \n    S2SAbstractiveSummarizer, \n    S2SConfig\n)\n\nfrom utils_nlp.models.transformers.datasets import (\n    IterableSummarizationDataset,\n    SummarizationDataset,\n)\n\nMAX_SEQ_LENGTH = 96\nMAX_SOURCE_SEQ_LENGTH = 64\nMAX_TARGET_SEQ_LENGTH = 16\nMAX_TGT_LENGTH = 16\n\nTRAIN_PER_GPU_BATCH_SIZE = 1\nTEST_PER_GPU_BATCH_SIZE = 1\n\n\n@pytest.fixture()\ndef s2s_test_data():\n    train_ds = [\n        {\n            \"src\": \"Moscow is usually blanketed in snow for four to five months \"\n            \"a year. But this year, Russia's capital had barely any snow cover \"\n            \"in the whole of February.\",\n            \"tgt\": \"Mowcow is unusually snowless this February.\",\n        },\n        {\n            \"src\": \"US stocks rallied back to life on Wednesday, retracing \"\n            \"losses from the previous day over coronavirus fears.\",\n            \"tgt\": \"US stocks retraced losses on Wednesday.\",\n        },\n        {\n            \"src\": \"The Los Angeles County Board of Supervisors and the \"\n            \"Department of Public Health have declared a local and public \"\n            \"health emergency in response to the spread of coronavirus across \"\n            \"the country, which includes six additional cases in L.A. County.\",\n            \"tgt\": \"Los Angeles County declares health emergency due to \"\n            \"coronavirus concerns.\",\n        },\n        {\n            \"src\": \"Tree cover in US cities is shrinking. A study published last \"\n            \"year by the US Forest Service found that we lost 36 million trees \"\n            \"annually from urban and rural communities over a five-year period. \"\n            \"That's a 1% drop from 2009 to 2014\",\n            \"tgt\": \"US cities are losing 36 million trees a year.\",\n        },\n    ]\n    test_ds = [\n        {\n            \"src\": \"A 5-year-old student at an elementary school in Vista, \"\n            \"California, collected enough money to pay off the negative lunch \"\n            \"balances of 123 students at her school.\"\n        },\n        {\n            \"src\": \"As counting gets underway in Israel's unprecedented third \"\n            \"election in 11 months, initial exit polls projected Prime Minister \"\n            \"Benjamin Netanyahu's Likud party as the winners.\"\n        },\n        {\n            \"src\": \"The German automaker's refreshed logo ditches the black ring \"\n            \"for a transparent circle. The rest of it, including the typeface, \"\n            \"has a flatter and more modern look. The blue and white emblem inside \"\n            \"the ring remains.\"\n        },\n        {\n            \"src\": \"Before dawn Tuesday, 24 people were killed, and hundreds of \"\n            \"buildings were destroyed by the storms. Officials in Putnam County, \"\n            \"which suffered 18 storm-related deaths, said they are working to \"\n            \"locate 17 people who are unaccounted for, down from 38 earlier in the day.\"\n        },\n    ]\n\n    return {\"train_ds\": train_ds, \"test_ds\": test_ds}\n\n\n@pytest.mark.gpu\n@pytest.mark.parametrize(\"model_name\", [\"unilm-base-cased\", \"minilm-l12-h384-uncased\"])\ndef test_S2SAbstractiveSummarizer(s2s_test_data, tmp, model_name):\n    cache_dir = tmp\n    model_dir = tmp\n    processor = S2SAbsSumProcessor(model_name=model_name, cache_dir=cache_dir)\n    train_dataset = processor.s2s_dataset_from_json_or_file(\n        s2s_test_data[\"train_ds\"], train_mode=True\n    )\n    test_dataset = processor.s2s_dataset_from_json_or_file(\n        s2s_test_data[\"test_ds\"], train_mode=False\n    )\n    abs_summarizer = S2SAbstractiveSummarizer(\n        model_name=model_name,\n        max_seq_length=MAX_SEQ_LENGTH,\n        max_source_seq_length=MAX_SOURCE_SEQ_LENGTH,\n        max_target_seq_length=MAX_TARGET_SEQ_LENGTH,\n        cache_dir=cache_dir,\n    )\n\n    # test fit and predict\n    global_step = abs_summarizer.fit(\n        train_dataset,\n        per_gpu_batch_size=TRAIN_PER_GPU_BATCH_SIZE,\n        save_model_to_dir=model_dir,\n    )\n    abs_summarizer.predict(\n        test_dataset,\n        per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE,\n        max_tgt_length=MAX_TGT_LENGTH,\n    )\n\n    # test load model from local disk\n    abs_summarizer_loaded = S2SAbstractiveSummarizer(\n        model_name=model_name,\n        load_model_from_dir=model_dir,\n        model_file_name=\"model.{}.bin\".format(global_step),\n        max_seq_length=MAX_SEQ_LENGTH,\n        max_source_seq_length=MAX_SOURCE_SEQ_LENGTH,\n        max_target_seq_length=MAX_TARGET_SEQ_LENGTH,\n        cache_dir=cache_dir,\n    )\n\n    abs_summarizer_loaded.predict(\n        test_dataset,\n        per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE,\n        max_tgt_length=MAX_TGT_LENGTH,\n    )\n\n    # test recover model\n    abs_summarizer.fit(\n        train_dataset,\n        per_gpu_batch_size=TRAIN_PER_GPU_BATCH_SIZE,\n        save_model_to_dir=model_dir,\n        recover_step=global_step,\n        recover_dir=model_dir,\n        max_steps=global_step + 3,\n    )\n\n    abs_summarizer.predict(\n        test_dataset,\n        per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE,\n        max_tgt_length=MAX_TGT_LENGTH,\n    )\n\n\ndef test_S2SAbsSumProcessor(s2s_test_data, tmp):\n    expected_output_length = 4\n    # prepare files for testing\n    train_source_file = os.path.join(tmp, \"train.src\")\n    train_target_file = os.path.join(tmp, \"train.tgt\")\n\n    test_source_file = os.path.join(tmp, \"test.src\")\n\n    train_json_file = os.path.join(tmp, \"train.json\")\n    test_json_file = os.path.join(tmp, \"test.json\")\n\n    with open(train_source_file, \"w\") as src_file, open(\n        train_target_file, \"w\"\n    ) as tgt_file:\n        for item in s2s_test_data[\"train_ds\"]:\n            src_file.write(item[\"src\"] + \"\\n\")\n            tgt_file.write(item[\"tgt\"] + \"\\n\")\n\n    with open(test_source_file, \"w\") as src_file:\n        for item in s2s_test_data[\"test_ds\"]:\n            src_file.write(item[\"src\"] + \"\\n\")\n\n    train_iterable_sum_ds = IterableSummarizationDataset(\n        source_file=train_source_file, target_file=train_target_file\n    )\n    test_iterable_sum_ds = IterableSummarizationDataset(source_file=test_source_file)\n\n    train_sum_ds = SummarizationDataset(\n        source_file=train_source_file, target_file=train_target_file\n    )\n    test_sum_ds = SummarizationDataset(source_file=test_source_file)\n\n    train_sum_ds.save_to_jsonl(train_json_file)\n    test_sum_ds.save_to_jsonl(test_json_file)\n\n    processor = S2SAbsSumProcessor(cache_dir=tmp)\n\n    train_json_output = processor.s2s_dataset_from_json_or_file(\n        input_data=s2s_test_data[\"train_ds\"], train_mode=True\n    )\n    test_json_output = processor.s2s_dataset_from_json_or_file(\n        input_data=s2s_test_data[\"test_ds\"], train_mode=False\n    )\n\n    assert len(train_json_output) == expected_output_length\n    assert len(test_json_output) == expected_output_length\n\n    train_file_output = processor.s2s_dataset_from_json_or_file(\n        input_data=train_json_file, train_mode=True\n    )\n    test_file_output = processor.s2s_dataset_from_json_or_file(\n        input_data=test_json_file, train_mode=False\n    )\n\n    assert len(train_file_output) == expected_output_length\n    assert len(test_file_output) == expected_output_length\n\n    train_iterable_sum_ds_output = processor.s2s_dataset_from_iterable_sum_ds(\n        sum_ds=train_iterable_sum_ds, train_mode=True\n    )\n    test_iterable_sum_ds_output = processor.s2s_dataset_from_iterable_sum_ds(\n        sum_ds=test_iterable_sum_ds, train_mode=False\n    )\n\n    assert len(train_iterable_sum_ds_output) == expected_output_length\n    assert len(test_iterable_sum_ds_output) == expected_output_length\n\n    train_sum_ds_output = processor.s2s_dataset_from_sum_ds(\n        sum_ds=train_sum_ds, train_mode=True\n    )\n    test_sum_ds_output = processor.s2s_dataset_from_sum_ds(\n        sum_ds=test_sum_ds, train_mode=False\n    )\n\n    assert len(train_sum_ds_output) == expected_output_length\n    assert len(test_sum_ds_output) == expected_output_length\n\n\ndef test_S2SConfig(tmp):\n    config_file = os.path.join(tmp, \"s2s_config.json\")\n\n    config = S2SConfig()\n\n    config.save_to_json(config_file)\n\n    loaded_config = S2SConfig.load_from_json(config_file)\n\n    assert loaded_config.__dict__ == config.__dict__\n"
  },
  {
    "path": "tests/unit/test_bert_common.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\n\nfrom utils_nlp.models.bert.common import create_data_loader\n\n\ndef test_tokenize(bert_english_tokenizer):\n    text = [\"Hello World.\", \"How you doing?\", \"greatttt\"]\n    tokens = bert_english_tokenizer.tokenize(text)\n    assert len(tokens) == len(text)\n    assert len(tokens[0]) == 3\n    assert len(tokens[1]) == 4\n    assert len(tokens[2]) == 3\n    assert tokens[2][1].startswith(\"##\")\n\n\ndef test_tokenize_ner(ner_test_data, bert_english_tokenizer):\n    seq_length = 20\n\n    # test providing labels\n    preprocessed_tokens = bert_english_tokenizer.tokenize_ner(\n        text=ner_test_data[\"INPUT_TEXT\"],\n        labels=ner_test_data[\"INPUT_LABELS\"],\n        label_map=ner_test_data[\"LABEL_MAP\"],\n        max_len=seq_length,\n    )\n\n    assert len(preprocessed_tokens[0][0]) == seq_length\n    assert len(preprocessed_tokens[1][0]) == seq_length\n    assert (\n        preprocessed_tokens[2] == ner_test_data[\"EXPECTED_TRAILING_TOKEN_MASK\"]\n    )\n    assert preprocessed_tokens[3] == ner_test_data[\"EXPECTED_LABEL_IDS\"]\n\n    # test when input is a single list\n    preprocessed_tokens = bert_english_tokenizer.tokenize_ner(\n        text=ner_test_data[\"INPUT_TEXT_SINGLE\"],\n        labels=ner_test_data[\"INPUT_LABELS_SINGLE\"],\n        label_map=ner_test_data[\"LABEL_MAP\"],\n        max_len=seq_length,\n    )\n\n    assert len(preprocessed_tokens[0][0]) == seq_length\n    assert len(preprocessed_tokens[1][0]) == seq_length\n    assert (\n        preprocessed_tokens[2] == ner_test_data[\"EXPECTED_TRAILING_TOKEN_MASK\"]\n    )\n    assert preprocessed_tokens[3] == ner_test_data[\"EXPECTED_LABEL_IDS\"]\n\n    # test not providing labels\n    preprocessed_tokens = bert_english_tokenizer.tokenize_ner(\n        text=ner_test_data[\"INPUT_TEXT\"],\n        label_map=ner_test_data[\"LABEL_MAP\"],\n        max_len=20,\n    )\n    assert (\n        preprocessed_tokens[2] == ner_test_data[\"EXPECTED_TRAILING_TOKEN_MASK\"]\n    )\n\n    # text exception when number of words and number of labels are different\n    with pytest.raises(ValueError):\n        preprocessed_tokens = bert_english_tokenizer.tokenize_ner(\n            text=ner_test_data[\"INPUT_TEXT\"],\n            labels=ner_test_data[\"INPUT_LABELS_WRONG\"],\n            label_map=ner_test_data[\"LABEL_MAP\"],\n            max_len=seq_length,\n        )\n\n\ndef test_create_data_loader(ner_test_data):\n    with pytest.raises(ValueError):\n        create_data_loader(\n            input_ids=ner_test_data[\"INPUT_TOKEN_IDS\"],\n            input_mask=ner_test_data[\"INPUT_MASK\"],\n            label_ids=ner_test_data[\"INPUT_LABEL_IDS\"],\n            sample_method=\"dummy\",\n        )\n\n    create_data_loader(\n        input_ids=ner_test_data[\"INPUT_TOKEN_IDS\"],\n        input_mask=ner_test_data[\"INPUT_MASK\"],\n        label_ids=ner_test_data[\"INPUT_LABEL_IDS\"],\n        sample_method=\"sequential\",\n    )\n\n    create_data_loader(\n        input_ids=ner_test_data[\"INPUT_TOKEN_IDS\"],\n        input_mask=ner_test_data[\"INPUT_MASK\"],\n        label_ids=ner_test_data[\"INPUT_LABEL_IDS\"],\n        sample_method=\"random\",\n    )\n"
  },
  {
    "path": "tests/unit/test_bert_encoder.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\n\nfrom utils_nlp.models.bert.common import Language\nfrom utils_nlp.models.bert.sequence_encoding import BERTSentenceEncoder\n\n@pytest.fixture()\ndef data():\n    return [\"The quick brown fox jumps over the lazy dog\", \"the coffee is very acidic\"]\n\ndef test_encoder(tmp, data):\n    se = BERTSentenceEncoder(\n        language=Language.ENGLISH,\n        num_gpus=0,\n        cache_dir=tmp,\n    )\n    embeddings = se.encode(data, as_numpy=True)\n    assert len(embeddings) == 2\n    assert len(embeddings[0]) == 768"
  },
  {
    "path": "tests/unit/test_bert_sentence_encoding.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\n\nfrom utils_nlp.models.bert.common import Language\nfrom utils_nlp.models.bert.sequence_encoding import BERTSentenceEncoder, PoolingStrategy\nfrom sklearn.metrics.pairwise import cosine_similarity\n\n\n@pytest.fixture()\ndef data():\n    return [\n        \"how old are you?\",\n        \"what's your age?\",\n        \"my phone is good\",\n        \"your cellphone looks great.\",\n    ]\n\n\ndef test_sentence_encoding(tmp, data):\n    se = BERTSentenceEncoder(\n        language=Language.ENGLISH,\n        num_gpus=0,\n        to_lower=True,\n        max_len=128,\n        layer_index=-2,\n        pooling_strategy=PoolingStrategy.MEAN,\n        cache_dir=tmp,\n    )\n\n    result = se.encode(data, as_numpy=False)\n    similarity = cosine_similarity(result[\"values\"].values.tolist())\n    assert similarity[0, 0] > similarity[1, 0]\n    assert similarity[0, 1] > similarity[0, 2]\n"
  },
  {
    "path": "tests/unit/test_common_pytorch_utils.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"PyTorch utils tests.\"\"\"\n\nimport pytest\nimport torch\nimport torch.nn as nn\nfrom torch.nn.parallel.data_parallel import DataParallel\n\nfrom utils_nlp.common.pytorch_utils import (\n    get_device,\n    move_model_to_device,\n    parallelize_model,\n)\n\n\n@pytest.fixture\ndef model():\n    return nn.Sequential(nn.Linear(24, 8), nn.ReLU(), nn.Linear(8, 2), nn.Sigmoid())\n\n\ndef test_get_device_cpu():\n    device, gpus = get_device(num_gpus=0)\n    assert isinstance(device, torch.device)\n    assert device.type == \"cpu\"\n    assert gpus == 0\n\n    device, gpus = get_device(gpu_ids=[])\n    assert device.type == \"cpu\"\n    assert gpus == 0\n\n\n@pytest.mark.gpu\ndef test_machine_is_gpu_machine():\n    assert torch.cuda.is_available() is True\n\n\n@pytest.mark.gpu\ndef test_get_device_gpu():\n    device, gpus = get_device(num_gpus=1)\n    assert isinstance(device, torch.device)\n    assert device.type == \"cuda\"\n    assert gpus == 1\n\n    device, gpus = get_device(gpu_ids=[0])\n    assert device.type == \"cuda\"\n    assert gpus == 1\n\n\n@pytest.mark.gpu\ndef test_get_device_all_gpus():\n    device, gpus = get_device()\n    assert isinstance(device, torch.device)\n    assert device.type == \"cuda\"\n    assert gpus == torch.cuda.device_count()\n\n\n@pytest.mark.gpu\ndef test_get_device_local_rank():\n    device, gpus = get_device(local_rank=0)\n    assert isinstance(device, torch.device)\n    assert device.type == \"cuda\"\n    assert device.index == 0\n    assert gpus == 1\n\n\ndef test_get_device_local_rank_cpu():\n    device, gpus = get_device(local_rank=-1, num_gpus=0)\n    assert isinstance(device, torch.device)\n    assert device.type == \"cpu\"\n    assert gpus == 0\n\n\ndef test_move_to_device_cpu(model):\n    # test when device.type=\"cpu\"\n    model_cpu = move_model_to_device(model, torch.device(\"cpu\"))\n    assert isinstance(model_cpu, nn.modules.container.Sequential)\n    assert next(model_cpu.parameters()).is_cuda is False\n\n\ndef test_move_to_device_cpu_parallelized(model):\n    # test when input model is parallelized\n    model_parallelized = nn.DataParallel(model)\n    model_parallelized_output = move_model_to_device(\n        model_parallelized, torch.device(\"cpu\")\n    )\n    assert isinstance(model_parallelized_output, nn.modules.container.Sequential)\n    assert next(model_parallelized.module.parameters()).is_cuda is False\n\n\ndef test_move_to_device_exception_not_torch_device(model):\n    # test when device is not torch.device\n    with pytest.raises(ValueError):\n        move_model_to_device(model, \"abc\")\n\n\ndef test_move_to_device_exception_wrong_type(model):\n    # test when device.type is not \"cuda\" or \"cpu\"\n    with pytest.raises(Exception):\n        move_model_to_device(model, torch.device(\"opengl\"))\n\n\n@pytest.mark.skipif(\n    torch.cuda.is_available(),\n    reason=\"Skip if we are executing the cpu tests on a gpu machine\",\n)\ndef test_move_to_device_exception_gpu_model_on_cpu_machine(model):\n    # test when the model is moved to a gpu but it is a cpu machine\n    with pytest.raises(Exception):\n        move_model_to_device(model, torch.device(\"cuda\"))\n\n\n@pytest.mark.gpu\ndef test_parallelize_model_exception_cuda_zero_gpus(model):\n    # test when device.type is cuda, but num_gpus is 0\n    with pytest.raises(ValueError):\n        model = move_model_to_device(model, torch.device(\"cuda\"))\n        parallelize_model(model, torch.device(\"cuda\"), num_gpus=0)\n\n\n@pytest.mark.gpu\ndef test_parallelize_model(model):\n    # test when device.type=\"cuda\" and move model to all devices\n    model_cuda = move_model_to_device(model, torch.device(\"cuda\"))\n    model_cuda = parallelize_model(model_cuda, torch.device(\"cuda\"))\n    num_cuda_devices = torch.cuda.device_count()\n    assert isinstance(model_cuda, DataParallel)\n\n    # test moving model to only one gpu\n    model_cuda_1_gpu = move_model_to_device(model, torch.device(\"cuda\"))\n    assert next(model_cuda_1_gpu.parameters()).is_cuda is True\n    model_cuda_1_gpu = parallelize_model(\n        model_cuda_1_gpu, torch.device(\"cuda\"), num_gpus=1\n    )\n    assert next(model_cuda_1_gpu.parameters()).is_cuda is True\n\n    # test parallelize_model can limit the number of devices\n    model_cuda_1_more_gpu = move_model_to_device(model, torch.device(\"cuda\"))\n    model_cuda_1_more_gpu = parallelize_model(\n        model_cuda_1_more_gpu, torch.device(\"cuda\"), num_gpus=num_cuda_devices + 1\n    )\n    assert next(model_cuda_1_more_gpu.module.parameters()).is_cuda is True\n\n    # test parallelize_model on the same number of devices\n    model_cuda_same_gpu = move_model_to_device(model, torch.device(\"cuda\"))\n    model_cuda_same_gpu = parallelize_model(\n        model_cuda_same_gpu, torch.device(\"cuda\"), num_gpus=num_cuda_devices\n    )\n    assert next(model_cuda_same_gpu.module.parameters()).is_cuda is True\n\n    # test parallelize_model with gpu id\n    model_base = move_model_to_device(model, torch.device(\"cuda\"))\n    # when gpu id is [], gpu id [0] is used\n    model_cuda_0_gpu = parallelize_model(model_base, torch.device(\"cuda\"), gpu_ids=[])\n    # device has priority ??\n    assert next(model_cuda_1_gpu.parameters()).device == torch.device(\"cuda:0\")\n    assert next(model_cuda_0_gpu.parameters()).is_cuda is True\n\n    # test parallelize_model with gpu id is [0]\n    model_base = move_model_to_device(model, torch.device(\"cuda\"))\n    model_cuda_1_gpu = parallelize_model(model_base, torch.device(\"cuda\"), gpu_ids=[0])\n    assert next(model_cuda_1_gpu.parameters()).is_cuda is True\n\n    # test parallelize_model with gpu id is [0:num_device]\n    model_base = move_model_to_device(model, torch.device(\"cuda\"))\n    model_cuda_same_gpu = parallelize_model(\n        model_base, torch.device(\"cuda\"), gpu_ids=list(range(num_cuda_devices))\n    )\n    if num_cuda_devices > 1:\n        assert next(model_cuda_same_gpu.module.parameters()).is_cuda is True\n    else:\n        assert next(model_cuda_same_gpu.parameters()).is_cuda is True\n\n    # test parallelize_model with gpu id is [1: num_devices+3]\n    model_base = move_model_to_device(model, torch.device(\"cuda\"))\n    model_cuda_same_gpu = parallelize_model(\n        model_base,\n        torch.device(\"cuda\"),\n        gpu_ids=[x + 1 for x in list(range(num_cuda_devices + 2))],\n    )\n    if num_cuda_devices > 1:\n        assert next(model_cuda_same_gpu.module.parameters()).is_cuda is True\n    else:\n        assert next(model_cuda_same_gpu.parameters()).is_cuda is True\n\n    # when intersection is only 1\n    model_base = move_model_to_device(model, torch.device(\"cuda\"))\n    gpu_ids = [x + num_cuda_devices - 1 for x in list(range(num_cuda_devices))]\n    model_cuda_intersect_1_gpu = parallelize_model(\n        model_base, torch.device(\"cuda\"), gpu_ids=gpu_ids\n    )\n    assert next(model_cuda_intersect_1_gpu.parameters()).device == torch.device(\n        \"cuda:{}\".format(num_cuda_devices - 1)\n    )\n    assert next(model_cuda_intersect_1_gpu.parameters()).is_cuda is True\n\n    # when threre is no intersection, no change to the model \n    model_base = move_model_to_device(model, torch.device(\"cuda\"))\n    model_cuda_intersect_0_gpu = parallelize_model(\n        model_base,\n        torch.device(\"cuda\"),\n        gpu_ids=[x + num_cuda_devices for x in list(range(num_cuda_devices))],\n    )\n    assert (\n        next(model_cuda_intersect_0_gpu.parameters()).device\n        == next(model_base.parameters()).device\n    )\n    assert next(model_cuda_intersect_0_gpu.parameters()).is_cuda is True\n    # test device is cpu original model on gpu\n    model_base = move_model_to_device(model, torch.device(\"cuda\"))\n    model_cuda_cpu = parallelize_model(\n        model_base,\n        torch.device(\"cpu\"),\n        gpu_ids=[x + num_cuda_devices for x in list(range(num_cuda_devices))],\n    )\n    assert next(model_cuda_cpu.parameters()).is_cuda is True\n    # test device is cpu and original model on cpu\n    model_base = move_model_to_device(model, torch.device(\"cpu\"))\n    model_cuda_cpu = parallelize_model(\n        model_base,\n        torch.device(\"cpu\"),\n        gpu_ids=[x + num_cuda_devices for x in list(range(num_cuda_devices))],\n    )\n    assert next(model_cuda_cpu.parameters()).is_cuda is False\n"
  },
  {
    "path": "tests/unit/test_data_loaders.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport random\n\nimport numpy as np\nimport pytest\nimport json\nimport os\nimport io\n\nfrom utils_nlp.dataset.data_loaders import DaskCSVLoader\nfrom utils_nlp.dataset.data_loaders import DaskJSONLoader\n\nUNIF1 = {\"a\": 4, \"b\": 6, \"n\": 10000}  # some uniform distribution\nrow_size = 5  # \"a,b\\n (5 bytes)\"\njson_row_size = 18  # \"{\"a\": 1, \"b\": 5}\\n (18 bytes)\"\n\n\n@pytest.fixture()\ndef csv_file(tmpdir):\n    random.seed(0)\n    f = tmpdir.mkdir(\"test_loaders\").join(\"tl_data.csv\")\n    f.write(\n        \"\\n\".join(\n            [\n                \"{},{}\".format(\n                    random.randint(0, 1),\n                    random.randint(UNIF1[\"a\"], UNIF1[\"b\"]),\n                )\n                for x in range(UNIF1[\"n\"])\n            ]\n        )\n    )\n    return str(f)\n\n\n@pytest.fixture()\ndef json_file(tmpdir):\n    random.seed(0)\n    json_path = os.path.join(tmpdir, \"test.jsonl\")\n    with io.open(json_path, \"w\", encoding=\"utf8\") as f:\n        for _ in range(UNIF1[\"n\"]):\n            data_dict = {\n                \"a\": random.randint(0, 1),\n                \"b\": random.randint(UNIF1[\"a\"], UNIF1[\"b\"]),\n            }\n            json.dump(data_dict, f)\n            f.write(\"\\n\")\n    return json_path\n\n\ndef test_dask_csv_rnd_loader(csv_file):\n    num_batches = 500\n    batch_size = 12\n    num_partitions = 4\n\n    loader = DaskCSVLoader(\n        csv_file,\n        header=None,\n        block_size=row_size * int(UNIF1[\"n\"] / num_partitions),\n        random_seed=0,\n    )\n\n    sample = []\n    for batch in loader.get_random_batches(num_batches, batch_size):\n        sample.append(list(batch.iloc[:, 1]))\n    sample = np.concatenate(sample)\n\n    assert loader.df.npartitions == num_partitions\n    assert sample.mean().round() == (UNIF1[\"a\"] + UNIF1[\"b\"]) / 2\n    assert len(sample) <= num_batches * batch_size\n\n\ndef test_dask_csv_seq_loader(csv_file):\n    batch_size = 12\n    num_partitions = 4\n\n    loader = DaskCSVLoader(\n        csv_file,\n        header=None,\n        block_size=row_size * int(UNIF1[\"n\"] / num_partitions),\n    )\n\n    sample = []\n    for batch in loader.get_sequential_batches(batch_size):\n        sample.append(list(batch.iloc[:, 1]))\n    sample = np.concatenate(sample)\n\n    assert loader.df.npartitions == num_partitions\n    assert sample.mean().round() == (UNIF1[\"a\"] + UNIF1[\"b\"]) / 2\n    assert len(sample) == UNIF1[\"n\"]\n\n\ndef test_dask_json_rnd_loader(json_file):\n    num_batches = 500\n    batch_size = 12\n    num_partitions = 4\n\n    loader = DaskJSONLoader(\n        json_file,\n        block_size=json_row_size * int(UNIF1[\"n\"] / num_partitions),\n        random_seed=0,\n        lines=True,\n    )\n\n    sample = []\n    for batch in loader.get_random_batches(num_batches, batch_size):\n        sample.append(list(batch.iloc[:, 1]))\n    sample = np.concatenate(sample)\n\n    assert loader.df.npartitions == num_partitions\n    assert sample.mean().round() == (UNIF1[\"a\"] + UNIF1[\"b\"]) / 2\n    assert len(sample) <= num_batches * batch_size\n\n\ndef test_dask_json_seq_loader(json_file):\n    batch_size = 12\n    num_partitions = 4\n\n    loader = DaskJSONLoader(\n        json_file,\n        block_size=json_row_size * int(UNIF1[\"n\"] / num_partitions),\n        random_seed=0,\n        lines=True,\n    )\n\n    sample = []\n    for batch in loader.get_sequential_batches(batch_size):\n        sample.append(list(batch.iloc[:, 1]))\n    sample = np.concatenate(sample)\n\n    assert loader.df.npartitions == num_partitions\n    assert sample.mean().round() == (UNIF1[\"a\"] + UNIF1[\"b\"]) / 2\n    assert len(sample) == UNIF1[\"n\"]\n"
  },
  {
    "path": "tests/unit/test_dataset.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\n\nfrom utils_nlp.dataset.url_utils import maybe_download\nfrom utils_nlp.dataset import msrpc\nfrom utils_nlp.dataset import wikigold\nfrom utils_nlp.dataset import xnli\nfrom utils_nlp.dataset import snli\nfrom utils_nlp.dataset import Split\nfrom utils_nlp.dataset import squad\nfrom utils_nlp.dataset.ner_utils import preprocess_conll\nfrom utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg\nfrom utils_nlp.models.transformers.datasets import (\n    SummarizationDataset,\n    IterableSummarizationDataset,\n)\n\n\n@pytest.fixture\ndef ner_utils_test_data(scope=\"module\"):\n    return {\n        \"input\": \"The O\\n139th I-ORG\\nwas O\\nformed O\\nat O\\nCamp I-LOC\\n\"\n        \"Howe I-LOC\\n, O\\nnear O\\nPittsburgh I-LOC\\n, O\\non O\\n\"\n        \"September O\\n1 O\\n, O\\n1862 O\\n. O\\n\\nFrederick I-PER\\n\"\n        \"H. I-PER\\nCollier I-PER\\nwas O\\nthe O\\nfirst O\\ncolonel O\\n. O\",\n        \"expected_output\": (\n            [\n                [\n                    \"The\",\n                    \"139th\",\n                    \"was\",\n                    \"formed\",\n                    \"at\",\n                    \"Camp\",\n                    \"Howe\",\n                    \",\",\n                    \"near\",\n                    \"Pittsburgh\",\n                    \",\",\n                    \"on\",\n                    \"September\",\n                    \"1\",\n                    \",\",\n                    \"1862\",\n                    \".\",\n                ],\n                [\"Frederick\", \"H.\", \"Collier\", \"was\", \"the\", \"first\", \"colonel\", \".\"],\n            ],\n            [\n                [\n                    \"O\",\n                    \"I-ORG\",\n                    \"O\",\n                    \"O\",\n                    \"O\",\n                    \"I-LOC\",\n                    \"I-LOC\",\n                    \"O\",\n                    \"O\",\n                    \"I-LOC\",\n                    \"O\",\n                    \"O\",\n                    \"O\",\n                    \"O\",\n                    \"O\",\n                    \"O\",\n                    \"O\",\n                ],\n                [\"I-PER\", \"I-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"O\", \"O\"],\n            ],\n        ),\n    }\n\n\ndef test_maybe_download():\n    # ToDo: Change this url when repo goes public.\n    file_url = \"https://raw.githubusercontent.com/Microsoft/Recommenders/master/LICENSE\"\n    filepath = \"license.txt\"\n    assert not os.path.exists(filepath)\n    filepath = maybe_download(file_url, \"license.txt\", expected_bytes=1162)\n    assert os.path.exists(filepath)\n    os.remove(filepath)\n    with pytest.raises(IOError):\n        filepath = maybe_download(file_url, \"license.txt\", expected_bytes=0)\n\n\ndef test_msrpc():\n    with pytest.raises(Exception):\n        msrpc.load_pandas_df(dataset_type=\"Dummy\")\n\n\ndef test_wikigold(tmp_path):\n    wg_sentence_count = 1841\n    wg_test_fraction = 0.5\n    wg_test_sentence_count = round(wg_sentence_count * wg_test_fraction)\n    wg_train_sentence_count = wg_sentence_count - wg_test_sentence_count\n\n    downloaded_file = os.path.join(tmp_path, \"wikigold.conll.txt\")\n    assert not os.path.exists(downloaded_file)\n\n    train_df, test_df = wikigold.load_train_test_dfs(\n        tmp_path, test_fraction=wg_test_fraction\n    )\n\n    assert os.path.exists(downloaded_file)\n\n    assert train_df.shape == (wg_train_sentence_count, 2)\n    assert test_df.shape == (wg_test_sentence_count, 2)\n\n\ndef test_ner_utils(ner_utils_test_data):\n    output = preprocess_conll(ner_utils_test_data[\"input\"], sep=\" \")\n    assert output == ner_utils_test_data[\"expected_output\"]\n\n\ndef test_xnli(tmp_path):\n    # Only test for the dev df as the train dataset takes several\n    # minutes to download\n    dev_df = xnli.load_pandas_df(local_cache_path=tmp_path, file_split=\"dev\")\n    assert dev_df.shape == (2490, 2)\n\n\ndef test_snli(tmp_path):\n    df_train = snli.load_pandas_df(local_cache_path=tmp_path, file_split=Split.TRAIN)\n    assert df_train.shape == (550152, 14)\n    df_test = snli.load_pandas_df(local_cache_path=tmp_path, file_split=Split.TEST)\n    assert df_test.shape == (10000, 14)\n    df_dev = snli.load_pandas_df(local_cache_path=tmp_path, file_split=Split.DEV)\n    assert df_dev.shape == (10000, 14)\n\n\ndef test_squad(tmp_path):\n    v1_train_df = squad.load_pandas_df(\n        local_cache_path=tmp_path, squad_version=\"v1.1\", file_split=\"train\"\n    )\n    assert v1_train_df.shape == (87599, 6)\n\n    v1_dev_df = squad.load_pandas_df(\n        local_cache_path=tmp_path, squad_version=\"v1.1\", file_split=\"dev\"\n    )\n    assert v1_dev_df.shape == (10570, 6)\n\n    v2_train_df = squad.load_pandas_df(\n        local_cache_path=tmp_path, squad_version=\"v2.0\", file_split=\"train\"\n    )\n    assert v2_train_df.shape == (130319, 6)\n\n    v2_dev_df = squad.load_pandas_df(\n        local_cache_path=tmp_path, squad_version=\"v2.0\", file_split=\"dev\"\n    )\n    assert v2_dev_df.shape == (11873, 6)\n\n\ndef test_CNNDMSummarizationDatasetOrg(tmp):\n    expected_train_ds_length = 287113\n    expected_test_ds_length = 11490\n    expected_dev_ds_length = 13368\n\n    top_n = 100\n\n    train_sum_ds, test_sum_ds, dev_sum_ds = CNNDMSummarizationDatasetOrg(\n        local_path=tmp, top_n=-1, return_iterable=False, return_dev_data=True\n    )\n\n    assert isinstance(train_sum_ds, SummarizationDataset)\n    assert isinstance(test_sum_ds, SummarizationDataset)\n    assert isinstance(dev_sum_ds, SummarizationDataset)\n    assert len(train_sum_ds) == expected_train_ds_length\n    assert len(test_sum_ds) == expected_test_ds_length\n    assert len(dev_sum_ds) == expected_dev_ds_length\n\n    train_sum_ds_top_n, test_sum_ds_top_n = CNNDMSummarizationDatasetOrg(\n        local_path=tmp, top_n=top_n, return_iterable=False, return_dev_data=False\n    )\n\n    assert len(train_sum_ds_top_n) == top_n\n    assert len(test_sum_ds_top_n) == top_n\n\n    (\n        train_iterable_sum_ds,\n        test_iterable_sum_ds,\n        dev_iterable_sum_ds,\n    ) = CNNDMSummarizationDatasetOrg(\n        local_path=tmp, top_n=100, return_iterable=True, return_dev_data=True\n    )\n\n    assert isinstance(train_iterable_sum_ds, IterableSummarizationDataset)\n    assert isinstance(test_iterable_sum_ds, IterableSummarizationDataset)\n    assert isinstance(dev_iterable_sum_ds, IterableSummarizationDataset)\n"
  },
  {
    "path": "tests/unit/test_dataset_pytorch.py",
    "content": "from utils_nlp.models.transformers.datasets import QADataset\n\n\ndef test_QADataset(qa_test_df):\n    dataset = QADataset(\n        df=qa_test_df[\"test_df\"],\n        doc_text_col=qa_test_df[\"doc_text_col\"],\n        question_text_col=qa_test_df[\"question_text_col\"],\n        answer_start_col=qa_test_df[\"answer_start_col\"],\n        answer_text_col=qa_test_df[\"answer_text_col\"],\n        qa_id_col=qa_test_df[\"qa_id_col\"],\n        is_impossible_col=qa_test_df[\"is_impossible_col\"],\n    )\n\n    for i in range(2):\n        assert dataset[i].doc_text == qa_test_df[\"test_df\"][qa_test_df[\"doc_text_col\"]][i]\n        assert dataset[i].question_text == qa_test_df[\"test_df\"][qa_test_df[\"question_text_col\"]][i]\n        assert dataset[i].answer_start == qa_test_df[\"test_df\"][qa_test_df[\"answer_start_col\"]][i]\n        assert dataset[i].answer_text == qa_test_df[\"test_df\"][qa_test_df[\"answer_text_col\"]][i]\n        assert dataset[i].qa_id == qa_test_df[\"test_df\"][qa_test_df[\"qa_id_col\"]][i]\n        assert dataset[i].is_impossible == qa_test_df[\"test_df\"][qa_test_df[\"is_impossible_col\"]][i]\n\n    dataset_default = QADataset(\n        df=qa_test_df[\"test_df\"],\n        doc_text_col=qa_test_df[\"doc_text_col\"],\n        question_text_col=qa_test_df[\"question_text_col\"],\n    )\n\n    for i in range(2):\n        assert dataset_default[i].doc_text == qa_test_df[\"test_df\"][qa_test_df[\"doc_text_col\"]][i]\n        assert (\n            dataset_default[i].question_text\n            == qa_test_df[\"test_df\"][qa_test_df[\"question_text_col\"]][i]\n        )\n        assert dataset_default[i].answer_start == -1\n        assert dataset_default[i].answer_text == \"\"\n        assert dataset_default[i].qa_id == i\n        assert dataset_default[i].is_impossible == False\n"
  },
  {
    "path": "tests/unit/test_distributed_sampler.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nfrom utils_nlp.models.transformers.extractive_summarization import IterableDistributedSampler\n\n@pytest.mark.cpu\ndef test_sampler():\n    sampler = IterableDistributedSampler(1, 0, -1)\n    samples = list(sampler.iter('abcdefg'))\n    assert ''.join(samples) == 'abcdefg'\n\n    sampler = IterableDistributedSampler(2, 0, -1)\n    samples = list(sampler.iter('abcdefg'))\n    assert ''.join(samples) == 'abcdefg'\n\n    sampler = IterableDistributedSampler(4, 1, 1)\n    samples = list(sampler.iter('abcdefg'))\n    assert ''.join(samples) == 'bf'\n\n    sampler = IterableDistributedSampler(4, 2, 2)\n    samples = list(sampler.iter('abcdefg'))\n    assert ''.join(samples) == 'cg'\n\n    sampler = IterableDistributedSampler(4, 3, 3)\n    samples = list(sampler.iter('abcdefg'))\n    assert ''.join(samples) == 'd'\n\n    sampler = IterableDistributedSampler(8, 7, 3)\n    samples = list(sampler.iter('abcdefghijklmn'))\n    assert ''.join(samples) == 'h'\n\n\n"
  },
  {
    "path": "tests/unit/test_eval_classification.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport numpy as np\n\nfrom utils_nlp.eval.classification import compute_correlation_coefficients\n\n\ndef test_compute():\n    x = np.random.rand(2, 100)\n    df = compute_correlation_coefficients(x)\n    assert df.shape == (2, 2)\n\n    y = np.random.rand(2, 100)\n    df = compute_correlation_coefficients(x, y)\n    assert df.shape == (4, 4)\n"
  },
  {
    "path": "tests/unit/test_eval_compute_rouge.py",
    "content": "import os\nimport pytest\nfrom utils_nlp.eval import compute_rouge_perl, compute_rouge_python\n\nABS_TOL = 0.00001\n\nR1R = 0.71429\nR1P = 0.77381\nR1F = 0.74176\nR2R = 0.44231\nR2P = 0.49231\nR2F = 0.46504\nRLR = 0.67857\nRLP = 0.73810\nRLF = 0.70605\n\nR1R_hi = 0.53571\nR1P_hi = 0.68125\nR1F_hi = 0.59804\nR2R_hi = 0.28431\nR2P_hi = 0.38334\nR2F_hi = 0.325\nRLR_hi = 0.53571\nRLP_hi = 0.68125\nRLF_hi = 0.59804\n\n\n@pytest.fixture()\ndef rouge_test_data():\n    ## First English testing case:\n    # Unigrams in candidate: 14\n    # Unigrams in reference: 14\n    # Unigram overlapping: 10\n    # Bigrams in candidate: 13\n    # Bigrams in reference: 13\n    # Bigram overlapping: 5\n    # LCS: 6, 3\n    # ROUGE-1 R: 10/14 = 0.71429\n    # ROUGE-1 P: 10/14 = 0.71429\n    # ROUGE-1 F: 2/(14/10 + 14/10) = 20/28 = 0.71429\n    # ROUGE-2 R: 5/13 = 0.38462\n    # ROUGE-2 P: 5/13 = 0.38462\n    # ROUGE-2 F: 0.38462\n    # ROUGE-L R: (6+3)/(9+5) = 0.64286\n    # ROUGE-L P: 0.64286\n    # ROUGE-L F: 0.64286\n\n    ## Second English testing case:\n    # Unigrams in candidate: 6\n    # Unigrams in reference: 7\n    # Unigram overlapping: 5\n    # Bigrams in candidate: 5\n    # Bigrams in reference: 6\n    # Bigram overlapping: 3\n    # LCS: 5\n    # ROUGE-1 R: 5/7 = 0.71429\n    # ROUGE-1 P: 5/6 = 0.83333\n    # ROUGE-1 F: 2/(7/5 + 6/5) = 10/13 = 0.76923\n    # ROUGE-2 R: 3/6 = 0.5\n    # ROUGE-2 P: 3/5 = 0.6\n    # ROUGE-2 F: 2/(6/3 + 5/3) = 6/11 = 0.54545\n    # ROUGE-L R: 5/7 = 0.71429\n    # ROUGE-L P: 5/6 = 0.83333\n    # ROUGE-L F: 2/(7/5 + 6/5) = 10/13 = 0.76923\n\n    summary_candidates = [\n        \"The stock market is doing very well this year. Hope the same for 2020\",\n        \"The new movie is very popular.\",\n    ]\n    summary_references = [\n        \"The stock market is doing really well in 2019. Hope 2020 is the same.\",\n        \"The movie is very popular among millennials.\",\n    ]\n\n    ## First Hindi testing case:\n    # Unigrams in candidate: 16\n    # Unigrams in reference: 18\n    # Unigram overlapping: 9\n    # Bigrams in candidate: 15\n    # Bigrams in reference: 17\n    # Bigram overlapping: 4\n    # LCS: 6, 3 (for each reference sentence, the code checks each candidate sentence)\n    # ROUGE-1 R: 9/18 = 0.5\n    # ROUGE-1 P: 9/16 = 0.5625\n    # ROUGE-1 F: 2/(18/9 + 16/9) = 18/34 = 0.52941\n    # ROUGE-2 R: 4/17 = 0.23529\n    # ROUGE-2 P: 4/15 = 0.26667\n    # ROUGE-2 F: 2/(17/4 + 15/4) = 8/32 = 0.25\n    # ROUGE-L R: (6+3)/18 = 0.5\n    # ROUGE-L P: (6+3)/16 = 0.5625\n    # ROUGE-L F: 2/(18/9 + 16/9) = 18/34 = 0.52941\n\n    ## Second Hindi testing case:\n    # Unigrams in candidate: 5\n    # Unigrams in reference: 7\n    # Unigram overlapping: 4\n    # Bigrams in candidate: 4\n    # Bigrams in reference: 6\n    # Bigram overlapping: 2\n    # LCS: 4\n    # ROUGE-1 R: 4/7 = 0.57143\n    # ROUGE-1 P: 4/5 = 0.8\n    # ROUGE-1 F: 2/(7/4 + 5/4) = 8/12 = 0.66667\n    # ROUGE-2 R: 2/6 = 0.33333\n    # ROUGE-2 P: 2/4 = 0.5\n    # ROUGE-2 F: 2/(6/2 + 4/2) = 4/10 = 0.4\n    # ROUGE-L R: 4/7 = 0.57143\n    # ROUGE-L P: 4/5 = 0.8\n    # ROUGE-L F: 2/(7/4 + 5/4) = 8/12 = 0.66667\n\n    summary_candidates_hi = [\n        \"शेयर बाजार इस साल बहुत अच्छा कर रहा है। 2020 के लिए भी यही उम्मीद है।\",\n        \"नई फिल्म बहुत लोकप्रिय है।\",\n    ]\n    summary_references_hi = [\n        \"शेयर बाजार 2019 में वास्तव में अच्छा कर रहा है। आशा है कि 2020 भी ऐसा ही होगा।\",\n        \"फिल्म सदियों के बीच बहुत लोकप्रिय है।\",\n    ]\n\n    return {\n        \"candidates\": summary_candidates,\n        \"references\": summary_references,\n        \"candidates_hi\": summary_candidates_hi,\n        \"references_hi\": summary_references_hi,\n    }\n\n\ndef test_compute_rouge_perl(rouge_test_data):\n    rouge_perl = compute_rouge_perl(\n        cand=rouge_test_data[\"candidates\"], ref=rouge_test_data[\"references\"]\n    )\n\n    pytest.approx(rouge_perl[\"rouge_1_recall\"], R1R, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_1_precision\"], R1P, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_1_f_score\"], R1F, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_2_recall\"], R2R, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_2_precision\"], R2P, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_2_f_score\"], R2F, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_l_recall\"], RLR, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_l_precision\"], RLP, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_l_f_score\"], RLF, abs=ABS_TOL)\n\n\ndef test_compute_rouge_python(rouge_test_data):\n    rouge_python = compute_rouge_python(\n        cand=rouge_test_data[\"candidates\"], ref=rouge_test_data[\"references\"]\n    )\n\n    pytest.approx(rouge_python[\"rouge-1\"][\"r\"], R1R, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-1\"][\"p\"], R1P, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-1\"][\"f\"], R1F, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-2\"][\"r\"], R2R, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-2\"][\"p\"], R2P, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-2\"][\"f\"], R2F, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-l\"][\"r\"], RLR, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-l\"][\"p\"], RLP, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-l\"][\"f\"], RLF, abs=ABS_TOL)\n\n\ndef test_compute_rouge_python_hi(rouge_test_data):\n    rouge_python = compute_rouge_python(\n        cand=rouge_test_data[\"candidates_hi\"], ref=rouge_test_data[\"references_hi\"], language=\"hi\"\n    )\n\n    pytest.approx(rouge_python[\"rouge-1\"][\"r\"], R1R_hi, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-1\"][\"p\"], R1P_hi, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-1\"][\"f\"], R1F_hi, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-2\"][\"r\"], R2R_hi, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-2\"][\"p\"], R2P_hi, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-2\"][\"f\"], R2F_hi, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-l\"][\"r\"], RLR_hi, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-l\"][\"p\"], RLP_hi, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-l\"][\"f\"], RLF_hi, abs=ABS_TOL)\n\n\ndef test_compute_rouge_perl_file(rouge_test_data, tmp):\n    tmp_cand_file = os.path.join(tmp, \"cand.txt\")\n    tmp_ref_file = os.path.join(tmp, \"ref.txt\")\n\n    with open(tmp_cand_file, \"w\") as f:\n        for s in rouge_test_data[\"candidates\"]:\n            f.write(s + \"\\n\")\n    with open(tmp_ref_file, \"w\") as f:\n        for s in rouge_test_data[\"references\"]:\n            f.write(s + \"\\n\")\n\n    rouge_perl = compute_rouge_perl(cand=tmp_cand_file, ref=tmp_ref_file, is_input_files=True)\n\n    pytest.approx(rouge_perl[\"rouge_1_recall\"], R1R, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_1_precision\"], R1P, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_1_f_score\"], R1F, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_2_recall\"], R2R, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_2_precision\"], R2P, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_2_f_score\"], R2F, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_l_recall\"], RLR, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_l_precision\"], RLP, abs=ABS_TOL)\n    pytest.approx(rouge_perl[\"rouge_l_f_score\"], RLF, abs=ABS_TOL)\n\n\ndef test_compute_rouge_python_file(rouge_test_data, tmp):\n    tmp_cand_file = os.path.join(tmp, \"cand.txt\")\n    tmp_ref_file = os.path.join(tmp, \"ref.txt\")\n\n    with open(tmp_cand_file, \"w\") as f:\n        for s in rouge_test_data[\"candidates\"]:\n            f.write(s + \"\\n\")\n    with open(tmp_ref_file, \"w\") as f:\n        for s in rouge_test_data[\"references\"]:\n            f.write(s + \"\\n\")\n\n    rouge_python = compute_rouge_python(cand=tmp_cand_file, ref=tmp_ref_file, is_input_files=True)\n\n    pytest.approx(rouge_python[\"rouge-1\"][\"r\"], R1R, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-1\"][\"p\"], R1P, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-1\"][\"f\"], R1F, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-2\"][\"r\"], R2R, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-2\"][\"p\"], R2P, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-2\"][\"f\"], R2F, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-l\"][\"r\"], RLR, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-l\"][\"p\"], RLP, abs=ABS_TOL)\n    pytest.approx(rouge_python[\"rouge-l\"][\"f\"], RLF, abs=ABS_TOL)\n"
  },
  {
    "path": "tests/unit/test_extractive_summarization.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport nltk\nimport pytest\nfrom nltk import tokenize\n\nfrom utils_nlp.models.transformers.datasets import SummarizationDataset\nfrom utils_nlp.models.transformers.extractive_summarization import (\n    ExtractiveSummarizer,\n    ExtSumProcessor,\n)\n\nnltk.download(\"punkt\")\n\n\n# @pytest.fixture()\ndef source_data():\n    return (\n        \"Boston, MA welcome to Microsoft/nlp. Welcome to text summarization.\"\n        \"Welcome to Microsoft NERD.\"\n        \"Look outside, what a beautiful Charlse River fall view.\"\n    )\n\n\n# @pytest.fixture()\ndef target_data():\n    return (\n        \"welcome to microsoft/nlp.\"\n        \"Welcome to text summarization.\"\n        \"Welcome to Microsoft NERD.\"\n    )\n\n\nMODEL_NAME = \"distilbert-base-uncased\"\n\n@pytest.fixture(scope=\"module\")\ndef data(tmp_module):\n    source = source_data()\n    target = target_data()\n    train_dataset = SummarizationDataset(\n        None,\n        source=[source],\n        target=[target],\n        source_preprocessing=[tokenize.sent_tokenize],\n        target_preprocessing=[tokenize.sent_tokenize],\n        word_tokenize=nltk.word_tokenize,\n    )\n    test_dataset = SummarizationDataset(\n        None,\n        source=[source],\n        source_preprocessing=[tokenize.sent_tokenize],\n        word_tokenize=nltk.word_tokenize,\n    )\n\n    processor = ExtSumProcessor(\n        model_name=MODEL_NAME,\n        cache_dir=tmp_module,\n        max_nsents=200,\n        max_src_ntokens=2000,\n        min_nsents=0,\n        min_src_ntokens=1,\n    )\n    ext_sum_train = processor.preprocess(train_dataset, oracle_mode=\"greedy\")\n    ext_sum_test = processor.preprocess(test_dataset, oracle_mode=\"greedy\")\n    return processor, ext_sum_train, ext_sum_test\n\n\n@pytest.mark.gpu\ndef test_bert_training(data, tmp_module):\n\n    CACHE_DIR = tmp_module\n    ENCODER = \"transformer\"\n    MAX_POS = 768\n    BATCH_SIZE = 128\n    LEARNING_RATE = 2e-3\n    REPORT_EVERY = 50\n    MAX_STEPS = 20\n    WARMUP_STEPS = 1e2\n\n    processor, train_dataset, test_dataset = data\n    summarizer = ExtractiveSummarizer(\n        processor, MODEL_NAME, ENCODER, MAX_POS, CACHE_DIR\n    )\n    summarizer.fit(\n        train_dataset,\n        num_gpus=None,\n        batch_size=BATCH_SIZE,\n        gradient_accumulation_steps=1,\n        max_steps=MAX_STEPS,\n        lr=LEARNING_RATE,\n        warmup_steps=WARMUP_STEPS,\n        verbose=True,\n        report_every=REPORT_EVERY,\n        clip_grad_norm=False,\n    )\n\n    prediction = summarizer.predict(test_dataset, num_gpus=None, batch_size=128)\n    assert len(prediction) == 1\n"
  },
  {
    "path": "tests/unit/test_gensen_utils.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\n\nimport pandas as pd\n\nfrom utils_nlp.models.gensen.preprocess_utils import gensen_preprocess\nfrom utils_nlp.models.gensen.utils import DataIterator\n\n\ndef test_gensen_preprocess(tmp_path):\n    data = [\n        [\n            \"neutral\",\n            \"it is a lovely day\",\n            \"the weather is great outside.\",\n            [\"it\", \"is\", \"lovely\", \"day\"],\n            [\"the\", \"weather\", \"is\", \"great\", \"outside\"],\n        ]\n    ]\n\n    df = pd.DataFrame(data)\n    df.columns = [\n        \"score\",\n        \"sentence1\",\n        \"sentence2\",\n        \"sentence1_tokens\",\n        \"sentence2_tokens\",\n    ]\n\n    expected_files = [\n        \"snli_1.0_test.txt.lab\",\n        \"snli_1.0_test.txt.s1.tok\",\n        \"snli_1.0_dev.txt.clean.noblank\",\n        \"snli_1.0_train.txt.s1.tok\",\n        \"snli_1.0_train.txt.lab\",\n        \"snli_1.0_dev.txt.s1.tok\",\n        \"snli_1.0_dev.txt.s2.tok\",\n        \"snli_1.0_test.txt.s2.tok\",\n        \"snli_1.0_train.txt.clean\",\n        \"snli_1.0_train.txt.s2.tok\",\n        \"snli_1.0_test.txt.clean.noblank\",\n        \"snli_1.0_test.txt.clean\",\n        \"snli_1.0_train.txt.clean.noblank\",\n        \"snli_1.0_dev.txt.lab\",\n        \"snli_1.0_dev.txt.clean\",\n    ]\n    path = gensen_preprocess(df, df, df, tmp_path)\n    assert os.path.isdir(path) is True\n    assert set(os.listdir(path)) == set(expected_files)\n\n\ndef test_data_iterator():\n    sentences = [\"it is a lovely day\", \"the weather is great outside.\", ]\n    expected_vocab = [\"it\", \"is\", \"a\", \"lovely\", \"day\", \"the\", \"weather\", \"is\", \"great\", \"outside.\"]\n\n    vocab_size = 10\n    di = DataIterator()\n    word2id, id2word = di.construct_vocab(sentences, vocab_size)\n    assert set(expected_vocab).issubset(word2id.keys())\n    assert set(expected_vocab).issubset(id2word.values())\n"
  },
  {
    "path": "tests/unit/test_interpreter.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License\n\nimport random\n\nimport pytest\n\nimport numpy as np\nimport torch\nfrom torch import nn\n\nfrom utils_nlp.interpreter.Interpreter import (\n    Interpreter,\n    calculate_regularization,\n)\n\n\ndef fixed_length_Phi(x):\n    return x[0] * 10 + x[1] * 20 - x[2] * 20 - x[3] * 10\n\n\ndef variable_length_Phi(function):\n    return lambda x: (function(x.unsqueeze(0))[0][0])\n\n\n@pytest.fixture\ndef fixed_length_interp():\n    x = torch.randn(4, 10)\n    regular = torch.randn(10)\n    return Interpreter(x, fixed_length_Phi, regularization=regular)\n\n\n@pytest.fixture\ndef variable_length_interp():\n    function = nn.LSTM(10, 10)\n    x = torch.randn(4, 10)\n    regular = torch.randn(1, 10)\n    return Interpreter(\n        x, variable_length_Phi(function), regularization=regular\n    )\n\n\ndef test_fixed_length_regularization():\n    dataset = torch.randn(10, 4, 10)\n    # calculate all hidden states\n    hidden = [fixed_length_Phi(x).tolist() for x in dataset]\n    # calculate the standard deviation\n    hidden = np.array(hidden)\n    regular_gt = np.std(hidden, axis=0)\n    regular = calculate_regularization(dataset, fixed_length_Phi)\n    assert np.sum(np.abs(regular - regular_gt)) < 1e-5\n\n\ndef test_variable_length_regularization():\n    function = nn.LSTM(10, 10)\n    dataset = [torch.randn(random.randint(5, 9), 10) for _ in range(10)]\n    # calculate all hidden states\n    hidden = [\n        np.mean(\n            variable_length_Phi(function)(x).tolist(), axis=0, keepdims=True\n        )\n        for x in dataset\n    ]\n    # calculate the standard deviation\n    hidden = np.array(hidden)\n    regular_gt = np.std(hidden, axis=0)\n    regular = calculate_regularization(\n        dataset, variable_length_Phi(function), reduced_axes=[0]\n    )\n    assert np.sum(np.abs(regular - regular_gt)) < 1e-5\n\n\ndef test_initialize_interpreter():\n    x = torch.randn(4, 10)\n    regular = torch.randn(10)\n    interpreter = Interpreter(x, fixed_length_Phi, regularization=regular)\n    assert interpreter.s == 4\n    assert interpreter.d == 10\n    assert interpreter.regular.tolist() == regular.tolist()\n\n\ndef test_train_fixed_length_interp(fixed_length_interp):\n    init_ratio = fixed_length_interp.ratio + 0.0  # make a copy\n    init_regular = fixed_length_interp.regular + 0.0\n    fixed_length_interp.optimize(iteration=10)\n    after_ratio = fixed_length_interp.ratio + 0.0\n    after_regular = fixed_length_interp.regular + 0.0\n    # make sure the ratio is changed when optimizing\n    assert torch.sum(torch.abs(after_ratio - init_ratio)) > 1e-5\n    # make sure the regular is not changed when optimizing\n    assert torch.sum(torch.abs(after_regular - init_regular)) < 1e-5\n\n\ndef test_train_variable_length_interp(variable_length_interp):\n    init_ratio = variable_length_interp.ratio + 0.0  # make a copy\n    init_regular = variable_length_interp.regular + 0.0\n    variable_length_interp.optimize(iteration=10)\n    after_ratio = variable_length_interp.ratio + 0.0\n    after_regular = variable_length_interp.regular + 0.0\n    # make sure the ratio is changed when optimizing\n    assert torch.sum(torch.abs(after_ratio - init_ratio)) > 1e-5\n    # make sure the regular is not changed when optimizing\n    assert torch.sum(torch.abs(after_regular - init_regular)) < 1e-5\n\n\ndef test_interpreter_get_simga(fixed_length_interp):\n    sigma = fixed_length_interp.get_sigma()\n    assert sigma.shape == (4,)\n"
  },
  {
    "path": "tests/unit/test_models_transformers_question_answering.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\n\nimport pytest\nimport torch\n\nfrom utils_nlp.common.pytorch_utils import dataloader_from_dataset\nfrom utils_nlp.models.transformers.datasets import QADataset\nfrom utils_nlp.models.transformers.question_answering import (\n    CACHED_EXAMPLES_TEST_FILE,\n    CACHED_FEATURES_TEST_FILE,\n    AnswerExtractor,\n    QAProcessor,\n)\n\nNUM_GPUS = max(1, torch.cuda.device_count())\nBATCH_SIZE = 8\n\n\n@pytest.fixture(scope=\"module\")\ndef qa_test_data(qa_test_df, tmp_module):\n\n    train_dataset = QADataset(\n        df=qa_test_df[\"test_df\"],\n        doc_text_col=qa_test_df[\"doc_text_col\"],\n        question_text_col=qa_test_df[\"question_text_col\"],\n        answer_start_col=qa_test_df[\"answer_start_col\"],\n        answer_text_col=qa_test_df[\"answer_text_col\"],\n        qa_id_col=qa_test_df[\"qa_id_col\"],\n    )\n\n    train_dataset_list = QADataset(\n        df=qa_test_df[\"test_df\"],\n        doc_text_col=qa_test_df[\"doc_text_col\"],\n        question_text_col=qa_test_df[\"question_text_col\"],\n        answer_start_col=qa_test_df[\"answer_start_list_col\"],\n        answer_text_col=qa_test_df[\"answer_text_list_col\"],\n        qa_id_col=qa_test_df[\"qa_id_col\"],\n    )\n\n    train_dataset_start_text_mismatch = QADataset(\n        df=qa_test_df[\"test_df\"],\n        doc_text_col=qa_test_df[\"doc_text_col\"],\n        question_text_col=qa_test_df[\"question_text_col\"],\n        answer_start_col=qa_test_df[\"answer_start_list_col\"],\n        answer_text_col=qa_test_df[\"answer_text_col\"],\n        qa_id_col=qa_test_df[\"qa_id_col\"],\n    )\n\n    train_dataset_multi_answers = QADataset(\n        df=qa_test_df[\"test_df\"],\n        doc_text_col=qa_test_df[\"doc_text_col\"],\n        question_text_col=qa_test_df[\"question_text_col\"],\n        answer_start_col=qa_test_df[\"answer_start_multi_col\"],\n        answer_text_col=qa_test_df[\"answer_text_multi_col\"],\n        qa_id_col=qa_test_df[\"qa_id_col\"],\n    )\n\n    test_dataset = QADataset(\n        df=qa_test_df[\"test_df\"],\n        doc_text_col=qa_test_df[\"doc_text_col\"],\n        question_text_col=qa_test_df[\"question_text_col\"],\n        qa_id_col=qa_test_df[\"qa_id_col\"],\n    )\n\n    # bert\n    qa_processor_bert = QAProcessor(cache_dir=tmp_module)\n    train_features_bert = qa_processor_bert.preprocess(\n        train_dataset,\n        is_training=True,\n        max_question_length=16,\n        max_seq_length=64,\n        doc_stride=32,\n        feature_cache_dir=tmp_module,\n    )\n    test_features_bert = qa_processor_bert.preprocess(\n        test_dataset,\n        is_training=False,\n        max_question_length=16,\n        max_seq_length=64,\n        doc_stride=32,\n        feature_cache_dir=tmp_module,\n    )\n\n    # xlnet\n    qa_processor_xlnet = QAProcessor(\n        model_name=\"xlnet-base-cased\", cache_dir=tmp_module\n    )\n    train_features_xlnet = qa_processor_xlnet.preprocess(\n        train_dataset,\n        is_training=True,\n        max_question_length=16,\n        max_seq_length=64,\n        doc_stride=32,\n        feature_cache_dir=tmp_module,\n    )\n    test_features_xlnet = qa_processor_xlnet.preprocess(\n        test_dataset,\n        is_training=False,\n        max_question_length=16,\n        max_seq_length=64,\n        doc_stride=32,\n        feature_cache_dir=tmp_module,\n    )\n\n    # distilbert\n    qa_processor_distilbert = QAProcessor(\n        model_name=\"distilbert-base-uncased\", cache_dir=tmp_module\n    )\n    train_features_distilbert = qa_processor_distilbert.preprocess(\n        train_dataset,\n        is_training=True,\n        max_question_length=16,\n        max_seq_length=64,\n        doc_stride=32,\n        feature_cache_dir=tmp_module,\n    )\n    test_features_distilbert = qa_processor_distilbert.preprocess(\n        test_dataset,\n        is_training=False,\n        max_question_length=16,\n        max_seq_length=64,\n        doc_stride=32,\n        feature_cache_dir=tmp_module,\n    )\n\n    return {\n        \"train_dataset\": train_dataset,\n        \"train_dataset_list\": train_dataset_list,\n        \"train_dataset_start_text_mismatch\": train_dataset_start_text_mismatch,\n        \"train_dataset_multi_answers\": train_dataset_multi_answers,\n        \"test_dataset\": test_dataset,\n        \"train_features_bert\": train_features_bert,\n        \"test_features_bert\": test_features_bert,\n        \"train_features_xlnet\": train_features_xlnet,\n        \"test_features_xlnet\": test_features_xlnet,\n        \"train_features_distilbert\": train_features_distilbert,\n        \"test_features_distilbert\": test_features_distilbert,\n    }\n\n\n@pytest.mark.gpu\ndef test_QAProcessor(qa_test_data, tmp_module):\n    for model_name in [\n        \"bert-base-cased\",\n        \"xlnet-base-cased\",\n        \"distilbert-base-uncased\",\n    ]:\n        qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)\n        qa_processor.preprocess(\n            qa_test_data[\"train_dataset\"],\n            is_training=True,\n            feature_cache_dir=tmp_module,\n        )\n        qa_processor.preprocess(\n            qa_test_data[\"train_dataset_list\"],\n            is_training=True,\n            feature_cache_dir=tmp_module,\n        )\n        qa_processor.preprocess(\n            qa_test_data[\"test_dataset\"],\n            is_training=False,\n            feature_cache_dir=tmp_module,\n        )\n\n    # test unsupported model type\n    with pytest.raises(ValueError):\n        qa_processor = QAProcessor(model_name=\"abc\", cache_dir=tmp_module)\n\n    # test training data has no ground truth exception\n    with pytest.raises(Exception):\n        qa_processor.preprocess(\n            qa_test_data[\"test_dataset\"], is_training=True, feature_cache_dir=tmp_module\n        )\n\n    # test when answer start is a list, but answer text is not\n    with pytest.raises(Exception):\n        qa_processor.preprocess(\n            qa_test_data[\"train_dataset_start_text_mismatch\"],\n            is_training=True,\n            feature_cache_dir=tmp_module,\n        )\n\n    # test when training data has multiple answers\n    with pytest.raises(Exception):\n        qa_processor.preprocess(\n            qa_test_data[\"train_dataset_multi_answers\"],\n            is_training=True,\n            feature_cache_dir=tmp_module,\n        )\n\n\ndef test_AnswerExtractor(qa_test_data, tmp_module):\n    # bert\n    qa_extractor_bert = AnswerExtractor(cache_dir=tmp_module)\n    train_loader_bert = dataloader_from_dataset(qa_test_data[\"train_features_bert\"])\n    test_loader_bert = dataloader_from_dataset(\n        qa_test_data[\"test_features_bert\"], shuffle=False\n    )\n    qa_extractor_bert.fit(train_loader_bert, verbose=False, cache_model=True)\n\n    # test saving fine-tuned model\n    model_output_dir = os.path.join(tmp_module, \"fine_tuned\")\n    assert os.path.exists(os.path.join(model_output_dir, \"pytorch_model.bin\"))\n    assert os.path.exists(os.path.join(model_output_dir, \"config.json\"))\n\n    qa_extractor_from_cache = AnswerExtractor(\n        cache_dir=tmp_module, load_model_from_dir=model_output_dir\n    )\n    qa_extractor_from_cache.predict(test_loader_bert, verbose=False)\n\n    # xlnet\n    train_loader_xlnet = dataloader_from_dataset(qa_test_data[\"train_features_xlnet\"])\n    test_loader_xlnet = dataloader_from_dataset(\n        qa_test_data[\"test_features_xlnet\"], shuffle=False\n    )\n    qa_extractor_xlnet = AnswerExtractor(\n        model_name=\"xlnet-base-cased\", cache_dir=tmp_module\n    )\n    qa_extractor_xlnet.fit(train_loader_xlnet, verbose=False, cache_model=False)\n    qa_extractor_xlnet.predict(test_loader_xlnet, verbose=False)\n\n    # distilbert\n    train_loader_xlnet = dataloader_from_dataset(\n        qa_test_data[\"train_features_distilbert\"]\n    )\n    test_loader_xlnet = dataloader_from_dataset(\n        qa_test_data[\"test_features_distilbert\"], shuffle=False\n    )\n    qa_extractor_distilbert = AnswerExtractor(\n        model_name=\"distilbert-base-uncased\", cache_dir=tmp_module\n    )\n    qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)\n    qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)\n\n\ndef test_postprocess_bert_answer(qa_test_data, tmp_module):\n    qa_processor = QAProcessor(cache_dir=tmp_module)\n    test_features = qa_processor.preprocess(\n        qa_test_data[\"test_dataset\"],\n        is_training=False,\n        max_question_length=16,\n        max_seq_length=64,\n        doc_stride=32,\n        feature_cache_dir=tmp_module,\n    )\n    test_loader = dataloader_from_dataset(test_features, shuffle=False)\n    qa_extractor = AnswerExtractor(cache_dir=tmp_module)\n    predictions = qa_extractor.predict(test_loader)\n\n    qa_processor.postprocess(\n        results=predictions,\n        examples_file=os.path.join(tmp_module, CACHED_EXAMPLES_TEST_FILE),\n        features_file=os.path.join(tmp_module, CACHED_FEATURES_TEST_FILE),\n        output_prediction_file=os.path.join(tmp_module, \"qa_predictions.json\"),\n        output_nbest_file=os.path.join(tmp_module, \"nbest_predictions.json\"),\n        output_null_log_odds_file=os.path.join(tmp_module, \"null_odds.json\"),\n    )\n\n    qa_processor.postprocess(\n        results=predictions,\n        examples_file=os.path.join(tmp_module, CACHED_EXAMPLES_TEST_FILE),\n        features_file=os.path.join(tmp_module, CACHED_FEATURES_TEST_FILE),\n        unanswerable_exists=True,\n        verbose_logging=True,\n        output_prediction_file=os.path.join(tmp_module, \"qa_predictions.json\"),\n        output_nbest_file=os.path.join(tmp_module, \"nbest_predictions.json\"),\n        output_null_log_odds_file=os.path.join(tmp_module, \"null_odds.json\"),\n    )\n\n\ndef test_postprocess_xlnet_answer(qa_test_data, tmp_module):\n    qa_processor = QAProcessor(model_name=\"xlnet-base-cased\", cache_dir=tmp_module)\n    test_features = qa_processor.preprocess(\n        qa_test_data[\"test_dataset\"],\n        is_training=False,\n        max_question_length=16,\n        max_seq_length=64,\n        doc_stride=32,\n        feature_cache_dir=tmp_module,\n    )\n    test_loader = dataloader_from_dataset(test_features, shuffle=False)\n    qa_extractor = AnswerExtractor(model_name=\"xlnet-base-cased\", cache_dir=tmp_module)\n    predictions = qa_extractor.predict(test_loader)\n\n    qa_processor.postprocess(\n        results=predictions,\n        examples_file=os.path.join(tmp_module, CACHED_EXAMPLES_TEST_FILE),\n        features_file=os.path.join(tmp_module, CACHED_FEATURES_TEST_FILE),\n        output_prediction_file=os.path.join(tmp_module, \"qa_predictions.json\"),\n        output_nbest_file=os.path.join(tmp_module, \"nbest_predictions.json\"),\n        output_null_log_odds_file=os.path.join(tmp_module, \"null_odds.json\"),\n    )\n\n    qa_processor.postprocess(\n        results=predictions,\n        examples_file=os.path.join(tmp_module, CACHED_EXAMPLES_TEST_FILE),\n        features_file=os.path.join(tmp_module, CACHED_FEATURES_TEST_FILE),\n        unanswerable_exists=True,\n        verbose_logging=True,\n        output_prediction_file=os.path.join(tmp_module, \"qa_predictions.json\"),\n        output_nbest_file=os.path.join(tmp_module, \"nbest_predictions.json\"),\n        output_null_log_odds_file=os.path.join(tmp_module, \"null_odds.json\"),\n    )\n"
  },
  {
    "path": "tests/unit/test_notebooks_cpu.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\nfrom tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME\nimport papermill as pm\nfrom utils_nlp.models.bert.common import Language\n\n\n@pytest.mark.notebooks\ndef test_bert_encoder(notebooks, tmp):\n    notebook_path = notebooks[\"bert_encoder\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            NUM_GPUS=0, LANGUAGE=Language.ENGLISH, TO_LOWER=True, MAX_SEQ_LENGTH=128, CACHE_DIR=tmp\n        ),\n    )\n"
  },
  {
    "path": "tests/unit/test_notebooks_gpu.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\nfrom tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME\nimport papermill as pm\nfrom utils_nlp.models.bert.common import Language\n\n\n@pytest.mark.notebooks\n@pytest.mark.gpu\ndef test_bert_encoder(notebooks, tmp):\n    notebook_path = notebooks[\"bert_encoder\"]\n    pm.execute_notebook(\n        notebook_path,\n        OUTPUT_NOTEBOOK,\n        kernel_name=KERNEL_NAME,\n        parameters=dict(\n            NUM_GPUS=1, LANGUAGE=Language.ENGLISH, TO_LOWER=True, MAX_SEQ_LENGTH=128, CACHE_DIR=tmp\n        ),\n    )\n"
  },
  {
    "path": "tests/unit/test_preprocess.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport pandas as pd\nimport numpy as np\n\nimport utils_nlp.dataset.preprocess as preprocess\n\n\n@pytest.fixture(scope=\"module\")\ndef df_sentences():\n    sentences = np.array(\n        [\n            \"The man is playing the piano.\",\n            \"Some men are fighting.\",\n            \"A man is spreading shreded cheese on a pizza.\",\n            \"A man is playing the cello.\",\n            \"A man is spreading shreded cheese on a pizza.\",\n            \"A man is playing a large flute.\",\n            \"A man is playing the cello.\",\n            \"A man is playing on a guitar and singing.\",\n            \"The man is playing the piano.\",\n            \"Some men are fighting.\",\n        ]\n    ).reshape(2, 5)\n\n    return pd.DataFrame(sentences, columns=[\"s1\", \"s2\", \"s3\", \"s4\", \"s5\"])\n\n\ndef test_to_lowercase_all(df_sentences):\n    ldf = preprocess.to_lowercase_all(df_sentences)\n    assert sum(map(lambda x: x.islower(), ldf.values.flatten())) == len(\n        ldf.values.flatten()\n    )\n\n\ndef test_to_lowercase_subset(df_sentences):\n    ldf = preprocess.to_lowercase(df_sentences, column_names=[\"s4\"])\n    assert sum(map(lambda x: x.islower(), ldf.s4.values.flatten())) == len(\n        ldf.s4.values.flatten()\n    )\n\n\ndef test_to_spacy_tokens(df_sentences):\n    sentence_cols = [\"s1\", \"s2\"]\n    token_cols = [\"t1\", \"t2\"]\n    token_df = preprocess.to_spacy_tokens(\n        df_sentences, sentence_cols=sentence_cols, token_cols=token_cols\n    )\n    assert token_df.shape[1] == df_sentences.shape[1] + len(\n        token_cols\n    ) and sum(\n        list(\n            map(lambda x: (token_df[x].apply(type) == list).all(), token_cols)\n        )\n    ) == len(\n        token_cols\n    )\n\n\ndef test_rm_spacy_stopwords(df_sentences):\n    sentence_cols = [\"s1\", \"s2\"]\n    stop_cols = [\"stop1\", \"stop2\"]\n    stop_df = preprocess.rm_spacy_stopwords(\n        df_sentences, sentence_cols=sentence_cols, stop_cols=stop_cols\n    )\n    assert stop_df.shape[1] == df_sentences.shape[1] + len(stop_cols) and sum(\n        list(map(lambda x: (stop_df[x].apply(type) == list).all(), stop_cols))\n    ) == len(stop_cols)\n\n\ndef test_to_nltk_tokens(df_sentences):\n    sentence_cols = [\"s1\", \"s2\"]\n    token_cols = [\"t1\", \"t2\"]\n    token_df = preprocess.to_nltk_tokens(\n        df_sentences, sentence_cols=sentence_cols, token_cols=token_cols\n    )\n    assert token_df.shape[1] == df_sentences.shape[1] + len(\n        token_cols\n    ) and sum(\n        list(\n            map(lambda x: (token_df[x].apply(type) == list).all(), token_cols)\n        )\n    ) == len(\n        token_cols\n    )\n\n\ndef test_rm_nltk_stopwords(df_sentences):\n    sentence_cols = [\"s1\", \"s2\"]\n    stop_cols = [\"stop1\", \"stop2\"]\n    stop_df = preprocess.rm_nltk_stopwords(\n        df_sentences, sentence_cols=sentence_cols, stop_cols=stop_cols\n    )\n    assert stop_df.shape[1] == df_sentences.shape[1] + len(stop_cols) and sum(\n        list(map(lambda x: (stop_df[x].apply(type) == list).all(), stop_cols))\n    ) == len(stop_cols)\n\n\ndef test_convert_to_unicode():\n    test_str = \"test\"\n    test_byte = test_str.encode(\"utf-8\")\n\n    assert isinstance(preprocess.convert_to_unicode(test_str), str)\n    assert isinstance(preprocess.convert_to_unicode(test_byte), str)\n"
  },
  {
    "path": "tests/unit/test_timer.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\nimport pytest\nimport time\nfrom utils_nlp.common.timer import Timer\n\n\nTOL = 0.01\n\n\n@pytest.fixture(scope=\"function\")\ndef t():\n    return Timer()\n\n\ndef test_no_time(t):\n    assert t.interval == 0\n    assert t.running == False\n\n\ndef test_stop_before_start(t):\n    with pytest.raises(ValueError):\n        t.stop()\n\n\ndef test_interval_before_stop(t):\n    t.start()\n    with pytest.raises(ValueError):\n        t.interval\n\n\ndef test_timer(t):\n    t.start()\n    assert t.running == True\n    time.sleep(1)\n    t.stop()\n    assert t.running == False\n    assert t.interval == pytest.approx(1, abs=TOL)\n    with Timer() as t2:\n        assert t2.running == True\n        time.sleep(1)\n    assert t2.interval == pytest.approx(1, abs=TOL)\n    assert t2.running == False\n\n\ndef test_timer_format(t):\n    assert str(t) == \"0.0000\"\n    assert str(t.interval) == \"0\"\n"
  },
  {
    "path": "tests/unit/test_transformers_sequence_classification.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport pandas as pd\n\nfrom utils_nlp.models.transformers.sequence_classification import (\n    SequenceClassifier,\n    Processor,\n)\nfrom utils_nlp.common.pytorch_utils import dataloader_from_dataset\n\n\n@pytest.fixture()\ndef data():\n    return ([\"hi\", \"hello\", \"what's wrong with us\", \"can I leave?\"], [0, 0, 1, 2])\n\n\n@pytest.mark.cpu\ndef test_classifier(data, tmpdir):\n\n    df = pd.DataFrame({\"text\": data[0], \"label\": data[1]})\n    num_labels = len(pd.unique(data[1]))\n    model_name = \"bert-base-uncased\"\n    processor = Processor(model_name=model_name, cache_dir=tmpdir)\n    ds = processor.dataset_from_dataframe(df, \"text\", \"label\")\n    dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True)\n    classifier = SequenceClassifier(\n        model_name=model_name, num_labels=num_labels, cache_dir=tmpdir\n    )\n    classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=0, verbose=False)\n    preds = classifier.predict(dl, num_gpus=0, verbose=False)\n    assert len(preds) == len(data[1])\n\n\n@pytest.mark.gpu\ndef test_classifier_gpu_train_cpu_predict(data, tmpdir):\n\n    df = pd.DataFrame({\"text\": data[0], \"label\": data[1]})\n    num_labels = len(pd.unique(data[1]))\n    model_name = \"bert-base-uncased\"\n    processor = Processor(model_name=model_name, cache_dir=tmpdir)\n    ds = processor.dataset_from_dataframe(df, \"text\", \"label\")\n    dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True)\n    classifier = SequenceClassifier(\n        model_name=model_name, num_labels=num_labels, cache_dir=tmpdir\n    )\n    classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=1, verbose=False)\n\n    # gpu prediction, no model move\n    preds = classifier.predict(dl, num_gpus=1, verbose=False)\n    assert len(preds) == len(data[1])\n    # cpu prediction, need model move\n    assert next(classifier.model.parameters()).is_cuda is True\n    preds = classifier.predict(dl, num_gpus=0, verbose=False)\n    assert next(classifier.model.parameters()).is_cuda is False\n"
  },
  {
    "path": "tests/unit/test_transformers_token_classification.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\n\nfrom utils_nlp.common.pytorch_utils import dataloader_from_dataset\nfrom utils_nlp.models.transformers.named_entity_recognition import (\n    TokenClassificationProcessor,\n    TokenClassifier,\n)\nfrom utils_nlp.models.transformers.common import MAX_SEQ_LEN\n\n\n@pytest.mark.cpu\ndef test_token_classifier_fit_predict(tmpdir, ner_test_data):\n    num_labels = 6\n    max_seq_len = MAX_SEQ_LEN\n    token_classifier = TokenClassifier(\n        model_name=\"bert-base-uncased\", num_labels=num_labels, cache_dir=tmpdir\n    )\n    processor = TokenClassificationProcessor(\n        model_name=\"bert-base-uncased\", cache_dir=tmpdir\n    )\n\n    # test fit, no warmup\n    train_dataset = processor.preprocess(\n        text=ner_test_data[\"INPUT_TEXT\"],\n        max_len=max_seq_len,\n        labels=ner_test_data[\"INPUT_LABELS\"],\n        label_map=ner_test_data[\"LABEL_MAP\"],\n    )\n    train_dataloader = dataloader_from_dataset(train_dataset)\n    token_classifier.fit(train_dataloader)\n\n    # test predict, no labels\n    preds = token_classifier.predict(train_dataloader, verbose=False)\n    assert preds.shape == (len(train_dataloader), MAX_SEQ_LEN, num_labels)\n"
  },
  {
    "path": "tools/README.md",
    "content": "# Tools\n\nThis submodule includes:\n1.  A [script](generate_conda_file.py) to generate the Conda environment file for running Python scripts and notebooks in this Git repo \n2.  Python [script](remove_pixelserver.py) to remove pixelserver tracking from all example notebooks.\n\n"
  },
  {
    "path": "tools/__init__.py",
    "content": ""
  },
  {
    "path": "tools/generate_conda_file.py",
    "content": "#!/usr/bin/python\n\n# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This script creates yaml files to build conda environments\n# For generating a conda file for running only python code:\n# $ python generate_conda_file.py\n#\n# For generating a conda file for running python gpu:\n# $ python generate_conda_file.py --gpu\n\n\nimport argparse\nimport textwrap\nfrom sys import platform\n\n\nHELP_MSG = \"\"\"\nTo create the conda environment:\n$ conda env create -f {conda_env}.yaml\n\nTo update the conda environment:\n$ conda env update -f {conda_env}.yaml\n\nTo register the conda environment in Jupyter:\n$ conda activate {conda_env}\n$ python -m ipykernel install --user --name {conda_env} \\\n--display-name \"Python ({conda_env})\"\n\"\"\"\n\n\nCHANNELS = [\"defaults\", \"conda-forge\", \"pytorch\"]\n\nCONDA_BASE = {\n    \"python\": \"python==3.6.8\",\n    \"pip\": \"pip>=19.1.1\",\n    \"ipykernel\": \"ipykernel>=4.6.1\",\n    \"jupyter\": \"jupyter>=1.0.0\",\n    \"matplotlib\": \"matplotlib>=2.2.2\",\n    \"numpy\": \"numpy>=1.13.3\",\n    \"pandas\": \"pandas>=0.24.2\",\n    \"pytest\": \"pytest>=3.6.4\",\n    \"pytorch\": \"pytorch-cpu>=1.0.0\",\n    \"scipy\": \"scipy>=1.0.0\",\n    \"h5py\": \"h5py>=2.8.0\",\n    \"tensorflow\": \"tensorflow==1.15.0\",\n    \"tensorflow-hub\": \"tensorflow-hub==0.7.0\",\n    \"dask\": \"dask[dataframe]==1.2.2\",\n    \"papermill\": \"papermill==1.2.1\",\n}\n\nCONDA_GPU = {\n    \"numba\": \"numba>=0.38.1\",\n    \"cudatoolkit\": \"cudatoolkit=10.1\",\n    \"pytorch\": \"pytorch==1.4.0\",\n}\n\nPIP_BASE = {\n    \"allennlp\": \"allennlp==0.8.4\",\n    \"azureml-sdk\": \"azureml-sdk[automl,notebooks,contrib]==1.0.85\",\n    \"azureml-train-automl\": \"azureml-train-automl==1.0.85\",\n    \"azureml-dataprep\": \"azureml-dataprep==1.1.8\",\n    \"azureml-widgets\": \"azureml-widgets==1.0.85\",\n    \"azureml-mlflow\": \"azureml-mlflow==1.0.85\",\n    \"black\": \"black>=18.6b4\",\n    \"cached-property\": \"cached-property==1.5.1\",\n    \"jsonlines\": \"jsonlines>=1.2.0\",\n    \"nteract-scrapbook\": \"nteract-scrapbook>=0.2.1\",\n    \"pydocumentdb\": \"pydocumentdb>=2.3.3\",\n    \"pytorch-pretrained-bert\": \"pytorch-pretrained-bert>=0.6\",\n    \"tqdm\": \"tqdm==4.32.2\",\n    \"pyemd\": \"pyemd==0.5.1\",\n    \"ipywebrtc\": \"ipywebrtc==0.4.3\",\n    \"pre-commit\": \"pre-commit>=1.14.4\",\n    \"scikit-learn\": \"scikit-learn>=0.19.0,<=0.20.3\",\n    \"seaborn\": \"seaborn>=0.9.0\",\n    \"sklearn-crfsuite\": \"sklearn-crfsuite>=0.3.6\",\n    \"spacy\": \"spacy==2.1.8\",\n    \"spacy-models\": (\n        \"https://github.com/explosion/spacy-models/releases/download/\"\n        \"en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz\"\n    ),\n    \"transformers\": \"transformers==2.9.0\",\n    \"gensim\": \"gensim>=3.7.0\",\n    \"nltk\": \"nltk>=3.4\",\n    \"seqeval\": \"seqeval>=0.0.12\",\n    \"pyrouge\": \"pyrouge>=0.1.3\",\n    \"py-rouge\": \"py-rouge>=1.1\",\n    \"indic-nlp-library\": \"indic-nlp-library>=0.6\",\n    \"torchtext\": \"torchtext>=0.4.0\",\n    \"multiprocess\": \"multiprocess==0.70.9\",\n    \"tensorboardX\": \"tensorboardX==1.8\",\n    \"Cython\": \"Cython>=0.29.13\",\n    \"googledrivedownloader\": \"googledrivedownloader>=0.4\",\n    \"methodtools\": \"methodtools\",\n    \"s2s-ft\": \"-e git+https://github.com/microsoft/unilm.git\"\n    \"@s2s-ft.v0.3#egg=s2s-ft&subdirectory=s2s-ft\",\n    \"requests\": \"requests==2.22.0\",\n    \"requests-oauthlib\": \"requests-oauthlib==1.2.0\",\n    \"regex\": \"regex==2020.2.20\",\n}\n\nPIP_GPU = {}\n\nPIP_DARWIN = {}\nPIP_DARWIN_GPU = {}\n\nPIP_LINUX = {}\nPIP_LINUX_GPU = {}\n\nPIP_WIN32 = {}\nPIP_WIN32_GPU = {}\n\nCONDA_DARWIN = {}\nCONDA_DARWIN_GPU = {}\n\nCONDA_LINUX = {}\nCONDA_LINUX_GPU = {}\n\nCONDA_WIN32 = {}\nCONDA_WIN32_GPU = {\"pytorch\": \"pytorch==1.0.0\", \"cudatoolkit\": \"cuda90\"}\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\n        description=textwrap.dedent(\n            \"\"\"\n        This script generates a conda file for different environments.\n        Plain python is the default,\n        but flags can be used to support GPU functionality.\"\"\"\n        ),\n        epilog=HELP_MSG,\n        formatter_class=argparse.RawDescriptionHelpFormatter,\n    )\n    parser.add_argument(\"--name\", help=\"specify name of conda environment\")\n    parser.add_argument(\n        \"--gpu\", action=\"store_true\", help=\"include packages for GPU support\"\n    )\n    parser.add_argument(\"--cuda_version\", type=str, default=\"10.1\")\n    args = parser.parse_args()\n\n    # set name of environment and output yaml file\n    conda_env = \"nlp_cpu\"\n    if args.gpu:\n        conda_env = \"nlp_gpu\"\n\n    # overwrite environment name with user input\n    if args.name is not None:\n        conda_env = args.name\n\n    # add conda and pip base packages\n    conda_packages = CONDA_BASE\n    pip_packages = PIP_BASE\n\n    # update conda and pip packages based on flags provided\n    CONDA_GPU[\"cudatoolkit\"] = \"cudatoolkit=\" + args.cuda_version\n    if args.gpu:\n        conda_packages.update(CONDA_GPU)\n        pip_packages.update(PIP_GPU)\n\n    # update conda and pip packages based on os platform support\n    if platform == \"darwin\":\n        conda_packages.update(CONDA_DARWIN)\n        pip_packages.update(PIP_DARWIN)\n        if args.gpu:\n            conda_packages.update(CONDA_DARWIN_GPU)\n            pip_packages.update(PIP_DARWIN_GPU)\n    elif platform.startswith(\"linux\"):\n        conda_packages.update(CONDA_LINUX)\n        pip_packages.update(PIP_LINUX)\n        if args.gpu:\n            conda_packages.update(CONDA_LINUX_GPU)\n            pip_packages.update(PIP_LINUX_GPU)\n    elif platform == \"win32\":\n        conda_packages.update(CONDA_WIN32)\n        pip_packages.update(PIP_WIN32)\n        if args.gpu:\n            conda_packages.update(CONDA_WIN32_GPU)\n            pip_packages.update(PIP_WIN32_GPU)\n    else:\n        raise Exception(\"Unsupported platform. Must be Windows, Linux, or macOS\")\n\n    # write out yaml file\n    conda_file = \"{}.yaml\".format(conda_env)\n    with open(conda_file, \"w\") as f:\n        for line in HELP_MSG.format(conda_env=conda_env).split(\"\\n\"):\n            f.write(\"# {}\\n\".format(line))\n        f.write(\"name: {}\\n\".format(conda_env))\n        f.write(\"channels:\\n\")\n        for channel in CHANNELS:\n            f.write(\"- {}\\n\".format(channel))\n        f.write(\"dependencies:\\n\")\n        for conda_package in conda_packages.values():\n            f.write(\"- {}\\n\".format(conda_package))\n        f.write(\"- pip:\\n\")\n        for pip_package in pip_packages.values():\n            f.write(\"  - {}\\n\".format(pip_package))\n\n    print(\"Generated conda file: {}\".format(conda_file))\n    print(HELP_MSG.format(conda_env=conda_env))\n"
  },
  {
    "path": "tools/generate_requirements_txt.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This file outputs a requirements.txt based on the libraries defined in generate_conda_file.py\nfrom generate_conda_file import (\n    CONDA_BASE,\n    CONDA_GPU,\n    PIP_BASE,\n    PIP_GPU,\n    PIP_DARWIN,\n    PIP_LINUX,\n    PIP_WIN32,\n    CONDA_DARWIN,\n    CONDA_LINUX,\n    CONDA_WIN32,\n    PIP_DARWIN_GPU,\n    PIP_LINUX_GPU,\n    PIP_WIN32_GPU,\n    CONDA_DARWIN_GPU,\n    CONDA_LINUX_GPU,\n    CONDA_WIN32_GPU,\n)\n\n\nif __name__ == \"__main__\":\n    deps = list(CONDA_BASE.values())\n    deps += list(CONDA_GPU.values())\n    deps += list(PIP_BASE.values())\n    deps += list(PIP_GPU.values())\n    deps += list(PIP_DARWIN.values())\n    deps += list(PIP_LINUX.values())\n    deps += list(PIP_WIN32.values())\n    deps += list(CONDA_DARWIN.values())\n    deps += list(CONDA_LINUX.values())\n    deps += list(CONDA_WIN32.values())\n    deps += list(PIP_DARWIN_GPU.values())\n    deps += list(PIP_LINUX_GPU.values())\n    deps += list(PIP_WIN32_GPU.values())\n    deps += list(CONDA_DARWIN_GPU.values())\n    deps += list(CONDA_LINUX_GPU.values())\n    deps += list(CONDA_WIN32_GPU.values())\n    with open(\"requirements.txt\", \"w\") as f:\n        f.write(\"\\n\".join(set(deps)))\n\n"
  },
  {
    "path": "tools/remove_pixelserver.py",
    "content": "#!/usr/bin/python\n\n# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport json\nimport os\nimport sys\nimport glob\n\n\nSIGNATURE = \"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions\"\n\n\ndef remove_pixelserver_from_notebook(file_path):\n    \"\"\"\n    Remove pixelserver tracking from a notebook. If the pixcelserver signature found in\n    the notebook, the pixelserver cell will be removed from the notebook file. File will\n    be modified only when the pixelserver signature is found in it.\n\n    Args:\n        file_path (str): The notebook file path.\n    \"\"\"\n\n    with open(file_path, encoding='utf-8') as fd:\n        raw_json = json.load(fd)\n\n        if 'cells' not in raw_json:\n            return\n        \n        cells = raw_json['cells']\n        pixel_cells = []\n\n        for idx, cell in enumerate(cells):\n            if cell['cell_type'] != 'markdown':\n                continue\n            \n            source = cell['source']\n            for row in source:\n                if row.startswith(SIGNATURE):\n                    pixel_cells.append(idx)\n                    print(\"Found pixelserver in file: \\\"{}\\\", cell {}\".format(file_path, idx))\n        \n        for cell_id in pixel_cells[::-1]:\n            cells.pop(cell_id)\n\n    if pixel_cells:\n        with open(file_path, 'w', encoding='utf-8') as fd:\n            json.dump(raw_json, fd, indent=1)\n\n\ndef get_all_notebook_files():\n    \"\"\"\n    Get all example notebook files' path and return them as a list.\n\n    Returns:\n        list of str. A list of notebook file paths. \n    \"\"\"\n\n    root_path = os.path.dirname(sys.path[0])\n    examples_path = os.path.join(root_path, \"examples\")\n    if not os.path.exists(examples_path):\n        raise ValueError(\"Cannot find examples file path: {}\".format(examples_path))\n\n    files = [f for f in glob.glob(os.path.join(examples_path, \"*/*.ipynb\"), recursive=True)]\n    return files\n\n\ndef main():\n    \"\"\"\n    Remove pixelserver from all example notebooks.\n    \"\"\"\n    \n    notebooks = get_all_notebook_files()\n    for notebook in notebooks:\n        remove_pixelserver_from_notebook(notebook)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "utils_nlp/README.md",
    "content": "# NLP Utilities\n\nModern NLP research and development can involve tedious tasks ranging from data loading, dataset understanding,  model development, model evaluation to productionize a trained NLP model. Recognizing the need of simplying these tedious tasks, we developed this module (**utils_nlp**) to provide a wide spectrum of classes, functions and utilities. Adoption of this module can greately speed up the development work and sample notebooks in [Examples](../examples) folder can demonstrate this.  The following provides a short description of the sub-modules. For more details about what functions/classes/utitilies are available and how to use them, please review the doc-strings provided with the code and see the sample notebooks in [Examples](../examples) folder.\n\n## Submodules\n\n### [AzureML](azureml)\n\nThe AzureML submodule contains utilities to connect to an Azure Machine Learning workspace, train, tune and operationalize NLP systems at scale using AzureML.\n\n```python\nfrom utils_nlp.azureml.azureml_utils import get_or_create_workspace\n\n###Note: you do not need to fill in these values if you have a config.json in the same folder as this notebook\nws = get_or_create_workspace(\n    config_path=config_path,\n    subscription_id=subscription_id,\n    resource_group=resource_group,\n    workspace_name=workspace_name,\n    workspace_region=workspace_region,\n)\n```\n\n### [Common](common)\n\nThis submodule contains high-level utilities that are commonly used in multiple algorithms as well as helper functions for managing frameworks like pytorch.\n\n### [Dataset](dataset)\nThis submodule includes helper functions for interacting with well-known datasets,  utility functions to process datasets for different NLP tasks, as well as utilities for splitting data for training/testing. For example, the [snli module](snli.py) will allow you to load a dataframe in pandas from the  Stanford Natural Language Inference (SNLI) Corpus dataset, with the option to set the number of rows to load in order to test algorithms and evaluate performance benchmarks. Information on the datasets used in the repo can be found [here](https://github.com/microsoft/nlp-recipes/tree/staging/utils_nlp/dataset#datasets).\n\nMost datasets may be split into `train`, `dev`, and `test`.\n\n```python\nfrom utils_nlp.dataset.snli import load_pandas_df\n\ndf = load_pandas_df(DATA_FOLDER, file_split =\"train\", nrows = 1000)\n```\n\n### [Evaluation](eval)\nThe *eval* submodule includes functionalities for computing common classification evaluation metrics like accuracy, precision, recall, and f1 scores for classification scenarios. It also includes metric utitlities for normalizing and finding f1_scores for [The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/), and utilities to log the means and other coefficients in evaluating the quality of sentence embedding.\n\n### [Models](models)\nThe models submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new natural language processing systems. A description of which algorithms are used in each scenario can be found on [this table](../README.md#content).\n\nA few highlights are\n* BERT\n* GenSen\n* XLNet\n\n\n### [Model Explainability](interpreter)\nThe interpreter submodule contains utils that help explain or diagnose models, such as interpreting layers of a neural network.\n"
  },
  {
    "path": "utils_nlp/__init__.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n__title__ = \"Microsoft NLP\"\n__author__ = \"AI CAT at Microsoft\"\n__license__ = \"MIT\"\n__copyright__ = \"Copyright 2018-present Microsoft Corporation\"\n__version__ = \"2.0.0\"\n\n# Synonyms\nTITLE = __title__\nAUTHOR = __author__\nLICENSE = __license__\nCOPYRIGHT = __copyright__\nVERSION = __version__\n"
  },
  {
    "path": "utils_nlp/azureml/README.md",
    "content": "## [AzureML](.)\n\nThe AzureML submodule contains utilities to connect to a\n[workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace),\ntrain, tune and operationalize NLP systems at scale using AzureML.   \nFor example, the `DistributedCommunicator` class defined in\n[azureml_bert_util.py](./azureml_bert_util.py) assists in making communication with multiple nodes\nfor distributed training possible. [azureml_utils.py](./azureml_utils.py) contains a few helper functions that make it easy to authenticate, create, or retrieve an AzureML resource.\n"
  },
  {
    "path": "utils_nlp/azureml/__init__.py",
    "content": ""
  },
  {
    "path": "utils_nlp/azureml/azureml_bert_util.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Original source:\n# https://github.com/microsoft/AzureML-BERT/blob/dec79be13befdd51fa72c05419cf9288d32eb263/finetune/PyTorch/azureml_bert_util.py\n\n\"\"\"\n    Classes and helper functions for fine-tuning BERT models at scale (e.g.\n    distributed training) using AzureML.\n\"\"\"\n\n\nfrom horovod.torch.mpi_ops import allreduce_async_, synchronize\nimport horovod.torch as hvd\nimport torch\n\nfrom collections import OrderedDict\n\ntry:\n    from apex_C import flatten\n    from apex_C import unflatten\nexcept ImportError:\n    try:\n        _ = warned_flatten\n    except NameError:\n        print(\n            \"Warning:  apex was installed without --cpp_ext.  Falling back to Python flatten and \"\n            \"unflatten.\"\n        )\n        warned_flatten = True\n    from torch._utils import _flatten_dense_tensors as flatten\n    from torch._utils import _unflatten_dense_tensors as unflatten\n\n\ndef warmup_linear(x, warmup=0.002):\n    if x < warmup:\n        return x / warmup\n    return 1.0 - x\n\n\ndef adjust_gradient_accumulation_steps(x, initial_steps, target_steps, warmup):\n    return min(max(int(x / warmup * target_steps), initial_steps), target_steps)\n\n\nclass DistributedCommunicator:\n    \"\"\" Assists in making communication with multiple nodes for distributed training\"\"\"\n\n    def __init__(self, accumulation_step=1):\n        hvd.init()\n        self.local_rank = hvd.local_rank()\n        self.world_size = hvd.size()\n        self.rank = hvd.rank()\n        self.n_gpu = torch.cuda.device_count()\n        self.node_count = self.world_size // self.n_gpu\n        self.accumulation_step = accumulation_step\n        self.count_down = accumulation_step - 1\n        self._multi_node = self.node_count > 1\n        if not self._multi_node:\n            # use PyTorch build-in NCCL backend for single node training\n            torch.distributed.init_process_group(\n                backend=\"nccl\",\n                init_method=\"tcp://127.0.0.1:6000\",\n                world_size=self.n_gpu,\n                rank=self.local_rank,\n            )\n\n    def register_model(self, model, fp16):\n        #  broadcast model parameters\n        if self.node_count > 1:\n            hvd.broadcast_parameters(model.state_dict(), root_rank=0)\n        else:\n            for param in model.parameters():\n                torch.distributed.broadcast_multigpu([param], 0)\n\n        # register hook for reduce when backpropagate\n        self._parameter_names = {v: k for k, v in sorted(model.named_parameters())}\n        self._handles = {}\n        self._requires_update = set()\n        self._grad_accs = []\n        self._grad = []\n        self._compression = hvd.Compression.fp16 if fp16 else hvd.Compression.none\n        for p in model.parameters():\n            if p.requires_grad:\n                p.grad = p.data.new(p.size()).zero_()\n                self._requires_update.add(p)\n                p_tmp = p.expand_as(p)\n                grad_acc = p_tmp.grad_fn.next_functions[0][0]\n                grad_acc.register_hook(self._make_hook(p))\n                self._grad_accs.append(grad_acc)\n\n    def _allreduce_tensor(self, p):\n        assert p not in self._handles\n        assert not p.grad.requires_grad\n        tensor = p.grad\n        name = self._parameter_names.get(p)\n        if self._multi_node:\n            tensor_compressed, ctx = self._compression.compress(tensor)\n            handle = allreduce_async_(tensor_compressed, average=True, name=name)\n            self._handles[p] = (handle, ctx)\n        else:\n            self._handles[p] = tensor\n\n    def _make_hook(self, p):\n        def hook(*ignore):\n            if self.count_down == 0:\n                self._allreduce_tensor(p)\n\n        return hook\n\n    def synchronize(self):\n        synced = False\n        if self.count_down == 0:\n            missing_p = self._requires_update - set(self._handles.keys())\n            for p in missing_p:\n                self._allreduce_tensor(p)\n\n            if self._multi_node:\n                for p, value in self._handles.items():\n                    handle, ctx = value\n                    output = synchronize(handle)\n                    p.grad.set_(self._compression.decompress(output, ctx) / self.accumulation_step)\n            else:\n                buckets = OrderedDict()\n                for tensor in self._handles.values():\n                    tp = tensor.type()\n                    if tp not in buckets:\n                        buckets[tp] = []\n                    buckets[tp].append(tensor)\n                for tp in buckets:\n                    bucket = buckets[tp]\n                    coalesced = flatten(bucket) / self.world_size / self.accumulation_step\n                    torch.distributed.all_reduce_multigpu([coalesced])\n                    for buf, synced in zip(bucket, unflatten(coalesced, bucket)):\n                        buf.copy_(synced)\n            self._handles.clear()\n            synced = True\n            self.count_down = self.accumulation_step\n\n        self.count_down -= 1\n        return synced\n\n    def set_accumulation_step(self, accumulation_step):\n        self.accumulation_step = accumulation_step\n        self.count_down = self.accumulation_step - 1\n"
  },
  {
    "path": "utils_nlp/azureml/azureml_utils.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Helper functions for interacting with AzureML Resources.\"\"\"\n\nimport os\nfrom azureml.core.authentication import AzureCliAuthentication\nfrom azureml.core.authentication import InteractiveLoginAuthentication\nfrom azureml.core.authentication import AuthenticationException\nfrom azureml.core import Workspace\nfrom azureml.exceptions import ProjectSystemException\nfrom azureml.core.compute import ComputeTarget, AmlCompute\nfrom azureml.core.compute_target import ComputeTargetException\n\n\ndef get_auth():\n    \"\"\"\n    Method to get the correct Azure ML Authentication type\n\n    Always start with CLI Authentication and if it fails, fall back\n    to interactive login\n    \"\"\"\n    try:\n        auth_type = AzureCliAuthentication()\n        auth_type.get_authentication_header()\n    except AuthenticationException:\n        auth_type = InteractiveLoginAuthentication()\n    return auth_type\n\n\ndef get_or_create_workspace(\n    config_path=\"./.azureml\",\n    subscription_id=None,\n    resource_group=None,\n    workspace_name=None,\n    workspace_region=None,\n):\n    \"\"\"\n    Method to get or create workspace.\n\n    Args:\n        config_path: optional directory to look for / store config.json file (defaults to current\n            directory)\n        subscription_id: Azure subscription id\n        resource_group: Azure resource group to create workspace and related resources\n        workspace_name: name of azure ml workspace\n        workspace_region: region for workspace\n\n    Returns:\n        obj: AzureML workspace if one exists already with the name otherwise creates a new one.\n    \"\"\"\n    config_file_path = \".\"\n\n    if config_path is not None:\n        config_dir, config_file_name = os.path.split(config_path)\n        if config_file_name != \"config.json\":\n            config_file_path = os.path.join(config_path, \"config.json\")\n\n    try:\n        # get existing azure ml workspace\n        if os.path.isfile(config_file_path):\n            ws = Workspace.from_config(config_file_path, auth=get_auth())\n        else:\n            ws = Workspace.get(\n                name=workspace_name,\n                subscription_id=subscription_id,\n                resource_group=resource_group,\n                auth=get_auth(),\n            )\n\n    except ProjectSystemException:\n        # this call might take a minute or two.\n        print(\"Creating new workspace\")\n        ws = Workspace.create(\n            name=workspace_name,\n            subscription_id=subscription_id,\n            resource_group=resource_group,\n            create_resource_group=True,\n            location=workspace_region,\n            auth=get_auth(),\n        )\n\n        ws.write_config(path=config_path)\n    return ws\n\n\ndef get_or_create_amlcompute(\n    workspace,\n    compute_name,\n    vm_size=\"\",\n    min_nodes=0,\n    max_nodes=None,\n    idle_seconds_before_scaledown=None,\n    verbose=False,\n):\n    \"\"\"\n        Get or create AmlCompute as the compute target. If a cluster of the same name is found,\n        attach it and rescale accordingly. Otherwise, create a new cluster.\n\n    Args:\n        workspace (Workspace): workspace\n        compute_name (str): name\n        vm_size (str, optional): vm size\n        min_nodes (int, optional): minimum number of nodes in cluster\n        max_nodes (None, optional): maximum number of nodes in cluster\n        idle_seconds_before_scaledown (None, optional): how long to wait before the cluster\n            autoscales down\n        verbose (bool, optional): if true, print logs\n    Returns:\n        Compute target\n    \"\"\"\n    try:\n        if verbose:\n            print(\"Found compute target: {}\".format(compute_name))\n\n        compute_target = ComputeTarget(workspace=workspace, name=compute_name)\n        if len(compute_target.list_nodes()) < max_nodes:\n            if verbose:\n                print(\"Rescaling to {} nodes\".format(max_nodes))\n            compute_target.update(max_nodes=max_nodes)\n            compute_target.wait_for_completion(show_output=verbose)\n\n    except ComputeTargetException:\n        if verbose:\n            print(\"Creating new compute target: {}\".format(compute_name))\n\n        compute_config = AmlCompute.provisioning_configuration(\n            vm_size=vm_size,\n            min_nodes=min_nodes,\n            max_nodes=max_nodes,\n            idle_seconds_before_scaledown=idle_seconds_before_scaledown,\n        )\n        compute_target = ComputeTarget.create(workspace, compute_name, compute_config)\n        compute_target.wait_for_completion(show_output=verbose)\n\n    return compute_target\n\n\ndef get_output_files(run, output_path, file_names=None):\n    \"\"\"\n    Method to get the output files from an AzureML output directory.\n\n    Args:\n        file_names(list): Names of the files to download.\n        run(azureml.core.run.Run): Run object of the run.\n        output_path(str): Path to download the output files.\n\n    Returns: None\n\n    \"\"\"\n    os.makedirs(output_path, exist_ok=True)\n\n    if file_names is None:\n        file_names = run.get_file_names()\n\n    for f in file_names:\n        dest = os.path.join(output_path, f.split(\"/\")[-1])\n        print(\"Downloading file {} to {}...\".format(f, dest))\n        run.download_file(f, dest)\n"
  },
  {
    "path": "utils_nlp/common/README.md",
    "content": "## [Common](.)\n\nThis submodule contains high-level common utilities used across multiple algorithms and \nframeworks as well as helper functions for managing aspects of different frameworks like pytorch.  \nFor example, [pytorch_utils.py](./pytorch_utils.py) contains utilities to interact with PyTorch \nlike getting a device architecture (cpu or gpu), moving a model to a specific device, and handling \nparallelism when multiple gpus are present.  \n"
  },
  {
    "path": "utils_nlp/common/__init__.py",
    "content": ""
  },
  {
    "path": "utils_nlp/common/pytorch_utils.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Common PyTorch utilities that facilitate building PyTorch models.\"\"\"\n\nimport torch\nfrom torch.utils.data import DataLoader, RandomSampler, SequentialSampler\nfrom torch.utils.data.distributed import DistributedSampler\n\n\ndef get_device(num_gpus=None, gpu_ids=None, local_rank=-1):\n    if gpu_ids is not None:\n        num_gpus = len(gpu_ids)\n    if local_rank == -1:\n        num_gpus = (\n            min(num_gpus, torch.cuda.device_count())\n            if num_gpus is not None\n            else torch.cuda.device_count()\n        )\n        device = torch.device(\n            \"cuda\" if torch.cuda.is_available() and num_gpus > 0 else \"cpu\"\n        )\n    else:\n        torch.cuda.set_device(local_rank)\n        device = torch.device(\"cuda\", local_rank)\n        num_gpus = 1\n    return device, num_gpus\n\n\ndef move_model_to_device(model, device):\n    if not isinstance(device, torch.device):\n        raise ValueError(\"device must be of type torch.device.\")\n\n    # unwrap model\n    # if isinstance(model, torch.nn.DataParallel):\n    model = (\n        model.module if hasattr(model, \"module\") else model\n    )  # Take care of distributed/parallel training\n\n    # move to device\n    return model.to(device)\n\n\ndef parallelize_model(model, device, num_gpus=None, gpu_ids=None, local_rank=-1):\n    \"\"\"Moves a model to the specified device (cpu or gpu/s)\n       and implements data parallelism when multiple gpus are specified.\n    Args:\n        model (Module): A PyTorch model.\n        device (torch.device): A PyTorch device.\n        num_gpus (int): The number of GPUs to be used.\n            If set to None, all available GPUs will be used.\n            Defaults to None.\n        gpu_ids (list): List of GPU IDs to be used.\n            If None, the first num_gpus GPUs will be used.\n            If not None, overrides num_gpus. if gpu_ids is an empty list\n            or there is no valid gpu devices are specified,\n            and device is \"cuda\", model will not be moved or parallelized.\n            Defaults to None.\n        local_rank (int): Local GPU ID within a node. Used in distributed environments.\n            If not -1, num_gpus and gpu_ids are ignored.\n            Defaults to -1.\n    Returns:\n        Module, DataParallel, DistributedDataParallel: A PyTorch Module or\n            a DataParallel/DistributedDataParallel wrapper,\n            when one or multiple gpus are used.\n    \"\"\"\n    if not isinstance(device, torch.device):\n        raise ValueError(\"device must be of type torch.device.\")\n\n    model_module = (\n        model.module if hasattr(model, \"module\") else model\n    )  # Take care of distributed/parallel training\n\n    if local_rank != -1:\n        model = torch.nn.parallel.DistributedDataParallel(\n            model_module,\n            device_ids=[local_rank],\n            output_device=local_rank,\n            find_unused_parameters=True,\n        )\n    else:\n        if device.type == \"cuda\":\n            if num_gpus is not None:\n                if num_gpus < 1:\n                    raise ValueError(\"num_gpus must be at least 1 or None\")\n            num_cuda_devices = torch.cuda.device_count()\n            if num_cuda_devices < 1:\n                raise Exception(\"CUDA devices are not available.\")\n            if gpu_ids is None:\n                num_gpus = (\n                    num_cuda_devices\n                    if num_gpus is None\n                    else min(num_gpus, num_cuda_devices)\n                )\n                gpu_ids = list(range(num_gpus))\n            else:\n                gpu_ids = list(set(list(range(num_cuda_devices))).intersection(gpu_ids))\n            if len(gpu_ids) > 0:\n                model = torch.nn.DataParallel(model_module, device_ids=gpu_ids)\n    return model\n\n\ndef dataloader_from_dataset(\n    ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False\n):\n    \"\"\"Creates a PyTorch DataLoader given a Dataset object.\n\n    Args:\n        ds (torch.utils.data.DataSet): A PyTorch dataset.\n        batch_size (int, optional): Batch size.\n            If more than 1 gpu is used, this would be the batch size per gpu.\n            Defaults to 32.\n        num_gpus (int, optional): The number of GPUs to be used. Defaults to None.\n        shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False.\n        distributed (book, optional): If True, a DistributedSampler is used.\n        Defaults to False.\n\n    Returns:\n        Module, DataParallel: A PyTorch Module or\n            a DataParallel wrapper (when multiple gpus are used).\n    \"\"\"\n    if num_gpus is None:\n        num_gpus = torch.cuda.device_count()\n\n    batch_size = batch_size * max(1, num_gpus)\n\n    if distributed:\n        sampler = DistributedSampler(ds)\n    else:\n        sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)\n\n    return DataLoader(ds, sampler=sampler, batch_size=batch_size)\n\n\ndef compute_training_steps(\n    dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1\n):\n    \"\"\"Computes the max training steps given a dataloader.\n\n    Args:\n        dataloader (Dataloader): A PyTorch DataLoader.\n        num_epochs (int, optional): Number of training epochs. Defaults to 1.\n        max_steps (int, optional): Total number of training steps.\n            If set to a positive value, it overrides num_epochs.\n            Otherwise, it's determined by the dataset length,\n            gradient_accumulation_steps, and num_epochs.\n            Defaults to -1.\n        gradient_accumulation_steps (int, optional): Number of steps to accumulate\n            before performing a backward/update pass.\n            Default to 1.\n\n    Returns:\n        int: The max number of steps to be used in a training loop.\n    \"\"\"\n    try:\n        dataset_length = len(dataloader)\n    except Exception:\n        dataset_length = -1\n    if max_steps <= 0:\n        if dataset_length != -1 and num_epochs > 0:\n            max_steps = dataset_length // gradient_accumulation_steps * num_epochs\n    if max_steps <= 0:\n        raise Exception(\"Max steps cannot be determined.\")\n    return max_steps\n\n\ndef get_amp(fp16):\n    \"\"\"This function ensures that fp16 execution of torch.einsum is enabled\n        if fp16 is set. Otherwise, it'll default to \"promote\" mode,\n        where the operations are in fp32.\n        Note that setting `fp16_opt_level=\"O2\"` will remove the need for this code.\n    \"\"\"\n    # Before we do anything with models, we want to\n    if fp16:\n        try:\n            from apex import amp\n\n            amp.register_half_function(torch, \"einsum\")\n        except ImportError:\n            raise ImportError(\n                \"Please install apex from https://www.github.com/nvidia/apex\"\n            )\n    else:\n        amp = None\n    return amp\n"
  },
  {
    "path": "utils_nlp/common/timer.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Timer utilities for benchmarking running times of executions.\"\"\"\n\nfrom timeit import default_timer\n\n\nclass Timer(object):\n    \"\"\"Timer class.\n    Original code: https://github.com/miguelgfierro/codebase\n\n    Examples:\n        >>> import time\n        >>> t = Timer()\n        >>> t.start()\n        >>> time.sleep(1)\n        >>> t.stop()\n        >>> t.interval < 1\n        True\n        >>> with Timer() as t:\n        ...   time.sleep(1)\n        >>> t.interval < 1\n        True\n        >>> \"Time elapsed {}\".format(t) #doctest: +ELLIPSIS\n        'Time elapsed 1...'\n    \"\"\"\n\n    def __init__(self):\n        self._timer = default_timer\n        self._interval = 0\n        self.running = False\n\n    def __enter__(self):\n        self.start()\n        return self\n\n    def __exit__(self, *args):\n        self.stop()\n\n    def __str__(self):\n        return \"{:0.4f}\".format(self.interval)\n\n    def start(self):\n        \"\"\"Start the timer.\"\"\"\n        self.init = self._timer()\n        self.running = True\n\n    def stop(self):\n        \"\"\"Stop the timer. Calculate the interval in seconds.\"\"\"\n        self.end = self._timer()\n        try:\n            self._interval = self.end - self.init\n            self.running = False\n        except AttributeError:\n            raise ValueError(\n                \"Timer has not been initialized: use start() or the contextual form with Timer() \"\n                \"as t:\"\n            )\n\n    @property\n    def interval(self):\n        if self.running:\n            raise ValueError(\"Timer has not been stopped, please use stop().\")\n        else:\n            return self._interval\n"
  },
  {
    "path": "utils_nlp/dataset/README.md",
    "content": "## [Dataset](.)\nThis submodule includes helper functions for downloading datasets and formatting them appropriately as well as utilities for splitting data for training / testing.\n\n## Data Loading\nThere are dataloaders for several datasets. For example, the snli module will allow you to load a dataframe in pandas from the SNLI dataset, with the option to set the number of rows to load in order to test algorithms and evaluate performance benchmarks.\nMost datasets may be split into `train`, `dev`, and `test`, for example:\n\n```python\nfrom utils_nlp.dataset.snli import load_pandas_df\n\ndf = load_pandas_df(DATA_FOLDER, file_split =\"train\", nrows = 1000)\n```\n## Dataset List\n|Dataset|Dataloader script|\n|-------|-----------------|\n|[Microsoft Research Paraphrase Corpus](https://www.microsoft.com/en-us/download/details.aspx?id=52398)|[msrpc.py](./msrpc.py)|\n|[The Multi-Genre NLI (MultiNLI) Corpus](https://www.nyu.edu/projects/bowman/multinli/)|[multinli.py](./multinli.py)|\n|[The Stanford Natural Language Inference (SNLI) Corpus](https://nlp.stanford.edu/projects/snli/)|[snli.py](./snli.py)|\n|[Wikigold NER](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data)|[wikigold.py](./wikigold.py)|\n|[The Cross-Lingual NLI (XNLI) Corpus](https://www.nyu.edu/projects/bowman/xnli/)|[xnli.py](./xnli.py)|\n|[The STSbenchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark)|[stsbenchmark.py](./stsbenchmark.py)|\n|[The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/)|[squad.py](./squad.py)|\n|[CNN/Daily Mail(CNN/DM) Dataset](https://github.com/harvardnlp/sent-summary)|[cnndm.py](./cnndm.py)|\n|[Preprocessed CNN/Daily Mail(CNN/DM) Dataset for Extractive Summarization](https://github.com/nlpyang/BertSum)|[cnndm.py](./cnndm.py)|\n\n## Dataset References\nPlease see [Dataset References](../../DatasetReferences.md) for notice and information regarding datasets used.\n"
  },
  {
    "path": "utils_nlp/dataset/__init__.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nfrom enum import Enum\nimport nltk\n\nnltk.download(\"punkt\", quiet=True)\nnltk.download(\"stopwords\", quiet=True)\n\n\nclass Split(str, Enum):\n    TRAIN: str = \"train\"\n    DEV: str = \"dev\"\n    TEST: str = \"test\"\n"
  },
  {
    "path": "utils_nlp/dataset/bbc_hindi.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n    Utility functions for downloading, extracting, and reading the\n    BBC Hindi News Corpus.\n    https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1\n\"\"\"\n\nimport logging\nimport os\nimport tarfile\nfrom tempfile import TemporaryDirectory\n\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\n\nfrom utils_nlp.common.pytorch_utils import dataloader_from_dataset\nfrom utils_nlp.dataset.url_utils import maybe_download\nfrom utils_nlp.models.transformers.common import MAX_SEQ_LEN\nfrom utils_nlp.models.transformers.sequence_classification import Processor\n\nURL = \"https://github.com/NirantK/hindi2vec/releases/\" \"download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz\"\n\n\ndef load_pandas_df(local_cache_path=TemporaryDirectory().name):\n    \"\"\"\n    Downloads and extracts the dataset files\n\n    Args:\n        local_cache_path (str, optional): The local file path to save the raw file.\n        Defaults to TemporaryDirectory().name.\n    Returns:\n        pd.DataFrame: pandas DataFrame containing the loaded dataset.\n    \"\"\"\n\n    zipped_file = URL.split(\"/\")[-1]\n    maybe_download(URL, zipped_file, local_cache_path)\n\n    zipped_file_path = os.path.join(local_cache_path, zipped_file)\n    tar = tarfile.open(zipped_file_path, \"r:gz\")\n    tar.extractall(path=local_cache_path)\n    tar.close()\n\n    train_csv_file_path = os.path.join(local_cache_path, \"hindi-train.csv\")\n    test_csv_file_path = os.path.join(local_cache_path, \"hindi-test.csv\")\n\n    train_df = pd.read_csv(train_csv_file_path, sep=\"\\t\", encoding=\"utf-8\", header=None)\n\n    test_df = pd.read_csv(test_csv_file_path, sep=\"\\t\", encoding=\"utf-8\", header=None)\n\n    train_df = train_df.fillna(\"\")\n    test_df = test_df.fillna(\"\")\n\n    return (train_df, test_df)\n\n\ndef load_tc_dataset(\n    local_path=TemporaryDirectory().name,\n    test_fraction=0.25,\n    random_seed=None,\n    train_sample_ratio=1.0,\n    test_sample_ratio=1.0,\n    model_name=\"bert-base-uncased\",\n    to_lower=True,\n    cache_dir=TemporaryDirectory().name,\n    max_len=MAX_SEQ_LEN,\n    batch_size=32,\n    num_gpus=None,\n):\n    \"\"\"\n    Load the multinli dataset and split into training and testing datasets.\n    The datasets are preprocessed and can be used to train a NER model or evaluate\n    on the testing dataset.\n\n    Args:\n        local_path (str, optional): The local file path to save the raw wikigold file.\n            Defautls to TemporaryDirectory().name.\n        test_fraction (float, optional): The fraction of testing dataset when splitting.\n            Defaults to 0.25.\n        random_seed (float, optional): Random seed used to shuffle the data.\n            Defaults to None.\n        train_sample_ratio (float, optional): The ratio that used to sub-sampling for training.\n            Defaults to 1.0.\n        test_sample_ratio (float, optional): The ratio that used to sub-sampling for testing.\n            Defaults to 1.0.\n        model_name (str, optional): The pretained model name.\n            Defaults to \"bert-base-uncased\".\n        to_lower (bool, optional): Lower case text input.\n            Defaults to True.\n        cache_dir (str, optional): The default folder for saving cache files.\n            Defaults to TemporaryDirectory().name.\n        max_len (int, optional): Maximum length of the list of tokens. Lists longer\n            than this are truncated and shorter ones are padded with \"O\"s.\n            Default value is BERT_MAX_LEN=512.\n        batch_size (int, optional): The batch size for training and testing.\n            Defaults to 32.\n        num_gpus (int, optional): The number of GPUs.\n            Defaults to None.\n\n    Returns:\n        tuple. The tuple contains four elements:\n        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.\n\n        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.\n\n        label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values\n            can be retrieved by calling the `inverse_transform` function.\n\n        test_labels (Series): a Pandas Series of testing label (in label ID format). If\n            the labels are in raw label values format, we will need to transform it to\n            label IDs by using the label_encoder.transform function.\n    \"\"\"\n\n    # download and load the original dataset\n    train, test = load_pandas_df(local_cache_path=local_path)\n    all_df = pd.concat([train, test], ignore_index=True)\n    all_df.columns = [\"label\", \"text\"]\n    text_col = \"text\"\n    label_col = \"label\"\n\n    # encode labels, use the \"genre\" column as the label column\n    label_encoder = LabelEncoder()\n    label_encoder.fit(all_df[label_col])\n\n    if test_fraction < 0 or test_fraction >= 1.0:\n        logging.warning(\"Invalid test fraction value: {}, changed to 0.25\".format(test_fraction))\n        test_fraction = 0.25\n\n    train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)\n\n    if train_sample_ratio > 1.0:\n        train_sample_ratio = 1.0\n        logging.warning(\"Setting the training sample ratio to 1.0\")\n    elif train_sample_ratio < 0:\n        logging.error(\"Invalid training sample ration: {}\".format(train_sample_ratio))\n        raise ValueError(\"Invalid training sample ration: {}\".format(train_sample_ratio))\n\n    if test_sample_ratio > 1.0:\n        test_sample_ratio = 1.0\n        logging.warning(\"Setting the testing sample ratio to 1.0\")\n    elif test_sample_ratio < 0:\n        logging.error(\"Invalid testing sample ration: {}\".format(test_sample_ratio))\n        raise ValueError(\"Invalid testing sample ration: {}\".format(test_sample_ratio))\n\n    if train_sample_ratio < 1.0:\n        train_df = train_df.sample(frac=train_sample_ratio).reset_index(drop=True)\n    if test_sample_ratio < 1.0:\n        test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)\n\n    train_labels = label_encoder.transform(train_df[label_col])\n    train_df[label_col] = train_labels\n    test_labels = label_encoder.transform(test_df[label_col])\n    test_df[label_col] = test_labels\n\n    processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)\n\n    train_dataset = processor.dataset_from_dataframe(\n        df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,\n    )\n    train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)\n\n    test_dataset = processor.dataset_from_dataframe(\n        df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,\n    )\n    test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)\n\n    return (train_dataloader, test_dataloader, label_encoder, test_labels)\n\n\ndef get_label_values(label_encoder, label_ids):\n    \"\"\"\n    Get the label values from label IDs.\n\n    Args:\n        label_encoder (LabelEncoder): a fitted sklearn LabelEncoder instance\n        label_ids (Numpy array): a Numpy array of label IDs.\n\n    Returns:\n        Numpy array. A Numpy array of label values.\n    \"\"\"\n\n    return label_encoder.inverse_transform(label_ids)\n"
  },
  {
    "path": "utils_nlp/dataset/cnndm.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\n# This script reuses some code from https://github.com/nlpyang/BertSum\n\n\"\"\"\n    Utility functions for downloading, extracting, and reading the\n    CNN/DM dataset at https://github.com/harvardnlp/sent-summary.\n\n\"\"\"\n\nimport nltk\n\n# nltk.download(\"punkt\")\nfrom nltk import tokenize\nfrom nltk.tokenize.treebank import TreebankWordDetokenizer\nimport os\nimport regex as re\nfrom torchtext.utils import extract_archive\n\n\nfrom utils_nlp.dataset.url_utils import (\n    maybe_download,\n    maybe_download_googledrive,\n    extract_zip,\n)\nfrom utils_nlp.models.transformers.datasets import (\n    SummarizationDataset,\n    IterableSummarizationDataset,\n)\n\n\n## have to move these local functions outside due to use of pool of processes\nREMAP = {\n    \"-lrb-\": \"(\",\n    \"-rrb-\": \")\",\n    \"-lcb-\": \"{\",\n    \"-rcb-\": \"}\",\n    \"-lsb-\": \"[\",\n    \"-rsb-\": \"]\",\n    \"``\": '\"',\n    \"''\": '\"',\n}\n\n\ndef _clean(x):\n    return re.sub(\n        r\"-lrb-|-rrb-|-lcb-|-rcb-|-lsb-|-rsb-|``|''\", lambda m: REMAP.get(m.group()), x,\n    )\n\n\ndef _remove_ttags(line):\n    line = re.sub(r\"<t>\", \"\", line)\n    # change </t> to <q>\n    # pyrouge test requires <q> as sentence splitter\n    line = re.sub(r\"</t>\", \"<q>\", line)\n    return line\n\n\ndef _target_sentence_tokenization(line):\n    return line.split(\"<q>\")\n\n\ndef join(sentences):\n    return \" \".join(sentences)\n\n\ndef CNNDMSummarizationDataset(*args, **kwargs):\n    \"\"\"Load the CNN/Daily Mail dataset preprocessed by harvardnlp group.\"\"\"\n\n    URLS = [\"https://s3.amazonaws.com/opennmt-models/Summary/cnndm.tar.gz\"]\n\n    def _setup_datasets(\n        url, top_n=-1, local_cache_path=\".data\", prepare_extractive=True\n    ):\n        FILE_NAME = \"cnndm.tar.gz\"\n        maybe_download(url, FILE_NAME, local_cache_path)\n        dataset_tar = os.path.join(local_cache_path, FILE_NAME)\n        extracted_files = extract_archive(dataset_tar)\n        for fname in extracted_files:\n            if fname.endswith(\"train.txt.src\"):\n                train_source_file = fname\n            if fname.endswith(\"train.txt.tgt.tagged\"):\n                train_target_file = fname\n            if fname.endswith(\"test.txt.src\"):\n                test_source_file = fname\n            if fname.endswith(\"test.txt.tgt.tagged\"):\n                test_target_file = fname\n\n        if prepare_extractive:\n\n            return (\n                SummarizationDataset(\n                    train_source_file,\n                    target_file=train_target_file,\n                    source_preprocessing=[_clean, tokenize.sent_tokenize],\n                    target_preprocessing=[\n                        _clean,\n                        _remove_ttags,\n                        _target_sentence_tokenization,\n                    ],\n                    word_tokenize=nltk.word_tokenize,\n                    top_n=top_n,\n                ),\n                SummarizationDataset(\n                    test_source_file,\n                    target_file=test_target_file,\n                    source_preprocessing=[_clean, tokenize.sent_tokenize],\n                    target_preprocessing=[\n                        _clean,\n                        _remove_ttags,\n                        _target_sentence_tokenization,\n                    ],\n                    word_tokenize=nltk.word_tokenize,\n                    top_n=top_n,\n                ),\n            )\n        else:\n            return (\n                SummarizationDataset(\n                    train_source_file,\n                    target_file=train_target_file,\n                    source_preprocessing=[_clean, tokenize.sent_tokenize],\n                    target_preprocessing=[\n                        _clean,\n                        _remove_ttags,\n                        _target_sentence_tokenization,\n                    ],\n                    top_n=top_n,\n                ),\n                SummarizationDataset(\n                    test_source_file,\n                    target_file=test_target_file,\n                    source_preprocessing=[_clean, tokenize.sent_tokenize],\n                    target_preprocessing=[\n                        _clean,\n                        _remove_ttags,\n                        _target_sentence_tokenization,\n                    ],\n                    top_n=top_n,\n                ),\n            )\n\n    return _setup_datasets(*((URLS[0],) + args), **kwargs)\n\n\nclass CNNDMBertSumProcessedData:\n    \"\"\"Class to load dataset preprocessed by BertSum paper at\n        https://github.com/nlpyang/BertSum\n    \"\"\"\n\n    @staticmethod\n    def download(local_path=\".data\"):\n        FILE_ID = \"1x0d61LP9UAN389YN00z0Pv-7jQgirVg6\"\n        FILE_NAME = \"bertsum_data.zip\"\n        os.makedirs(local_path, exist_ok=True)\n        output_dir = os.path.join(local_path, \"processed_data\")\n        os.makedirs(output_dir, exist_ok=True)\n        maybe_download_googledrive(\n            google_file_id=FILE_ID, file_name=FILE_NAME, work_directory=local_path\n        )\n        extract_zip(\n            file_path=os.path.join(local_path, FILE_NAME), dest_path=output_dir,\n        )\n        return output_dir\n\n\ndef detokenize(line):\n    \"\"\"\n    Detokenizes the processed CNN/DM dataset to recover the original dataset,\n    e.g. converts \"-LRB-\" back to \"(\" and \"-RRB-\" back to \")\".\n    \"\"\"\n    line = line.strip().replace(\"``\", '\"').replace(\"''\", '\"').replace(\"`\", \"'\")\n    twd = TreebankWordDetokenizer()\n    s_list = [\n        twd.detokenize(x.strip().split(\" \"), convert_parentheses=True)\n        for x in line.split(\"<S_SEP>\")\n    ]\n    return \" \".join(s_list)\n\n\ndef CNNDMSummarizationDatasetOrg(\n    local_path=\".\", top_n=-1, return_iterable=False, return_dev_data=False\n):\n    \"\"\"\n    Downloads a version of the CNN/DailyMail dataset with minimal processing\n    from https://github.com/microsoft/unilm/tree/master/unilm-v1\n    This version of the CNN/DM dataset was originally downloaded from\n    https://github.com/harvardnlp/sent-summary\n    and preprocessed following https://github.com/abisee/cnn-dailymail.\n\n    Args:\n        local_path (str): Path to store the downloaded data. If the data file\n            doesn't exist in this path, it's downloaded and unzipped.\n        top_n (int): Number of lines to read. Defaults to -1 and the entire dataset\n            is read.\n        return_iterable (bool): If False, returns SummarizationDataset.\n            If True, returns IterableSummarizationDataset. Defaults to False.\n        return_dev_data (bool): if False, returns train and test data splits.\n            If True, returns train, test, and dev data splits. Defaults to False.\n\n    Returns:\n        tuple: tuple containing train, test (, and dev) datasets.\n    \"\"\"\n\n    # Download and unzip the data\n    FILE_ID = \"1jiDbDbAsqy_5BM79SmX6aSu5DQVCAZq1\"\n    FILE_NAME = \"cnndm_data.zip\"\n\n    output_dir = os.path.join(local_path, \"cnndm_data\")\n    os.makedirs(output_dir, exist_ok=True)\n\n    # This folder contains the a version of the dataset with minimal processing\n    org_data_dir = os.path.join(output_dir, \"org_data\")\n\n    expected_data_files = set(\n        [\n            \"train.src\",\n            \"org_data\",\n            \"dev.src\",\n            \"test.tgt\",\n            \"train.tgt\",\n            \"dev.tgt\",\n            \"test.src\",\n        ]\n    )\n    expected_org_data_files = set(\n        [\n            \"training.summary\",\n            \"test.article\",\n            \"dev.article\",\n            \"training.article\",\n            \"dev.summary\",\n            \"test.summary\",\n        ]\n    )\n\n    maybe_download_googledrive(\n        google_file_id=FILE_ID, file_name=FILE_NAME, work_directory=local_path\n    )\n\n    if (\n        set(os.listdir(output_dir)) != expected_data_files\n        or set(os.listdir(org_data_dir)) != expected_org_data_files\n    ):\n        extract_zip(\n            file_path=os.path.join(local_path, FILE_NAME),\n            dest_path=output_dir,\n        )\n\n    train_source_file = os.path.join(org_data_dir, \"training.article\")\n    train_target_file = os.path.join(org_data_dir, \"training.summary\")\n    test_source_file = os.path.join(org_data_dir, \"test.article\")\n    test_target_file = os.path.join(org_data_dir, \"test.summary\")\n    dev_source_file = os.path.join(org_data_dir, \"dev.article\")\n    dev_target_file = os.path.join(org_data_dir, \"dev.summary\")\n\n    source_preprocessing = [detokenize]\n    target_preprocessing = [detokenize]\n\n    if return_iterable:\n        train_dataset = IterableSummarizationDataset(\n            source_file=train_source_file,\n            target_file=train_target_file,\n            source_preprocessing=source_preprocessing,\n            target_preprocessing=target_preprocessing,\n            top_n=top_n,\n        )\n\n        test_dataset = IterableSummarizationDataset(\n            source_file=test_source_file,\n            target_file=test_target_file,\n            source_preprocessing=source_preprocessing,\n            target_preprocessing=target_preprocessing,\n            top_n=top_n,\n        )\n    else:\n        train_dataset = SummarizationDataset(\n            source_file=train_source_file,\n            target_file=train_target_file,\n            source_preprocessing=source_preprocessing,\n            target_preprocessing=target_preprocessing,\n            top_n=top_n,\n        )\n\n        test_dataset = SummarizationDataset(\n            source_file=test_source_file,\n            target_file=test_target_file,\n            source_preprocessing=source_preprocessing,\n            target_preprocessing=target_preprocessing,\n            top_n=top_n,\n        )\n\n    if return_dev_data:\n        if return_iterable:\n            dev_dataset = IterableSummarizationDataset(\n                source_file=dev_source_file,\n                target_file=dev_target_file,\n                source_preprocessing=source_preprocessing,\n                target_preprocessing=target_preprocessing,\n                top_n=top_n,\n            )\n        else:\n            dev_dataset = SummarizationDataset(\n                source_file=dev_source_file,\n                target_file=dev_target_file,\n                source_preprocessing=source_preprocessing,\n                target_preprocessing=target_preprocessing,\n                top_n=top_n,\n            )\n\n        return train_dataset, test_dataset, dev_dataset\n    else:\n        return train_dataset, test_dataset\n"
  },
  {
    "path": "utils_nlp/dataset/dac.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Dataset for Arabic Classification utils\nhttps://data.mendeley.com/datasets/v524p5dhpj/2\nMohamed, BINIZ (2018), “DataSet for Arabic Classification”, Mendeley Data, v2\npaper link:  (\"https://www.mendeley.com/catalogue/\n        arabic-text-classification-using-deep-learning-technics/\")\n\"\"\"\n\nimport logging\nimport os\nfrom tempfile import TemporaryDirectory\n\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\n\nfrom utils_nlp.common.pytorch_utils import dataloader_from_dataset\nfrom utils_nlp.dataset.url_utils import extract_zip, maybe_download\nfrom utils_nlp.models.transformers.common import MAX_SEQ_LEN\nfrom utils_nlp.models.transformers.sequence_classification import Processor\n\nURL = (\n    \"https://data.mendeley.com/datasets/v524p5dhpj/2\"\n    \"/files/72c2e306-9538-4c74-a28f-558fbe87c382/\"\n    \"arabic_dataset_classifiction.csv.zip\"\n)\n\n\ndef load_pandas_df(local_cache_path=None, num_rows=None):\n    \"\"\"Downloads and extracts the dataset files\n    Args:\n        local_cache_path ([type], optional): [description]. Defaults to None.\n        num_rows (int): Number of rows to load. If None, all data is loaded.\n    Returns:\n        pd.DataFrame: pandas DataFrame containing the loaded dataset.\n    \"\"\"\n    zip_file = URL.split(\"/\")[-1]\n    maybe_download(URL, zip_file, local_cache_path)\n\n    zip_file_path = os.path.join(local_cache_path, zip_file)\n    csv_file_path = os.path.join(local_cache_path, zip_file.replace(\".zip\", \"\"))\n\n    if not os.path.exists(csv_file_path):\n        extract_zip(file_path=zip_file_path, dest_path=local_cache_path)\n    return pd.read_csv(csv_file_path, nrows=num_rows)\n\n\ndef load_tc_dataset(\n    local_path=TemporaryDirectory().name,\n    test_fraction=0.25,\n    random_seed=None,\n    train_sample_ratio=1.0,\n    test_sample_ratio=1.0,\n    model_name=\"bert-base-uncased\",\n    to_lower=True,\n    cache_dir=TemporaryDirectory().name,\n    max_len=MAX_SEQ_LEN,\n    batch_size=32,\n    num_gpus=None,\n):\n    \"\"\"\n    Load the multinli dataset and split into training and testing datasets.\n    The datasets are preprocessed and can be used to train a NER model or evaluate\n    on the testing dataset.\n\n    Args:\n        local_path (str, optional): The local file path to save the raw wikigold file.\n            Defautls to TemporaryDirectory().name.\n        test_fraction (float, optional): The fraction of testing dataset when splitting.\n            Defaults to 0.25.\n        random_seed (float, optional): Random seed used to shuffle the data.\n            Defaults to None.\n        train_sample_ratio (float, optional): The ratio that used to sub-sampling for training.\n            Defaults to 1.0.\n        test_sample_ratio (float, optional): The ratio that used to sub-sampling for testing.\n            Defaults to 1.0.\n        model_name (str, optional): The pretained model name.\n            Defaults to \"bert-base-uncased\".\n        to_lower (bool, optional): Lower case text input.\n            Defaults to True.\n        cache_dir (str, optional): The default folder for saving cache files.\n            Defaults to TemporaryDirectory().name.\n        max_len (int, optional): Maximum length of the list of tokens. Lists longer\n            than this are truncated and shorter ones are padded with \"O\"s. \n            Default value is BERT_MAX_LEN=512.\n        batch_size (int, optional): The batch size for training and testing.\n            Defaults to 32.\n        num_gpus (int, optional): The number of GPUs.\n            Defaults to None.\n\n    Returns:\n        tuple. The tuple contains four elements:\n        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.\n\n        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.\n        \n        label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values\n            can be retrieved by calling the `inverse_transform` function.\n        \n        test_labels (Series): a Pandas Series of testing label (in label ID format). If\n            the labels are in raw label values format, we will need to transform it to \n            label IDs by using the label_encoder.transform function.\n    \"\"\"\n\n    # download and load the original dataset\n    all_df = load_pandas_df(local_cache_path=local_path, num_rows=None)\n\n    # set the text and label columns\n    text_col = all_df.columns[0]\n    label_col = all_df.columns[1]\n\n    label_encoder = LabelEncoder()\n    label_encoder.fit([\"culture\", \"diverse\", \"economy\", \"politics\", \"sports\"])\n\n    # remove empty documents\n    all_df = all_df[all_df[text_col].isna() == False]\n\n    if test_fraction < 0 or test_fraction >= 1.0:\n        logging.warning(\"Invalid test fraction value: {}, changed to 0.25\".format(test_fraction))\n        test_fraction = 0.25\n\n    train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)\n\n    if train_sample_ratio > 1.0:\n        train_sample_ratio = 1.0\n        logging.warning(\"Setting the training sample ratio to 1.0\")\n    elif train_sample_ratio < 0:\n        logging.error(\"Invalid training sample ration: {}\".format(train_sample_ratio))\n        raise ValueError(\"Invalid training sample ration: {}\".format(train_sample_ratio))\n\n    if test_sample_ratio > 1.0:\n        test_sample_ratio = 1.0\n        logging.warning(\"Setting the testing sample ratio to 1.0\")\n    elif test_sample_ratio < 0:\n        logging.error(\"Invalid testing sample ration: {}\".format(test_sample_ratio))\n        raise ValueError(\"Invalid testing sample ration: {}\".format(test_sample_ratio))\n\n    if train_sample_ratio < 1.0:\n        train_df = train_df.sample(frac=train_sample_ratio).reset_index(drop=True)\n    if test_sample_ratio < 1.0:\n        test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)\n\n    processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)\n\n    train_dataset = processor.dataset_from_dataframe(\n        df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,\n    )\n    train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)\n\n    test_dataset = processor.dataset_from_dataframe(\n        df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,\n    )\n    test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)\n\n    # the DAC dataset already converted the labels to label ID format\n    test_labels = test_df[label_col]\n    return (train_dataloader, test_dataloader, label_encoder, test_labels)\n\n\ndef get_label_values(label_encoder, label_ids):\n    \"\"\"\n    Get the label values from label IDs. \n\n    Args:\n        label_encoder (LabelEncoder): a fitted sklearn LabelEncoder instance\n        label_ids (Numpy array): a Numpy array of label IDs.\n\n    Returns:\n        Numpy array. A Numpy array of label values.\n    \"\"\"\n\n    return label_encoder.inverse_transform(label_ids)\n"
  },
  {
    "path": "utils_nlp/dataset/data_loaders.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Data loaders for sampling and reading large files that can not fit into the memory.\"\"\"\n\nimport random\nimport dask.dataframe as dd\n\n\nclass DaskCSVLoader:\n    \"\"\"Class for creating and using a loader for large file of type csv or other delimited\n    files. The loader uses dask to read smaller partitions of a file into memory (one partition\n    at a time), before sampling batches from the partitions.\"\"\"\n\n    def __init__(self, file_path, sep=\",\", header=\"infer\", block_size=10e6, random_seed=None):\n        \"\"\"Initializes the loader.\n\n        Args:\n            file_path (str): Path to delimited file.\n            sep (str, optional): Delimiter. Defaults to \",\".\n            header (str, optional): Number of rows to be used as the header.\n                See pandas.read_csv()\n                Defaults to \"infer\".\n            block_size (int, optional): Size of partition in bytes.\n                See dask.dataframe.read_csv()\n                Defaults to 10e6.\n            random_seed (int, optional): Random seed. See random.seed().\n                Defaults to None.\n        \"\"\"\n\n        self.df = dd.read_csv(file_path, sep=sep, header=header, blocksize=block_size)\n\n        self.random_seed = random_seed\n        random.seed(random_seed)\n\n    def get_random_batches(self, num_batches, batch_size):\n        \"\"\"Creates a random-batch generator.\n            Batches returned are pandas dataframes of length=batch_size.\n            Note: If the sampled partition has less rows than the\n            specified batch_size, then a smaller batch of the same\n            size as that partition's number of rows is returned.\n\n        Args:\n            num_batches (int): Number of batches to generate.\n            batch_size (int]): Batch size.\n        \"\"\"\n        for i in range(num_batches):\n            rnd_part_idx = random.randint(0, self.df.npartitions - 1)\n            sample_part = self.df.partitions[rnd_part_idx].compute()\n            if sample_part.shape[0] > batch_size:\n                yield sample_part.sample(batch_size, random_state=self.random_seed)\n            else:\n                yield sample_part\n\n    def get_sequential_batches(self, batch_size):\n        \"\"\"Creates a sequential generator.\n            Batches returned are pandas dataframes of length=batch_size.\n            Note: Final batch might be of smaller size.\n\n        Args:\n            batch_size (int): Batch size.\n        \"\"\"\n        for i in range(self.df.npartitions):\n            part = self.df.partitions[i].compute()\n            for j in range(0, part.shape[0], batch_size):\n                yield part.iloc[j : j + batch_size, :]\n\n\nclass DaskJSONLoader:\n    \"\"\"Class for creating and using a loader for large file in json format. The loader uses dask to\n    read smaller partitions of a file into memory (one partition at a time), before sampling\n    batches from the partitions.\"\"\"\n\n    def __init__(self, file_path, block_size=10e6, random_seed=None, lines=True):\n        \"\"\"Initializes the loader.\n\n        Args:\n            file_path (str): Path to delimited file.\n            block_size (int, optional): Size of partition in bytes.\n                See dask.dataframe.read_csv()\n                Defaults to 10e6.\n            random_seed (int, optional): Random seed. See random.seed().\n                Defaults to None.\n            lines (bool, optional): Read the file as a json object per line. Defaults to True.\n        \"\"\"\n\n        self.df = dd.read_json(file_path, blocksize=block_size, lines=lines)\n\n        self.random_seed = random_seed\n        random.seed(random_seed)\n\n    def get_random_batches(self, num_batches, batch_size):\n        \"\"\"Creates a random-batch generator.\n            Batches returned are pandas dataframes of length=batch_size.\n            Note: If the sampled partition has less rows than the\n            specified batch_size, then a smaller batch of the same\n            size as that partition's number of rows is returned.\n\n        Args:\n            num_batches (int): Number of batches to generate.\n            batch_size (int]): Batch size.\n        \"\"\"\n        for i in range(num_batches):\n            rnd_part_idx = random.randint(0, self.df.npartitions - 1)\n            sample_part = self.df.partitions[rnd_part_idx].compute()\n            if sample_part.shape[0] > batch_size:\n                yield sample_part.sample(batch_size, random_state=self.random_seed)\n            else:\n                yield sample_part\n\n    def get_sequential_batches(self, batch_size, num_batches=None):\n        \"\"\"Creates a sequential generator.\n            Batches returned are pandas dataframes of length=batch_size.\n            Note: Final batch might be of smaller size.\n\n        Args:\n            num_batches: Number of batches to generate.\n            batch_size (int): Batch size.\n        \"\"\"\n\n        if num_batches is None:\n            num_batches = self.df.npartitions\n        for i in range(num_batches):\n            part = self.df.partitions[i].compute()\n            for j in range(0, part.shape[0], batch_size):\n                yield part.iloc[j : j + batch_size, :]\n"
  },
  {
    "path": "utils_nlp/dataset/msrpc.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n    Utility functions for downloading, extracting, and reading the Microsoft\n    Research Paraphrase Corpus (MSRPC) dataset.\n    https://www.microsoft.com/en-us/download/details.aspx?id=52398\n\"\"\"\n\nimport os\nimport pathlib\n\nimport pandas as pd\n\nfrom utils_nlp.dataset.url_utils import maybe_download, download_path\n\nDATASET_DICT = {\n    \"train\": \"msr_paraphrase_train.txt\",\n    \"test\": \"msr_paraphrase_test.txt\",\n    \"all\": \"msr_paraphrase_data.txt\",\n}\n\n\ndef download_msrpc(download_dir):\n    \"\"\"Downloads Windows Installer for Microsoft Paraphrase Corpus.\n    \n    Args:\n        download_dir (str): File path for the downloaded file\n\n    Returns:\n        str: file_path to the downloaded dataset.\n    \"\"\"\n\n    url = (\n        \"https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B\"\n        \"-3604ED519838/MSRParaphraseCorpus.msi\"\n    )\n    return maybe_download(url, work_directory=download_dir)\n\n\ndef load_pandas_df(local_cache_path=None, dataset_type=\"train\"):\n    \"\"\"Load pandas dataframe and clean the data from the downloaded dataset\n\n    Args:\n        the dataset is already downloaded.\n        dataset_type (str): Key to the DATASET_DICT item. Loads the dataset specified.\n        Could be train or test.\n        local_cache_path (str): Path to download the dataset installer.\n\n    Returns:\n        pd.DataFrame: A pandas dataframe with 3 columns, Sentence 1, Sentence 2 and\n        score.\n\n    \"\"\"\n\n    if dataset_type not in DATASET_DICT.keys():\n        raise Exception(\"Dataset type not found!\")\n\n    with download_path(local_cache_path) as path:\n        path = pathlib.Path(path)\n        installer_datapath = download_msrpc(path)\n\n        print(\n            \"The Windows Installer for Mircosoft Paraphrase Corpus has been \" \"downloaded at \",\n            installer_datapath,\n            \"\\n\",\n        )\n        data_directory = input(\"Please install and provide the installed directory. Thanks! \\n\")\n\n        data_directory = pathlib.Path(data_directory)\n        assert os.path.exists(data_directory)\n\n        fields = [\"Quality\", \"#1 String\", \"#2 String\"]\n        file_path = os.path.join(data_directory, DATASET_DICT[dataset_type])\n        df = (\n            pd.read_csv(file_path, delimiter=\"\\t\", error_bad_lines=False, usecols=fields)\n            .dropna()\n            .rename(\n                index=str,\n                columns={\"Quality\": \"score\", \"#1 String\": \"sentence1\", \"#2 String\": \"sentence2\"},\n            )\n        )\n        return df\n"
  },
  {
    "path": "utils_nlp/dataset/multinli.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n    Utility functions for downloading, extracting, and reading the\n    Multi-Genre NLI (MultiNLI) Corpus.\n    https://www.nyu.edu/projects/bowman/multinli/\n\"\"\"\n\nimport logging\nimport os\nfrom tempfile import TemporaryDirectory\n\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\n\nfrom utils_nlp.common.pytorch_utils import dataloader_from_dataset\nfrom utils_nlp.dataset.data_loaders import DaskJSONLoader\nfrom utils_nlp.dataset.url_utils import extract_zip, maybe_download\nfrom utils_nlp.models.transformers.common import MAX_SEQ_LEN\nfrom utils_nlp.models.transformers.sequence_classification import Processor\n\nURL = \"http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip\"\n\n# Source - https://github.com/nyu-mll/jiant/blob/master/scripts/download_glue_data.py\nURL_JIANT_MNLI_TSV = \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce\"\nDATA_FILES = {\n    \"train\": \"multinli_1.0/multinli_1.0_train.jsonl\",\n    \"dev_matched\": \"multinli_1.0/multinli_1.0_dev_matched.jsonl\",\n    \"dev_mismatched\": \"multinli_1.0/multinli_1.0_dev_mismatched.jsonl\",\n}\n\n\ndef download_file_and_extract(\n    local_cache_path: str = \".\", file_split: str = \"train\"\n) -> None:\n    \"\"\"Download and extract the dataset files\n\n    Args:\n        local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {\".\"})\n        file_split {str} -- [description] (default: {\"train\"})\n    \n    Returns:\n        None -- Nothing is returned\n    \"\"\"\n    file_name = URL.split(\"/\")[-1]\n    maybe_download(URL, file_name, local_cache_path)\n\n    if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])):\n        extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)\n\n\ndef download_tsv_files_and_extract(local_cache_path: str = \".\") -> None:\n    \"\"\"Download and extract the dataset files in tsv format from NYU Jiant \n        downloads both original and tsv formatted data. \n\n    Args:\n        local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {\".\"})\n    \n    Returns:\n        None -- Nothing is returned\n    \"\"\"\n    try:\n        folder_name = \"MNLI\"\n        file_name = f\"{folder_name}.zip\"\n        maybe_download(URL_JIANT_MNLI_TSV, file_name, local_cache_path)\n        if not os.path.exists(os.path.join(local_cache_path, folder_name)):\n            extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)\n\n        # Clean up zip download\n        if os.path.exists(os.path.join(local_cache_path, file_name)):\n            os.remove(os.path.join(local_cache_path, file_name))\n    except IOError as e:\n        raise (e)\n    print(\"Downloaded file to: \", os.path.join(local_cache_path, folder_name))\n\n\ndef load_pandas_df(local_cache_path=\".\", file_split=\"train\"):\n    \"\"\"Loads extracted dataset into pandas\n    Args:\n        local_cache_path ([type], optional): [description]. Defaults to current working directory.\n        file_split (str, optional): The subset to load.\n            One of: {\"train\", \"dev_matched\", \"dev_mismatched\"}\n            Defaults to \"train\".\n    Returns:\n        pd.DataFrame: pandas DataFrame containing the specified\n            MultiNLI subset.\n    \"\"\"\n    try:\n        download_file_and_extract(local_cache_path, file_split)\n    except Exception as e:\n        raise e\n    return pd.read_json(\n        os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True\n    )\n\n\ndef get_generator(\n    local_cache_path=\".\",\n    file_split=\"train\",\n    block_size=10e6,\n    batch_size=10e6,\n    num_batches=None,\n):\n    \"\"\" Returns an extracted dataset as a random batch generator that\n    yields pandas dataframes.\n    Args:\n        local_cache_path ([type], optional): [description]. Defaults to None.\n        file_split (str, optional): The subset to load.\n            One of: {\"train\", \"dev_matched\", \"dev_mismatched\"}\n            Defaults to \"train\".\n        block_size (int, optional): Size of partition in bytes.\n        num_batches (int): Number of batches to generate.\n        batch_size (int]): Batch size.\n    Returns:\n        Generator[pd.Dataframe, None, None] : Random batch generator that yields pandas dataframes.\n    \"\"\"\n\n    try:\n        download_file_and_extract(local_cache_path, file_split)\n    except Exception as e:\n        raise e\n\n    loader = DaskJSONLoader(\n        os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size\n    )\n\n    return loader.get_sequential_batches(\n        batch_size=int(batch_size), num_batches=num_batches\n    )\n\n\ndef load_tc_dataset(\n    local_path=TemporaryDirectory().name,\n    test_fraction=0.25,\n    random_seed=None,\n    train_sample_ratio=1.0,\n    test_sample_ratio=1.0,\n    model_name=\"bert-base-uncased\",\n    to_lower=True,\n    cache_dir=TemporaryDirectory().name,\n    max_len=MAX_SEQ_LEN,\n    batch_size=32,\n    num_gpus=None,\n):\n    \"\"\"\n    Load the multinli dataset and split into training and testing datasets.\n    The datasets are preprocessed and can be used to train a NER model or evaluate\n    on the testing dataset.\n\n    Args:\n        local_path (str, optional): The local file path to save the raw wikigold file.\n            Defautls to TemporaryDirectory().name.\n        test_fraction (float, optional): The fraction of testing dataset when splitting.\n            Defaults to 0.25.\n        random_seed (float, optional): Random seed used to shuffle the data.\n            Defaults to None.\n        train_sample_ratio (float, optional): The ratio that used to sub-sampling for training.\n            Defaults to 1.0.\n        test_sample_ratio (float, optional): The ratio that used to sub-sampling for testing.\n            Defaults to 1.0.\n        model_name (str, optional): The pretained model name.\n            Defaults to \"bert-base-uncased\".\n        to_lower (bool, optional): Lower case text input.\n            Defaults to True.\n        cache_dir (str, optional): The default folder for saving cache files.\n            Defaults to TemporaryDirectory().name.\n        max_len (int, optional): Maximum length of the list of tokens. Lists longer\n            than this are truncated and shorter ones are padded with \"O\"s. \n            Default value is BERT_MAX_LEN=512.\n        batch_size (int, optional): The batch size for training and testing.\n            Defaults to 32.\n        num_gpus (int, optional): The number of GPUs.\n            Defaults to None.\n\n    Returns:\n        tuple. The tuple contains four elements:\n        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.\n\n        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.\n        \n        label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values\n            can be retrieved by calling the `inverse_transform` function.\n        \n        test_labels (Series): a Pandas Series of testing label (in label ID format). If\n            the labels are in raw label values format, we will need to transform it to \n            label IDs by using the label_encoder.transform function.\n    \"\"\"\n\n    # download and load the original dataset\n    all_df = load_pandas_df(local_cache_path=local_path, file_split=\"train\")\n\n    # select the examples corresponding to one of the entailment labels (neutral\n    # in this case) to avoid duplicate rows, as the sentences are not unique,\n    # whereas the sentence pairs are.\n    all_df = all_df[all_df[\"gold_label\"] == \"neutral\"]\n    text_col = \"sentence1\"\n    label_col = \"genre\"\n\n    # encode labels, use the \"genre\" column as the label column\n    label_encoder = LabelEncoder()\n    label_encoder.fit(all_df[label_col])\n\n    if test_fraction < 0 or test_fraction >= 1.0:\n        logging.warning(\n            \"Invalid test fraction value: {}, changed to 0.25\".format(test_fraction)\n        )\n        test_fraction = 0.25\n\n    train_df, test_df = train_test_split(\n        all_df, train_size=(1.0 - test_fraction), random_state=random_seed\n    )\n\n    if train_sample_ratio > 1.0:\n        train_sample_ratio = 1.0\n        logging.warning(\"Setting the training sample ratio to 1.0\")\n    elif train_sample_ratio < 0:\n        logging.error(\"Invalid training sample ration: {}\".format(train_sample_ratio))\n        raise ValueError(\n            \"Invalid training sample ration: {}\".format(train_sample_ratio)\n        )\n\n    if test_sample_ratio > 1.0:\n        test_sample_ratio = 1.0\n        logging.warning(\"Setting the testing sample ratio to 1.0\")\n    elif test_sample_ratio < 0:\n        logging.error(\"Invalid testing sample ration: {}\".format(test_sample_ratio))\n        raise ValueError(\"Invalid testing sample ration: {}\".format(test_sample_ratio))\n\n    if train_sample_ratio < 1.0:\n        train_df = train_df.sample(frac=train_sample_ratio).reset_index(drop=True)\n    if test_sample_ratio < 1.0:\n        test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)\n\n    train_labels = label_encoder.transform(train_df[label_col])\n    train_df[label_col] = train_labels\n    test_labels = label_encoder.transform(test_df[label_col])\n    test_df[label_col] = test_labels\n\n    processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)\n\n    train_dataset = processor.dataset_from_dataframe(\n        df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,\n    )\n    train_dataloader = dataloader_from_dataset(\n        train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True\n    )\n\n    test_dataset = processor.dataset_from_dataframe(\n        df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,\n    )\n    test_dataloader = dataloader_from_dataset(\n        test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False\n    )\n\n    return (train_dataloader, test_dataloader, label_encoder, test_labels)\n\n\ndef get_label_values(label_encoder, label_ids):\n    \"\"\"\n    Get the label values from label IDs. \n\n    Args:\n        label_encoder (LabelEncoder): a fitted sklearn LabelEncoder instance\n        label_ids (Numpy array): a Numpy array of label IDs.\n\n    Returns:\n        Numpy array. A Numpy array of label values.\n    \"\"\"\n\n    return label_encoder.inverse_transform(label_ids)\n"
  },
  {
    "path": "utils_nlp/dataset/ner_utils.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Common helper functions for preprocessing Named Entity Recognition (NER) datasets.\"\"\"\n\n\ndef preprocess_conll(text, sep=\"\\t\"):\n    \"\"\"\n    Converts data in CoNLL format to word and label lists.\n\n    Args:\n        text (str): Text string in conll format, e.g.\n            \"Amy B-PER\n             ADAMS I-PER\n             works O\n             at O\n             the O\n             University B-ORG\n             of I-ORG\n             Minnesota I-ORG\n             . O\"\n        sep (str, optional): Column separator\n            Defaults to \\t\n    Returns:\n        tuple:\n            (list of word lists, list of token label lists)\n    \"\"\"\n    text_list = text.split(\"\\n\\n\")\n    if text_list[-1] in (\" \", \"\"):\n        text_list = text_list[:-1]\n\n    max_seq_len = 0\n    sentence_list = []\n    labels_list = []\n    for s in text_list:\n        # split each sentence string into \"word label\" pairs\n        s_split = s.split(\"\\n\")\n        # split \"word label\" pairs\n        s_split_split = [t.split(sep) for t in s_split]\n        sentence_list.append([t[0] for t in s_split_split if len(t) > 1])\n        labels_list.append([t[1] for t in s_split_split if len(t) > 1])\n        if len(s_split_split) > max_seq_len:\n            max_seq_len = len(s_split_split)\n    print(\"Maximum sequence length is: {0}\".format(max_seq_len))\n    return sentence_list, labels_list\n\n\ndef read_conll_file(file_path, sep=\"\\t\", encoding=None):\n    \"\"\"\n    Reads a data file in CoNLL format and returns word and label lists.\n\n    Args:\n        file_path (str): Data file path.\n        sep (str, optional): Column separator. Defaults to \"\\t\".\n        encoding (str): File encoding used when reading the file.\n            Defaults to None.\n\n    Returns:\n        (list, list): A tuple of word and label lists (list of lists).\n    \"\"\"\n    with open(file_path, encoding=encoding) as f:\n        data = f.read()\n    return preprocess_conll(data, sep=sep)\n"
  },
  {
    "path": "utils_nlp/dataset/preprocess.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n    Utility functions for common text preprocessing tasks like converting to\n    lower case, removing stop words, convert to unicode, etc.\n\"\"\"\n\nimport pandas as pd\nimport spacy\nimport nltk\nfrom nltk.corpus import stopwords\n\n\ndef to_lowercase_all(df):\n    \"\"\"\n    This function transforms all strings in the dataframe to lowercase\n\n    Args:\n        df (pd.DataFrame): Raw dataframe with some text columns.\n\n    Returns:\n        pd.DataFrame: Dataframe with lowercase standardization.\n    \"\"\"\n    return df.applymap(lambda s: s.lower() if type(s) == str else s)\n\n\ndef to_lowercase(df, column_names=[]):\n    \"\"\"\n    This function transforms strings of the column names in the dataframe\n    passed to lowercase\n\n    Args:\n        df (pd.DataFrame): Raw dataframe with some text columns.\n        column_names(list, optional): column names to be changed to lowercase.\n\n    Returns:\n        pd.DataFrame: Dataframe with columns with lowercase standardization.\n    \"\"\"\n    if not column_names:\n        return to_lowercase_all(df)\n    else:\n        df[column_names] = df[column_names].applymap(lambda s: s.lower() if type(s) == str else s)\n        return df\n\n\ndef to_spacy_tokens(\n    df,\n    sentence_cols=[\"sentence1\", \"sentence2\"],\n    token_cols=[\"sentence1_tokens\", \"sentence2_tokens\"],\n):\n    \"\"\"\n    This function tokenizes the sentence pairs using spaCy, defaulting to the\n    spaCy en_core_web_sm model\n\n    Args:\n        df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.\n        sentence_cols (list, optional): Column names of the raw sentence pairs.\n        token_cols (list, optional): Column names for the tokenized sentences.\n\n    Returns:\n        pd.DataFrame: Dataframe with new columns token_cols, each containing\n                            a list of tokens for their respective sentences.\n    \"\"\"\n    nlp = spacy.load(\"en_core_web_sm\")\n    text_df = df[sentence_cols]\n    nlp_df = text_df.applymap(lambda x: nlp(x))\n    tok_df = nlp_df.applymap(lambda doc: [token.text for token in doc])\n    tok_df.columns = token_cols\n    tokenized = pd.concat([df, tok_df], axis=1)\n    return tokenized\n\n\ndef rm_spacy_stopwords(\n    df,\n    sentence_cols=[\"sentence1\", \"sentence2\"],\n    stop_cols=[\"sentence1_tokens_rm_stopwords\", \"sentence2_tokens_rm_stopwords\"],\n    custom_stopwords=[],\n):\n    \"\"\"\n    This function tokenizes the sentence pairs using spaCy and remove\n    stopwords, defaulting to the spaCy en_core_web_sm model\n\n    Args:\n        df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.\n        sentence_cols (list, optional): Column names for the raw sentence\n        pairs.\n        stop_cols (list, optional): Column names for the tokenized sentences\n            without stop words.\n        custom_stopwords (list of str, optional): List of custom stopwords to\n            register with the spaCy model.\n\n    Returns:\n        pd.DataFrame: Dataframe with new columns stop_cols, each containing a\n            list of tokens for their respective sentences.\n    \"\"\"\n    nlp = spacy.load(\"en_core_web_sm\")\n    if len(custom_stopwords) > 0:\n        for csw in custom_stopwords:\n            nlp.vocab[csw].is_stop = True\n    text_df = df[sentence_cols]\n    nlp_df = text_df.applymap(lambda x: nlp(x))\n    stop_df = nlp_df.applymap(lambda doc: [token.text for token in doc if not token.is_stop])\n    stop_df.columns = stop_cols\n    return pd.concat([df, stop_df], axis=1)\n\n\ndef to_nltk_tokens(\n    df,\n    sentence_cols=[\"sentence1\", \"sentence2\"],\n    token_cols=[\"sentence1_tokens\", \"sentence2_tokens\"],\n):\n    \"\"\"\n    This function converts a sentence to word tokens using nltk.\n\n    Args:\n        df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.\n        sentence_cols (list, optional): Column names for the raw sentences.\n        token_cols (list, optional): Column names for the tokenized sentences.\n\n    Returns:\n    pd.DataFrame: Dataframe with new columns token_cols, each containing a\n    list of tokens for their respective sentences.\n    \"\"\"\n    text_df = df[sentence_cols]\n    tok_df = text_df.applymap(lambda sentence: nltk.word_tokenize(sentence))\n    tok_df.columns = token_cols\n    tokenized = pd.concat([df, tok_df], axis=1)\n    return tokenized\n\n\ndef rm_nltk_stopwords(\n    df,\n    sentence_cols=[\"sentence1\", \"sentence2\"],\n    stop_cols=[\"sentence1_tokens_rm_stopwords\", \"sentence2_tokens_rm_stopwords\"],\n):\n    \"\"\"\n    This function removes stop words from a sentence using nltk.\n\n    Args:\n        df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.\n        sentence_cols (list, optional): Column names for the raw entences.\n        stop_cols (list, optional): Column names for the tokenized sentences\n            without stop words.\n\n    Returns:\n        pd.DataFrame: Dataframe with new columns stop_cols, each containing a\n        list of tokens for their respective sentences.\n    \"\"\"\n\n    nltk.download(\"stopwords\")\n    stop_words = tuple(stopwords.words(\"english\"))\n    text_df = df[sentence_cols]\n    stop_df = text_df.applymap(lambda sentence: nltk.word_tokenize(sentence)).applymap(\n        lambda l: [word for word in l if word not in stop_words]\n    )\n\n    stop_df.columns = stop_cols\n    return pd.concat([df, stop_df], axis=1)\n\n\ndef convert_to_unicode(input_text, encoding=\"utf-8\"):\n    \"\"\"Converts intput_text to Unicode. Input must be utf-8.\"\"\"\n    if isinstance(input_text, str):\n        return input_text\n    elif isinstance(input_text, bytes):\n        return input_text.decode(encoding, \"ignore\")\n"
  },
  {
    "path": "utils_nlp/dataset/sentence_selection.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This script reuses some code from https://github.com/nlpyang/BertSum\n\n\nimport itertools\nimport re\n\n\ndef _get_ngrams(n, text):\n    \"\"\"Calcualtes n-grams.\n    Args:\n      n: which n-grams to calculate\n      text: An array of tokens\n    Returns:\n      A set of n-grams\n    \"\"\"\n    ngram_set = set()\n    text_length = len(text)\n    max_index_ngram_start = text_length - n\n    for i in range(max_index_ngram_start + 1):\n        ngram_set.add(tuple(text[i:i + n]))\n    return ngram_set\n\n\ndef _get_word_ngrams(n, sentences):\n    \"\"\"Calculates word n-grams for multiple sentences.\n    \"\"\"\n    assert len(sentences) > 0\n    assert n > 0\n\n    # words = _split_into_words(sentences)\n\n    words = sum(sentences, [])\n    # words = [w for w in words if w not in stopwords]\n    return _get_ngrams(n, words)\n\n\ndef cal_rouge(evaluated_ngrams, reference_ngrams):\n    reference_count = len(reference_ngrams)\n    evaluated_count = len(evaluated_ngrams)\n\n    overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)\n    overlapping_count = len(overlapping_ngrams)\n\n    if evaluated_count == 0:\n        precision = 0.0\n    else:\n        precision = overlapping_count / evaluated_count\n\n    if reference_count == 0:\n        recall = 0.0\n    else:\n        recall = overlapping_count / reference_count\n\n    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))\n    return {\"f\": f1_score, \"p\": precision, \"r\": recall}\n\n\ndef combination_selection(doc_sent_list, abstract_sent_list, summary_size):\n    def _rouge_clean(s):\n        return re.sub(r'[^a-zA-Z0-9 ]', '', s)\n\n    max_rouge = 0.0\n    max_idx = (0, 0)\n    abstract = sum(abstract_sent_list, [])\n    abstract = _rouge_clean(' '.join(abstract)).split()\n    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]\n    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]\n    reference_1grams = _get_word_ngrams(1, [abstract])\n    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]\n    reference_2grams = _get_word_ngrams(2, [abstract])\n\n    impossible_sents = []\n    for s in range(summary_size + 1):\n        combinations = itertools.combinations([i for i in range(len(sents)) if i not in impossible_sents], s + 1)\n        for c in combinations:\n            candidates_1 = [evaluated_1grams[idx] for idx in c]\n            candidates_1 = set.union(*map(set, candidates_1))\n            candidates_2 = [evaluated_2grams[idx] for idx in c]\n            candidates_2 = set.union(*map(set, candidates_2))\n            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']\n            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']\n\n            rouge_score = rouge_1 + rouge_2\n            if (s == 0 and rouge_score == 0):\n                impossible_sents.append(c[0])\n            if rouge_score > max_rouge:\n                max_idx = c\n                max_rouge = rouge_score\n    return sorted(list(max_idx))\n\n\ndef greedy_selection(doc_sent_list, abstract_sent_list, summary_size):\n    def _rouge_clean(s):\n        return re.sub(r'[^a-zA-Z0-9 ]', '', s)\n\n    max_rouge = 0.0\n    abstract = sum(abstract_sent_list, [])\n    abstract = _rouge_clean(' '.join(abstract)).split()\n    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]\n    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]\n    reference_1grams = _get_word_ngrams(1, [abstract])\n    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]\n    reference_2grams = _get_word_ngrams(2, [abstract])\n\n    selected = []\n    for s in range(summary_size):\n        cur_max_rouge = max_rouge\n        cur_id = -1\n        for i in range(len(sents)):\n            if (i in selected):\n                continue\n            c = selected + [i]\n            candidates_1 = [evaluated_1grams[idx] for idx in c]\n            candidates_1 = set.union(*map(set, candidates_1))\n            candidates_2 = [evaluated_2grams[idx] for idx in c]\n            candidates_2 = set.union(*map(set, candidates_2))\n            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']\n            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']\n            rouge_score = rouge_1 + rouge_2\n            if rouge_score > cur_max_rouge:\n                cur_max_rouge = rouge_score\n                cur_id = i\n        if (cur_id == -1):\n            return selected\n        selected.append(cur_id)\n        max_rouge = cur_max_rouge\n\n    return sorted(selected)\n"
  },
  {
    "path": "utils_nlp/dataset/snli.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n    Utility functions for downloading, extracting, and reading the Stanford\n    Natural Language Inference (SNLI) Corpus.\n    https://nlp.stanford.edu/projects/snli/\n\"\"\"\nimport os\nimport shutil\nimport azureml.dataprep as dprep\nimport pandas as pd\nfrom zipfile import ZipFile\nfrom utils_nlp.dataset.url_utils import maybe_download, download_path\nfrom utils_nlp.dataset import Split\n\n# constants\nSNLI_URL = \"https://nlp.stanford.edu/projects/snli/snli_1.0.zip\"\nSNLI_DIRNAME = \"snli_1.0\"\nSNLI_FILE_PREFIX = \"snli_1.0\"\n\n# clean col names\nS1_COL = \"sentence1\"\nS2_COL = \"sentence2\"\nLABEL_COL = \"score\"\n\n\ndef load_pandas_df(local_cache_path=None, file_split=Split.TRAIN, file_type=\"txt\", nrows=None):\n    \"\"\"\n    Loads the SNLI dataset as pd.DataFrame\n    Download the dataset from \"https://nlp.stanford.edu/projects/snli/snli_1.0.zip\", unzip, and load\n\n    Args:\n        local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.\n            If None, all the intermediate files will be stored in a temporary directory and removed\n            after use.\n        file_split (str): File split to load, defaults to \"train\"\n        file_type (str): File type to load, defaults to \"txt\"\n        nrows (int): Number of rows to load, defaults to None (in which all rows will be returned)\n\n    Returns:\n        pd.DataFrame: SNLI dataset.\n    \"\"\"\n    with download_path(local_cache_path) as path:\n        filepath = os.path.join(path, \"snli_1.0.zip\")\n        snlipath = _maybe_download_and_extract(filepath, file_split, file_type)\n\n        if file_type == \"txt\":\n            snli_df = pd.read_csv(snlipath, sep=\"\\t\", nrows=nrows)\n        else:\n            snli_df = pd.read_json(snlipath, lines=True)\n            if nrows:\n                snli_df = snli_df[:nrows]\n\n    return snli_df\n\n\ndef _maybe_download_and_extract(zip_path, file_split, file_type):\n    \"\"\"\n    Downloads SNLI dataset zip and extract provided datafile split if they don’t already exist\n    Args:\n        zip_path (str): Path (directory or a zip file) to cache the downloaded zip file\n        file_split (str): File split to load\n        file_type(str) : File type to load\n\n    Returns:\n         str: File path where data file is extracted\n    \"\"\"\n    dirs, _ = os.path.split(zip_path)\n    if not os.path.exists(dirs):\n        os.makedirs(dirs)\n\n    # store raw data here\n    dir_path = os.path.join(dirs, \"raw\", SNLI_DIRNAME)\n\n    if not os.path.exists(dir_path):\n        os.makedirs(dir_path)\n\n    # format csv filename\n    file_name = \"{0}_{1}.{2}\".format(SNLI_FILE_PREFIX, file_split.value, file_type)\n    extract_path = os.path.join(dir_path, file_name)\n\n    if not os.path.exists(extract_path):\n        _ = download_snli(zip_path)\n        extract_snli(zip_path, source_path=SNLI_DIRNAME + \"/\" + file_name, dest_path=extract_path)\n\n    return extract_path\n\n\ndef download_snli(dest_path):\n    \"\"\"\n    Download the SNLI dataset\n    Args:\n        dest_path (str): file path where SNLI dataset should be downloaded\n\n    Returns:\n        str: file path where SNLI dataset is downloaded\n\n    \"\"\"\n    dirs, file = os.path.split(dest_path)\n    maybe_download(SNLI_URL, file, work_directory=dirs)\n\n\ndef extract_snli(zip_path, source_path, dest_path):\n    \"\"\"\n    Extract SNLI datafile from the SNLI raw zip file.\n    Args:\n        zip_path (str): zip file location\n        source_path (str): datafile location\n        dest_path (str): file path for extracted SNLI\n\n    \"\"\"\n    with ZipFile(zip_path, \"r\") as z:\n        with z.open(source_path) as zf, open(dest_path, \"wb\") as f:\n            shutil.copyfileobj(zf, f)\n\n\ndef clean_cols(df):\n    \"\"\"\n    Drop irrelevant columns from the input dataframe\n    Args:\n        df(pd.DataFrame): Input dataframe\n\n    Returns:\n        pd.DataFrame\n    \"\"\"\n    snli_df = df.drop(\n        [\n            \"sentence1_binary_parse\",\n            \"sentence2_binary_parse\",\n            \"sentence1_parse\",\n            \"sentence2_parse\",\n            \"captionID\",\n            \"pairID\",\n            \"label1\",\n            \"label2\",\n            \"label3\",\n            \"label4\",\n            \"label5\",\n        ],\n        axis=1,\n    )\n\n    snli_df = snli_df.rename(\n        columns={\"sentence1\": S1_COL, \"sentence2\": S2_COL, \"gold_label\": LABEL_COL}\n    )\n\n    return snli_df\n\n\ndef clean_rows(df, label_col=LABEL_COL):\n    \"\"\"Drop badly formatted rows from the input dataframe\n\n    Args:\n        df (pd.DataFrame): Input dataframe\n        label_col (str): Name of label column.\n            Defaults to the standardized column name that is set after running the clean_col method.\n\n    Returns:\n        pd.DataFrame\n    \"\"\"\n    snli_df = df.dropna()\n    snli_df = snli_df.loc[snli_df[label_col] != \"-\"].copy()\n\n    return snli_df\n\n\ndef clean_df(df, label_col=LABEL_COL):\n    df = clean_cols(df)\n    df = clean_rows(df, label_col)\n\n    return df\n\n\ndef load_azureml_df(local_cache_path=None, file_split=Split.TRAIN, file_type=\"txt\"):\n    \"\"\"\n    Loads the SNLI dataset as AzureML dataflow object\n    Download the dataset from \"https://nlp.stanford.edu/projects/snli/snli_1.0.zip\", unzip,\n    and load.\n\n    Args:\n        local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.\n            If None, all the intermediate files will be stored in a temporary directory and removed\n            after use.\n        file_split (str): File split to load. One of (dev, test, train)\n        file_type (str): File type to load. One of (txt, jsonl)\n\n    Returns:\n        AzureML dataflow: SNLI dataset\n\n    \"\"\"\n    with download_path(local_cache_path) as path:\n        filepath = os.path.join(path, \"snli_1.0.zip\")\n        snlipath = _maybe_download_and_extract(filepath, file_split, file_type)\n\n        # NOTE: this works for the txt format but not the jsonl format\n        df = dprep.auto_read_file(snlipath)\n\n    return df\n"
  },
  {
    "path": "utils_nlp/dataset/squad.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\r\n# Licensed under the MIT License.\r\n\r\nimport os\r\nimport json\r\nimport pandas as pd\r\n\r\nfrom utils_nlp.dataset.url_utils import maybe_download\r\n\r\nURL_DICT = {\r\n    \"v1.1\": {\r\n        \"train\": \"https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/\"\r\n        \"master/dataset/train-v1.1.json\",\r\n        \"dev\": \"https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/\"\r\n        \"master/dataset/dev-v1.1.json\",\r\n    },\r\n    \"v2.0\": {\r\n        \"train\": \"https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/\"\r\n        \"master/dataset/train-v2.0.json\",\r\n        \"dev\": \"https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/\"\r\n        \"master/dataset/dev-v2.0.json\",\r\n    },\r\n}\r\n\r\n\r\ndef load_pandas_df(local_cache_path=\".\", squad_version=\"v1.1\", file_split=\"train\"):\r\n    \"\"\"Loads the SQuAD dataset in pandas data frame.\r\n\r\n    Args:\r\n        local_cache_path (str, optional): Path to load the data from. If the file doesn't exist,\r\n            download it first. Defaults to the current directory.\r\n        squad_version (str, optional): Version of the SQuAD dataset, accepted values are: \r\n            \"v1.1\" and \"v2.0\". Defaults to \"v1.1\".\r\n        file_split (str, optional): Dataset split to load, accepted values are: \"train\" and \"dev\".\r\n            Defaults to \"train\".\r\n    \"\"\"\r\n\r\n    if file_split not in [\"train\", \"dev\"]:\r\n        raise ValueError(\"file_split should be either train or dev\")\r\n\r\n    URL = URL_DICT[squad_version][file_split]\r\n    file_name = URL.split(\"/\")[-1]\r\n    maybe_download(URL, file_name, local_cache_path)\r\n\r\n    file_path = os.path.join(local_cache_path, file_name)\r\n\r\n    with open(file_path, \"r\", encoding=\"utf-8\") as reader:\r\n        input_data = json.load(reader)[\"data\"]\r\n\r\n    paragraph_text_list = []\r\n    question_text_list = []\r\n    answer_start_list = []\r\n    answer_text_list = []\r\n    qa_id_list = []\r\n    is_impossible_list = []\r\n    for entry in input_data:\r\n        for paragraph in entry[\"paragraphs\"]:\r\n            paragraph_text = paragraph[\"context\"]\r\n\r\n            for qa in paragraph[\"qas\"]:\r\n                qas_id = qa[\"id\"]\r\n                question_text = qa[\"question\"]\r\n                answer_offset = None\r\n                is_impossible = False\r\n\r\n                if squad_version == \"v2.0\":\r\n                    is_impossible = qa[\"is_impossible\"]\r\n\r\n                if file_split == \"train\":\r\n                    if (len(qa[\"answers\"]) != 1) and (not is_impossible):\r\n                        raise ValueError(\r\n                            \"For training, each question should have exactly 1 answer.\"\r\n                        )\r\n                    if not is_impossible:\r\n                        answer = qa[\"answers\"][0]\r\n                        orig_answer_text = answer[\"text\"]\r\n                        answer_offset = answer[\"answer_start\"]\r\n                    else:\r\n                        orig_answer_text = \"\"\r\n                else:\r\n                    if not is_impossible:\r\n                        orig_answer_text = []\r\n                        answer_offset = []\r\n                        for answer in qa[\"answers\"]:\r\n                            orig_answer_text.append(answer[\"text\"])\r\n                            answer_offset.append(answer[\"answer_start\"])\r\n                    else:\r\n                        orig_answer_text = \"\"\r\n\r\n                paragraph_text_list.append(paragraph_text)\r\n                question_text_list.append(question_text)\r\n                answer_start_list.append(answer_offset)\r\n                answer_text_list.append(orig_answer_text)\r\n                qa_id_list.append(qas_id)\r\n                is_impossible_list.append(is_impossible)\r\n\r\n    output_df = pd.DataFrame(\r\n        {\r\n            \"doc_text\": paragraph_text_list,\r\n            \"question_text\": question_text_list,\r\n            \"answer_start\": answer_start_list,\r\n            \"answer_text\": answer_text_list,\r\n            \"qa_id\": qa_id_list,\r\n            \"is_impossible\": is_impossible_list,\r\n        }\r\n    )\r\n\r\n    return output_df\r\n"
  },
  {
    "path": "utils_nlp/dataset/stsbenchmark.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n    Utility functions for downloading, extracting, and reading the\n    STSbenchmark dataset.\n    http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark\n\"\"\"\n\n\nimport os\nimport tarfile\nimport pandas as pd\n\nfrom utils_nlp.dataset.url_utils import maybe_download\n\nSTS_URL = \"http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz\"\nDEFAULT_FILE_SPLIT = \"train\"\n\n\ndef load_pandas_df(data_path, file_split=DEFAULT_FILE_SPLIT):\n    \"\"\"Load the STS Benchmark dataset as a pd.DataFrame\n\n    Args:\n        data_path (str): Path to data directory\n        file_split (str, optional): File split to load.\n        One of (train, dev, test).\n        Defaults to train.\n\n    Returns:\n        pd.DataFrame: STS Benchmark dataset\n    \"\"\"\n    file_name = \"sts-{}.csv\".format(file_split)\n    df = _maybe_download_and_extract(file_name, data_path)\n    return df\n\n\ndef _maybe_download_and_extract(sts_file, base_data_path):\n    raw_data_path = os.path.join(base_data_path, \"raw\")\n    if not os.path.exists(raw_data_path):\n        os.makedirs(raw_data_path)\n    sts_path = _download_sts(raw_data_path)\n    df = _load_sts(os.path.join(sts_path, sts_file))\n    return df\n\n\ndef _download_sts(dirpath):\n    \"\"\"Download and extract data from\n        http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz\n\n    Args:\n        dirpath (str): Path to data directory.\n\n    Returns:\n        str: Path to extracted STS Benchmark data.\n    \"\"\"\n    filepath = maybe_download(STS_URL, work_directory=dirpath)\n    extracted_path = _extract_sts(filepath, target_dirpath=dirpath, tmode=\"r:gz\")\n    print(\"Data downloaded to {}\".format(extracted_path))\n    return extracted_path\n\n\ndef _extract_sts(tarpath, target_dirpath=\".\", tmode=\"r\"):\n    \"\"\"Extract data from the sts tar.gz archive\n\n    Args:\n        tarpath (str): Path to tarfile, to be deleted after extraction.\n        target_dirpath (str, optional): Directory in which to save\n            the extracted files.\n        tmode (str, optional): The mode for reading,\n            of the form \"filemode[:compression]\".\n        Defaults to \"r\".\n\n    Returns:\n        str: Path to extracted STS Benchmark data.\n    \"\"\"\n    with tarfile.open(tarpath, mode=tmode) as t:\n        t.extractall(target_dirpath)\n        extracted = t.getnames()[0]\n    os.remove(tarpath)\n    return os.path.join(target_dirpath, extracted)\n\n\ndef _load_sts(src_file_path):\n    \"\"\"Load datafile as dataframe\n\n    Args:\n        src_file_path (str): filepath to train/dev/test csv files.\n    \"\"\"\n    with open(src_file_path, \"r\", encoding=\"utf-8\") as f:\n        sent_pairs = []\n        for line in f:\n            line = line.strip().split(\"\\t\")\n            sent_pairs.append(\n                [\n                    line[0].strip(),\n                    line[1].strip(),\n                    line[2].strip(),\n                    line[3].strip(),\n                    float(line[4]),\n                    line[5].strip(),\n                    line[6].strip(),\n                ]\n            )\n\n        sdf = pd.DataFrame(\n            sent_pairs,\n            columns=[\n                \"column_0\",\n                \"column_1\",\n                \"column_2\",\n                \"column_3\",\n                \"column_4\",\n                \"column_5\",\n                \"column_6\",\n            ],\n        )\n        return sdf\n\n\ndef clean_sts(df):\n    \"\"\"Drop columns containing irrelevant metadata and\n    save as new csv files in the target_dir.\n\n    Args:\n        df (pandas.Dataframe): drop columns from train/test/dev files.\n    \"\"\"\n    clean_df = df.drop([\"column_0\", \"column_1\", \"column_2\", \"column_3\"], axis=1)\n    clean_df = clean_df.rename(\n        index=str, columns={\"column_4\": \"score\", \"column_5\": \"sentence1\", \"column_6\": \"sentence2\"}\n    )\n    return clean_df\n"
  },
  {
    "path": "utils_nlp/dataset/url_utils.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Common utilities for downloading and extracting datasets.\"\"\"\n\nimport logging\nimport math\nimport os\nimport tarfile\nimport zipfile\nfrom contextlib import contextmanager\nfrom tempfile import TemporaryDirectory\n\nimport requests\nfrom tqdm import tqdm\nfrom google_drive_downloader import GoogleDriveDownloader as gdd\n\nlogger = logging.getLogger(__name__)\n\n\ndef maybe_download(url, filename=None, work_directory=\".\", expected_bytes=None):\n    \"\"\"Download a file if it is not already downloaded.\n\n    Args:\n        filename (str): File name.\n        work_directory (str): Working directory.\n        url (str): URL of the file to download.\n        expected_bytes (int): Expected file size in bytes.\n    Returns:\n        str: File path of the file downloaded.\n    \"\"\"\n    if filename is None:\n        filename = url.split(\"/\")[-1]\n    os.makedirs(work_directory, exist_ok=True)\n    filepath = os.path.join(work_directory, filename)\n    if not os.path.exists(filepath):\n        if not os.path.isdir(work_directory):\n            os.makedirs(work_directory)\n        r = requests.get(url, stream=True)\n        total_size = int(r.headers.get(\"content-length\", 0))\n        block_size = 1024\n        num_iterables = math.ceil(total_size / block_size)\n\n        with open(filepath, \"wb\") as file:\n            for data in tqdm(\n                r.iter_content(block_size),\n                total=num_iterables,\n                unit=\"KB\",\n                unit_scale=True,\n            ):\n                file.write(data)\n    else:\n        logger.info(\"File {} already downloaded\".format(filepath))\n    if expected_bytes is not None:\n        statinfo = os.stat(filepath)\n        if statinfo.st_size != expected_bytes:\n            os.remove(filepath)\n            raise IOError(\"Failed to verify {}\".format(filepath))\n\n    return filepath\n\n\ndef maybe_download_googledrive(\n    google_file_id, file_name, work_directory=\".\", expected_bytes=None\n):\n    \"\"\"Download a file from google drive if it is not already downloaded.\n\n    Args:\n        google_file_id (str): The ID of the google file which can be found in\n            the file link, e.g. https://drive.google.com/file/d/{google_file_id}/view\n        file_name (str): Name of the downloaded file.\n        work_directory (str, optional): Directory to download the file to.\n            Defaults to \".\".\n        expected_bytes (int, optional): Expected file size in bytes.\n    Returns:\n        str: File path of the file downloaded.\n    \"\"\"\n\n    os.makedirs(work_directory, exist_ok=True)\n    filepath = os.path.join(work_directory, file_name)\n    if not os.path.exists(filepath):\n        gdd.download_file_from_google_drive(file_id=google_file_id, dest_path=filepath)\n    else:\n        logger.info(\"File {} already downloaded\".format(filepath))\n    if expected_bytes is not None:\n        statinfo = os.stat(filepath)\n        if statinfo.st_size != expected_bytes:\n            os.remove(filepath)\n            raise IOError(\"Failed to verify {}\".format(filepath))\n\n    return filepath\n\n\ndef extract_tar(file_path, dest_path=\".\"):\n    \"\"\"Extracts all contents of a tar archive file.\n    Args:\n        file_path (str): Path of file to extract.\n        dest_path (str, optional): Destination directory. Defaults to \".\".\n    \"\"\"\n    if not os.path.exists(file_path):\n        raise IOError(\"File doesn't exist\")\n    if not os.path.exists(dest_path):\n        raise IOError(\"Destination directory doesn't exist\")\n    with tarfile.open(file_path) as t:\n        t.extractall(path=dest_path)\n\n\ndef extract_zip(file_path, dest_path=\".\"):\n    \"\"\"Extracts all contents of a zip archive file.\n    Args:\n        file_path (str): Path of file to extract.\n        dest_path (str, optional): Destination directory. Defaults to \".\".\n    \"\"\"\n    if not os.path.exists(file_path):\n        raise IOError(\"File doesn't exist\")\n    if not os.path.exists(dest_path):\n        raise IOError(\"Destination directory doesn't exist\")\n    with zipfile.ZipFile(file_path) as z:\n        z.extractall(dest_path, filter(lambda f: not f.endswith(\"\\r\"), z.namelist()))\n\n\n@contextmanager\ndef download_path(path):\n    tmp_dir = TemporaryDirectory()\n    if path is None:\n        path = tmp_dir.name\n    else:\n        path = os.path.realpath(path)\n\n    try:\n        yield path\n    finally:\n        tmp_dir.cleanup()\n"
  },
  {
    "path": "utils_nlp/dataset/wikigold.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n    Utility functions for downloading and reading the wikigold dataset for\n    Named Entity Recognition (NER).\n    https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data\n\"\"\"\n\nimport logging\nimport os\nimport random\nfrom tempfile import TemporaryDirectory\n\nimport pandas as pd\n\nfrom utils_nlp.common.pytorch_utils import dataloader_from_dataset\nfrom utils_nlp.dataset.ner_utils import preprocess_conll\nfrom utils_nlp.dataset.url_utils import maybe_download\nfrom utils_nlp.models.transformers.common import MAX_SEQ_LEN\nfrom utils_nlp.models.transformers.named_entity_recognition import (\n    TokenClassificationProcessor,\n)\n\nURL = (\n    \"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets\"\n    \"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt\"\n)\n\n\ndef load_train_test_dfs(local_cache_path=\"./\", test_fraction=0.5, random_seed=None):\n    \"\"\"\n    Get the training and testing data frames based on test_fraction.\n\n    Args:\n        local_cache_path (str): Path to store the data. If the data file\n            doesn't exist in this path, it's downloaded.\n        test_fraction (float, optional): Fraction of data ot use for\n            testing. Since this is a small dataset, the default testing\n            fraction is set to 0.5\n        random_seed (float, optional): Random seed used to shuffle the data.\n\n    Returns:\n        tuple: (train_pandas_df, test_pandas_df), each data frame contains\n            two columns\n            \"sentence\": sentences in strings.\n            \"labels\": list of entity labels of the words in the sentence.\n\n    \"\"\"\n    file_name = URL.split(\"/\")[-1]\n    maybe_download(URL, file_name, local_cache_path)\n\n    data_file = os.path.join(local_cache_path, file_name)\n\n    with open(data_file, \"r\", encoding=\"utf8\") as file:\n        text = file.read()\n\n    sentence_list, labels_list = preprocess_conll(text)\n\n    if random_seed:\n        random.seed(random_seed)\n    sentence_and_labels = list(zip(sentence_list, labels_list))\n    random.shuffle(sentence_and_labels)\n    sentence_list[:], labels_list[:] = zip(*sentence_and_labels)\n\n    sentence_count = len(sentence_list)\n    test_sentence_count = round(sentence_count * test_fraction)\n    test_sentence_list = sentence_list[:test_sentence_count]\n    test_labels_list = labels_list[:test_sentence_count]\n    train_sentence_list = sentence_list[test_sentence_count:]\n    train_labels_list = labels_list[test_sentence_count:]\n\n    train_df = pd.DataFrame(\n        {\"sentence\": train_sentence_list, \"labels\": train_labels_list}\n    )\n\n    test_df = pd.DataFrame({\"sentence\": test_sentence_list, \"labels\": test_labels_list})\n\n    return (train_df, test_df)\n\n\ndef get_unique_labels():\n    \"\"\"Get the unique labels in the wikigold dataset.\"\"\"\n    return [\"O\", \"I-LOC\", \"I-MISC\", \"I-PER\", \"I-ORG\"]\n\n\ndef load_dataset(\n    local_path=TemporaryDirectory().name,\n    test_fraction=0.3,\n    random_seed=None,\n    train_sample_ratio=1.0,\n    test_sample_ratio=1.0,\n    model_name=\"bert-base-uncased\",\n    to_lower=True,\n    cache_dir=TemporaryDirectory().name,\n    max_len=MAX_SEQ_LEN,\n    trailing_piece_tag=\"X\",\n    batch_size=32,\n    num_gpus=None,\n):\n    \"\"\"\n    Load the wikigold dataset and split into training and testing datasets.\n    The datasets are preprocessed and can be used to train a NER model or evaluate\n    on the testing dataset.\n\n    Args:\n        local_path (str, optional): The local file path to save the raw wikigold file.\n            Defautls to \"~/.nlp_utils/datasets/\".\n        test_fraction (float, optional): The fraction of testing dataset when splitting.\n            Defaults to 0.3.\n        random_seed (float, optional): Random seed used to shuffle the data.\n            Defaults to None.\n        train_sample_ratio (float, optional): The ratio that used to sub-sampling for training.\n            Defaults to 1.0.\n        test_sample_ratio (float, optional): The ratio that used to sub-sampling for testing.\n            Defaults to 1.0.\n        model_name (str, optional): The pretained model name.\n            Defaults to \"bert-base-uncased\".\n        to_lower (bool, optional): Lower case text input.\n            Defaults to True.\n        cache_dir (str, optional): The default folder for saving cache files.\n            Defaults to './temp'.\n        max_len (int, optional): Maximum length of the list of tokens. Lists longer\n            than this are truncated and shorter ones are padded with \"O\"s.\n            Default value is BERT_MAX_LEN=512.\n        trailing_piece_tag (str, optional): Tag used to label trailing word pieces.\n            For example, \"criticize\" is broken into \"critic\" and \"##ize\", \"critic\"\n            preserves its original label and \"##ize\" is labeled as trailing_piece_tag.\n            Default value is \"X\".\n        batch_size (int, optional): The batch size for training and testing.\n            Defaults to 32.\n        num_gpus (int, optional): The number of GPUs.\n            Defaults to None.\n\n    Returns:\n        tuple. The tuple contains four elements.\n        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.\n        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.\n        label_map (dict): A dictionary object to map a label (str) to an ID (int).\n        test_dataset (TensorDataset): A TensorDataset containing the following four tensors.\n            1. input_ids_all: Tensor. Each sublist contains numerical values,\n                i.e. token ids, corresponding to the tokens in the input text data.\n            2. input_mask_all: Tensor. Each sublist contains the attention\n                mask of the input token id list, 1 for input tokens and 0 for\n                padded tokens, so that padded tokens are not attended to.\n            3. trailing_token_mask_all: Tensor. Each sublist is\n                a boolean list, True for the first word piece of each\n                original word, False for the trailing word pieces,\n                e.g. \"##ize\". This mask is useful for removing the\n                predictions on trailing word pieces, so that each\n                original word in the input text has a unique predicted\n                label.\n            4. label_ids_all: Tensor, each sublist contains token labels of\n                a input sentence/paragraph, if labels is provided. If the\n                `labels` argument is not provided, it will not return this tensor.\n    \"\"\"\n\n    train_df, test_df = load_train_test_dfs(\n        local_cache_path=local_path,\n        test_fraction=test_fraction,\n        random_seed=random_seed,\n    )\n\n    if train_sample_ratio > 1.0:\n        train_sample_ratio = 1.0\n        logging.warning(\"Setting the training sample ratio to 1.0\")\n    elif train_sample_ratio < 0:\n        logging.error(\"Invalid training sample ratio: {}\".format(train_sample_ratio))\n        raise ValueError(\n            \"Invalid training sample ratio: {}\".format(train_sample_ratio)\n        )\n\n    if test_sample_ratio > 1.0:\n        test_sample_ratio = 1.0\n        logging.warning(\"Setting the testing sample ratio to 1.0\")\n    elif test_sample_ratio < 0:\n        logging.error(\"Invalid testing sample ratio: {}\".format(test_sample_ratio))\n        raise ValueError(\"Invalid testing sample ratio: {}\".format(test_sample_ratio))\n\n    if train_sample_ratio < 1.0:\n        train_df = train_df.sample(frac=train_sample_ratio).reset_index(drop=True)\n    if test_sample_ratio < 1.0:\n        test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)\n\n    processor = TokenClassificationProcessor(\n        model_name=model_name, to_lower=to_lower, cache_dir=cache_dir\n    )\n\n    label_map = TokenClassificationProcessor.create_label_map(\n        label_lists=train_df[\"labels\"], trailing_piece_tag=trailing_piece_tag\n    )\n    train_dataset = processor.preprocess(\n        text=train_df[\"sentence\"],\n        max_len=max_len,\n        labels=train_df[\"labels\"],\n        label_map=label_map,\n        trailing_piece_tag=trailing_piece_tag,\n    )\n\n    test_dataset = processor.preprocess(\n        text=test_df[\"sentence\"],\n        max_len=max_len,\n        labels=test_df[\"labels\"],\n        label_map=label_map,\n        trailing_piece_tag=trailing_piece_tag,\n    )\n\n    train_dataloader = dataloader_from_dataset(\n        train_dataset,\n        batch_size=batch_size,\n        num_gpus=num_gpus,\n        shuffle=True,\n        distributed=False,\n    )\n\n    test_dataloader = dataloader_from_dataset(\n        test_dataset,\n        batch_size=batch_size,\n        num_gpus=num_gpus,\n        shuffle=False,\n        distributed=False,\n    )\n\n    return (train_dataloader, test_dataloader, label_map, test_dataset)\n"
  },
  {
    "path": "utils_nlp/dataset/xnli.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n    Utility functions for downloading, extracting, and reading the\n    Cross-Lingual NLI Corpus (XNLI).\n    https://www.nyu.edu/projects/bowman/xnli/\n\"\"\"\n\n\nimport os\nimport pandas as pd\n\nfrom utils_nlp.dataset.url_utils import extract_zip, maybe_download\nfrom utils_nlp.dataset.preprocess import convert_to_unicode\n\nURL_XNLI = \"https://cims.nyu.edu/~sbowman/xnli/XNLI-1.0.zip\"\nURL_XNLI_MT = \"https://cims.nyu.edu/~sbowman/xnli/XNLI-MT-1.0.zip\"\n\n\ndef load_pandas_df(local_cache_path=\".\", file_split=\"dev\", language=\"zh\"):\n    \"\"\"Downloads and extracts the dataset files.\n\n    Utilities information can be found `on this link <https://www.nyu.edu/projects/bowman/xnli/>`_.\n\n    Args:\n        local_cache_path (str, optional): Path to store the data.\n            Defaults to \"./\".\n        file_split (str, optional): The subset to load.\n            One of: {\"train\", \"dev\", \"test\"}\n            Defaults to \"dev\".\n        language (str, optional): language subset to read.\n            One of: {\"en\", \"fr\", \"es\", \"de\", \"el\", \"bg\", \"ru\",\n            \"tr\", \"ar\", \"vi\", \"th\", \"zh\", \"hi\", \"sw\", \"ur\"}\n            Defaults to \"zh\" (Chinese).\n    Returns:\n        pd.DataFrame: pandas DataFrame containing the specified\n            XNLI subset.\n    \"\"\"\n\n    if file_split in (\"dev\", \"test\"):\n        url = URL_XNLI\n        sentence_1_index = 6\n        sentence_2_index = 7\n        label_index = 1\n\n        zip_file_name = url.split(\"/\")[-1]\n        folder_name = \".\".join(zip_file_name.split(\".\")[:-1])\n        file_name = folder_name + \"/\" + \".\".join([\"xnli\", file_split, \"tsv\"])\n    elif file_split == \"train\":\n        url = URL_XNLI_MT\n        sentence_1_index = 0\n        sentence_2_index = 1\n        label_index = 2\n\n        zip_file_name = url.split(\"/\")[-1]\n        folder_name = \".\".join(zip_file_name.split(\".\")[:-1])\n        file_name = folder_name + \"/multinli/\" + \".\".join([\"multinli\", file_split, language, \"tsv\"])\n\n    maybe_download(url, zip_file_name, local_cache_path)\n\n    if not os.path.exists(os.path.join(local_cache_path, folder_name)):\n        extract_zip(os.path.join(local_cache_path, zip_file_name), local_cache_path)\n\n    with open(os.path.join(local_cache_path, file_name), \"r\", encoding=\"utf-8\") as f:\n        lines = f.read().splitlines()\n\n    line_list = [line.split(\"\\t\") for line in lines]\n\n    # Remove the column name row\n    line_list.pop(0)\n    if file_split != \"train\":\n        line_list = [line for line in line_list if line[0] == language]\n\n    valid_lines = [\n        True if line[sentence_1_index] and line[sentence_2_index] else False for line in line_list\n    ]\n    total_line_count = len(line_list)\n    line_list = [line for line, valid in zip(line_list, valid_lines) if valid]\n    valid_line_count = len(line_list)\n\n    if valid_line_count != total_line_count:\n        print(\"{} invalid lines removed.\".format(total_line_count - valid_line_count))\n\n    label_list = [convert_to_unicode(line[label_index]) for line in line_list]\n    old_contradict_label = convert_to_unicode(\"contradictory\")\n    new_contradict_label = convert_to_unicode(\"contradiction\")\n    label_list = [\n        new_contradict_label if label == old_contradict_label else label for label in label_list\n    ]\n    text_list = [\n        (convert_to_unicode(line[sentence_1_index]), convert_to_unicode(line[sentence_2_index]))\n        for line in line_list\n    ]\n\n    df = pd.DataFrame({\"text\": text_list, \"label\": label_list})\n\n    return df\n"
  },
  {
    "path": "utils_nlp/dataset/xnli_torch_dataset.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Utility functions for loading the Cross-Lingual NLI Corpus (XNLI) as a PyTorch Dataset.\"\"\"\n\nimport numpy as np\nimport torch\nfrom utils_nlp.models.bert.common import Language, Tokenizer\nfrom torch.utils import data\nfrom utils_nlp.dataset.xnli import load_pandas_df\nfrom sklearn.preprocessing import LabelEncoder\n\nMAX_SEQ_LENGTH = 128\nTEXT_COL = \"text\"\nLABEL_COL = \"label\"\nDATA_PERCENT_USED = 1.0\nTRAIN_FILE_SPLIT = \"train\"\nTEST_FILE_SPLIT = \"test\"\nVALIDATION_FILE_SPLIT = \"dev\"\nCACHE_DIR = \"./\"\nLANGUAGE_ENGLISH = \"en\"\nTO_LOWER_CASE = False\nTOK_ENGLISH = Language.ENGLISH\nVALID_FILE_SPLIT = [TRAIN_FILE_SPLIT, VALIDATION_FILE_SPLIT, TEST_FILE_SPLIT]\n\n\ndef _load_pandas_df(cache_dir, file_split, language, data_percent_used):\n    df = load_pandas_df(local_cache_path=cache_dir, file_split=file_split, language=language)\n    data_used_count = round(data_percent_used * df.shape[0])\n    df = df.loc[:data_used_count]\n    return df\n\n\ndef _tokenize(tok_language, to_lowercase, cache_dir, df):\n    print(\"Create a tokenizer...\")\n    tokenizer = Tokenizer(language=tok_language, to_lower=to_lowercase, cache_dir=cache_dir)\n    tokens = tokenizer.tokenize(df[TEXT_COL])\n\n    print(\"Tokenize and preprocess text...\")\n    # tokenize\n    token_ids, input_mask, token_type_ids = tokenizer.preprocess_classification_tokens(\n        tokens, max_len=MAX_SEQ_LENGTH\n    )\n    return token_ids, input_mask, token_type_ids\n\n\ndef _fit_train_labels(df):\n    label_encoder = LabelEncoder()\n    train_labels = label_encoder.fit_transform(df[LABEL_COL])\n    train_labels = np.array(train_labels)\n    return label_encoder, train_labels\n\n\nclass XnliDataset(data.Dataset):\n    def __init__(\n        self,\n        file_split=TRAIN_FILE_SPLIT,\n        cache_dir=CACHE_DIR,\n        language=LANGUAGE_ENGLISH,\n        to_lowercase=TO_LOWER_CASE,\n        tok_language=TOK_ENGLISH,\n        data_percent_used=DATA_PERCENT_USED,\n    ):\n        \"\"\"\n            Load the dataset here\n        Args:\n            file_split (str, optional):The subset to load.\n                One of: {\"train\", \"dev\", \"test\"}\n                Defaults to \"train\".\n            cache_dir (str, optional):Path to store the data.\n                Defaults to \"./\".\n            language(str):Language required to load which xnli file (eg - \"en\", \"zh\")\n            to_lowercase(bool):flag to convert samples in dataset to lowercase\n            tok_language(Language, optional): language (Language, optional): The pretrained model's\n                language. Defaults to Language.ENGLISH.\n            data_percent_used(float, optional): Data used to create Torch Dataset.\n                Defaults to \"1.0\" which is 100% data\n        \"\"\"\n        if file_split not in VALID_FILE_SPLIT:\n            raise ValueError(\"The file split is not part of \", VALID_FILE_SPLIT)\n\n        self.file_split = file_split\n        self.cache_dir = cache_dir\n        self.language = language\n        self.to_lowercase = to_lowercase\n        self.tok_language = tok_language\n        self.data_percent_used = data_percent_used\n\n        df = _load_pandas_df(self.cache_dir, self.file_split, self.language, self.data_percent_used)\n\n        self.df = df\n\n        token_ids, input_mask, token_type_ids = _tokenize(\n            tok_language, to_lowercase, cache_dir, self.df\n        )\n\n        self.token_ids = token_ids\n        self.input_mask = input_mask\n        self.token_type_ids = token_type_ids\n\n        if file_split == TRAIN_FILE_SPLIT:\n            label_encoder, train_labels = _fit_train_labels(self.df)\n            self.label_encoder = label_encoder\n            self.labels = train_labels\n        else:\n            # use the label_encoder passed when you create the test/validate dataset\n            self.labels = self.df[LABEL_COL]\n\n    def __len__(self):\n        \"\"\" Denotes the total number of samples \"\"\"\n        return len(self.df)\n\n    def __getitem__(self, index):\n        \"\"\" Generates one sample of data \"\"\"\n        token_ids = self.token_ids[index]\n        input_mask = self.input_mask[index]\n        token_type_ids = self.token_type_ids[index]\n        labels = self.labels[index]\n\n        return {\n            \"token_ids\": torch.tensor(token_ids, dtype=torch.long),\n            \"input_mask\": torch.tensor(input_mask, dtype=torch.long),\n            \"token_type_ids\": torch.tensor(token_type_ids, dtype=torch.long),\n            \"labels\": labels,\n        }\n"
  },
  {
    "path": "utils_nlp/eval/README.md",
    "content": "## [Evaluation](.)\nThe evaluation (eval) submodule includes functionalities for computing metrics for evaluating NLP model performance.  There are general evaluation metrics like accuracy, precision, recall, and f1 scores for classification scenarios. In addition, we also include evaluation utilities for specialized tasks like question answering and sentence embedding.\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/.gitignore",
    "content": "# SentEval data and .pyc files\n\n\n\n# python\n__pycache__/\n*.py[cod]\n*$py.class\n\n# log files\n*.log\n*.txt\n\n# data files\ndata/senteval_data*\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/LICENSE",
    "content": "BSD License\n\nFor SentEval software\n\nCopyright (c) 2017-present, Facebook, Inc. All rights reserved.\n\nRedistribution and use in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n * Redistributions of source code must retain the above copyright notice, this\n   list of conditions and the following disclaimer.\n\n * Redistributions in binary form must reproduce the above copyright notice,\n   this list of conditions and the following disclaimer in the documentation\n   and/or other materials provided with the distribution.\n\n * Neither the name Facebook nor the names of its contributors may be used to\n   endorse or promote products derived from this software without specific\n   prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\nANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\nWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR\nANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\nLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON\nANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\nSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/README.md",
    "content": "# SentEval: evaluation toolkit for sentence embeddings\n\nSentEval is a library for evaluating the quality of sentence embeddings. We assess their generalization power by using them as features on a broad and diverse set of \"transfer\" tasks. **SentEval currently includes 17 downstream tasks**. We also include a suite of **10 probing tasks** which evaluate what linguistic properties are encoded in sentence embeddings. Our goal is to ease the study and the development of general-purpose fixed-size sentence representations.\n\n\n**(04/22) SentEval new tasks: Added probing tasks for evaluating what linguistic properties are encoded in sentence embeddings**\n\n**(10/04) SentEval example scripts for three sentence encoders: [SkipThought-LN](https://github.com/ryankiros/layer-norm#skip-thoughts)/[GenSen](https://github.com/Maluuba/gensen)/[Google-USE](https://tfhub.dev/google/universal-sentence-encoder/1)**\n\n## Dependencies\n\nThis code is written in python. The dependencies are:\n\n* Python 2/3 with [NumPy](http://www.numpy.org/)/[SciPy](http://www.scipy.org/)\n* [Pytorch](http://pytorch.org/)>=0.4\n* [scikit-learn](http://scikit-learn.org/stable/index.html)>=0.18.0\n\n## Transfer tasks\n\n### Downstream tasks\nSentEval allows you to evaluate your sentence embeddings as features for the following *downstream* tasks:\n\n| Task     \t| Type                         \t| #train \t| #test \t| needs_train \t| set_classifier |\n|----------\t|------------------------------\t|-----------:|----------:|:-----------:|:----------:|\n| [MR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)       \t| movie review                 \t| 11k     \t| 11k    \t| 1 | 1 |\n| [CR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)       \t| product review               \t| 4k      \t| 4k     \t| 1 | 1 |\n| [SUBJ](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)     \t| subjectivity status          \t| 10k     \t| 10k    \t| 1 | 1 |\n| [MPQA](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)     \t| opinion-polarity  | 11k     \t| 11k    \t| 1 | 1 |\n| [SST](https://nlp.stanford.edu/sentiment/index.html)      \t| binary sentiment analysis  \t| 67k     \t| 1.8k   \t| 1 | 1 |\n| **[SST](https://nlp.stanford.edu/sentiment/index.html)**      \t| **fine-grained sentiment analysis**  \t| 8.5k     \t| 2.2k   \t| 1 | 1 |\n| [TREC](http://cogcomp.cs.illinois.edu/Data/QA/QC/)     \t| question-type classification \t| 6k      \t| 0.5k    \t| 1 | 1 |\n| [SICK-E](http://clic.cimec.unitn.it/composes/sick.html)   \t| natural language inference \t| 4.5k    \t| 4.9k   \t| 1 | 1 |\n| [SNLI](https://nlp.stanford.edu/projects/snli/)     \t| natural language inference   \t| 550k    \t| 9.8k   \t| 1 | 1 |\n| [MRPC](https://aclweb.org/aclwiki/Paraphrase_Identification_(State_of_the_art)) | paraphrase detection  | 4.1k | 1.7k | 1 | 1 |\n| [STS 2012](https://www.cs.york.ac.uk/semeval-2012/task6/) \t| semantic textual similarity  \t| N/A     \t| 3.1k   \t| 0  | 0 |\n| [STS 2013](http://ixa2.si.ehu.es/sts/) \t| semantic textual similarity  \t| N/A     \t| 1.5k   \t| 0  | 0 |\n| [STS 2014](http://alt.qcri.org/semeval2014/task10/) \t| semantic textual similarity  \t| N/A     \t| 3.7k   \t| 0  | 0 |\n| [STS 2015](http://alt.qcri.org/semeval2015/task2/) \t| semantic textual similarity  \t| N/A     \t| 8.5k   \t| 0  | 0 |\n| [STS 2016](http://alt.qcri.org/semeval2016/task1/) \t| semantic textual similarity  \t| N/A     \t| 9.2k   \t| 0  | 0 |\n| [STS B](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results)    \t| semantic textual similarity  \t| 5.7k    \t| 1.4k   \t| 1 | 0 |\n| [SICK-R](http://clic.cimec.unitn.it/composes/sick.html)   \t| semantic textual similarity | 4.5k    \t| 4.9k   \t| 1 | 0 |\n| [COCO](http://mscoco.org/)     \t| image-caption retrieval      \t| 567k    \t| 5*1k   \t| 1 | 0 |\n\nwhere **needs_train** means a model with parameters is learned on top of the sentence embeddings, and **set_classifier** means you can define the parameters of the classifier in the case of a classification task (see below).\n\nNote: COCO comes with ResNet-101 2048d image embeddings. [More details on the tasks.](https://arxiv.org/pdf/1705.02364.pdf)\n\n### Probing tasks\nSentEval also includes a series of [*probing* tasks](https://github.com/facebookresearch/SentEval/tree/master/data/probing) to evaluate what linguistic properties are encoded in your sentence embeddings:\n\n| Task     \t| Type                         \t| #train \t| #test \t| needs_train \t| set_classifier |\n|----------\t|------------------------------\t|-----------:|----------:|:-----------:|:----------:|\n| [SentLen](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Length prediction\t| 100k     \t| 10k    \t| 1 | 1 |\n| [WC](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Word Content analysis\t| 100k     \t| 10k    \t| 1 | 1 |\n| [TreeDepth](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Tree depth prediction\t| 100k     \t| 10k    \t| 1 | 1 |\n| [TopConst](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Top Constituents prediction\t| 100k     \t| 10k    \t| 1 | 1 |\n| [BShift](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Word order analysis\t| 100k     \t| 10k    \t| 1 | 1 |\n| [Tense](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Verb tense prediction\t| 100k     \t| 10k    \t| 1 | 1 |\n| [SubjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Subject number prediction\t| 100k     \t| 10k    \t| 1 | 1 |\n| [ObjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Object number prediction\t| 100k     \t| 10k    \t| 1 | 1 |\n| [SOMO](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Semantic odd man out\t| 100k     \t| 10k    \t| 1 | 1 |\n| [CoordInv](https://github.com/facebookresearch/SentEval/tree/master/data/probing)\t| Coordination Inversion | 100k     \t| 10k    \t| 1 | 1 |\n\n## Download datasets\nTo get all the transfer tasks datasets, run (in data/downstream/):\n```bash\n./get_transfer_data.bash\n```\nThis will automatically download and preprocess the downstream datasets, and store them in data/downstream (warning: for MacOS users, you may have to use p7zip instead of unzip). The probing tasks are already in data/probing by default.\n\n## How to use SentEval: examples\n\n### examples/bow.py\n\nIn examples/bow.py, we evaluate the quality of the average of word embeddings.\n\nTo download state-of-the-art fastText embeddings:\n\n```bash\ncurl -Lo glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip\ncurl -Lo crawl-300d-2M.vec.zip https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip\n```\n\nTo reproduce the results for bag-of-vectors, run (in examples/):  \n```bash\npython bow.py\n```\n\nAs required by SentEval, this script implements two functions: **prepare** (optional) and **batcher** (required) that turn text sentences into sentence embeddings. Then SentEval takes care of the evaluation on the transfer tasks using the embeddings as features.\n\n### examples/infersent.py\n\nTo get the **[InferSent](https://www.github.com/facebookresearch/InferSent)** model and reproduce our results, download our best models and run infersent.py (in examples/):\n```bash\ncurl -Lo examples/infersent1.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent1.pkl\ncurl -Lo examples/infersent2.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent2.pkl\n```\n\n### examples/skipthought.py - examples/gensen.py - examples/googleuse.py\n\nWe also provide example scripts for three other encoders:\n\n* [SkipThought with Layer-Normalization](https://github.com/ryankiros/layer-norm#skip-thoughts) in Theano\n* [GenSen encoder](https://github.com/Maluuba/gensen) in Pytorch\n* [Google encoder](https://tfhub.dev/google/universal-sentence-encoder/1) in TensorFlow\n\nNote that for SkipThought and GenSen, following the steps of the associated githubs is necessary.\nThe Google encoder script should work as-is.\n\n## How to use SentEval\n\nTo evaluate your sentence embeddings, SentEval requires that you implement two functions:\n\n1. **prepare** (sees the whole dataset of each task and can thus construct the word vocabulary, the dictionary of word vectors etc)\n2. **batcher** (transforms a batch of text sentences into sentence embeddings)\n\n\n### 1.) prepare(params, samples) (optional)\n\n*batcher* only sees one batch at a time while the *samples* argument of *prepare* contains all the sentences of a task.\n\n```\nprepare(params, samples)\n```\n* *params*: senteval parameters.\n* *samples*: list of all sentences from the tranfer task.\n* *output*: No output. Arguments stored in \"params\" can further be used by *batcher*.\n\n*Example*: in bow.py, prepare is is used to build the vocabulary of words and construct the \"params.word_vect* dictionary of word vectors.\n\n\n### 2.) batcher(params, batch)\n```\nbatcher(params, batch)\n```\n* *params*: senteval parameters.\n* *batch*: numpy array of text sentences (of size params.batch_size)\n* *output*: numpy array of sentence embeddings (of size params.batch_size)\n\n*Example*: in bow.py, batcher is used to compute the mean of the word vectors for each sentence in the batch using params.word_vec. Use your own encoder in that function to encode sentences.\n\n### 3.) evaluation on transfer tasks\n\nAfter having implemented the batch and prepare function for your own sentence encoder,\n\n1) to perform the actual evaluation, first import senteval and set its parameters:\n```python\nimport senteval\nparams = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}\n```\n\n2) (optional) set the parameters of the classifier (when applicable):\n```python\nparams['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,\n                                 'tenacity': 5, 'epoch_size': 4}\n```\nYou can choose **nhid=0** (Logistic Regression) or **nhid>0** (MLP) and define the parameters for training.\n\n3) Create an instance of the class SE:\n```python\nse = senteval.engine.SE(params, batcher, prepare)\n```\n\n4) define the set of transfer tasks and run the evaluation:\n```python\ntransfer_tasks = ['MR', 'SICKEntailment', 'STS14', 'STSBenchmark']\nresults = se.eval(transfer_tasks)\n```\nThe current list of available tasks is:\n```python\n['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SNLI',\n'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'ImageCaptionRetrieval',\n'STS12', 'STS13', 'STS14', 'STS15', 'STS16',\n'Length', 'WordContent', 'Depth', 'TopConstituents','BigramShift', 'Tense',\n'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion']\n```\n\n## SentEval parameters\nGlobal parameters of SentEval:\n```bash\n# senteval parameters\ntask_path                   # path to SentEval datasets (required)\nseed                        # seed\nusepytorch                  # use cuda-pytorch (else scikit-learn) where possible\nkfold                       # k-fold validation for MR/CR/SUB/MPQA.\n```\n\nParameters of the classifier:\n```bash\nnhid:                       # number of hidden units (0: Logistic Regression, >0: MLP); Default nonlinearity: Tanh\noptim:                      # optimizer (\"sgd,lr=0.1\", \"adam\", \"rmsprop\" ..)\ntenacity:                   # how many times dev acc does not increase before training stops\nepoch_size:                 # each epoch corresponds to epoch_size pass on the train set\nmax_epoch:                  # max number of epoches\ndropout:                    # dropout for MLP\n```\n\nNote that to get a proxy of the results while **dramatically reducing computation time**,\nwe suggest the **prototyping config**:\n```python\nparams = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}\nparams['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,\n                                 'tenacity': 3, 'epoch_size': 2}\n```\nwhich will results in a 5 times speedup for classification tasks.\n\nTo produce results that are **comparable to the literature**, use the **default config**:\n```python\nparams = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}\nparams['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,\n                                 'tenacity': 5, 'epoch_size': 4}\n```\nwhich takes longer but will produce better and comparable results.\n\nFor probing tasks, we used an MLP with a Sigmoid nonlinearity and and tuned the nhid (in [50, 100, 200]) and dropout (in [0.0, 0.1, 0.2]) on the dev set.\n\n## References\n\nPlease considering citing [[1]](https://arxiv.org/abs/1803.05449) if using this code for evaluating sentence embedding methods.\n\n### SentEval: An Evaluation Toolkit for Universal Sentence Representations\n\n[1] A. Conneau, D. Kiela, [*SentEval: An Evaluation Toolkit for Universal Sentence Representations*](https://arxiv.org/abs/1803.05449)\n\n```\n@article{conneau2018senteval,\n  title={SentEval: An Evaluation Toolkit for Universal Sentence Representations},\n  author={Conneau, Alexis and Kiela, Douwe},\n  journal={arXiv preprint arXiv:1803.05449},\n  year={2018}\n}\n```\n\nContact: [aconneau@fb.com](mailto:aconneau@fb.com), [dkiela@fb.com](mailto:dkiela@fb.com)\n\n### Related work\n* [J. R Kiros, Y. Zhu, R. Salakhutdinov, R. S. Zemel, A. Torralba, R. Urtasun, S. Fidler - SkipThought Vectors, NIPS 2015](https://arxiv.org/abs/1506.06726)\n* [S. Arora, Y. Liang, T. Ma - A Simple but Tough-to-Beat Baseline for Sentence Embeddings, ICLR 2017](https://openreview.net/pdf?id=SyK00v5xx)\n* [Y. Adi, E. Kermany, Y. Belinkov, O. Lavi, Y. Goldberg - Fine-grained analysis of sentence embeddings using auxiliary prediction tasks, ICLR 2017](https://arxiv.org/abs/1608.04207)\n* [A. Conneau, D. Kiela, L. Barrault, H. Schwenk, A. Bordes - Supervised Learning of Universal Sentence Representations from Natural Language Inference Data, EMNLP 2017](https://arxiv.org/abs/1705.02364)\n* [S. Subramanian, A. Trischler, Y. Bengio, C. J Pal - Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning, ICLR 2018](https://arxiv.org/abs/1804.00079)\n* [A. Nie, E. D. Bennett, N. D. Goodman - DisSent: Sentence Representation Learning from Explicit Discourse Relations, 2018](https://arxiv.org/abs/1710.04334)\n* [D. Cer, Y. Yang, S. Kong, N. Hua, N. Limtiaco, R. St. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y. Sung, B. Strope, R. Kurzweil - Universal Sentence Encoder, 2018](https://arxiv.org/abs/1803.11175)\n* [A. Conneau, G. Kruszewski, G. Lample, L. Barrault, M. Baroni - What you can cram into a single vector: Probing sentence embeddings for linguistic properties, ACL 2018](https://arxiv.org/abs/1805.01070)\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/__init__.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\nfrom __future__ import absolute_import\n\nfrom senteval.engine import SE\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/binary.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\nBinary classifier and corresponding datasets : MR, CR, SUBJ, MPQA\n'''\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport io\nimport os\nimport numpy as np\nimport logging\n\nfrom senteval.tools.validation import InnerKFoldClassifier\n\n\nclass BinaryClassifierEval(object):\n    def __init__(self, pos, neg, seed=1111):\n        self.seed = seed\n        self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg)\n        self.n_samples = len(self.samples)\n\n    def do_prepare(self, params, prepare):\n        # prepare is given the whole text\n        return prepare(params, self.samples)\n        # prepare puts everything it outputs in \"params\" : params.word2id etc\n        # Those output will be further used by \"batcher\".\n\n    def loadFile(self, fpath):\n        with io.open(fpath, 'r', encoding='latin-1') as f:\n            return [line.split() for line in f.read().splitlines()]\n\n    def run(self, params, batcher):\n        enc_input = []\n        # Sort to reduce padding\n        sorted_corpus = sorted(zip(self.samples, self.labels),\n                               key=lambda z: (len(z[0]), z[1]))\n        sorted_samples = [x for (x, y) in sorted_corpus]\n        sorted_labels = [y for (x, y) in sorted_corpus]\n        logging.info('Generating sentence embeddings')\n        for ii in range(0, self.n_samples, params.batch_size):\n            batch = sorted_samples[ii:ii + params.batch_size]\n            embeddings = batcher(params, batch)\n            enc_input.append(embeddings)\n        enc_input = np.vstack(enc_input)\n        logging.info('Generated sentence embeddings')\n\n        config = {'nclasses': 2, 'seed': self.seed,\n                  'usepytorch': params.usepytorch,\n                  'classifier': params.classifier,\n                  'nhid': params.nhid, 'kfold': params.kfold}\n        clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config)\n        devacc, testacc = clf.run()\n        logging.debug('Dev acc : {0} Test acc : {1}\\n'.format(devacc, testacc))\n        return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples,\n                'ntest': self.n_samples}\n\n\nclass CREval(BinaryClassifierEval):\n    def __init__(self, task_path, seed=1111):\n        logging.debug('***** Transfer task : CR *****\\n\\n')\n        pos = self.loadFile(os.path.join(task_path, 'custrev.pos'))\n        neg = self.loadFile(os.path.join(task_path, 'custrev.neg'))\n        super(self.__class__, self).__init__(pos, neg, seed)\n\n\nclass MREval(BinaryClassifierEval):\n    def __init__(self, task_path, seed=1111):\n        logging.debug('***** Transfer task : MR *****\\n\\n')\n        pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos'))\n        neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg'))\n        super(self.__class__, self).__init__(pos, neg, seed)\n\n\nclass SUBJEval(BinaryClassifierEval):\n    def __init__(self, task_path, seed=1111):\n        logging.debug('***** Transfer task : SUBJ *****\\n\\n')\n        obj = self.loadFile(os.path.join(task_path, 'subj.objective'))\n        subj = self.loadFile(os.path.join(task_path, 'subj.subjective'))\n        super(self.__class__, self).__init__(obj, subj, seed)\n\n\nclass MPQAEval(BinaryClassifierEval):\n    def __init__(self, task_path, seed=1111):\n        logging.debug('***** Transfer task : MPQA *****\\n\\n')\n        pos = self.loadFile(os.path.join(task_path, 'mpqa.pos'))\n        neg = self.loadFile(os.path.join(task_path, 'mpqa.neg'))\n        super(self.__class__, self).__init__(pos, neg, seed)\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/engine.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\n\nGeneric sentence evaluation scripts wrapper\n\n'''\nfrom __future__ import absolute_import, division, unicode_literals\n\nfrom senteval import utils\nfrom senteval.binary import CREval, MREval, MPQAEval, SUBJEval\nfrom senteval.snli import SNLIEval\nfrom senteval.trec import TRECEval\nfrom senteval.sick import SICKRelatednessEval, SICKEntailmentEval\nfrom senteval.mrpc import MRPCEval\nfrom senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval\nfrom senteval.sst import SSTEval\nfrom senteval.rank import ImageCaptionRetrievalEval\nfrom senteval.probing import *\n\nclass SE(object):\n    def __init__(self, params, batcher, prepare=None):\n        # parameters\n        params = utils.dotdict(params)\n        params.usepytorch = True if 'usepytorch' not in params else params.usepytorch\n        params.seed = 1111 if 'seed' not in params else params.seed\n\n        params.batch_size = 128 if 'batch_size' not in params else params.batch_size\n        params.nhid = 0 if 'nhid' not in params else params.nhid\n        params.kfold = 5 if 'kfold' not in params else params.kfold\n\n        if 'classifier' not in params or not params['classifier']:\n            params.classifier = {'nhid': 0}\n\n        assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!'\n\n        self.params = params\n\n        # batcher and prepare\n        self.batcher = batcher\n        self.prepare = prepare if prepare else lambda x, y: None\n\n        self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',\n                           'SICKRelatedness', 'SICKEntailment', 'STSBenchmark',\n                           'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13',\n                           'STS14', 'STS15', 'STS16',\n                           'Length', 'WordContent', 'Depth', 'TopConstituents',\n                           'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',\n                           'OddManOut', 'CoordinationInversion']\n\n    def eval(self, name):\n        # evaluate on evaluation [name], either takes string or list of strings\n        if (isinstance(name, list)):\n            self.results = {x: self.eval(x) for x in name}\n            return self.results\n\n        tpath = self.params.task_path\n        assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks)\n\n        # Original SentEval tasks\n        if name == 'CR':\n            self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed)\n        elif name == 'MR':\n            self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed)\n        elif name == 'MPQA':\n            self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed)\n        elif name == 'SUBJ':\n            self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed)\n        elif name == 'SST2':\n            self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed)\n        elif name == 'SST5':\n            self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed)\n        elif name == 'TREC':\n            self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed)\n        elif name == 'MRPC':\n            self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed)\n        elif name == 'SICKRelatedness':\n            self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed)\n        elif name == 'STSBenchmark':\n            self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)\n        elif name == 'SICKEntailment':\n            self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed)\n        elif name == 'SNLI':\n            self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed)\n        elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:\n            fpath = name + '-en-test'\n            self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed)\n        elif name == 'ImageCaptionRetrieval':\n            self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed)\n\n        # Probing Tasks\n        elif name == 'Length':\n                self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed)\n        elif name == 'WordContent':\n                self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed)\n        elif name == 'Depth':\n                self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed)\n        elif name == 'TopConstituents':\n                self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed)\n        elif name == 'BigramShift':\n                self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed)\n        elif name == 'Tense':\n                self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed)\n        elif name == 'SubjNumber':\n                self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed)\n        elif name == 'ObjNumber':\n                self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed)\n        elif name == 'OddManOut':\n                self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed)\n        elif name == 'CoordinationInversion':\n                self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed)\n\n        self.params.current_task = name\n        self.evaluation.do_prepare(self.params, self.prepare)\n\n        self.results = self.evaluation.run(self.params, self.batcher)\n\n        return self.results\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/mrpc.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\nMRPC : Microsoft Research Paraphrase (detection) Corpus\n'''\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport os\nimport logging\nimport numpy as np\nimport io\n\nfrom senteval.tools.validation import KFoldClassifier\n\nfrom sklearn.metrics import f1_score\n\n\nclass MRPCEval(object):\n    def __init__(self, task_path, seed=1111):\n        logging.info('***** Transfer task : MRPC *****\\n\\n')\n        self.seed = seed\n        train = self.loadFile(os.path.join(task_path,\n                              'msr_paraphrase_train.txt'))\n        test = self.loadFile(os.path.join(task_path,\n                             'msr_paraphrase_test.txt'))\n        self.mrpc_data = {'train': train, 'test': test}\n\n    def do_prepare(self, params, prepare):\n        # TODO : Should we separate samples in \"train, test\"?\n        samples = self.mrpc_data['train']['X_A'] + \\\n                  self.mrpc_data['train']['X_B'] + \\\n                  self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B']\n        return prepare(params, samples)\n\n    def loadFile(self, fpath):\n        mrpc_data = {'X_A': [], 'X_B': [], 'y': []}\n        with io.open(fpath, 'r', encoding='utf-8') as f:\n            for line in f:\n                text = line.strip().split('\\t')\n                mrpc_data['X_A'].append(text[3].split())\n                mrpc_data['X_B'].append(text[4].split())\n                mrpc_data['y'].append(text[0])\n\n        mrpc_data['X_A'] = mrpc_data['X_A'][1:]\n        mrpc_data['X_B'] = mrpc_data['X_B'][1:]\n        mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]]\n        return mrpc_data\n\n    def run(self, params, batcher):\n        mrpc_embed = {'train': {}, 'test': {}}\n\n        for key in self.mrpc_data:\n            logging.info('Computing embedding for {0}'.format(key))\n            # Sort to reduce padding\n            text_data = {}\n            sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'],\n                                       self.mrpc_data[key]['X_B'],\n                                       self.mrpc_data[key]['y']),\n                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))\n\n            text_data['A'] = [x for (x, y, z) in sorted_corpus]\n            text_data['B'] = [y for (x, y, z) in sorted_corpus]\n            text_data['y'] = [z for (x, y, z) in sorted_corpus]\n\n            for txt_type in ['A', 'B']:\n                mrpc_embed[key][txt_type] = []\n                for ii in range(0, len(text_data['y']), params.batch_size):\n                    batch = text_data[txt_type][ii:ii + params.batch_size]\n                    embeddings = batcher(params, batch)\n                    mrpc_embed[key][txt_type].append(embeddings)\n                mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type])\n            mrpc_embed[key]['y'] = np.array(text_data['y'])\n            logging.info('Computed {0} embeddings'.format(key))\n\n        # Train\n        trainA = mrpc_embed['train']['A']\n        trainB = mrpc_embed['train']['B']\n        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]\n        trainY = mrpc_embed['train']['y']\n\n        # Test\n        testA = mrpc_embed['test']['A']\n        testB = mrpc_embed['test']['B']\n        testF = np.c_[np.abs(testA - testB), testA * testB]\n        testY = mrpc_embed['test']['y']\n\n        config = {'nclasses': 2, 'seed': self.seed,\n                  'usepytorch': params.usepytorch,\n                  'classifier': params.classifier,\n                  'nhid': params.nhid, 'kfold': params.kfold}\n        clf = KFoldClassifier(train={'X': trainF, 'y': trainY},\n                              test={'X': testF, 'y': testY}, config=config)\n\n        devacc, testacc, yhat = clf.run()\n        testf1 = round(100*f1_score(testY, yhat), 2)\n        logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\\n'\n                      .format(devacc, testacc, testf1))\n        return {'devacc': devacc, 'acc': testacc, 'f1': testf1,\n                'ndev': len(trainA), 'ntest': len(testA)}\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/probing.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\nprobing tasks\n'''\n\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport os\nimport io\nimport copy\nimport logging\nimport numpy as np\n\nfrom senteval.tools.validation import SplitClassifier\n\n\nclass PROBINGEval(object):\n    def __init__(self, task, task_path, seed=1111):\n        self.seed = seed\n        self.task = task\n        logging.debug('***** (Probing) Transfer task : %s classification *****', self.task.upper())\n        self.task_data = {'train': {'X': [], 'y': []},\n                          'dev': {'X': [], 'y': []},\n                          'test': {'X': [], 'y': []}}\n        self.loadFile(task_path)\n        logging.info('Loaded %s train - %s dev - %s test for %s' %\n                     (len(self.task_data['train']['y']), len(self.task_data['dev']['y']),\n                      len(self.task_data['test']['y']), self.task))\n\n    def do_prepare(self, params, prepare):\n        samples = self.task_data['train']['X'] + self.task_data['dev']['X'] + \\\n                  self.task_data['test']['X']\n        return prepare(params, samples)\n\n    def loadFile(self, fpath):\n        self.tok2split = {'tr': 'train', 'va': 'dev', 'te': 'test'}\n        with io.open(fpath, 'r', encoding='utf-8') as f:\n            for line in f:\n                line = line.rstrip().split('\\t')\n                self.task_data[self.tok2split[line[0]]]['X'].append(line[-1].split())\n                self.task_data[self.tok2split[line[0]]]['y'].append(line[1])\n\n        labels = sorted(np.unique(self.task_data['train']['y']))\n        self.tok2label = dict(zip(labels, range(len(labels))))\n        self.nclasses = len(self.tok2label)\n\n        for split in self.task_data:\n            for i, y in enumerate(self.task_data[split]['y']):\n                self.task_data[split]['y'][i] = self.tok2label[y]\n\n    def run(self, params, batcher):\n        task_embed = {'train': {}, 'dev': {}, 'test': {}}\n        bsize = params.batch_size\n        logging.info('Computing embeddings for train/dev/test')\n        for key in self.task_data:\n            # Sort to reduce padding\n            sorted_data = sorted(zip(self.task_data[key]['X'],\n                                     self.task_data[key]['y']),\n                                 key=lambda z: (len(z[0]), z[1]))\n            self.task_data[key]['X'], self.task_data[key]['y'] = map(list, zip(*sorted_data))\n\n            task_embed[key]['X'] = []\n            for ii in range(0, len(self.task_data[key]['y']), bsize):\n                batch = self.task_data[key]['X'][ii:ii + bsize]\n                embeddings = batcher(params, batch)\n                task_embed[key]['X'].append(embeddings)\n            task_embed[key]['X'] = np.vstack(task_embed[key]['X'])\n            task_embed[key]['y'] = np.array(self.task_data[key]['y'])\n        logging.info('Computed embeddings')\n\n        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,\n                             'usepytorch': params.usepytorch,\n                             'classifier': params.classifier}\n\n        if self.task == \"WordContent\" and params.classifier['nhid'] > 0:\n            config_classifier = copy.deepcopy(config_classifier)\n            config_classifier['classifier']['nhid'] = 0\n            print(params.classifier['nhid'])\n\n        clf = SplitClassifier(X={'train': task_embed['train']['X'],\n                                 'valid': task_embed['dev']['X'],\n                                 'test': task_embed['test']['X']},\n                              y={'train': task_embed['train']['y'],\n                                 'valid': task_embed['dev']['y'],\n                                 'test': task_embed['test']['y']},\n                              config=config_classifier)\n\n        devacc, testacc = clf.run()\n        logging.debug('\\nDev acc : %.1f Test acc : %.1f for %s classification\\n' % (devacc, testacc, self.task.upper()))\n\n        return {'devacc': devacc, 'acc': testacc,\n                'ndev': len(task_embed['dev']['X']),\n                'ntest': len(task_embed['test']['X'])}\n\n\"\"\"\nSurface Information\n\"\"\"\nclass LengthEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'sentence_length.txt')\n        # labels: bins\n        PROBINGEval.__init__(self, 'Length', task_path, seed)\n\nclass WordContentEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'word_content.txt')\n        # labels: 200 target words\n        PROBINGEval.__init__(self, 'WordContent', task_path, seed)\n\n\"\"\"\nLatent Structural Information\n\"\"\"\nclass DepthEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'tree_depth.txt')\n        # labels: bins\n        PROBINGEval.__init__(self, 'Depth', task_path, seed)\n\nclass TopConstituentsEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'top_constituents.txt')\n        # labels: 'PP_NP_VP_.' .. (20 classes)\n        PROBINGEval.__init__(self, 'TopConstituents', task_path, seed)\n\nclass BigramShiftEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'bigram_shift.txt')\n        # labels: 0 or 1\n        PROBINGEval.__init__(self, 'BigramShift', task_path, seed)\n\n# TODO: Voice?\n\n\"\"\"\nLatent Semantic Information\n\"\"\"\n\nclass TenseEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'past_present.txt')\n        # labels: 'PRES', 'PAST'\n        PROBINGEval.__init__(self, 'Tense', task_path, seed)\n\nclass SubjNumberEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'subj_number.txt')\n        # labels: 'NN', 'NNS'\n        PROBINGEval.__init__(self, 'SubjNumber', task_path, seed)\n\nclass ObjNumberEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'obj_number.txt')\n        # labels: 'NN', 'NNS'\n        PROBINGEval.__init__(self, 'ObjNumber', task_path, seed)\n\nclass OddManOutEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'odd_man_out.txt')\n        # labels: 'O', 'C'\n        PROBINGEval.__init__(self, 'OddManOut', task_path, seed)\n\nclass CoordinationInversionEval(PROBINGEval):\n    def __init__(self, task_path, seed=1111):\n        task_path = os.path.join(task_path, 'coordination_inversion.txt')\n        # labels: 'O', 'I'\n        PROBINGEval.__init__(self, 'CoordinationInversion', task_path, seed)\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/rank.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\nImage-Caption Retrieval with COCO dataset\n'''\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport os\nimport sys\nimport logging\nimport numpy as np\n\ntry:\n    import cPickle as pickle\nexcept ImportError:\n    import pickle\n\nfrom senteval.tools.ranking import ImageSentenceRankingPytorch\n\n\nclass ImageCaptionRetrievalEval(object):\n    def __init__(self, task_path, seed=1111):\n        logging.debug('***** Transfer task: Image Caption Retrieval *****\\n\\n')\n\n        # Get captions and image features\n        self.seed = seed\n        train, dev, test = self.loadFile(task_path)\n        self.coco_data = {'train': train, 'dev': dev, 'test': test}\n\n    def do_prepare(self, params, prepare):\n        samples = self.coco_data['train']['sent'] + \\\n                  self.coco_data['dev']['sent'] + \\\n                  self.coco_data['test']['sent']\n        prepare(params, samples)\n\n    def loadFile(self, fpath):\n        coco = {}\n\n        for split in ['train', 'valid', 'test']:\n            list_sent = []\n            list_img_feat = []\n            if sys.version_info < (3, 0):\n                with open(os.path.join(fpath, split + '.pkl')) as f:\n                    cocodata = pickle.load(f)\n            else:\n                with open(os.path.join(fpath, split + '.pkl'), 'rb') as f:\n                    cocodata = pickle.load(f, encoding='latin1')\n\n            for imgkey in range(len(cocodata['features'])):\n                assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \\\n                       cocodata['image_to_caption_ids'][imgkey]\n                for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]:\n                    sent = cocodata['captions'][captkey]['cleaned_caption']\n                    sent += ' .'  # add punctuation to end of sentence in COCO\n                    list_sent.append(sent.encode('utf-8').split())\n                    list_img_feat.append(cocodata['features'][imgkey])\n            assert len(list_sent) == len(list_img_feat) and \\\n                len(list_sent) % 5 == 0\n            list_img_feat = np.array(list_img_feat).astype('float32')\n            coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat}\n        return coco['train'], coco['valid'], coco['test']\n\n    def run(self, params, batcher):\n        coco_embed = {'train': {'sentfeat': [], 'imgfeat': []},\n                      'dev': {'sentfeat': [], 'imgfeat': []},\n                      'test': {'sentfeat': [], 'imgfeat': []}}\n\n        for key in self.coco_data:\n            logging.info('Computing embedding for {0}'.format(key))\n            # Sort to reduce padding\n            self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent'])\n            self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent'])\n            idx_unsort = np.argsort(idx_sort)\n\n            coco_embed[key]['X'] = []\n            nsent = len(self.coco_data[key]['sent'])\n            for ii in range(0, nsent, params.batch_size):\n                batch = self.coco_data[key]['sent'][ii:ii + params.batch_size]\n                embeddings = batcher(params, batch)\n                coco_embed[key]['sentfeat'].append(embeddings)\n            coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort]\n            coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat'])\n            logging.info('Computed {0} embeddings'.format(key))\n\n        config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2}\n        clf = ImageSentenceRankingPytorch(train=coco_embed['train'],\n                                          valid=coco_embed['dev'],\n                                          test=coco_embed['test'],\n                                          config=config)\n\n        bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \\\n            r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run()\n\n        logging.debug(\"\\nTest scores | Image to text: \\\n            {0}, {1}, {2}, {3}\".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))\n        logging.debug(\"Test scores | Text to image: \\\n            {0}, {1}, {2}, {3}\\n\".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))\n\n        return {'devacc': bestdevscore,\n                'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t),\n                        (r1_t2i, r5_t2i, r10_t2i, medr_t2i)],\n                'ndev': len(coco_embed['dev']['sentfeat']),\n                'ntest': len(coco_embed['test']['sentfeat'])}\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/sick.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\nSICK Relatedness and Entailment\n'''\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport os\nimport io\nimport logging\nimport numpy as np\n\nfrom sklearn.metrics import mean_squared_error\nfrom scipy.stats import pearsonr, spearmanr\n\nfrom senteval.tools.relatedness import RelatednessPytorch\nfrom senteval.tools.validation import SplitClassifier\n\n\nclass SICKRelatednessEval(object):\n    def __init__(self, task_path, seed=1111):\n        logging.debug('***** Transfer task : SICK-Relatedness*****\\n\\n')\n        self.seed = seed\n        train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))\n        dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))\n        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))\n        self.sick_data = {'train': train, 'dev': dev, 'test': test}\n\n    def do_prepare(self, params, prepare):\n        samples = self.sick_data['train']['X_A'] + \\\n                  self.sick_data['train']['X_B'] + \\\n                  self.sick_data['dev']['X_A'] + \\\n                  self.sick_data['dev']['X_B'] + \\\n                  self.sick_data['test']['X_A'] + self.sick_data['test']['X_B']\n        return prepare(params, samples)\n\n    def loadFile(self, fpath):\n        skipFirstLine = True\n        sick_data = {'X_A': [], 'X_B': [], 'y': []}\n        with io.open(fpath, 'r', encoding='utf-8') as f:\n            for line in f:\n                if skipFirstLine:\n                    skipFirstLine = False\n                else:\n                    text = line.strip().split('\\t')\n                    sick_data['X_A'].append(text[1].split())\n                    sick_data['X_B'].append(text[2].split())\n                    sick_data['y'].append(text[3])\n\n        sick_data['y'] = [float(s) for s in sick_data['y']]\n        return sick_data\n\n    def run(self, params, batcher):\n        sick_embed = {'train': {}, 'dev': {}, 'test': {}}\n        bsize = params.batch_size\n\n        for key in self.sick_data:\n            logging.info('Computing embedding for {0}'.format(key))\n            # Sort to reduce padding\n            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],\n                                       self.sick_data[key]['X_B'],\n                                       self.sick_data[key]['y']),\n                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))\n\n            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]\n            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]\n            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]\n\n            for txt_type in ['X_A', 'X_B']:\n                sick_embed[key][txt_type] = []\n                for ii in range(0, len(self.sick_data[key]['y']), bsize):\n                    batch = self.sick_data[key][txt_type][ii:ii + bsize]\n                    embeddings = batcher(params, batch)\n                    sick_embed[key][txt_type].append(embeddings)\n                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])\n            sick_embed[key]['y'] = np.array(self.sick_data[key]['y'])\n            logging.info('Computed {0} embeddings'.format(key))\n\n        # Train\n        trainA = sick_embed['train']['X_A']\n        trainB = sick_embed['train']['X_B']\n        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]\n        trainY = self.encode_labels(self.sick_data['train']['y'])\n\n        # Dev\n        devA = sick_embed['dev']['X_A']\n        devB = sick_embed['dev']['X_B']\n        devF = np.c_[np.abs(devA - devB), devA * devB]\n        devY = self.encode_labels(self.sick_data['dev']['y'])\n\n        # Test\n        testA = sick_embed['test']['X_A']\n        testB = sick_embed['test']['X_B']\n        testF = np.c_[np.abs(testA - testB), testA * testB]\n        testY = self.encode_labels(self.sick_data['test']['y'])\n\n        config = {'seed': self.seed, 'nclasses': 5}\n        clf = RelatednessPytorch(train={'X': trainF, 'y': trainY},\n                                 valid={'X': devF, 'y': devY},\n                                 test={'X': testF, 'y': testY},\n                                 devscores=self.sick_data['dev']['y'],\n                                 config=config)\n\n        devpr, yhat = clf.run()\n\n        pr = pearsonr(yhat, self.sick_data['test']['y'])[0]\n        sr = spearmanr(yhat, self.sick_data['test']['y'])[0]\n        pr = 0 if pr != pr else pr\n        sr = 0 if sr != sr else sr\n        se = mean_squared_error(yhat, self.sick_data['test']['y'])\n        logging.debug('Dev : Pearson {0}'.format(devpr))\n        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \\\n                       for SICK Relatedness\\n'.format(pr, sr, se))\n\n        return {'devpearson': devpr, 'pearson': pr, 'spearman': sr, 'mse': se,\n                'yhat': yhat, 'ndev': len(devA), 'ntest': len(testA)}\n\n    def encode_labels(self, labels, nclass=5):\n        \"\"\"\n        Label encoding from Tree LSTM paper (Tai, Socher, Manning)\n        \"\"\"\n        Y = np.zeros((len(labels), nclass)).astype('float32')\n        for j, y in enumerate(labels):\n            for i in range(nclass):\n                if i+1 == np.floor(y) + 1:\n                    Y[j, i] = y - np.floor(y)\n                if i+1 == np.floor(y):\n                    Y[j, i] = np.floor(y) - y + 1\n        return Y\n\n\nclass SICKEntailmentEval(SICKRelatednessEval):\n    def __init__(self, task_path, seed=1111):\n        logging.debug('***** Transfer task : SICK-Entailment*****\\n\\n')\n        self.seed = seed\n        train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))\n        dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))\n        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))\n        self.sick_data = {'train': train, 'dev': dev, 'test': test}\n\n    def loadFile(self, fpath):\n        label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2}\n        skipFirstLine = True\n        sick_data = {'X_A': [], 'X_B': [], 'y': []}\n        with io.open(fpath, 'r', encoding='utf-8') as f:\n            for line in f:\n                if skipFirstLine:\n                    skipFirstLine = False\n                else:\n                    text = line.strip().split('\\t')\n                    sick_data['X_A'].append(text[1].split())\n                    sick_data['X_B'].append(text[2].split())\n                    sick_data['y'].append(text[4])\n        sick_data['y'] = [label2id[s] for s in sick_data['y']]\n        return sick_data\n\n    def run(self, params, batcher):\n        sick_embed = {'train': {}, 'dev': {}, 'test': {}}\n        bsize = params.batch_size\n\n        for key in self.sick_data:\n            logging.info('Computing embedding for {0}'.format(key))\n            # Sort to reduce padding\n            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],\n                                       self.sick_data[key]['X_B'],\n                                       self.sick_data[key]['y']),\n                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))\n\n            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]\n            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]\n            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]\n\n            for txt_type in ['X_A', 'X_B']:\n                sick_embed[key][txt_type] = []\n                for ii in range(0, len(self.sick_data[key]['y']), bsize):\n                    batch = self.sick_data[key][txt_type][ii:ii + bsize]\n                    embeddings = batcher(params, batch)\n                    sick_embed[key][txt_type].append(embeddings)\n                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])\n            logging.info('Computed {0} embeddings'.format(key))\n\n        # Train\n        trainA = sick_embed['train']['X_A']\n        trainB = sick_embed['train']['X_B']\n        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]\n        trainY = np.array(self.sick_data['train']['y'])\n\n        # Dev\n        devA = sick_embed['dev']['X_A']\n        devB = sick_embed['dev']['X_B']\n        devF = np.c_[np.abs(devA - devB), devA * devB]\n        devY = np.array(self.sick_data['dev']['y'])\n\n        # Test\n        testA = sick_embed['test']['X_A']\n        testB = sick_embed['test']['X_B']\n        testF = np.c_[np.abs(testA - testB), testA * testB]\n        testY = np.array(self.sick_data['test']['y'])\n\n        config = {'nclasses': 3, 'seed': self.seed,\n                  'usepytorch': params.usepytorch,\n                  'classifier': params.classifier,\n                  'nhid': params.nhid}\n        clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF},\n                              y={'train': trainY, 'valid': devY, 'test': testY},\n                              config=config)\n\n        devacc, testacc = clf.run()\n        logging.debug('\\nDev acc : {0} Test acc : {1} for \\\n                       SICK entailment\\n'.format(devacc, testacc))\n        return {'devacc': devacc, 'acc': testacc,\n                'ndev': len(devA), 'ntest': len(testA)}\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/snli.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\nSNLI - Entailment\n'''\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport codecs\nimport os\nimport io\nimport copy\nimport logging\nimport numpy as np\n\nfrom senteval.tools.validation import SplitClassifier\n\n\nclass SNLIEval(object):\n    def __init__(self, taskpath, seed=1111):\n        logging.debug('***** Transfer task : SNLI Entailment*****\\n\\n')\n        self.seed = seed\n        train1 = self.loadFile(os.path.join(taskpath, 's1.train'))\n        train2 = self.loadFile(os.path.join(taskpath, 's2.train'))\n\n        trainlabels = io.open(os.path.join(taskpath, 'labels.train'),\n                              encoding='utf-8').read().splitlines()\n\n        valid1 = self.loadFile(os.path.join(taskpath, 's1.dev'))\n        valid2 = self.loadFile(os.path.join(taskpath, 's2.dev'))\n        validlabels = io.open(os.path.join(taskpath, 'labels.dev'),\n                              encoding='utf-8').read().splitlines()\n\n        test1 = self.loadFile(os.path.join(taskpath, 's1.test'))\n        test2 = self.loadFile(os.path.join(taskpath, 's2.test'))\n        testlabels = io.open(os.path.join(taskpath, 'labels.test'),\n                             encoding='utf-8').read().splitlines()\n\n        # sort data (by s2 first) to reduce padding\n        sorted_train = sorted(zip(train2, train1, trainlabels),\n                              key=lambda z: (len(z[0]), len(z[1]), z[2]))\n        train2, train1, trainlabels = map(list, zip(*sorted_train))\n\n        sorted_valid = sorted(zip(valid2, valid1, validlabels),\n                              key=lambda z: (len(z[0]), len(z[1]), z[2]))\n        valid2, valid1, validlabels = map(list, zip(*sorted_valid))\n\n        sorted_test = sorted(zip(test2, test1, testlabels),\n                             key=lambda z: (len(z[0]), len(z[1]), z[2]))\n        test2, test1, testlabels = map(list, zip(*sorted_test))\n\n        self.samples = train1 + train2 + valid1 + valid2 + test1 + test2\n        self.data = {'train': (train1, train2, trainlabels),\n                     'valid': (valid1, valid2, validlabels),\n                     'test': (test1, test2, testlabels)\n                     }\n\n    def do_prepare(self, params, prepare):\n        return prepare(params, self.samples)\n\n    def loadFile(self, fpath):\n        with codecs.open(fpath, 'rb', 'latin-1') as f:\n            return [line.split() for line in\n                    f.read().splitlines()]\n\n    def run(self, params, batcher):\n        self.X, self.y = {}, {}\n        dico_label = {'entailment': 0,  'neutral': 1, 'contradiction': 2}\n        for key in self.data:\n            if key not in self.X:\n                self.X[key] = []\n            if key not in self.y:\n                self.y[key] = []\n\n            input1, input2, mylabels = self.data[key]\n            enc_input = []\n            n_labels = len(mylabels)\n            for ii in range(0, n_labels, params.batch_size):\n                batch1 = input1[ii:ii + params.batch_size]\n                batch2 = input2[ii:ii + params.batch_size]\n\n                if len(batch1) == len(batch2) and len(batch1) > 0:\n                    enc1 = batcher(params, batch1)\n                    enc2 = batcher(params, batch2)\n                    enc_input.append(np.hstack((enc1, enc2, enc1 * enc2,\n                                                np.abs(enc1 - enc2))))\n                if (ii*params.batch_size) % (20000*params.batch_size) == 0:\n                    logging.info(\"PROGRESS (encoding): %.2f%%\" %\n                                 (100 * ii / n_labels))\n            self.X[key] = np.vstack(enc_input)\n            self.y[key] = [dico_label[y] for y in mylabels]\n\n        config = {'nclasses': 3, 'seed': self.seed,\n                  'usepytorch': params.usepytorch,\n                  'cudaEfficient': True,\n                  'nhid': params.nhid, 'noreg': True}\n\n        config_classifier = copy.deepcopy(params.classifier)\n        config_classifier['max_epoch'] = 15\n        config_classifier['epoch_size'] = 1\n        config['classifier'] = config_classifier\n\n        clf = SplitClassifier(self.X, self.y, config)\n        devacc, testacc = clf.run()\n        logging.debug('Dev acc : {0} Test acc : {1} for SNLI\\n'\n                      .format(devacc, testacc))\n        return {'devacc': devacc, 'acc': testacc,\n                'ndev': len(self.data['valid'][0]),\n                'ntest': len(self.data['test'][0])}\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/sst.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\nSST - binary classification\n'''\n\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport os\nimport io\nimport logging\nimport numpy as np\n\nfrom senteval.tools.validation import SplitClassifier\n\n\nclass SSTEval(object):\n    def __init__(self, task_path, nclasses=2, seed=1111):\n        self.seed = seed\n\n        # binary of fine-grained\n        assert nclasses in [2, 5]\n        self.nclasses = nclasses\n        self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained'\n        logging.debug('***** Transfer task : SST %s classification *****\\n\\n', self.task_name)\n\n        train = self.loadFile(os.path.join(task_path, 'sentiment-train'))\n        dev = self.loadFile(os.path.join(task_path, 'sentiment-dev'))\n        test = self.loadFile(os.path.join(task_path, 'sentiment-test'))\n        self.sst_data = {'train': train, 'dev': dev, 'test': test}\n\n    def do_prepare(self, params, prepare):\n        samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \\\n                  self.sst_data['test']['X']\n        return prepare(params, samples)\n\n    def loadFile(self, fpath):\n        sst_data = {'X': [], 'y': []}\n        with io.open(fpath, 'r', encoding='utf-8') as f:\n            for line in f:\n                if self.nclasses == 2:\n                    sample = line.strip().split('\\t')\n                    sst_data['y'].append(int(sample[1]))\n                    sst_data['X'].append(sample[0].split())\n                elif self.nclasses == 5:\n                    sample = line.strip().split(' ', 1)\n                    sst_data['y'].append(int(sample[0]))\n                    sst_data['X'].append(sample[1].split())\n        assert max(sst_data['y']) == self.nclasses - 1\n        return sst_data\n\n    def run(self, params, batcher):\n        sst_embed = {'train': {}, 'dev': {}, 'test': {}}\n        bsize = params.batch_size\n\n        for key in self.sst_data:\n            logging.info('Computing embedding for {0}'.format(key))\n            # Sort to reduce padding\n            sorted_data = sorted(zip(self.sst_data[key]['X'],\n                                     self.sst_data[key]['y']),\n                                 key=lambda z: (len(z[0]), z[1]))\n            self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))\n\n            sst_embed[key]['X'] = []\n            for ii in range(0, len(self.sst_data[key]['y']), bsize):\n                batch = self.sst_data[key]['X'][ii:ii + bsize]\n                embeddings = batcher(params, batch)\n                sst_embed[key]['X'].append(embeddings)\n            sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])\n            sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])\n            logging.info('Computed {0} embeddings'.format(key))\n\n        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,\n                             'usepytorch': params.usepytorch,\n                             'classifier': params.classifier}\n\n        clf = SplitClassifier(X={'train': sst_embed['train']['X'],\n                                 'valid': sst_embed['dev']['X'],\n                                 'test': sst_embed['test']['X']},\n                              y={'train': sst_embed['train']['y'],\n                                 'valid': sst_embed['dev']['y'],\n                                 'test': sst_embed['test']['y']},\n                              config=config_classifier)\n\n        devacc, testacc = clf.run()\n        logging.debug('\\nDev acc : {0} Test acc : {1} for \\\n            SST {2} classification\\n'.format(devacc, testacc, self.task_name))\n\n        return {'devacc': devacc, 'acc': testacc,\n                'ndev': len(sst_embed['dev']['X']),\n                'ntest': len(sst_embed['test']['X'])}\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/sts.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\nSTS-{2012,2013,2014,2015,2016} (unsupervised) and\nSTS-benchmark (supervised) tasks\n'''\n\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport os\nimport io\nimport numpy as np\nimport logging\n\nfrom scipy.stats import spearmanr, pearsonr\n\nfrom senteval.utils import cosine\nfrom senteval.sick import SICKRelatednessEval\n\n\nclass STSEval(object):\n    def loadFile(self, fpath):\n        self.data = {}\n        self.samples = []\n\n        for dataset in self.datasets:\n            sent1, sent2 = zip(*[l.split(\"\\t\") for l in\n                               io.open(fpath + '/STS.input.%s.txt' % dataset,\n                                       encoding='utf8').read().splitlines()])\n            raw_scores = np.array([x for x in\n                                   io.open(fpath + '/STS.gs.%s.txt' % dataset,\n                                           encoding='utf8')\n                                   .read().splitlines()])\n            not_empty_idx = raw_scores != ''\n\n            gs_scores = [float(x) for x in raw_scores[not_empty_idx]]\n            sent1 = np.array([s.split() for s in sent1])[not_empty_idx]\n            sent2 = np.array([s.split() for s in sent2])[not_empty_idx]\n            # sort data by length to minimize padding in batcher\n            sorted_data = sorted(zip(sent1, sent2, gs_scores),\n                                 key=lambda z: (len(z[0]), len(z[1]), z[2]))\n            sent1, sent2, gs_scores = map(list, zip(*sorted_data))\n\n            self.data[dataset] = (sent1, sent2, gs_scores)\n            self.samples += sent1 + sent2\n\n    def do_prepare(self, params, prepare):\n        if 'similarity' in params:\n            self.similarity = params.similarity\n        else:  # Default similarity is cosine\n            self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2)))\n        return prepare(params, self.samples)\n\n    def run(self, params, batcher):\n        results = {}\n        for dataset in self.datasets:\n            sys_scores = []\n            input1, input2, gs_scores = self.data[dataset]\n            for ii in range(0, len(gs_scores), params.batch_size):\n                batch1 = input1[ii:ii + params.batch_size]\n                batch2 = input2[ii:ii + params.batch_size]\n\n                # we assume get_batch already throws out the faulty ones\n                if len(batch1) == len(batch2) and len(batch1) > 0:\n                    enc1 = batcher(params, batch1)\n                    enc2 = batcher(params, batch2)\n\n                    for kk in range(enc2.shape[0]):\n                        sys_score = self.similarity(enc1[kk], enc2[kk])\n                        sys_scores.append(sys_score)\n\n            results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores),\n                                'spearman': spearmanr(sys_scores, gs_scores),\n                                'nsamples': len(sys_scores)}\n            logging.debug('%s : pearson = %.4f, spearman = %.4f' %\n                          (dataset, results[dataset]['pearson'][0],\n                           results[dataset]['spearman'][0]))\n\n        weights = [results[dset]['nsamples'] for dset in results.keys()]\n        list_prs = np.array([results[dset]['pearson'][0] for\n                            dset in results.keys()])\n        list_spr = np.array([results[dset]['spearman'][0] for\n                            dset in results.keys()])\n\n        avg_pearson = np.average(list_prs)\n        avg_spearman = np.average(list_spr)\n        wavg_pearson = np.average(list_prs, weights=weights)\n        wavg_spearman = np.average(list_spr, weights=weights)\n\n        results['all'] = {'pearson': {'mean': avg_pearson,\n                                      'wmean': wavg_pearson},\n                          'spearman': {'mean': avg_spearman,\n                                       'wmean': wavg_spearman}}\n        logging.debug('ALL (weighted average) : Pearson = %.4f, \\\n            Spearman = %.4f' % (wavg_pearson, wavg_spearman))\n        logging.debug('ALL (average) : Pearson = %.4f, \\\n            Spearman = %.4f\\n' % (avg_pearson, avg_spearman))\n\n        return results\n\n\nclass STS12Eval(STSEval):\n    def __init__(self, taskpath, seed=1111):\n        logging.debug('***** Transfer task : STS12 *****\\n\\n')\n        self.seed = seed\n        self.datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl',\n                         'surprise.OnWN', 'surprise.SMTnews']\n        self.loadFile(taskpath)\n\n\nclass STS13Eval(STSEval):\n    # STS13 here does not contain the \"SMT\" subtask due to LICENSE issue\n    def __init__(self, taskpath, seed=1111):\n        logging.debug('***** Transfer task : STS13 (-SMT) *****\\n\\n')\n        self.seed = seed\n        self.datasets = ['FNWN', 'headlines', 'OnWN']\n        self.loadFile(taskpath)\n\n\nclass STS14Eval(STSEval):\n    def __init__(self, taskpath, seed=1111):\n        logging.debug('***** Transfer task : STS14 *****\\n\\n')\n        self.seed = seed\n        self.datasets = ['deft-forum', 'deft-news', 'headlines',\n                         'images', 'OnWN', 'tweet-news']\n        self.loadFile(taskpath)\n\n\nclass STS15Eval(STSEval):\n    def __init__(self, taskpath, seed=1111):\n        logging.debug('***** Transfer task : STS15 *****\\n\\n')\n        self.seed = seed\n        self.datasets = ['answers-forums', 'answers-students',\n                         'belief', 'headlines', 'images']\n        self.loadFile(taskpath)\n\n\nclass STS16Eval(STSEval):\n    def __init__(self, taskpath, seed=1111):\n        logging.debug('***** Transfer task : STS16 *****\\n\\n')\n        self.seed = seed\n        self.datasets = ['answer-answer', 'headlines', 'plagiarism',\n                         'postediting', 'question-question']\n        self.loadFile(taskpath)\n\n\nclass STSBenchmarkEval(SICKRelatednessEval):\n    def __init__(self, task_path, seed=1111):\n        logging.debug('\\n\\n***** Transfer task : STSBenchmark*****\\n\\n')\n        self.seed = seed\n        train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))\n        dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))\n        test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))\n        self.sick_data = {'train': train, 'dev': dev, 'test': test}\n\n    def loadFile(self, fpath):\n        sick_data = {'X_A': [], 'X_B': [], 'y': []}\n        with io.open(fpath, 'r', encoding='utf-8') as f:\n            for line in f:\n                text = line.strip().split('\\t')\n                sick_data['X_A'].append(text[5].split())\n                sick_data['X_B'].append(text[6].split())\n                sick_data['y'].append(text[4])\n\n        sick_data['y'] = [float(s) for s in sick_data['y']]\n        return sick_data\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/tools/__init__.py",
    "content": ""
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/tools/classifier.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n\"\"\"\nPytorch Classifier class in the style of scikit-learn\nClassifiers include Logistic Regression and MLP\n\"\"\"\n\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport numpy as np\nimport copy\nfrom senteval import utils\n\nimport torch\nfrom torch import nn\nimport torch.nn.functional as F\n\n\nclass PyTorchClassifier(object):\n    def __init__(self, inputdim, nclasses, l2reg=0., batch_size=64, seed=1111,\n                 cudaEfficient=False):\n        # fix seed\n        np.random.seed(seed)\n        torch.manual_seed(seed)\n        torch.cuda.manual_seed(seed)\n\n        self.inputdim = inputdim\n        self.nclasses = nclasses\n        self.l2reg = l2reg\n        self.batch_size = batch_size\n        self.cudaEfficient = cudaEfficient\n\n    def prepare_split(self, X, y, validation_data=None, validation_split=None):\n        # Preparing validation data\n        assert validation_split or validation_data\n        if validation_data is not None:\n            trainX, trainy = X, y\n            devX, devy = validation_data\n        else:\n            permutation = np.random.permutation(len(X))\n            trainidx = permutation[int(validation_split * len(X)):]\n            devidx = permutation[0:int(validation_split * len(X))]\n            trainX, trainy = X[trainidx], y[trainidx]\n            devX, devy = X[devidx], y[devidx]\n\n        device = torch.device('cpu') if self.cudaEfficient else torch.device('cuda')\n\n        trainX = torch.from_numpy(trainX).to(device, dtype=torch.float32)\n        trainy = torch.from_numpy(trainy).to(device, dtype=torch.int64)\n        devX = torch.from_numpy(devX).to(device, dtype=torch.float32)\n        devy = torch.from_numpy(devy).to(device, dtype=torch.int64)\n\n        return trainX, trainy, devX, devy\n\n    def fit(self, X, y, validation_data=None, validation_split=None,\n            early_stop=True):\n        self.nepoch = 0\n        bestaccuracy = -1\n        stop_train = False\n        early_stop_count = 0\n\n        # Preparing validation data\n        trainX, trainy, devX, devy = self.prepare_split(X, y, validation_data,\n                                                        validation_split)\n\n        # Training\n        while not stop_train and self.nepoch <= self.max_epoch:\n            self.trainepoch(trainX, trainy, epoch_size=self.epoch_size)\n            accuracy = self.score(devX, devy)\n            if accuracy > bestaccuracy:\n                bestaccuracy = accuracy\n                bestmodel = copy.deepcopy(self.model)\n            elif early_stop:\n                if early_stop_count >= self.tenacity:\n                    stop_train = True\n                early_stop_count += 1\n        self.model = bestmodel\n        return bestaccuracy\n\n    def trainepoch(self, X, y, epoch_size=1):\n        self.model.train()\n        for _ in range(self.nepoch, self.nepoch + epoch_size):\n            permutation = np.random.permutation(len(X))\n            all_costs = []\n            for i in range(0, len(X), self.batch_size):\n                # forward\n                idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().to(X.device)\n\n                Xbatch = X[idx]\n                ybatch = y[idx]\n\n                if self.cudaEfficient:\n                    Xbatch = Xbatch.cuda()\n                    ybatch = ybatch.cuda()\n                output = self.model(Xbatch)\n                # loss\n                loss = self.loss_fn(output, ybatch)\n                all_costs.append(loss.data.item())\n                # backward\n                self.optimizer.zero_grad()\n                loss.backward()\n                # Update parameters\n                self.optimizer.step()\n        self.nepoch += epoch_size\n\n    def score(self, devX, devy):\n        self.model.eval()\n        correct = 0\n        if not isinstance(devX, torch.cuda.FloatTensor) or self.cudaEfficient:\n            devX = torch.FloatTensor(devX).cuda()\n            devy = torch.LongTensor(devy).cuda()\n        with torch.no_grad():\n            for i in range(0, len(devX), self.batch_size):\n                Xbatch = devX[i:i + self.batch_size]\n                ybatch = devy[i:i + self.batch_size]\n                if self.cudaEfficient:\n                    Xbatch = Xbatch.cuda()\n                    ybatch = ybatch.cuda()\n                output = self.model(Xbatch)\n                pred = output.data.max(1)[1]\n                correct += pred.long().eq(ybatch.data.long()).sum().item()\n            accuracy = 1.0 * correct / len(devX)\n        return accuracy\n\n    def predict(self, devX):\n        self.model.eval()\n        if not isinstance(devX, torch.cuda.FloatTensor):\n            devX = torch.FloatTensor(devX).cuda()\n        yhat = np.array([])\n        with torch.no_grad():\n            for i in range(0, len(devX), self.batch_size):\n                Xbatch = devX[i:i + self.batch_size]\n                output = self.model(Xbatch)\n                yhat = np.append(yhat,\n                                 output.data.max(1)[1].cpu().numpy())\n        yhat = np.vstack(yhat)\n        return yhat\n\n    def predict_proba(self, devX):\n        self.model.eval()\n        probas = []\n        with torch.no_grad():\n            for i in range(0, len(devX), self.batch_size):\n                Xbatch = devX[i:i + self.batch_size]\n                vals = F.softmax(self.model(Xbatch).data.cpu().numpy())\n                if not probas:\n                    probas = vals\n                else:\n                    probas = np.concatenate(probas, vals, axis=0)\n        return probas\n\n\n\"\"\"\nMLP with Pytorch (nhid=0 --> Logistic Regression)\n\"\"\"\n\nclass MLP(PyTorchClassifier):\n    def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64,\n                 seed=1111, cudaEfficient=False):\n        super(self.__class__, self).__init__(inputdim, nclasses, l2reg,\n                                             batch_size, seed, cudaEfficient)\n        \"\"\"\n        PARAMETERS:\n        -nhid:       number of hidden units (0: Logistic Regression)\n        -optim:      optimizer (\"sgd,lr=0.1\", \"adam\", \"rmsprop\" ..)\n        -tenacity:   how many times dev acc does not increase before stopping\n        -epoch_size: each epoch corresponds to epoch_size pass on the train set\n        -max_epoch:  max number of epoches\n        -dropout:    dropout for MLP\n        \"\"\"\n\n        self.nhid = 0 if \"nhid\" not in params else params[\"nhid\"]\n        self.optim = \"adam\" if \"optim\" not in params else params[\"optim\"]\n        self.tenacity = 5 if \"tenacity\" not in params else params[\"tenacity\"]\n        self.epoch_size = 4 if \"epoch_size\" not in params else params[\"epoch_size\"]\n        self.max_epoch = 200 if \"max_epoch\" not in params else params[\"max_epoch\"]\n        self.dropout = 0. if \"dropout\" not in params else params[\"dropout\"]\n        self.batch_size = 64 if \"batch_size\" not in params else params[\"batch_size\"]\n\n        if params[\"nhid\"] == 0:\n            self.model = nn.Sequential(\n                nn.Linear(self.inputdim, self.nclasses),\n            ).cuda()\n        else:\n            self.model = nn.Sequential(\n                nn.Linear(self.inputdim, params[\"nhid\"]),\n                nn.Dropout(p=self.dropout),\n                nn.Sigmoid(),\n                nn.Linear(params[\"nhid\"], self.nclasses),\n            ).cuda()\n\n        self.loss_fn = nn.CrossEntropyLoss().cuda()\n        self.loss_fn.size_average = False\n\n        optim_fn, optim_params = utils.get_optimizer(self.optim)\n        self.optimizer = optim_fn(self.model.parameters(), **optim_params)\n        self.optimizer.param_groups[0]['weight_decay'] = self.l2reg\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/tools/ranking.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n\"\"\"\nImage Annotation/Search for COCO with Pytorch\n\"\"\"\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport logging\nimport copy\nimport numpy as np\n\nimport torch\nfrom torch import nn\nfrom torch.autograd import Variable\nimport torch.optim as optim\n\n\nclass COCOProjNet(nn.Module):\n    def __init__(self, config):\n        super(COCOProjNet, self).__init__()\n        self.imgdim = config['imgdim']\n        self.sentdim = config['sentdim']\n        self.projdim = config['projdim']\n        self.imgproj = nn.Sequential(\n                        nn.Linear(self.imgdim, self.projdim),\n                        )\n        self.sentproj = nn.Sequential(\n                        nn.Linear(self.sentdim, self.projdim),\n                        )\n\n    def forward(self, img, sent, imgc, sentc):\n        # imgc : (bsize, ncontrast, imgdim)\n        # sentc : (bsize, ncontrast, sentdim)\n        # img : (bsize, imgdim)\n        # sent : (bsize, sentdim)\n        img = img.unsqueeze(1).expand_as(imgc).contiguous()\n        img = img.view(-1, self.imgdim)\n        imgc = imgc.view(-1, self.imgdim)\n        sent = sent.unsqueeze(1).expand_as(sentc).contiguous()\n        sent = sent.view(-1, self.sentdim)\n        sentc = sentc.view(-1, self.sentdim)\n\n        imgproj = self.imgproj(img)\n        imgproj = imgproj / torch.sqrt(torch.pow(imgproj, 2).sum(1, keepdim=True)).expand_as(imgproj)\n        imgcproj = self.imgproj(imgc)\n        imgcproj = imgcproj / torch.sqrt(torch.pow(imgcproj, 2).sum(1, keepdim=True)).expand_as(imgcproj)\n        sentproj = self.sentproj(sent)\n        sentproj = sentproj / torch.sqrt(torch.pow(sentproj, 2).sum(1, keepdim=True)).expand_as(sentproj)\n        sentcproj = self.sentproj(sentc)\n        sentcproj = sentcproj / torch.sqrt(torch.pow(sentcproj, 2).sum(1, keepdim=True)).expand_as(sentcproj)\n        # (bsize*ncontrast, projdim)\n\n        anchor1 = torch.sum((imgproj*sentproj), 1)\n        anchor2 = torch.sum((sentproj*imgproj), 1)\n        img_sentc = torch.sum((imgproj*sentcproj), 1)\n        sent_imgc = torch.sum((sentproj*imgcproj), 1)\n\n        # (bsize*ncontrast)\n        return anchor1, anchor2, img_sentc, sent_imgc\n\n    def proj_sentence(self, sent):\n        output = self.sentproj(sent)\n        output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output)\n        return output # (bsize, projdim)\n\n    def proj_image(self, img):\n        output = self.imgproj(img)\n        output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output)\n        return output # (bsize, projdim)\n\n\nclass PairwiseRankingLoss(nn.Module):\n    \"\"\"\n    Pairwise ranking loss\n    \"\"\"\n    def __init__(self, margin):\n        super(PairwiseRankingLoss, self).__init__()\n        self.margin = margin\n\n    def forward(self, anchor1, anchor2, img_sentc, sent_imgc):\n\n        cost_sent = torch.clamp(self.margin - anchor1 + img_sentc,\n                                min=0.0).sum()\n        cost_img = torch.clamp(self.margin - anchor2 + sent_imgc,\n                               min=0.0).sum()\n        loss = cost_sent + cost_img\n        return loss\n\n\nclass ImageSentenceRankingPytorch(object):\n    # Image Sentence Ranking on COCO with Pytorch\n    def __init__(self, train, valid, test, config):\n        # fix seed\n        self.seed = config['seed']\n        np.random.seed(self.seed)\n        torch.manual_seed(self.seed)\n        torch.cuda.manual_seed(self.seed)\n\n        self.train = train\n        self.valid = valid\n        self.test = test\n\n        self.imgdim = len(train['imgfeat'][0])\n        self.sentdim = len(train['sentfeat'][0])\n        self.projdim = config['projdim']\n        self.margin = config['margin']\n\n        self.batch_size = 128\n        self.ncontrast = 30\n        self.maxepoch = 20\n        self.early_stop = True\n\n        config_model = {'imgdim': self.imgdim,'sentdim': self.sentdim,\n                        'projdim': self.projdim}\n        self.model = COCOProjNet(config_model).cuda()\n\n        self.loss_fn = PairwiseRankingLoss(margin=self.margin).cuda()\n\n        self.optimizer = optim.Adam(self.model.parameters())\n\n    def prepare_data(self, trainTxt, trainImg, devTxt, devImg,\n                     testTxt, testImg):\n        trainTxt = torch.FloatTensor(trainTxt)\n        trainImg = torch.FloatTensor(trainImg)\n        devTxt = torch.FloatTensor(devTxt).cuda()\n        devImg = torch.FloatTensor(devImg).cuda()\n        testTxt = torch.FloatTensor(testTxt).cuda()\n        testImg = torch.FloatTensor(testImg).cuda()\n\n        return trainTxt, trainImg, devTxt, devImg, testTxt, testImg\n\n    def run(self):\n        self.nepoch = 0\n        bestdevscore = -1\n        early_stop_count = 0\n        stop_train = False\n\n        # Preparing data\n        logging.info('prepare data')\n        trainTxt, trainImg, devTxt, devImg, testTxt, testImg = \\\n            self.prepare_data(self.train['sentfeat'], self.train['imgfeat'],\n                              self.valid['sentfeat'], self.valid['imgfeat'],\n                              self.test['sentfeat'], self.test['imgfeat'])\n\n        # Training\n        while not stop_train and self.nepoch <= self.maxepoch:\n            logging.info('start epoch')\n            self.trainepoch(trainTxt, trainImg, devTxt, devImg, nepoches=1)\n            logging.info('Epoch {0} finished'.format(self.nepoch))\n\n            results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},\n                       't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},\n                       'dev': bestdevscore}\n            score = 0\n            for i in range(5):\n                devTxt_i = devTxt[i*5000:(i+1)*5000]\n                devImg_i = devImg[i*5000:(i+1)*5000]\n                # Compute dev ranks img2txt\n                r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg_i,\n                                                             devTxt_i)\n                results['i2t']['r1'] += r1_i2t / 5\n                results['i2t']['r5'] += r5_i2t / 5\n                results['i2t']['r10'] += r10_i2t / 5\n                results['i2t']['medr'] += medr_i2t / 5\n                logging.info(\"Image to text: {0}, {1}, {2}, {3}\"\n                             .format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))\n                # Compute dev ranks txt2img\n                r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg_i,\n                                                             devTxt_i)\n                results['t2i']['r1'] += r1_t2i / 5\n                results['t2i']['r5'] += r5_t2i / 5\n                results['t2i']['r10'] += r10_t2i / 5\n                results['t2i']['medr'] += medr_t2i / 5\n                logging.info(\"Text to Image: {0}, {1}, {2}, {3}\"\n                             .format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))\n                score += (r1_i2t + r5_i2t + r10_i2t +\n                          r1_t2i + r5_t2i + r10_t2i) / 5\n\n            logging.info(\"Dev mean Text to Image: {0}, {1}, {2}, {3}\".format(\n                        results['t2i']['r1'], results['t2i']['r5'],\n                        results['t2i']['r10'], results['t2i']['medr']))\n            logging.info(\"Dev mean Image to text: {0}, {1}, {2}, {3}\".format(\n                        results['i2t']['r1'], results['i2t']['r5'],\n                        results['i2t']['r10'], results['i2t']['medr']))\n\n            # early stop on Pearson\n            if score > bestdevscore:\n                bestdevscore = score\n                bestmodel = copy.deepcopy(self.model)\n            elif self.early_stop:\n                if early_stop_count >= 3:\n                    stop_train = True\n                early_stop_count += 1\n        self.model = bestmodel\n\n        # Compute test for the 5 splits\n        results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},\n                   't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},\n                   'dev': bestdevscore}\n        for i in range(5):\n            testTxt_i = testTxt[i*5000:(i+1)*5000]\n            testImg_i = testImg[i*5000:(i+1)*5000]\n            # Compute test ranks img2txt\n            r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(testImg_i, testTxt_i)\n            results['i2t']['r1'] += r1_i2t / 5\n            results['i2t']['r5'] += r5_i2t / 5\n            results['i2t']['r10'] += r10_i2t / 5\n            results['i2t']['medr'] += medr_i2t / 5\n            # Compute test ranks txt2img\n            r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(testImg_i, testTxt_i)\n            results['t2i']['r1'] += r1_t2i / 5\n            results['t2i']['r5'] += r5_t2i / 5\n            results['t2i']['r10'] += r10_t2i / 5\n            results['t2i']['medr'] += medr_t2i / 5\n\n        return bestdevscore, results['i2t']['r1'], results['i2t']['r5'], \\\n                             results['i2t']['r10'], results['i2t']['medr'], \\\n                             results['t2i']['r1'], results['t2i']['r5'], \\\n                             results['t2i']['r10'], results['t2i']['medr']\n\n    def trainepoch(self, trainTxt, trainImg, devTxt, devImg, nepoches=1):\n        self.model.train()\n        for _ in range(self.nepoch, self.nepoch + nepoches):\n            permutation = list(np.random.permutation(len(trainTxt)))\n            all_costs = []\n            for i in range(0, len(trainTxt), self.batch_size):\n                # forward\n                if i % (self.batch_size*500) == 0 and i > 0:\n                    logging.info('samples : {0}'.format(i))\n                    r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg,\n                                                                 devTxt)\n                    logging.info(\"Image to text: {0}, {1}, {2}, {3}\".format(\n                        r1_i2t, r5_i2t, r10_i2t, medr_i2t))\n                    # Compute test ranks txt2img\n                    r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg,\n                                                                 devTxt)\n                    logging.info(\"Text to Image: {0}, {1}, {2}, {3}\".format(\n                        r1_t2i, r5_t2i, r10_t2i, medr_t2i))\n                idx = torch.LongTensor(permutation[i:i + self.batch_size])\n                imgbatch = Variable(trainImg.index_select(0, idx)).cuda()\n                sentbatch = Variable(trainTxt.index_select(0, idx)).cuda()\n\n                idximgc = np.random.choice(permutation[:i] +\n                                           permutation[i + self.batch_size:],\n                                           self.ncontrast*idx.size(0))\n                idxsentc = np.random.choice(permutation[:i] +\n                                            permutation[i + self.batch_size:],\n                                            self.ncontrast*idx.size(0))\n                idximgc = torch.LongTensor(idximgc)\n                idxsentc = torch.LongTensor(idxsentc)\n                # Get indexes for contrastive images and sentences\n                imgcbatch = Variable(trainImg.index_select(0, idximgc)).view(\n                    -1, self.ncontrast, self.imgdim).cuda()\n                sentcbatch = Variable(trainTxt.index_select(0, idxsentc)).view(\n                    -1, self.ncontrast, self.sentdim).cuda()\n\n                anchor1, anchor2, img_sentc, sent_imgc = self.model(\n                    imgbatch, sentbatch, imgcbatch, sentcbatch)\n                # loss\n                loss = self.loss_fn(anchor1, anchor2, img_sentc, sent_imgc)\n                all_costs.append(loss.data.item())\n                # backward\n                self.optimizer.zero_grad()\n                loss.backward()\n                # Update parameters\n                self.optimizer.step()\n        self.nepoch += nepoches\n\n    def t2i(self, images, captions):\n        \"\"\"\n        Images: (5N, imgdim) matrix of images\n        Captions: (5N, sentdim) matrix of captions\n        \"\"\"\n        with torch.no_grad():\n            # Project images and captions\n            img_embed, sent_embed = [], []\n            for i in range(0, len(images), self.batch_size):\n                img_embed.append(self.model.proj_image(\n                    Variable(images[i:i + self.batch_size])))\n                sent_embed.append(self.model.proj_sentence(\n                    Variable(captions[i:i + self.batch_size])))\n            img_embed = torch.cat(img_embed, 0).data\n            sent_embed = torch.cat(sent_embed, 0).data\n\n            npts = int(img_embed.size(0) / 5)\n            idxs = torch.cuda.LongTensor(range(0, len(img_embed), 5))\n            ims = img_embed.index_select(0, idxs)\n\n            ranks = np.zeros(5 * npts)\n            for index in range(npts):\n\n                # Get query captions\n                queries = sent_embed[5*index: 5*index + 5]\n\n                # Compute scores\n                scores = torch.mm(queries, ims.transpose(0, 1)).cpu().numpy()\n                inds = np.zeros(scores.shape)\n                for i in range(len(inds)):\n                    inds[i] = np.argsort(scores[i])[::-1]\n                    ranks[5 * index + i] = np.where(inds[i] == index)[0][0]\n\n            # Compute metrics\n            r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)\n            r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)\n            r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)\n            medr = np.floor(np.median(ranks)) + 1\n            return (r1, r5, r10, medr)\n\n    def i2t(self, images, captions):\n        \"\"\"\n        Images: (5N, imgdim) matrix of images\n        Captions: (5N, sentdim) matrix of captions\n        \"\"\"\n        with torch.no_grad():\n            # Project images and captions\n            img_embed, sent_embed = [], []\n            for i in range(0, len(images), self.batch_size):\n                img_embed.append(self.model.proj_image(\n                    Variable(images[i:i + self.batch_size])))\n                sent_embed.append(self.model.proj_sentence(\n                    Variable(captions[i:i + self.batch_size])))\n            img_embed = torch.cat(img_embed, 0).data\n            sent_embed = torch.cat(sent_embed, 0).data\n\n            npts = int(img_embed.size(0) / 5)\n            index_list = []\n\n            ranks = np.zeros(npts)\n            for index in range(npts):\n\n                # Get query image\n                query_img = img_embed[5 * index]\n\n                # Compute scores\n                scores = torch.mm(query_img.view(1, -1),\n                                  sent_embed.transpose(0, 1)).view(-1)\n                scores = scores.cpu().numpy()\n                inds = np.argsort(scores)[::-1]\n                index_list.append(inds[0])\n\n                # Score\n                rank = 1e20\n                for i in range(5*index, 5*index + 5, 1):\n                    tmp = np.where(inds == i)[0][0]\n                    if tmp < rank:\n                        rank = tmp\n                ranks[index] = rank\n\n            # Compute metrics\n            r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)\n            r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)\n            r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)\n            medr = np.floor(np.median(ranks)) + 1\n            return (r1, r5, r10, medr)\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/tools/relatedness.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n\"\"\"\nSemantic Relatedness (supervised) with Pytorch\n\"\"\"\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport copy\nimport numpy as np\n\nimport torch\nfrom torch import nn\nimport torch.optim as optim\n\nfrom scipy.stats import pearsonr\n\n\nclass RelatednessPytorch(object):\n    # Can be used for SICK-Relatedness, and STS14\n    def __init__(self, train, valid, test, devscores, config):\n        # fix seed\n        np.random.seed(config['seed'])\n        torch.manual_seed(config['seed'])\n        assert torch.cuda.is_available(), 'torch.cuda required for Relatedness'\n        torch.cuda.manual_seed(config['seed'])\n\n        self.train = train\n        self.valid = valid\n        self.test = test\n        self.devscores = devscores\n\n        self.inputdim = train['X'].shape[1]\n        self.nclasses = config['nclasses']\n        self.seed = config['seed']\n        self.l2reg = 0.\n        self.batch_size = 64\n        self.maxepoch = 1000\n        self.early_stop = True\n\n        self.model = nn.Sequential(\n            nn.Linear(self.inputdim, self.nclasses),\n            nn.Softmax(dim=-1),\n        )\n        self.loss_fn = nn.MSELoss()\n\n        if torch.cuda.is_available():\n            self.model = self.model.cuda()\n            self.loss_fn = self.loss_fn.cuda()\n\n        self.loss_fn.size_average = False\n        self.optimizer = optim.Adam(self.model.parameters(),\n                                    weight_decay=self.l2reg)\n\n    def prepare_data(self, trainX, trainy, devX, devy, testX, testy):\n        # Transform probs to log-probs for KL-divergence\n        trainX = torch.from_numpy(trainX).float().cuda()\n        trainy = torch.from_numpy(trainy).float().cuda()\n        devX = torch.from_numpy(devX).float().cuda()\n        devy = torch.from_numpy(devy).float().cuda()\n        testX = torch.from_numpy(testX).float().cuda()\n        testY = torch.from_numpy(testy).float().cuda()\n\n        return trainX, trainy, devX, devy, testX, testy\n\n    def run(self):\n        self.nepoch = 0\n        bestpr = -1\n        early_stop_count = 0\n        r = np.arange(1, 6)\n        stop_train = False\n\n        # Preparing data\n        trainX, trainy, devX, devy, testX, testy = self.prepare_data(\n            self.train['X'], self.train['y'],\n            self.valid['X'], self.valid['y'],\n            self.test['X'], self.test['y'])\n\n        # Training\n        while not stop_train and self.nepoch <= self.maxepoch:\n            self.trainepoch(trainX, trainy, nepoches=50)\n            yhat = np.dot(self.predict_proba(devX), r)\n            pr = pearsonr(yhat, self.devscores)[0]\n            pr = 0 if pr != pr else pr  # if NaN bc std=0\n            # early stop on Pearson\n            if pr > bestpr:\n                bestpr = pr\n                bestmodel = copy.deepcopy(self.model)\n            elif self.early_stop:\n                if early_stop_count >= 3:\n                    stop_train = True\n                early_stop_count += 1\n        self.model = bestmodel\n\n        yhat = np.dot(self.predict_proba(testX), r)\n\n        return bestpr, yhat\n\n    def trainepoch(self, X, y, nepoches=1):\n        self.model.train()\n        for _ in range(self.nepoch, self.nepoch + nepoches):\n            permutation = np.random.permutation(len(X))\n            all_costs = []\n            for i in range(0, len(X), self.batch_size):\n                # forward\n                idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda()\n                Xbatch = X[idx]\n                ybatch = y[idx]\n                output = self.model(Xbatch)\n                # loss\n                loss = self.loss_fn(output, ybatch)\n                all_costs.append(loss.item())\n                # backward\n                self.optimizer.zero_grad()\n                loss.backward()\n                # Update parameters\n                self.optimizer.step()\n        self.nepoch += nepoches\n\n    def predict_proba(self, devX):\n        self.model.eval()\n        probas = []\n        with torch.no_grad():\n            for i in range(0, len(devX), self.batch_size):\n                Xbatch = devX[i:i + self.batch_size]\n                if len(probas) == 0:\n                    probas = self.model(Xbatch).data.cpu().numpy()\n                else:\n                    probas = np.concatenate((probas, self.model(Xbatch).data.cpu().numpy()), axis=0)\n        return probas\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/tools/validation.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n\"\"\"\nValidation and classification\n(train)            :  inner-kfold classifier\n(train, test)      :  kfold classifier\n(train, dev, test) :  split classifier\n\n\"\"\"\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport logging\nimport numpy as np\nfrom senteval.tools.classifier import MLP\n\nimport sklearn\nassert(sklearn.__version__ >= \"0.18.0\"), \\\n    \"need to update sklearn to version >= 0.18.0\"\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import StratifiedKFold\n\n\ndef get_classif_name(classifier_config, usepytorch):\n    if not usepytorch:\n        modelname = 'sklearn-LogReg'\n    else:\n        nhid = classifier_config['nhid']\n        optim = 'adam' if 'optim' not in classifier_config else classifier_config['optim']\n        bs = 64 if 'batch_size' not in classifier_config else classifier_config['batch_size']\n        modelname = 'pytorch-MLP-nhid%s-%s-bs%s' % (nhid, optim, bs)\n    return modelname\n\n# Pytorch version\nclass InnerKFoldClassifier(object):\n    \"\"\"\n    (train) split classifier : InnerKfold.\n    \"\"\"\n    def __init__(self, X, y, config):\n        self.X = X\n        self.y = y\n        self.featdim = X.shape[1]\n        self.nclasses = config['nclasses']\n        self.seed = config['seed']\n        self.devresults = []\n        self.testresults = []\n        self.usepytorch = config['usepytorch']\n        self.classifier_config = config['classifier']\n        self.modelname = get_classif_name(self.classifier_config, self.usepytorch)\n\n        self.k = 5 if 'kfold' not in config else config['kfold']\n\n    def run(self):\n        logging.info('Training {0} with (inner) {1}-fold cross-validation'\n                     .format(self.modelname, self.k))\n\n        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \\\n               [2**t for t in range(-2, 4, 1)]\n        skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111)\n        innerskf = StratifiedKFold(n_splits=self.k, shuffle=True,\n                                   random_state=1111)\n        count = 0\n        for train_idx, test_idx in skf.split(self.X, self.y):\n            count += 1\n            X_train, X_test = self.X[train_idx], self.X[test_idx]\n            y_train, y_test = self.y[train_idx], self.y[test_idx]\n            scores = []\n            for reg in regs:\n                regscores = []\n                for inner_train_idx, inner_test_idx in innerskf.split(X_train, y_train):\n                    X_in_train, X_in_test = X_train[inner_train_idx], X_train[inner_test_idx]\n                    y_in_train, y_in_test = y_train[inner_train_idx], y_train[inner_test_idx]\n                    if self.usepytorch:\n                        clf = MLP(self.classifier_config, inputdim=self.featdim,\n                                  nclasses=self.nclasses, l2reg=reg,\n                                  seed=self.seed)\n                        clf.fit(X_in_train, y_in_train,\n                                validation_data=(X_in_test, y_in_test))\n                    else:\n                        clf = LogisticRegression(C=reg, random_state=self.seed)\n                        clf.fit(X_in_train, y_in_train)\n                    regscores.append(clf.score(X_in_test, y_in_test))\n                scores.append(round(100*np.mean(regscores), 2))\n            optreg = regs[np.argmax(scores)]\n            logging.info('Best param found at split {0}: l2reg = {1} \\\n                with score {2}'.format(count, optreg, np.max(scores)))\n            self.devresults.append(np.max(scores))\n\n            if self.usepytorch:\n                clf = MLP(self.classifier_config, inputdim=self.featdim,\n                          nclasses=self.nclasses, l2reg=optreg,\n                          seed=self.seed)\n\n                clf.fit(X_train, y_train, validation_split=0.05)\n            else:\n                clf = LogisticRegression(C=optreg, random_state=self.seed)\n                clf.fit(X_train, y_train)\n\n            self.testresults.append(round(100*clf.score(X_test, y_test), 2))\n\n        devaccuracy = round(np.mean(self.devresults), 2)\n        testaccuracy = round(np.mean(self.testresults), 2)\n        return devaccuracy, testaccuracy\n\n\nclass KFoldClassifier(object):\n    \"\"\"\n    (train, test) split classifier : cross-validation on train.\n    \"\"\"\n    def __init__(self, train, test, config):\n        self.train = train\n        self.test = test\n        self.featdim = self.train['X'].shape[1]\n        self.nclasses = config['nclasses']\n        self.seed = config['seed']\n        self.usepytorch = config['usepytorch']\n        self.classifier_config = config['classifier']\n        self.modelname = get_classif_name(self.classifier_config, self.usepytorch)\n\n        self.k = 5 if 'kfold' not in config else config['kfold']\n\n    def run(self):\n        # cross-validation\n        logging.info('Training {0} with {1}-fold cross-validation'\n                     .format(self.modelname, self.k))\n        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \\\n               [2**t for t in range(-1, 6, 1)]\n        skf = StratifiedKFold(n_splits=self.k, shuffle=True,\n                              random_state=self.seed)\n        scores = []\n\n        for reg in regs:\n            scanscores = []\n            for train_idx, test_idx in skf.split(self.train['X'],\n                                                 self.train['y']):\n                # Split data\n                X_train, y_train = self.train['X'][train_idx], self.train['y'][train_idx]\n\n                X_test, y_test = self.train['X'][test_idx], self.train['y'][test_idx]\n\n                # Train classifier\n                if self.usepytorch:\n                    clf = MLP(self.classifier_config, inputdim=self.featdim,\n                              nclasses=self.nclasses, l2reg=reg,\n                              seed=self.seed)\n                    clf.fit(X_train, y_train, validation_data=(X_test, y_test))\n                else:\n                    clf = LogisticRegression(C=reg, random_state=self.seed)\n                    clf.fit(X_train, y_train)\n                score = clf.score(X_test, y_test)\n                scanscores.append(score)\n            # Append mean score\n            scores.append(round(100*np.mean(scanscores), 2))\n\n        # evaluation\n        logging.info([('reg:' + str(regs[idx]), scores[idx])\n                      for idx in range(len(scores))])\n        optreg = regs[np.argmax(scores)]\n        devaccuracy = np.max(scores)\n        logging.info('Cross-validation : best param found is reg = {0} \\\n            with score {1}'.format(optreg, devaccuracy))\n\n        logging.info('Evaluating...')\n        if self.usepytorch:\n            clf = MLP(self.classifier_config, inputdim=self.featdim,\n                      nclasses=self.nclasses, l2reg=optreg,\n                      seed=self.seed)\n            clf.fit(self.train['X'], self.train['y'], validation_split=0.05)\n        else:\n            clf = LogisticRegression(C=optreg, random_state=self.seed)\n            clf.fit(self.train['X'], self.train['y'])\n        yhat = clf.predict(self.test['X'])\n\n        testaccuracy = clf.score(self.test['X'], self.test['y'])\n        testaccuracy = round(100*testaccuracy, 2)\n\n        return devaccuracy, testaccuracy, yhat\n\n\nclass SplitClassifier(object):\n    \"\"\"\n    (train, valid, test) split classifier.\n    \"\"\"\n    def __init__(self, X, y, config):\n        self.X = X\n        self.y = y\n        self.nclasses = config['nclasses']\n        self.featdim = self.X['train'].shape[1]\n        self.seed = config['seed']\n        self.usepytorch = config['usepytorch']\n        self.classifier_config = config['classifier']\n        self.cudaEfficient = False if 'cudaEfficient' not in config else \\\n            config['cudaEfficient']\n        self.modelname = get_classif_name(self.classifier_config, self.usepytorch)\n        self.noreg = False if 'noreg' not in config else config['noreg']\n        self.config = config\n\n    def run(self):\n        logging.info('Training {0} with standard validation..'\n                     .format(self.modelname))\n        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \\\n               [2**t for t in range(-2, 4, 1)]\n        if self.noreg:\n            regs = [1e-9 if self.usepytorch else 1e9]\n        scores = []\n        for reg in regs:\n            if self.usepytorch:\n                clf = MLP(self.classifier_config, inputdim=self.featdim,\n                          nclasses=self.nclasses, l2reg=reg,\n                          seed=self.seed, cudaEfficient=self.cudaEfficient)\n\n                # TODO: Find a hack for reducing nb epoches in SNLI\n                clf.fit(self.X['train'], self.y['train'],\n                        validation_data=(self.X['valid'], self.y['valid']))\n            else:\n                clf = LogisticRegression(C=reg, random_state=self.seed)\n                clf.fit(self.X['train'], self.y['train'])\n            scores.append(round(100*clf.score(self.X['valid'],\n                                self.y['valid']), 2))\n        logging.info([('reg:'+str(regs[idx]), scores[idx])\n                      for idx in range(len(scores))])\n        optreg = regs[np.argmax(scores)]\n        devaccuracy = np.max(scores)\n        logging.info('Validation : best param found is reg = {0} with score \\\n            {1}'.format(optreg, devaccuracy))\n        clf = LogisticRegression(C=optreg, random_state=self.seed)\n        logging.info('Evaluating...')\n        if self.usepytorch:\n            clf = MLP(self.classifier_config, inputdim=self.featdim,\n                      nclasses=self.nclasses, l2reg=optreg,\n                      seed=self.seed, cudaEfficient=self.cudaEfficient)\n\n            # TODO: Find a hack for reducing nb epoches in SNLI\n            clf.fit(self.X['train'], self.y['train'],\n                    validation_data=(self.X['valid'], self.y['valid']))\n        else:\n            clf = LogisticRegression(C=optreg, random_state=self.seed)\n            clf.fit(self.X['train'], self.y['train'])\n\n        testaccuracy = clf.score(self.X['test'], self.y['test'])\n        testaccuracy = round(100*testaccuracy, 2)\n        return devaccuracy, testaccuracy\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/trec.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\n'''\nTREC question-type classification\n'''\n\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport os\nimport io\nimport logging\nimport numpy as np\n\nfrom senteval.tools.validation import KFoldClassifier\n\n\nclass TRECEval(object):\n    def __init__(self, task_path, seed=1111):\n        logging.info('***** Transfer task : TREC *****\\n\\n')\n        self.seed = seed\n        self.train = self.loadFile(os.path.join(task_path, 'train_5500.label'))\n        self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label'))\n\n    def do_prepare(self, params, prepare):\n        samples = self.train['X'] + self.test['X']\n        return prepare(params, samples)\n\n    def loadFile(self, fpath):\n        trec_data = {'X': [], 'y': []}\n        tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2,\n                   'HUM': 3, 'LOC': 4, 'NUM': 5}\n        with io.open(fpath, 'r', encoding='latin-1') as f:\n            for line in f:\n                target, sample = line.strip().split(':', 1)\n                sample = sample.split(' ', 1)[1].split()\n                assert target in tgt2idx, target\n                trec_data['X'].append(sample)\n                trec_data['y'].append(tgt2idx[target])\n        return trec_data\n\n    def run(self, params, batcher):\n        train_embeddings, test_embeddings = [], []\n\n        # Sort to reduce padding\n        sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']),\n                                     key=lambda z: (len(z[0]), z[1]))\n        train_samples = [x for (x, y) in sorted_corpus_train]\n        train_labels = [y for (x, y) in sorted_corpus_train]\n\n        sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']),\n                                    key=lambda z: (len(z[0]), z[1]))\n        test_samples = [x for (x, y) in sorted_corpus_test]\n        test_labels = [y for (x, y) in sorted_corpus_test]\n\n        # Get train embeddings\n        for ii in range(0, len(train_labels), params.batch_size):\n            batch = train_samples[ii:ii + params.batch_size]\n            embeddings = batcher(params, batch)\n            train_embeddings.append(embeddings)\n        train_embeddings = np.vstack(train_embeddings)\n        logging.info('Computed train embeddings')\n\n        # Get test embeddings\n        for ii in range(0, len(test_labels), params.batch_size):\n            batch = test_samples[ii:ii + params.batch_size]\n            embeddings = batcher(params, batch)\n            test_embeddings.append(embeddings)\n        test_embeddings = np.vstack(test_embeddings)\n        logging.info('Computed test embeddings')\n\n        config_classifier = {'nclasses': 6, 'seed': self.seed,\n                             'usepytorch': params.usepytorch,\n                             'classifier': params.classifier,\n                             'kfold': params.kfold}\n        clf = KFoldClassifier({'X': train_embeddings,\n                               'y': np.array(train_labels)},\n                              {'X': test_embeddings,\n                               'y': np.array(test_labels)},\n                              config_classifier)\n        devacc, testacc, _ = clf.run()\n        logging.debug('\\nDev acc : {0} Test acc : {1} \\\n            for TREC\\n'.format(devacc, testacc))\n        return {'devacc': devacc, 'acc': testacc,\n                'ndev': len(self.train['X']), 'ntest': len(self.test['X'])}\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/senteval/utils.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\nfrom __future__ import absolute_import, division, unicode_literals\n\nimport numpy as np\nimport re\nimport inspect\nfrom torch import optim\n\n\ndef create_dictionary(sentences):\n    words = {}\n    for s in sentences:\n        for word in s:\n            if word in words:\n                words[word] += 1\n            else:\n                words[word] = 1\n    words['<s>'] = 1e9 + 4\n    words['</s>'] = 1e9 + 3\n    words['<p>'] = 1e9 + 2\n    # words['<UNK>'] = 1e9 + 1\n    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort\n    id2word = []\n    word2id = {}\n    for i, (w, _) in enumerate(sorted_words):\n        id2word.append(w)\n        word2id[w] = i\n\n    return id2word, word2id\n\n\ndef cosine(u, v):\n    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))\n\n\nclass dotdict(dict):\n    \"\"\" dot.notation access to dictionary attributes \"\"\"\n    __getattr__ = dict.get\n    __setattr__ = dict.__setitem__\n    __delattr__ = dict.__delitem__\n\n\ndef get_optimizer(s):\n    \"\"\"\n    Parse optimizer parameters.\n    Input should be of the form:\n        - \"sgd,lr=0.01\"\n        - \"adagrad,lr=0.1,lr_decay=0.05\"\n    \"\"\"\n    if \",\" in s:\n        method = s[:s.find(',')]\n        optim_params = {}\n        for x in s[s.find(',') + 1:].split(','):\n            split = x.split('=')\n            assert len(split) == 2\n            assert re.match(\"^[+-]?(\\d+(\\.\\d*)?|\\.\\d+)$\", split[1]) is not None\n            optim_params[split[0]] = float(split[1])\n    else:\n        method = s\n        optim_params = {}\n\n    if method == 'adadelta':\n        optim_fn = optim.Adadelta\n    elif method == 'adagrad':\n        optim_fn = optim.Adagrad\n    elif method == 'adam':\n        optim_fn = optim.Adam\n    elif method == 'adamax':\n        optim_fn = optim.Adamax\n    elif method == 'asgd':\n        optim_fn = optim.ASGD\n    elif method == 'rmsprop':\n        optim_fn = optim.RMSprop\n    elif method == 'rprop':\n        optim_fn = optim.Rprop\n    elif method == 'sgd':\n        optim_fn = optim.SGD\n        assert 'lr' in optim_params\n    else:\n        raise Exception('Unknown optimization method: \"%s\"' % method)\n\n    # check that we give good parameters to the optimizer\n    expected_args = inspect.getfullargspec(optim_fn.__init__)[0]\n    assert expected_args[:2] == ['self', 'params']\n    if not all(k in expected_args[2:] for k in optim_params.keys()):\n        raise Exception('Unexpected parameters: expected \"%s\", got \"%s\"' % (\n            str(expected_args[2:]), str(optim_params.keys())))\n\n    return optim_fn, optim_params\n"
  },
  {
    "path": "utils_nlp/eval/SentEval/setup.py",
    "content": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n#\n\nimport io\nfrom setuptools import setup, find_packages\n\nwith io.open('./README.md', encoding='utf-8') as f:\n    readme = f.read()\n\nsetup(\n    name='SentEval',\n    version='0.1.0',\n    url='https://github.com/facebookresearch/SentEval',\n    packages=find_packages(exclude=['examples']),\n    license='Attribution-NonCommercial 4.0 International',\n    long_description=readme,\n)\n"
  },
  {
    "path": "utils_nlp/eval/__init__.py",
    "content": "from .rouge.compute_rouge import compute_rouge_perl, compute_rouge_python\n"
  },
  {
    "path": "utils_nlp/eval/classification.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Utilities functions for computing general model evaluation metrics.\"\"\"\n\nfrom sklearn.metrics import (\n    accuracy_score,\n    precision_score,\n    recall_score,\n    f1_score,\n    confusion_matrix,\n)\nfrom numpy import corrcoef\n\nfrom matplotlib import pyplot\nimport seaborn as sn\nimport numpy as np\nimport pandas as pd\n\n\ndef eval_classification(actual, predicted, round_decimals=4):\n    \"\"\"Returns common classification evaluation metrics.\n    Args:\n        actual (1d array-like): Array of actual values.\n        predicted (1d array-like): Array of predicted values.\n        round_decimals (int, optional): Number of decimal places. Defaults to 4.\n    Returns:\n        dict: A dictionary of evaluation metrics.\n    \"\"\"\n    return {\n        \"accuracy\": accuracy_score(actual, predicted).round(round_decimals),\n        \"precision\": list(precision_score(actual, predicted, average=None).round(round_decimals)),\n        \"recall\": list(recall_score(actual, predicted, average=None).round(round_decimals)),\n        \"f1\": list(f1_score(actual, predicted, average=None).round(round_decimals)),\n    }\n\n\ndef compute_correlation_coefficients(x, y=None):\n    \"\"\"\n    Compute Pearson product-moment correlation coefficients.\n\n    Args:\n        x: array_like\n            A 1-D or 2-D array containing multiple variables and observations.\n            Each row of `x` represents a variable, and each column a single\n            observation of all those variables.\n\n        y: array_like, optional\n            An additional set of variables and observations. `y` has the same\n            shape as `x`.\n\n    Returns:\n        pd.DataFrame : A pandas dataframe from the correlation coefficient matrix of the variables.\n    \"\"\"\n    return pd.DataFrame(corrcoef(x, y))\n\n\ndef plot_confusion_matrix(\n    y_true,\n    y_pred,\n    labels,\n    normalize=False,\n    title=\"Confusion matrix\",\n    plot_size=(8, 5),\n    font_scale=1.1,\n):\n    \"\"\"Function that prints out a graphical representation of confusion matrix using Seaborn Heatmap\n\n    Args:\n        y_true (1d array-like): True labels from dataset\n        y_pred (1d array-like): Predicted labels from the models\n        labels: A list of labels\n        normalize (Bool, optional): Boolean to Set Row Normalization for Confusion Matrix\n        title (String, optional): String that is the title of the plot\n        plot_size (tuple, optional): Tuple of Plot Dimensions Default \"(8, 5)\"\n        font_scale (float, optional): float type scale factor for font within plot\n    \"\"\"\n    conf_matrix = np.array(confusion_matrix(y_true, y_pred))\n    if normalize:\n        conf_matrix = np.round(\n            conf_matrix.astype(\"float\") / conf_matrix.sum(axis=1)[:, np.newaxis], 3\n        )\n    conf_dataframe = pd.DataFrame(conf_matrix, labels, labels)\n    fig, ax = pyplot.subplots(figsize=plot_size)\n    sn.set(font_scale=font_scale)\n    ax.set_title(title)\n    ax = sn.heatmap(conf_dataframe, cmap=\"Blues\", annot=True, annot_kws={\"size\": 16}, fmt=\"g\")\n    ax.set(xlabel=\"Predicted Labels\", ylabel=\"True Labels\")\n"
  },
  {
    "path": "utils_nlp/eval/evaluate_squad.py",
    "content": "\"\"\" Official evaluation script for v1.1 of the SQuAD dataset. \"\"\"\n\n# Original source:\n# https://github.com/allenai/bi-att-flow/blob/498c8026d92a8bcf0286e2d216d092d444d02d76/squad/evaluate-v1.1.py\n\nfrom __future__ import print_function\nfrom collections import Counter\nimport string\nimport re\nimport argparse\nimport json\nimport sys\n\n\ndef normalize_answer(s):\n    \"\"\"Lower text and remove punctuation, articles and extra whitespace.\"\"\"\n\n    def remove_articles(text):\n        return re.sub(r\"\\b(a|an|the)\\b\", \" \", text)\n\n    def white_space_fix(text):\n        return \" \".join(text.split())\n\n    def remove_punc(text):\n        exclude = set(string.punctuation)\n        return \"\".join(ch for ch in text if ch not in exclude)\n\n    def lower(text):\n        return text.lower()\n\n    return white_space_fix(remove_articles(remove_punc(lower(s))))\n\n\ndef f1_score(prediction, ground_truth):\n    prediction_tokens = normalize_answer(prediction).split()\n    ground_truth_tokens = normalize_answer(ground_truth).split()\n    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)\n    num_same = sum(common.values())\n    if num_same == 0:\n        return 0\n    precision = 1.0 * num_same / len(prediction_tokens)\n    recall = 1.0 * num_same / len(ground_truth_tokens)\n    f1 = (2 * precision * recall) / (precision + recall)\n    return f1\n\n\ndef exact_match_score(prediction, ground_truth):\n    return normalize_answer(prediction) == normalize_answer(ground_truth)\n\n\ndef metric_max_over_ground_truths(metric_fn, prediction, ground_truths):\n    scores_for_ground_truths = []\n    for ground_truth in ground_truths:\n        score = metric_fn(prediction, ground_truth)\n        scores_for_ground_truths.append(score)\n    return max(scores_for_ground_truths)\n\n\ndef evaluate(dataset, predictions):\n    f1 = exact_match = total = 0\n    for article in dataset:\n        for paragraph in article[\"paragraphs\"]:\n            for qa in paragraph[\"qas\"]:\n                total += 1\n                if qa[\"id\"] not in predictions:\n                    message = \"Unanswered question \" + qa[\"id\"] + \" will receive score 0.\"\n                    print(message, file=sys.stderr)\n                    continue\n                ground_truths = list(map(lambda x: x[\"text\"], qa[\"answers\"]))\n                prediction = predictions[qa[\"id\"]]\n                exact_match += metric_max_over_ground_truths(\n                    exact_match_score, prediction, ground_truths\n                )\n                f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)\n\n    exact_match = 100.0 * exact_match / total\n    f1 = 100.0 * f1 / total\n\n    return {\"exact_match\": exact_match, \"f1\": f1}\n\n\nif __name__ == \"__main__\":\n    expected_version = \"1.1\"\n    parser = argparse.ArgumentParser(description=\"Evaluation for SQuAD \" + expected_version)\n    parser.add_argument(\"dataset_file\", help=\"Dataset file\")\n    parser.add_argument(\"prediction_file\", help=\"Prediction File\")\n    args = parser.parse_args()\n    with open(args.dataset_file) as dataset_file:\n        dataset_json = json.load(dataset_file)\n        if dataset_json[\"version\"] != expected_version:\n            print(\n                \"Evaluation expects v-\"\n                + expected_version\n                + \", but got dataset with v-\"\n                + dataset_json[\"version\"],\n                file=sys.stderr,\n            )\n        dataset = dataset_json[\"data\"]\n    with open(args.prediction_file) as prediction_file:\n        predictions = json.load(prediction_file)\n    print(json.dumps(evaluate(dataset, predictions)))\n"
  },
  {
    "path": "utils_nlp/eval/evaluate_summarization.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport os\nfrom random import random, seed\n\nfrom bertsum.others.utils import test_rouge\n\n\ndef get_rouge(predictions, targets, temp_dir, random_seed=42):\n    \"\"\"\n    function to get the rouge metric for the prediction and the reference.\n\n    Args:\n        predictions (list of strings): Predictions to be compared.\n        target (list of strings): References\n        temp_dir (str): Path where temporary folders are created to host the files\n            generated by ROUGE application.\n        seed (int, optional): Random seed. Defaults to 42.\n\n    Return:\n        dictionary: rouge metric\n\n    \"\"\"\n\n    def _write_list_to_file(list_items, filename):\n        with open(filename, \"w\") as filehandle:\n            # for cnt, line in enumerate(filehandle):\n            for item in list_items:\n                filehandle.write(\"%s\\n\" % item)\n\n    seed(random_seed)\n    random_number = random()\n    os.makedirs(temp_dir, exist_ok=True)\n    candidate_path = os.path.join(temp_dir, \"candidate\" + str(random_number))\n    gold_path = os.path.join(temp_dir, \"gold\" + str(random_number))\n    _write_list_to_file(predictions, candidate_path)\n    _write_list_to_file(targets, gold_path)\n    rouge = test_rouge(temp_dir, candidate_path, gold_path)\n    return rouge\n"
  },
  {
    "path": "utils_nlp/eval/question_answering.py",
    "content": "\"\"\" Official evaluation script for SQuAD version 2.0.\n    Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0\n\"\"\"\n\nimport collections\nimport json\nimport re\nimport string\n\n\ndef get_raw_scores(qa_ids, actuals, preds):\n    \"\"\"\n        Computes exact match and F1 scores without applying any unanswerable probability threshold.\n\n        Args:\n            qa_ids (list): Unique ids corresponding to the answers in `actuals`.\n            actuals (list): List of ground truth answers.\n            preds (dict): Dictionary with qa_id as keys and predicted answers as values.\n\n        Returns:\n            tuple: (exact_match, f1)\n\n    \"\"\"\n    # Helper functions\n    def _normalize_answer(s):\n        \"\"\"Lower text and remove punctuation, articles and extra whitespace.\"\"\"\n\n        def remove_articles(text):\n            regex = re.compile(r\"\\b(a|an|the)\\b\", re.UNICODE)\n            return re.sub(regex, \" \", text)\n\n        def white_space_fix(text):\n            return \" \".join(text.split())\n\n        def remove_punc(text):\n            exclude = set(string.punctuation)\n            return \"\".join(ch for ch in text if ch not in exclude)\n\n        def lower(text):\n            return text.lower()\n\n        return white_space_fix(remove_articles(remove_punc(lower(s))))\n\n    def _get_tokens(s):\n        \"\"\"Normalizes text and returns white-space tokenized tokens. \"\"\"\n        if not s:\n            return []\n        return _normalize_answer(s).split()\n\n    def _compute_exact(a_gold, a_pred):\n        \"\"\"Compute the exact match between two sentences after normalization.\n\n        Returns:\n            int: 1 if two sentences match exactly after normalization,\n                0 otherwise.\n        \"\"\"\n        return int(_normalize_answer(a_gold) == _normalize_answer(a_pred))\n\n    def _compute_f1(a_gold, a_pred):\n        \"\"\"\n            Compute F1 score based on token overlapping between two\n            sentences.\n        \"\"\"\n        gold_toks = _get_tokens(a_gold)\n        pred_toks = _get_tokens(a_pred)\n        common = collections.Counter(gold_toks) & collections.Counter(pred_toks)\n        num_same = sum(common.values())\n        if len(gold_toks) == 0 or len(pred_toks) == 0:\n            # If either is no-answer, then F1 is 1 if they agree, 0 otherwise\n            return int(gold_toks == pred_toks)\n        if num_same == 0:\n            return 0\n        precision = 1.0 * num_same / len(pred_toks)\n        recall = 1.0 * num_same / len(gold_toks)\n        f1 = (2 * precision * recall) / (precision + recall)\n        return f1\n\n    # Helper functions end\n\n    exact_scores = {}\n    f1_scores = {}\n\n    for qid, gold_answers in zip(qa_ids, actuals):\n        if not gold_answers:\n            # For unanswerable questions, only correct answer is empty string\n            gold_answers = [\"\"]\n        if qid not in preds:\n            print(\"Missing prediction for %s\" % qid)\n            continue\n        a_pred = preds[qid]\n        # Take max over all gold answers\n        if isinstance(gold_answers, str):\n            gold_answers = [gold_answers]\n\n        exact_scores[qid] = max(_compute_exact(a, a_pred) for a in gold_answers)\n        f1_scores[qid] = max(_compute_f1(a, a_pred) for a in gold_answers)\n    return exact_scores, f1_scores\n\n\ndef find_best_thresh(preds, scores, na_probs, qid_to_has_ans, unanswerable_exists=False):\n    \"\"\"\n    Find the best threshold to determine a question is impossible to answer.\n\n    Args:\n        preds (dict): Dictionary with qa_id as keys and predicted answers as values.\n        scores (dict): Dictionary with qa_id as keys and raw evaluation scores (exact_match or\n            f1) as values.\n        na_probs (dict): Dictionary with qa_id as keys and unanswerable probabilities as values.\n        qid_to_has_ans (dict): Dictionary with qa_id as keys boolean values indicating if the\n            question has answer as values.\n        unanswerable_exists (bool, optional): Whether there is unanswerable questions in the data.\n            Defaults to False.\n\n    Returns:\n        tuple: score after applying best threshold, best threshold, (score for answerable\n            questions after applying best threshold, if unanswerable_exists=True)\n    \"\"\"\n    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])\n    # If na_prob > threshold, the question is considered as unanswerable by the prediction.\n    # Initially, the threshold is 0. All questions are considered as unanswerable by the\n    # predictions. So cur_score is the number of actual unanswerable questions (i.e. correctly\n    # predicted as unanswerable in the data.\n    cur_score = num_no_ans\n    best_score = cur_score\n    best_thresh = 0.0\n\n    # Sorted in ascending order\n    qid_list = sorted(na_probs, key=lambda k: na_probs[k])\n    for i, qid in enumerate(qid_list):\n        # When using the cur_na_prob as threshold, all predictions with na_prob > na_prob_cur are\n        # considered as unanswerable. Current question is considered answerable.\n        if qid not in scores:\n            continue\n        if qid_to_has_ans[qid]:\n            # Current question has ground truth answer, the prediction is correct. The raw score\n            # is added to cur_score\n            diff = scores[qid]\n        else:\n            # Current question doesn't have ground truth answer.\n            if preds[qid]:\n                # Prediction is not empty, incorrect. cur_score -= 1\n                diff = -1\n            else:\n                # Prediction is empty, correct, the original score 1 from num_no_ans is preserved.\n                diff = 0\n        cur_score += diff\n        if cur_score > best_score:\n            # When cur_score > best_score, the threshold can increase so that more questions are\n            # considered as answerable and fewer questions are considered as unanswerable.\n            # Imagine a PDF with two humps with some overlapping, the x axis is the na_prob. The\n            # hump on the left is answerable questions and the hump on the right is unanswerable\n            # questions.\n            # At some point, the number of actual answerable questions decreases, and we got more\n            # penalty from considering unanswerable questions as answerable than the score added\n            # from actual answerable questions, we will not change the threshold anymore and the\n            # optimal threshold is found.\n            best_score = cur_score\n            best_thresh = na_probs[qid]\n\n    if not unanswerable_exists:\n        return 100.0 * best_score / len(scores), best_thresh\n    else:\n        has_ans_score, has_ans_cnt = 0, 0\n        for qid in qid_list:\n            if not qid_to_has_ans[qid]:\n                continue\n            has_ans_cnt += 1\n\n            if qid not in scores:\n                continue\n            has_ans_score += scores[qid]\n\n        return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt\n\n\ndef find_all_best_thresh(\n    main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans, unanswerable_exists=False\n):\n    \"\"\"\n    Update raw evaluation scores by finding the best threshold to determine a question is\n    impossible to answer.\n\n    Args:\n        main_eval (dict): Dictionary with raw evaluation scores without apply any threshold.\n        preds (dict): Dictionary with qa_id as keys and predicted answers as values.\n        exact_raw (dict): Dictionary with qa_id as keys and raw exact_match scores as values.\n        f1_raw (dict): Dictionary with qa_id as keys and raw f1 scores as values.\n        na_probs (dict): Dictionary with qa_id as keys and unanswerable probabilities as values.\n        qid_to_has_ans (dict): Dictionary with qa_id as keys boolean values indicating if the\n            question has answer as values.\n        unanswerable_exists (bool, optional): Whether there is unanswerable questions in the data.\n            Defaults to False.\n\n    Returns:\n        dict: Updated `main_eval` with scores after applying best threshold and best threshold\n            for each score.\n    \"\"\"\n    all_exact = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans, unanswerable_exists)\n    all_f1 = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans, unanswerable_exists)\n    main_eval[\"best_exact\"] = all_exact[0]\n    main_eval[\"best_exact_thresh\"] = all_exact[1]\n    main_eval[\"best_f1\"] = all_f1[0]\n    main_eval[\"best_f1_thresh\"] = all_f1[1]\n\n    if unanswerable_exists:\n        main_eval[\"has_ans_exact\"] = all_exact[2]\n        main_eval[\"has_ans_f1\"] = all_f1[2]\n\n\ndef evaluate_qa(\n    actual_dataset, preds, na_probs=None, na_prob_thresh=0, unanswerable_exists=False, out_file=None\n):\n    \"\"\"\n    Evaluate question answering prediction results against ground truth answers.\n\n    Args:\n        Evaluates question answering model performance.\n\n        Args:\n            actual_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`): Input question answering\n                dataset with ground truth answers.\n            preds (dict): The key of the dictionary is the qa_id in the original\n                :class:`utils_nlp.dataset.pytorch.QADataset`. The values of the dictionary are\n                the predicted answer texts in string type.\n            na_probs (dict, optional): Dictionary of qa_id and unanswerable probability pairs.\n                If None, unanswerable probabilities are all set to zero. Defaults to None.\n            na_prob_thresh (float, optional): Probability threshold to predict a question to be\n                unanswerable. For an unanswerable question, if `na_probs` > `na_prob_thresh`,\n                the prediction is considered as correct. Otherwise, the prediction is considered as\n                incorrect. Defaults to 0.\n            out_file (str, optional): Path of the file to save the evaluation results to.\n                Defaults to None.\n\n        Returns:\n            dict: A dictionary with exact_match and f1 values.\n    \"\"\"\n\n    # Helper functions\n    def _apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):\n        \"\"\"Update the input scores by applying unanswerable probability threshold.\"\"\"\n\n        new_scores = {}\n        for qid, s in scores.items():\n            pred_na = na_probs[qid] > na_prob_thresh\n            if pred_na:\n                new_scores[qid] = float(not qid_to_has_ans[qid])\n            else:\n                new_scores[qid] = s\n        return new_scores\n\n    def _make_eval_dict(exact_scores, f1_scores, qid_list=None):\n        \"\"\"Create a dictionary of evaluation results.\"\"\"\n        if not qid_list:\n            total = len(exact_scores)\n            return collections.OrderedDict(\n                [\n                    (\"exact\", 100.0 * sum(exact_scores.values()) / total),\n                    (\"f1\", 100.0 * sum(f1_scores.values()) / total),\n                    (\"total\", total),\n                ]\n            )\n        else:\n            total = len(qid_list)\n            return collections.OrderedDict(\n                [\n                    (\"exact\", 100.0 * sum(exact_scores[k] for k in qid_list) / total),\n                    (\"f1\", 100.0 * sum(f1_scores[k] for k in qid_list) / total),\n                    (\"total\", total),\n                ]\n            )\n\n    def _merge_eval(main_eval, new_eval, prefix):\n        \"\"\"Merge multiple evaluation result dictionaries.\"\"\"\n        for k in new_eval:\n            main_eval[\"%s_%s\" % (prefix, k)] = new_eval[k]\n\n    # Helper functions end\n\n    if na_probs is None:\n        na_probs_available = False\n        na_probs = {k: 0.0 for k in preds}\n    else:\n        na_probs_available = True\n\n    qa_ids = [item.qa_id for item in actual_dataset]\n    actuals = [item.answer_text for item in actual_dataset]\n\n    qid_to_has_ans = {qa_id: bool(ans) for (qa_id, ans) in zip(qa_ids, actuals)}\n    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]\n    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]\n    exact_raw, f1_raw = get_raw_scores(qa_ids, actuals, preds)\n    exact_thresh = _apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, na_prob_thresh)\n    f1_thresh = _apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, na_prob_thresh)\n    out_eval = _make_eval_dict(exact_thresh, f1_thresh)\n    if has_ans_qids:\n        has_ans_eval = _make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)\n        _merge_eval(out_eval, has_ans_eval, \"HasAns\")\n    if no_ans_qids:\n        no_ans_eval = _make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)\n        _merge_eval(out_eval, no_ans_eval, \"NoAns\")\n\n    if na_probs_available:\n        find_all_best_thresh(\n            out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans, unanswerable_exists\n        )\n\n    if out_file:\n        with open(out_file, \"w\") as f:\n            json.dump(out_eval, f)\n    else:\n        print(json.dumps(out_eval, indent=2))\n    return out_eval\n"
  },
  {
    "path": "utils_nlp/eval/rouge/compute_rouge.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport os\nimport shutil\nimport time\nimport tempfile\n\nfrom pyrouge import Rouge155\nfrom rouge import Rouge\nfrom .rouge_ext import RougeExt\n\n\ndef compute_rouge_perl(cand, ref, is_input_files=False, verbose=False):\n    \"\"\"\n    Computes ROUGE scores using the python wrapper\n    (https://github.com/bheinzerling/pyrouge) of perl ROUGE package.\n\n    Args:\n        cand (list or str): If `is_input_files` is `False`, `cand` is a list of strings\n            containing predicted summaries. if `is_input_files` is `True`, `cand` is the path\n            to the file containing the predicted summaries.\n        ref (list or str): If `is_input_files` is `False`, `cand` is a list of strings\n            containing reference summaries. if `is_input_files` is `True`, `cand` is the path\n            to the file containing the reference summaries.\n        is_input_files (bool, optional): If True, inputs are file names. Otherwise, inputs are lists\n            of predicted and reference summaries. Defaults to False.\n        verbose (bool, optional): If True, print out all rouge scores. Defaults to False.\n\n    Returns:\n        dict: Dictionary of ROUGE scores.\n\n    \"\"\"\n\n    temp_dir = tempfile.mkdtemp()\n\n    if is_input_files:\n        candidates = [line.strip() for line in open(cand, encoding=\"utf-8\")]\n        references = [line.strip() for line in open(ref, encoding=\"utf-8\")]\n    else:\n        candidates = cand\n        references = ref\n\n    print(\"Number of candidates: {}\".format(len(candidates)))\n    print(\"Number of references: {}\".format(len(references)))\n    assert len(candidates) == len(references)\n\n    cnt = len(candidates)\n    current_time = time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.localtime())\n    tmp_dir = os.path.join(temp_dir, \"rouge-tmp-{}\".format(current_time))\n\n    tmp_dir_candidate = tmp_dir + \"/candidate/\"\n    tmp_dir_reference = tmp_dir + \"/reference/\"\n\n    os.makedirs(tmp_dir_candidate, exist_ok=True)\n    os.makedirs(tmp_dir_reference, exist_ok=True)\n\n    try:\n        for i in range(cnt):\n            if len(references[i]) < 1:\n                continue\n            with open(tmp_dir_candidate + \"/cand.{}.txt\".format(i), \"w\", encoding=\"utf-8\") as f:\n                f.write(candidates[i])\n            with open(tmp_dir_reference + \"/ref.{}.txt\".format(i), \"w\", encoding=\"utf-8\") as f:\n                f.write(references[i])\n        r = Rouge155()\n        r.model_dir = tmp_dir_reference\n        r.system_dir = tmp_dir_candidate\n        r.model_filename_pattern = \"ref.#ID#.txt\"\n        r.system_filename_pattern = r\"cand.(\\d+).txt\"\n        rouge_results = r.convert_and_evaluate()\n        if verbose:\n            print(rouge_results)\n        results_dict = r.output_to_dict(rouge_results)\n    finally:\n        if os.path.isdir(tmp_dir):\n            shutil.rmtree(tmp_dir)\n    return results_dict\n\n\ndef compute_rouge_python(cand, ref, is_input_files=False, language=\"en\"):\n    \"\"\"\n    Computes ROUGE scores using the python package (https://pypi.org/project/py-rouge/).\n\n    Args:\n        cand (list or str): If `is_input_files` is `False`, `cand` is a list of strings\n            containing predicted summaries. if `is_input_files` is `True`, `cand` is the path\n            to the file containing the predicted summaries.\n        ref (list or str): If `is_input_files` is `False`, `cand` is a list of strings\n            containing reference summaries. if `is_input_files` is `True`, `cand` is the path\n            to the file containing the reference summaries.\n        is_input_files (bool, optional): If True, inputs are file names. Otherwise, inputs are\n            lists of predicted and reference summaries. Defaults to False.\n        language (str, optional): Language of the input text. Supported values are \"en\" and\n            \"hi\". Defaults to \"en\".\n\n    Returns:\n        dict: Dictionary of ROUGE scores.\n\n    \"\"\"\n    supported_langauges = [\"en\", \"hi\"]\n    if language not in supported_langauges:\n        raise Exception(\n            \"Language {0} is not supported. Supported languages are: {1}.\".format(\n                language, supported_langauges\n            )\n        )\n\n    if is_input_files:\n        candidates = [line.strip() for line in open(cand, encoding=\"utf-8\")]\n        references = [line.strip() for line in open(ref, encoding=\"utf-8\")]\n    else:\n        candidates = cand\n        references = ref\n\n    print(\"Number of candidates: {}\".format(len(candidates)))\n    print(\"Number of references: {}\".format(len(references)))\n    assert len(candidates) == len(references)\n\n    if language == \"en\":\n        evaluator = Rouge(\n            metrics=[\"rouge-n\", \"rouge-l\"], max_n=2, limit_length=False, apply_avg=True\n        )\n    else:\n        evaluator = RougeExt(\n            metrics=[\"rouge-n\", \"rouge-l\"],\n            max_n=2,\n            limit_length=False,\n            apply_avg=True,\n            language=language,\n        )\n\n    scores = evaluator.get_scores(candidates, [[it] for it in references])\n\n    return scores\n"
  },
  {
    "path": "utils_nlp/eval/rouge/rouge_ext.py",
    "content": "# This script is adopted from https://github.com/Diego999/py-rouge/blob/master/rouge/rouge.py\n# to compute ROUGE scores for non-English languages.\n\n# Currently, the script supports Hindi.\n# Additional language support can be added by adding language specific\n# 1) sentence splitter (SENTENCE_SPLIT_DICT or the sentence_split_func argument)\n# 2) word tokenizer (WORD_TOKENIZE_DICT or the word_tokenize_func argument)\n# 3) pattern of characters to remove (REMOVE_CHAR_PATTERN_DICT or the remove_char_pattern\n#    argument)\n# 4) stemmer (STEMMER_DICT or the stemming_func argument), this is optional since\n#    stemming is not applicable to all languages\n# 5) word splitter (WORD_SPLIT_DICT or the word_split_func_argument)\n\n# Major changes made to the original rouge.py include:\n# 1) Don't remove non-English or non-numeric characters\n# 2) Removed the ensure_compatibility argument as we don't need to reproduce the results of\n#    the original perl script that only supports English.\n\n\nimport re\nimport string\nimport itertools\nimport collections\n\nfrom indicnlp.tokenize import sentence_tokenize, indic_tokenize\nfrom ...language_utils.hi.hindi_stemmer import hi_stem\nfrom rouge import Rouge\n\n\nclass RougeExt(Rouge):\n    DEFAULT_METRICS = {\"rouge-n\"}\n    DEFAULT_N = 1\n    STATS = [\"f\", \"p\", \"r\"]\n    AVAILABLE_METRICS = {\"rouge-n\", \"rouge-l\", \"rouge-w\"}\n    AVAILABLE_LENGTH_LIMIT_TYPES = {\"words\", \"bytes\"}\n\n    SENTENCE_SPLIT_DICT = {\"hi\": sentence_tokenize.sentence_split}\n    WORD_TOKENIZE_DICT = {\"hi\": indic_tokenize.trivial_tokenize}\n    REMOVE_CHAR_PATTERN_DICT = {\n        \"hi\": re.compile(r\"([\" + string.punctuation + r\"\\u0964\\u0965\" + r\"])\")\n    }\n    STEMMER_DICT = {\"hi\": hi_stem}\n    WORD_SPLIT_DICT = {}\n\n    # REMOVE_CHAR_PATTERN = re.compile('[^A-Za-z0-9]')\n\n    # Hack to not tokenize \"cannot\" to \"can not\" and consider them different as in the\n    # official ROUGE script\n    # KEEP_CANNOT_IN_ONE_WORD = re.compile('cannot')\n    # KEEP_CANNOT_IN_ONE_WORD_REVERSED = re.compile('_cannot_')\n\n    # WORDNET_KEY_VALUE = {}\n    # WORDNET_DB_FILEPATH = 'wordnet_key_value.txt'\n    # WORDNET_DB_FILEPATH_SPECIAL_CASE = 'wordnet_key_value_special_cases.txt'\n    # WORDNET_DB_DELIMITER = '|'\n    # STEMMER = None\n\n    def __init__(\n        self,\n        language,\n        metrics=None,\n        max_n=None,\n        limit_length=True,\n        length_limit=665,\n        length_limit_type=\"bytes\",\n        apply_avg=True,\n        apply_best=False,\n        stemming=True,\n        alpha=0.5,\n        weight_factor=1.0,\n        sentence_split_func=None,\n        word_tokenize_func=None,\n        remove_char_pattern=None,\n        stemming_func=None,\n        word_split_func=None,\n    ):\n        \"\"\"\n        Handle the ROUGE score computation as in the official perl script.\n\n        Note 1: Small differences might happen if the resampling of the perl script is not\n                high enough (as the average depends on this).\n        Note 2: Stemming of the official Porter Stemmer of the ROUGE perl script is slightly\n                different and the Porter one implemented in NLTK. However, special cases of\n                DUC 2004 have been traited.\n                The solution would be to rewrite the whole perl stemming in python from\n                the original script\n\n        Args:\n            language: language of the text to be evaluated, e.g. \"hi\".\n            metrics: What ROUGE score to compute. Available: ROUGE-N, ROUGE-L, ROUGE-W.\n                Default: ROUGE-N\n            max_n: N-grams for ROUGE-N if specify. Default:1\n            limit_length: If the summaries must be truncated. Defaut:True\n            length_limit: Number of the truncation where the unit is express int length_limit_Type.\n                Default:665 (bytes)\n            length_limit_type: Unit of length_limit. Available: words, bytes. Default: 'bytes'\n            apply_avg: If we should average the score of multiple samples. Default: True. If\n                apply_Avg & apply_best = False, then each ROUGE scores are independant\n            apply_best: Take the best instead of the average. Default: False, then each ROUGE\n                scores are independant\n            stemming: Apply stemming to summaries. Default: True\n            alpha: Alpha use to compute f1 score: P*R/((1-a)*P + a*R). Default:0.5\n            weight_factor: Weight factor to be used for ROUGE-W. Official rouge score defines\n                it at 1.2. Default: 1.0\n            sentence_split_func (function, optional): Language specific function for splitting\n                sentences. Defaults to None.\n            word_tokenize_func (function, optional): Language specific function for tokenizing text.\n                Defaults to None.\n            remove_char_pattern (_sre.SRE_Pattern, optional): Langauge specific regular expression\n                pattern for removing special characters, e.g. punctuations. Defaults to None.\n            stemming_func (function, optional): Language specific stemmer. Defaults to None.\n            word_split_func (function, optional): Language specific word splitter. Only needed if\n            the language words are not separated by space, e.g. Chinese. Defaults to None.\n\n        Raises:\n            ValueError: raises exception if metric is not among AVAILABLE_METRICS\n            ValueError: raises exception if length_limit_type is not among\n                AVAILABLE_LENGTH_LIMIT_TYPES\n            ValueError: raises exception if weight_factor < 0\n        \"\"\"\n        supported_langauges = [\"hi\"]\n        if language not in supported_langauges and not all(\n            [sentence_split_func, word_tokenize_func, remove_char_pattern]\n        ):\n            raise Exception(\n                \"Language {0} is not supported. Supported languages are: {1}. Provide language \"\n                \"speicifc sentence_split_func, word_tokenize_func, remove_char_pattern, \"\n                \"stemming_func(optional), and word_split_func (if words are not separated by \"\n                \"space) to use this class\".format(language, supported_langauges)\n            )\n        self.metrics = metrics[:] if metrics is not None else RougeExt.DEFAULT_METRICS\n        for m in self.metrics:\n            if m not in RougeExt.AVAILABLE_METRICS:\n                raise ValueError(\"Unknown metric '{}'\".format(m))\n\n        self.max_n = max_n if \"rouge-n\" in self.metrics else None\n        # Add all rouge-n metrics\n        if self.max_n is not None:\n            index_rouge_n = self.metrics.index(\"rouge-n\")\n            del self.metrics[index_rouge_n]\n            self.metrics += [\"rouge-{}\".format(n) for n in range(1, self.max_n + 1)]\n        self.metrics = set(self.metrics)\n\n        self.limit_length = limit_length\n        if self.limit_length:\n            if length_limit_type not in RougeExt.AVAILABLE_LENGTH_LIMIT_TYPES:\n                raise ValueError(\"Unknown length_limit_type '{}'\".format(length_limit_type))\n\n        self.length_limit = length_limit\n        if self.length_limit == 0:\n            self.limit_length = False\n        self.length_limit_type = length_limit_type\n        self.stemming = stemming\n\n        self.apply_avg = apply_avg\n        self.apply_best = apply_best\n        self.alpha = alpha\n        self.weight_factor = weight_factor\n        if self.weight_factor <= 0:\n            raise ValueError(\"ROUGE-W weight factor must greater than 0.\")\n\n        self.language = language\n        if sentence_split_func is None:\n            self.sentence_split = RougeExt.SENTENCE_SPLIT_DICT[self.language]\n        else:\n            self.sentence_split = sentence_split_func\n        if word_tokenize_func is None:\n            self.word_tokenize = RougeExt.WORD_TOKENIZE_DICT[self.language]\n        else:\n            self.word_tokenize = word_tokenize_func\n        if remove_char_pattern is None:\n            self.remove_char_pattern = RougeExt.REMOVE_CHAR_PATTERN_DICT[self.language]\n        else:\n            self.remove_char_pattern = remove_char_pattern\n        if self.language not in RougeExt.STEMMER_DICT.keys() and stemming_func is None:\n            self.stemmer = None\n            warnings.warn(\"Language-specific stemmer is not available. Skipping stemming.\")\n        elif stemming_func is None:\n            self.stemmer = RougeExt.STEMMER_DICT[self.language]\n        else:\n            self.stemmer = stemming_func\n\n        if self.language not in RougeExt.WORD_SPLIT_DICT.keys() and word_split_func is None:\n            self.word_split = None\n        elif word_split_func is None:\n            self.word_split = RougeExt.WORD_SPLIT_DICT[self.language]\n        else:\n            self.word_split = word_split_func\n\n    def tokenize_text(self, text):\n        \"\"\"\n        Tokenize text in the specific language\n\n        Args:\n          text: The string text to tokenize\n          language: Language of the text\n\n        Returns:\n          List of tokens of text\n        \"\"\"\n        return self.word_tokenize(text, self.language)\n\n    def split_into_sentences(self, text):\n        \"\"\"\n        Split text into sentences, using specified language.\n\n        Args:\n          text: The string text to tokenize\n          language: Language of the text\n\n        Returns:\n          List of tokens of text\n        \"\"\"\n\n        return self.sentence_split(text, self.language)\n\n    def stem_tokens(self, tokens):\n        \"\"\"\n        Stem each token of tokens\n\n        Args:\n          tokens: List of tokens to stem\n\n        Returns:\n          List of final stems\n        \"\"\"\n        for i, token in enumerate(tokens):\n            tokens[i] = self.stemmer(token)\n\n        return tokens\n\n    def _split_into_words(self, sentences):\n        \"\"\"\n        Splits multiple sentences into words and flattens the result\n\n        Args:\n          sentences: list of string\n\n        Returns:\n          A list of words (split by white space)\n        \"\"\"\n        # Modified from https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py\n        if self.word_split is None:\n            return list(itertools.chain(*[_.split() for _ in sentences]))\n        else:\n            return list(itertools.chain(*[self.word_split(_) for _ in sentences]))\n\n    def _get_word_ngrams_and_length(self, n, sentences):\n        \"\"\"\n        Calculates word n-grams for multiple sentences.\n\n        Args:\n          n: wich n-grams to calculate\n          sentences: list of string\n\n        Returns:\n          A set of n-grams, their frequency and #n-grams in sentences\n        \"\"\"\n        # Modified from https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py\n        assert len(sentences) > 0\n        assert n > 0\n\n        tokens = self._split_into_words(sentences)\n        return self._get_ngrams(n, tokens), tokens, len(tokens) - (n - 1)\n\n    def _get_unigrams(self, sentences):\n        \"\"\"\n        Calcualtes uni-grams.\n\n        Args:\n          sentences: list of string\n\n        Returns:\n          A set of n-grams and their freqneucy\n        \"\"\"\n        assert len(sentences) > 0\n\n        tokens = self._split_into_words(sentences)\n        unigram_set = collections.defaultdict(int)\n        for token in tokens:\n            unigram_set[token] += 1\n        return unigram_set, len(tokens)\n\n    def _compute_ngrams(self, evaluated_sentences, reference_sentences, n):\n        \"\"\"\n        Computes n-grams overlap of two text collections of sentences.\n        Source: http://research.microsoft.com/en-us/um/people/cyl/download/\n        papers/rouge-working-note-v1.3.1.pdf\n\n        Args:\n          evaluated_sentences: The sentences that have been picked by the\n                               summarizer\n          reference_sentences: The sentences from the referene set\n          n: Size of ngram\n\n        Returns:\n          Number of n-grams for evaluated_sentences, reference_sentences and intersection of both.\n          intersection of both count multiple of occurences in n-grams match several times\n\n        Raises:\n          ValueError: raises exception if a param has len <= 0\n        \"\"\"\n        # Modified from https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py\n        if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:\n            raise ValueError(\"Collections must contain at least 1 sentence.\")\n\n        evaluated_ngrams, _, evaluated_count = self._get_word_ngrams_and_length(\n            n, evaluated_sentences\n        )\n        reference_ngrams, _, reference_count = self._get_word_ngrams_and_length(\n            n, reference_sentences\n        )\n\n        # Gets the overlapping ngrams between evaluated and reference\n        overlapping_ngrams = set(evaluated_ngrams.keys()).intersection(set(reference_ngrams.keys()))\n        overlapping_count = 0\n        for ngram in overlapping_ngrams:\n            overlapping_count += min(evaluated_ngrams[ngram], reference_ngrams[ngram])\n\n        return evaluated_count, reference_count, overlapping_count\n\n    def _compute_ngrams_lcs(self, evaluated_sentences, reference_sentences, weight_factor=1.0):\n        \"\"\"\n        Computes ROUGE-L (summary level) of two text collections of sentences.\n        http://research.microsoft.com/en-us/um/people/cyl/download/papers/\n        rouge-working-note-v1.3.1.pdf\n        Args:\n          evaluated_sentences: The sentences that have been picked by the summarizer\n          reference_sentence: One of the sentences in the reference summaries\n          weight_factor: Weight factor to be used for WLCS (1.0 by default if LCS)\n        Returns:\n          Number of LCS n-grams for evaluated_sentences, reference_sentences and intersection\n              of both.\n          intersection of both count multiple of occurences in n-grams match several times\n        Raises:\n          ValueError: raises exception if a param has len <= 0\n        \"\"\"\n\n        def _lcs(x, y):\n            m = len(x)\n            n = len(y)\n            vals = collections.defaultdict(int)\n            dirs = collections.defaultdict(int)\n\n            for i in range(1, m + 1):\n                for j in range(1, n + 1):\n                    if x[i - 1] == y[j - 1]:\n                        vals[i, j] = vals[i - 1, j - 1] + 1\n                        dirs[i, j] = \"|\"\n                    elif vals[i - 1, j] >= vals[i, j - 1]:\n                        vals[i, j] = vals[i - 1, j]\n                        dirs[i, j] = \"^\"\n                    else:\n                        vals[i, j] = vals[i, j - 1]\n                        dirs[i, j] = \"<\"\n\n            return vals, dirs\n\n        def _wlcs(x, y, weight_factor):\n            m = len(x)\n            n = len(y)\n            vals = collections.defaultdict(float)\n            dirs = collections.defaultdict(int)\n            lengths = collections.defaultdict(int)\n\n            for i in range(1, m + 1):\n                for j in range(1, n + 1):\n                    if x[i - 1] == y[j - 1]:\n                        length_tmp = lengths[i - 1, j - 1]\n                        vals[i, j] = (\n                            vals[i - 1, j - 1]\n                            + (length_tmp + 1) ** weight_factor\n                            - length_tmp ** weight_factor\n                        )\n                        dirs[i, j] = \"|\"\n                        lengths[i, j] = length_tmp + 1\n                    elif vals[i - 1, j] >= vals[i, j - 1]:\n                        vals[i, j] = vals[i - 1, j]\n                        dirs[i, j] = \"^\"\n                        lengths[i, j] = 0\n                    else:\n                        vals[i, j] = vals[i, j - 1]\n                        dirs[i, j] = \"<\"\n                        lengths[i, j] = 0\n\n            return vals, dirs\n\n        def _mark_lcs(mask, dirs, m, n):\n            while m != 0 and n != 0:\n                if dirs[m, n] == \"|\":\n                    m -= 1\n                    n -= 1\n                    mask[m] = 1\n                elif dirs[m, n] == \"^\":\n                    m -= 1\n                elif dirs[m, n] == \"<\":\n                    n -= 1\n                else:\n                    raise UnboundLocalError(\"Illegal move\")\n\n            return mask\n\n        if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:\n            raise ValueError(\"Collections must contain at least 1 sentence.\")\n\n        evaluated_unigrams_dict, evaluated_count = self._get_unigrams(evaluated_sentences)\n        reference_unigrams_dict, reference_count = self._get_unigrams(reference_sentences)\n\n        # Has to use weight factor for WLCS\n        use_WLCS = weight_factor != 1.0\n        if use_WLCS:\n            evaluated_count = evaluated_count ** weight_factor\n            reference_count = 0\n\n        overlapping_count = 0.0\n        for reference_sentence in reference_sentences:\n            reference_sentence_tokens = reference_sentence.split()\n            if use_WLCS:\n                reference_count += len(reference_sentence_tokens) ** weight_factor\n            hit_mask = [0 for _ in range(len(reference_sentence_tokens))]\n\n            for evaluated_sentence in evaluated_sentences:\n                evaluated_sentence_tokens = evaluated_sentence.split()\n\n                if use_WLCS:\n                    _, lcs_dirs = _wlcs(\n                        reference_sentence_tokens, evaluated_sentence_tokens, weight_factor\n                    )\n                else:\n                    _, lcs_dirs = _lcs(reference_sentence_tokens, evaluated_sentence_tokens)\n                _mark_lcs(\n                    hit_mask,\n                    lcs_dirs,\n                    len(reference_sentence_tokens),\n                    len(evaluated_sentence_tokens),\n                )\n\n            overlapping_count_length = 0\n            for ref_token_id, val in enumerate(hit_mask):\n                if val == 1:\n                    token = reference_sentence_tokens[ref_token_id]\n                    if evaluated_unigrams_dict[token] > 0 and reference_unigrams_dict[token] > 0:\n                        evaluated_unigrams_dict[token] -= 1\n                        reference_unigrams_dict[ref_token_id] -= 1\n\n                        if use_WLCS:\n                            overlapping_count_length += 1\n                            if (\n                                ref_token_id + 1 < len(hit_mask) and hit_mask[ref_token_id + 1] == 0\n                            ) or ref_token_id + 1 == len(hit_mask):\n                                overlapping_count += overlapping_count_length ** weight_factor\n                                overlapping_count_length = 0\n                        else:\n                            overlapping_count += 1\n\n        if use_WLCS:\n            reference_count = reference_count ** weight_factor\n\n        return evaluated_count, reference_count, overlapping_count\n\n    def _preprocess_summary_as_a_whole(self, summary):\n        \"\"\"\n        Preprocessing (truncate text if enable, tokenization, stemming if enable, lowering)\n        of a summary as a whole\n\n        Args:\n          summary: string of the summary\n\n        Returns:\n          Return the preprocessed summary (string)\n        \"\"\"\n        sentences = self.split_into_sentences(summary)\n\n        # Truncate\n        if self.limit_length:\n            # By words\n            if self.length_limit_type == \"words\":\n                summary = \" \".join(sentences)\n                all_tokens = summary.split()  # Counting as in the perls script\n                summary = \" \".join(all_tokens[: self.length_limit])\n\n            # By bytes\n            elif self.length_limit_type == \"bytes\":\n                summary = \"\"\n                current_len = 0\n                for sentence in sentences:\n                    sentence = sentence.strip()\n                    sentence_len = len(sentence)\n\n                    if current_len + sentence_len < self.length_limit:\n                        if current_len != 0:\n                            summary += \" \"\n                        summary += sentence\n                        current_len += sentence_len\n                    else:\n                        if current_len > 0:\n                            summary += \" \"\n                        summary += sentence[: self.length_limit - current_len]\n                        break\n        else:\n            summary = \" \".join(sentences)\n\n        # summary = Rouge.REMOVE_CHAR_PATTERN.sub(' ', summary.lower()).strip()\n        summary = self.remove_char_pattern.sub(\" \", summary.lower()).strip()\n\n        # # Preprocess. Hack: because official ROUGE script bring \"cannot\" as \"cannot\" and\n        #   \"can not\" as \"can not\",\n        # # we have to hack nltk tokenizer to not transform \"cannot/can not\" to \"can not\"\n        # if self.ensure_compatibility:\n        #     tokens = self.tokenize_text(Rouge.KEEP_CANNOT_IN_ONE_WORD.sub('_cannot_', summary))\n        # else:\n        #     tokens = self.tokenize_text(Rouge.REMOVE_CHAR_PATTERN.sub(' ', summary))\n\n        # if self.stemming:\n        #     self.stem_tokens(tokens) # stemming in-place\n\n        # if self.ensure_compatibility:\n        #     preprocessed_summary = [Rouge.KEEP_CANNOT_IN_ONE_WORD_REVERSED.sub(\n        #         'cannot', ' '.join(tokens))]\n        # else:\n        #     preprocessed_summary = [' '.join(tokens)]\n\n        # return preprocessed_summary\n\n        tokens = self.tokenize_text(summary)\n        if self.stemming:\n            self.stem_tokens(tokens)  # stemming in-place\n        summary = [\" \".join(tokens)]\n\n        return summary\n\n    def _preprocess_summary_per_sentence(self, summary):\n        \"\"\"\n        Preprocessing (truncate text if enable, tokenization, stemming if enable, lowering)\n        of a summary by sentences\n\n        Args:\n          summary: string of the summary\n\n        Returns:\n          Return the preprocessed summary (string)\n        \"\"\"\n        sentences = self.split_into_sentences(summary)\n\n        # Truncate\n        if self.limit_length:\n            final_sentences = []\n            current_len = 0\n            # By words\n            if self.length_limit_type == \"words\":\n                for sentence in sentences:\n                    tokens = sentence.strip().split()\n                    tokens_len = len(tokens)\n                    if current_len + tokens_len < self.length_limit:\n                        sentence = \" \".join(tokens)\n                        final_sentences.append(sentence)\n                        current_len += tokens_len\n                    else:\n                        sentence = \" \".join(tokens[: self.length_limit - current_len])\n                        final_sentences.append(sentence)\n                        break\n            # By bytes\n            elif self.length_limit_type == \"bytes\":\n                for sentence in sentences:\n                    sentence = sentence.strip()\n                    sentence_len = len(sentence)\n                    if current_len + sentence_len < self.length_limit:\n                        final_sentences.append(sentence)\n                        current_len += sentence_len\n                    else:\n                        sentence = sentence[: self.length_limit - current_len]\n                        final_sentences.append(sentence)\n                        break\n            sentences = final_sentences\n\n        final_sentences = []\n        for sentence in sentences:\n            # sentence = Rouge.REMOVE_CHAR_PATTERN.sub(' ', sentence.lower()).strip()\n            sentence = self.remove_char_pattern.sub(\" \", sentence.lower()).strip()\n\n            #     # Preprocess. Hack: because official ROUGE script bring \"cannot\" as \"cannot\"\n            #       and \"can not\" as \"can not\",\n            #     # we have to hack nltk tokenizer to not transform \"cannot/can not\" to \"can not\"\n            #     if self.ensure_compatibility:\n            #         tokens = self.tokenize_text(Rouge.KEEP_CANNOT_IN_ONE_WORD.sub(\n            #             '_cannot_', sentence))\n            #     else:\n            #         tokens = self.tokenize_text(Rouge.REMOVE_CHAR_PATTERN.sub(' ', sentence))\n\n            #     if self.stemming:\n            #         self.stem_tokens(tokens) # stemming in-place\n\n            #     if self.ensure_compatibility:\n            #         sentence = Rouge.KEEP_CANNOT_IN_ONE_WORD_REVERSED.sub(\n            #             'cannot', ' '.join(tokens)\n            #         )\n            #     else:\n            #         sentence = ' '.join(tokens)\n\n            tokens = self.tokenize_text(sentence)\n            if self.stemming:\n                self.stem_tokens(tokens)  # stemming in-place\n            sentence = \" \".join(tokens)\n            final_sentences.append(sentence)\n\n        return final_sentences\n"
  },
  {
    "path": "utils_nlp/eval/senteval.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Utilities for evaluating sentence embeddings.\"\"\"\n\n\nclass SentEvalConfig:\n    \"\"\"Object to store static properties of senteval experiments\n\n    Attributes:\n        model_params (dict): model parameters that stay consistent across all runs\n        senteval_params (dict): senteval parameters that stay consistent across all runs\n\n    \"\"\"\n\n    def __init__(self, model_params, senteval_params):\n        \"\"\"Summary\n\n        Args:\n            model_params (dict): model parameters that stay consistent across all runs\n            senteval_params (dict): senteval parameters that stay consistent across all runs\n        \"\"\"\n        self.model_params = model_params\n        self.senteval_params = senteval_params\n\n    @property\n    def model_params(self):\n        return self._model_params\n\n    @model_params.setter\n    def model_params(self, model_params):\n        self._model_params = model_params\n\n    def append_senteval_params(self, params):\n        \"\"\"Util to append any params to senteval_params after initialization\"\"\"\n        self.senteval_params = dict(self.senteval_params, **params)\n\n        classifying_tasks = {\n            \"MR\",\n            \"CR\",\n            \"SUBJ\",\n            \"MPQA\",\n            \"SST2\",\n            \"SST5\",\n            \"TREC\",\n            \"SICKEntailment\",\n            \"SNLI\",\n            \"MRPC\",\n        }\n\n        if any(t in classifying_tasks for t in self.transfer_tasks):\n            try:\n                a = \"classifier\" in self.senteval_params\n                if not a:\n                    raise ValueError(\"Include param['classifier'] to run task {}\".format(t))\n                else:\n                    b = (\n                        set(\"nhid\", \"optim\", \"batch_size\", \"tenacity\", \"epoch_size\")\n                        in self.senteval_params[\"classifier\"].keys()\n                    )\n                    if not b:\n                        raise ValueError(\n                            \"Include nhid, optim, batch_size, tenacity, and epoch_size params to \"\n                            \"run task {}\".format(t)\n                        )\n            except ValueError as ve:\n                print(ve)\n"
  },
  {
    "path": "utils_nlp/interpreter/Interpreter.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Utilities that enables you to explain every hidden state in your model\"\"\"\n\nimport torch\nfrom torch import nn\nfrom torch import optim\nfrom tqdm import tqdm\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef calculate_regularization(sampled_x, Phi, reduced_axes=None, device=None):\n    \"\"\" Calculate the variance that is used for Interpreter\n\n    Args:\n        sampled_x (list of torch.FloatTensor):\n            A list of sampled input embeddings $x$, each $x$ is of shape\n            ``[length, dimension]``. All the $x$s can have different length,\n            but should have the same dimension. Sampled number should be\n            higher to get a good estimation.\n        reduced_axes (list of ints, Optional):\n            The axes that is variable in Phi (e.g., the sentence length axis).\n            We will reduce these axes by mean along them.\n\n    Returns:\n        torch.FloatTensor: The regularization term calculated\n\n    \"\"\"\n    sample_num = len(sampled_x)\n    sample_s = []\n    for n in range(sample_num):\n        x = sampled_x[n]\n        if device is not None:\n            x = x.to(device)\n        s = Phi(x)\n        if reduced_axes is not None:\n            for axis in reduced_axes:\n                assert axis < len(s.shape)\n                s = s.mean(dim=axis, keepdim=True)\n        sample_s.append(s.tolist())\n    sample_s = np.array(sample_s)\n    return np.std(sample_s, axis=0)\n\n\nclass Interpreter(nn.Module):\n    \"\"\" Interpreter for interpreting one instance. The method is from\n    paper `Towards a Deep and Unified Understanding of Deep Neural\n    Models in NLP <http://proceedings.mlr.press/v97/guan19a/guan19a.pdf>`_\n\n    It will minimize the loss in Eqn.(7):\n\n        $L(sigma) = (||Phi(embed + epsilon) - Phi(embed)||_2^2)\n        // (regularization^2) - rate * log(sigma)$\n\n    In our implementation, we use reparameterization trick to represent\n    epsilon ~ N(0, sigma^2 I), i.e. epsilon = scale * ratio * noise.\n    Where noise ~ N(0, 1), scale is a hyper-parameter that controls the\n    maximum value of sigma^2, and ratio in (0, 1) is the learnable parameter.\n\n    \"\"\"\n\n    def __init__(self, x, Phi, scale=0.5, rate=0.1, regularization=None, words=None):\n        \"\"\" Initialize an interpreter class.\n\n        Args:\n            x (torch.FloatTensor): Of shape ``[length, dimension]``.\n                The $x$ we studied. i.e. The input word embeddings.\n            Phi (function):\n                The $Phi$ we studied. A function whose input is x (the first\n                parameter) and returns a hidden state (of type\n                ``torch.FloatTensor``, of any shape)\n            scale (float):\n                The maximum size of sigma. A hyper-parameter in\n                reparameterization trick. The recommended value is\n                10 * Std[word_embedding_weight], where word_embedding_weight\n                is the word embedding weight in the model interpreted. Larger\n                scale will give more salient result, Default: 0.5.\n            rate (float):\n                A hyper-parameter that balance the MLE Loss and Maximum\n                Entropy Loss. Larger rate will result in larger information\n                loss. Default: 0.1.\n            regularization (Torch.FloatTensor or np.ndarray):\n                The regularization term, should be of the same shape as\n                (or broadcastable to) the output of Phi. If None is given,\n                method will use the output to regularize itself.\n                Default: None.\n            words (List[Str]):\n                The input sentence, used for visualizing. If None is given,\n                method will not show the words.\n\n        \"\"\"\n        super(Interpreter, self).__init__()\n        self.s = x.size(0)\n        self.d = x.size(1)\n        self.ratio = nn.Parameter(torch.randn(self.s, 1), requires_grad=True)\n\n        self.scale = scale\n        self.rate = rate\n        self.x = x\n        self.Phi = Phi\n\n        self.regular = regularization\n        if self.regular is not None:\n            self.regular = nn.Parameter(torch.tensor(self.regular).to(x), requires_grad=False)\n        self.words = words\n        if self.words is not None:\n            assert self.s == len(\n                words\n            ), \"the length of x should be of the same with the lengh of words\"\n\n    def forward(self):\n        \"\"\" Calculate loss:\n\n            $L(sigma) = (||Phi(embed + epsilon) - Phi(embed)||_2^2)\n            // (regularization^2) - rate * log(sigma)$\n\n        Returns:\n            torch.FloatTensor: a scalar, the target loss.\n\n        \"\"\"\n        ratios = torch.sigmoid(self.ratio)  # S * 1\n        x = self.x + 0.0  # S * D\n        x_tilde = x + ratios * torch.randn(self.s, self.d).to(x.device) * self.scale  # S * D\n        s = self.Phi(x)  # D or S * D\n        s_tilde = self.Phi(x_tilde)\n        loss = (s_tilde - s) ** 2\n        if self.regular is not None:\n            loss = torch.mean(loss / self.regular ** 2)\n        else:\n            loss = torch.mean(loss) / torch.mean(s ** 2)\n\n        return loss - torch.mean(torch.log(ratios)) * self.rate\n\n    def optimize(self, iteration=5000, lr=0.01, show_progress=False):\n        \"\"\" Optimize the loss function\n\n        Args:\n            iteration (int): Total optimizing iteration\n            lr (float): Learning rate\n            show_progress (bool): Whether to show the learn progress\n\n        \"\"\"\n        minLoss = None\n        state_dict = None\n        optimizer = optim.Adam(self.parameters(), lr=lr)\n        self.train()\n        func = (lambda x: x) if not show_progress else tqdm\n        for _ in func(range(iteration)):\n            optimizer.zero_grad()\n            loss = self()\n            loss.backward()\n            optimizer.step()\n            if minLoss is None or minLoss > loss:\n                state_dict = {k: self.state_dict()[k] + 0.0 for k in self.state_dict().keys()}\n                minLoss = loss\n        self.eval()\n        self.load_state_dict(state_dict)\n\n    def get_sigma(self):\n        \"\"\" Calculate and return the sigma\n\n        Returns:\n            np.ndarray: of shape ``[seqLen]``, the ``sigma``.\n\n        \"\"\"\n        ratios = torch.sigmoid(self.ratio)  # S * 1\n        return ratios.detach().cpu().numpy()[:, 0] * self.scale\n\n    def visualize(self):\n        \"\"\" Visualize the information loss of every word.\n        \"\"\"\n        sigma_ = self.get_sigma()\n        _, ax = plt.subplots()\n        im = ax.imshow([sigma_], cmap=\"GnBu_r\")\n        ax.set_xticks(range(self.s))\n        ax.set_xticklabels(self.words)\n        ax.set_yticks([0])\n        ax.set_yticklabels([\"\"])\n        plt.colorbar(im, orientation=\"horizontal\")\n        plt.tight_layout()\n        plt.show()\n"
  },
  {
    "path": "utils_nlp/interpreter/README.md",
    "content": "# Towards a Deep and Unified Understanding of Deep Neural Models in NLP\n\nThis submodule contains a tool for explaining hidden states of models. It is an implementation of the paper [*Towards a Deep and Unified Understanding of Deep Neural Models in NLP*](http://proceedings.mlr.press/v97/guan19a/guan19a.pdf)\n\n\n## How to use\n\nWe provide a notebook tutorial [here](../../examples/interpret_NLP_models/understand_models.ipynb) to help you get started quickly. The main class needed is the `Interpreter` in [Interpreter.py](Interpreter.py). Given any input word embeddings and a forward function $\\Phi$ that transforms the word embeddings $\\bf x$ to a hidden state $\\bf s$, the Interpreter helps understand how much each input word contributes to the hidden state. Suppose the $\\Phi$, the input $\\bf x$ and the input words are defined as:\n```\nimport torch\n\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n\nx = torch.randn(5,256) / 100\nx = x.to(device)\nwords = ['1','2','3','4','5']\n\ndef Phi(x):\n    W = torch.tensor([10., 20., 5., -20., -10.]).to(device)\n    return W @ x\n```\n\nTo explain a certain hidden state, we also need to get its variance for regularization. We provide a simple tool in `Interpreter.py` for calculating regularization. You just need to provide your sampled x as a list and your Phi. as shown below:\n\n```\nfrom Interpreter import calculate_regularization\n\n# here we sample input x using random for simplicity\nsampled_x = [torch.randn(5,256) / 100 for _ in range(100)]\n\nregularization = calculate_regularization(sampled_x, Phi, device=device)\n```\n\nTo explain this case, we need to initialize an `Interpreter` class, and pass $\\bf x$, regularization and $\\Phi$ to it (we also need to set hyper-parameter scale to a reasonable value: 10 * Std[embedding] is recommanded):\n```\nfrom Interpreter import Interpreter\n\ninterpreter = Interpreter(x=x, Phi=Phi, regularization=regularization, scale=10 * 0.1, words=words).to(device)\n```\nThen, we need the interpreter to optimize itself by minimizing the loss function in paper.\n```\ninterpreter.optimize(iteration=5000, lr=0.5, show_progress=True)\n```\nAfter optimizing, we can get the best sigma:\n```\ninterpreter.get_sigma()\n```\nthe result will be something like:\n```\narray([0.00315634, 0.00181308, 0.00633237, 0.00174878, 0.0030807 ], dtype=float32)\n```\nEvery sigma stands for the change limit of input without changing hidden state too much. The smaller the sigma is, the more this input word contributes to the hidden state.\n\nNow, we can get the explanation by calling the visualize function:\n```\ninterpreter.visualize()\n```\nThen, we can get results below:\n\n![](https://nlpbp.blob.core.windows.net/images/result.png)\n\nwhich means that the second and forth words are most important to $\\Phi$, which is reasonable because the weight of them are larger.\n\n## Explain a certain layer in any saved pytorch model\n\nWe provide an example on how to use our method to explain a saved pytorch model (*pre-trained BERT model in our case*) [here](../../examples/interpret_NLP_models/understand_models.ipynb). \n> NOTE: This result may not be consistent with the result in the paper because we use the pre-trained BERT model directly for simplicity, while the BERT model we use in paper is fine-tuned on a specific dataset like SST-2.\n"
  },
  {
    "path": "utils_nlp/interpreter/__init__.py",
    "content": ""
  },
  {
    "path": "utils_nlp/language_utils/hi/hindi_stemmer.py",
    "content": "#! /usr/bin/env python3.1\n# Script was downloaded from https://research.variancia.com/hindi_stemmer/\n\"\"\" Lightweight Hindi stemmer\nCopyright © 2010 Luís Gomes <luismsgomes@gmail.com>.\n\nImplementation of algorithm described in\n\n    A Lightweight Stemmer for Hindi\n    Ananthakrishnan Ramanathan and Durgesh D Rao\n    http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf\n\n    @conference{ramanathan2003lightweight,\n      title={{A lightweight stemmer for Hindi}},\n      author={Ramanathan, A. and Rao, D.},\n      booktitle={Workshop on Computational Linguistics for South-Asian Languages, EACL},\n      year={2003}\n    }\n\nPorted from HindiStemmer.java, part of of Lucene.\n\"\"\"\n\nsuffixes = {\n    1: [\"ो\", \"े\", \"ू\", \"ु\", \"ी\", \"ि\", \"ा\"],\n    2: [\n        \"कर\",\n        \"ाओ\",\n        \"िए\",\n        \"ाई\",\n        \"ाए\",\n        \"ने\",\n        \"नी\",\n        \"ना\",\n        \"ते\",\n        \"ीं\",\n        \"ती\",\n        \"ता\",\n        \"ाँ\",\n        \"ां\",\n        \"ों\",\n        \"ें\",\n    ],\n    3: [\n        \"ाकर\",\n        \"ाइए\",\n        \"ाईं\",\n        \"ाया\",\n        \"ेगी\",\n        \"ेगा\",\n        \"ोगी\",\n        \"ोगे\",\n        \"ाने\",\n        \"ाना\",\n        \"ाते\",\n        \"ाती\",\n        \"ाता\",\n        \"तीं\",\n        \"ाओं\",\n        \"ाएं\",\n        \"ुओं\",\n        \"ुएं\",\n        \"ुआं\",\n    ],\n    4: [\n        \"ाएगी\",\n        \"ाएगा\",\n        \"ाओगी\",\n        \"ाओगे\",\n        \"एंगी\",\n        \"ेंगी\",\n        \"एंगे\",\n        \"ेंगे\",\n        \"ूंगी\",\n        \"ूंगा\",\n        \"ातीं\",\n        \"नाओं\",\n        \"नाएं\",\n        \"ताओं\",\n        \"ताएं\",\n        \"ियाँ\",\n        \"ियों\",\n        \"ियां\",\n    ],\n    5: [\"ाएंगी\", \"ाएंगे\", \"ाऊंगी\", \"ाऊंगा\", \"ाइयाँ\", \"ाइयों\", \"ाइयां\"],\n}\n\n\ndef hi_stem(word):\n    for L in 5, 4, 3, 2, 1:\n        if len(word) > L + 1:\n            for suf in suffixes[L]:\n                if word.endswith(suf):\n                    return word[:-L]\n    return word\n\n\nif __name__ == \"__main__\":\n    import sys\n\n    if len(sys.argv) != 1:\n        sys.exit(\"{} takes no arguments\".format(sys.argv[0]))\n    for line in sys.stdin:\n        print(*[hi_stem(word) for word in line.split()])\n"
  },
  {
    "path": "utils_nlp/models/README.md",
    "content": "# Models\nThe models submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new natural language processing systems. A description of which algorithms are used in each scenario can be found on [this table](../../README.md#content)\n\n## Summary\n\nThe following table summarizes each submodule.\n\n|Submodule|Description|\n|---|---|\n|[bert](./bert/README.md)| This submodule includes the BERT-based models for sequence classification, token classification, and sequence encoding.|\n|[gensen](./gensen/README.md)| This submodule includes a distributed Pytorch implementation based on [Horovod](https://github.com/horovod/horovod) of [learning general purpose distributed sentence representations via large scale multi-task learning](https://arxiv.org/abs/1804.00079) by refactoring https://github.com/Maluuba/gensen|\n|[pretrained embeddings](./pretrained_embeddings) | This submodule provides utilities to download and extract pretrained word embeddings trained with Word2Vec, GloVe, fastText methods.|\n|[pytorch_modules](./pytorch_modules/README.md)| This submodule provides Pytorch modules like Gated Recurrent Unit with peepholes. |\n|[xlnet](./xlnet/README.md)| This submodule includes the XLNet-based model for sequence classification.|\n"
  },
  {
    "path": "utils_nlp/models/bert/README.md",
    "content": "# BERT-based Classes\n\nThis folder contains utility functions and classes based on the implementation of [Transformers](https://github.com/huggingface/transformers). \n\n## Summary\n\nThe following table summarizes each Python scripts.\n\n|Script|Description|\n|---|---|\n|[common.py](common.py)| This script includes <ul><li>the languages supported by BERT-based classes</li><li> tokenization for text classification, name entity recognition, and encoding</li> <li>utilities to load data, etc.</li></ul>|\n|[sequence_classification.py](sequence_classification.py)| An implementation of sequence classification based on fine-turning BERT. It is commonly used for text classification.|\n|[sequence_classification_distributed.py](sequence_classification_distributed.py) | A distributed implementation of sequence classification with method based on fine-turning BERT. [Horovod](https://github.com/horovod/horovod) is the underlying distributed training framework.|\n|[sequence_encoding.py](sequence_encoding.py)| An implementation of sequence encoding based on BERT. Both pretrained and fine-tuned BERT models can be used. The hidden states from the loaded BERT model for the input sequence are used in the computation of the encoding. It provides mean, max and class pooling stragegies. It is commonly used in upstream tasks for sentence similarity. |\n|[token_classification.py](token_classification.py) |  An implementation of token classification based on fine-turning BERT. It is commonly used for name entity recognition. |\n"
  },
  {
    "path": "utils_nlp/models/bert/__init__.py",
    "content": ""
  },
  {
    "path": "utils_nlp/models/bert/common.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\n# This script reuses some code from\n# https://github.com/huggingface/transformers/blob/master/examples\n# /run_glue.py\n\nimport csv\nimport linecache\nimport subprocess\nimport warnings\nfrom collections.abc import Iterable\nfrom enum import Enum\n\nimport torch\nfrom pytorch_pretrained_bert.tokenization import BertTokenizer\nfrom torch.utils.data import (\n    DataLoader,\n    Dataset,\n    RandomSampler,\n    SequentialSampler,\n    TensorDataset,\n    ConcatDataset,\n)\nfrom tqdm import tqdm\n\n# Max supported sequence length\nBERT_MAX_LEN = 512\n\n\nclass Language(str, Enum):\n    \"\"\"An enumeration of the supported pretrained models and languages.\"\"\"\n\n    ENGLISH: str = \"bert-base-uncased\"\n    ENGLISHCASED: str = \"bert-base-cased\"\n    ENGLISHLARGE: str = \"bert-large-uncased\"\n    ENGLISHLARGECASED: str = \"bert-large-cased\"\n    ENGLISHLARGEWWM: str = \"bert-large-uncased-whole-word-masking\"\n    ENGLISHLARGECASEDWWM: str = \"bert-large-cased-whole-word-masking\"\n    CHINESE: str = \"bert-base-chinese\"\n    MULTILINGUAL: str = \"bert-base-multilingual-cased\"\n\n\nclass Tokenizer:\n    def __init__(self, language=Language.ENGLISH, to_lower=False, cache_dir=\".\"):\n        \"\"\"Initializes the underlying pretrained BERT tokenizer.\n\n        Args:\n            language (Language, optional): The pretrained model's language.\n                                           Defaults to Language.ENGLISH.\n            cache_dir (str, optional): Location of BERT's cache directory.\n                Defaults to \".\".\n        \"\"\"\n        self.tokenizer = BertTokenizer.from_pretrained(\n            language, do_lower_case=to_lower, cache_dir=cache_dir\n        )\n        self.language = language\n\n    def tokenize(self, text):\n        \"\"\"Tokenizes a list of documents using a BERT tokenizer\n\n        Args:\n            text (list): List of strings (one sequence) or\n                tuples (two sequences).\n\n        Returns:\n            [list]: List of lists. Each sublist contains WordPiece tokens\n                of the input sequence(s).\n        \"\"\"\n        if isinstance(text[0], str):\n            return [self.tokenizer.tokenize(x) for x in tqdm(text)]\n        else:\n            return [[self.tokenizer.tokenize(x) for x in sentences] for sentences in tqdm(text)]\n\n    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):\n        \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n        # This is a simple heuristic which will always truncate the longer\n        # sequence one token at a time. This makes more sense than\n        # truncating an equal percent of tokens from each, since if one\n        # sequence is very short then each token that's truncated likely\n        # contains more information than a longer sequence.\n\n        if not tokens_b:\n            max_length += 1\n\n        while True:\n            total_length = len(tokens_a) + len(tokens_b)\n            if total_length <= max_length:\n                break\n            if len(tokens_a) > len(tokens_b):\n                tokens_a.pop()\n            else:\n                tokens_b.pop()\n\n        tokens_a.append(\"[SEP]\")\n\n        if tokens_b:\n            tokens_b.append(\"[SEP]\")\n\n        return [tokens_a, tokens_b]\n\n    def preprocess_classification_tokens(self, tokens, max_len=BERT_MAX_LEN):\n        \"\"\"Preprocessing of input tokens:\n            - add BERT sentence markers ([CLS] and [SEP])\n            - map tokens to token indices in the BERT vocabulary\n            - pad and truncate sequences\n            - create an input_mask\n            - create token type ids, aka. segment ids\n\n        Args:\n            tokens (list): List of token lists to preprocess.\n            max_len (int, optional): Maximum number of tokens\n                            (documents will be truncated or padded).\n                            Defaults to 512.\n        Returns:\n            tuple: A tuple containing the following three lists\n                list of preprocesssed token lists\n                list of input mask lists\n                list of token type id lists\n        \"\"\"\n        if max_len > BERT_MAX_LEN:\n            print(\"setting max_len to max allowed tokens: {}\".format(BERT_MAX_LEN))\n            max_len = BERT_MAX_LEN\n\n        if isinstance(tokens[0][0], str):\n            tokens = [x[0 : max_len - 2] + [\"[SEP]\"] for x in tokens]\n            token_type_ids = None\n        else:\n            # get tokens for each sentence [[t00, t01, ...] [t10, t11,... ]]\n            tokens = [\n                self._truncate_seq_pair(sentence[0], sentence[1], max_len - 3)\n                for sentence in tokens\n            ]\n\n            # construct token_type_ids\n            # [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ]\n            token_type_ids = [\n                [[i] * len(sentence) for i, sentence in enumerate(example)] for example in tokens\n            ]\n            # merge sentences\n            tokens = [[token for sentence in example for token in sentence] for example in tokens]\n            # prefix with [0] for [CLS]\n            token_type_ids = [\n                [0] + [i for sentence in example for i in sentence] for example in token_type_ids\n            ]\n            # pad sequence\n            token_type_ids = [x + [0] * (max_len - len(x)) for x in token_type_ids]\n\n        tokens = [[\"[CLS]\"] + x for x in tokens]\n        # convert tokens to indices\n        tokens = [self.tokenizer.convert_tokens_to_ids(x) for x in tokens]\n        # pad sequence\n        tokens = [x + [0] * (max_len - len(x)) for x in tokens]\n        # create input mask\n        input_mask = [[min(1, x) for x in y] for y in tokens]\n        return tokens, input_mask, token_type_ids\n\n    def preprocess_encoder_tokens(self, tokens, max_len=BERT_MAX_LEN):\n        \"\"\"Preprocessing of input tokens:\n            - add BERT sentence markers ([CLS] and [SEP])\n            - map tokens to token indices in the BERT vocabulary\n            - pad and truncate sequences\n            - create an input_mask\n            - create token type ids, aka. segment ids\n\n        Args:\n            tokens (list): List of token lists to preprocess.\n            max_len (int, optional): Maximum number of tokens\n                            (documents will be truncated or padded).\n                            Defaults to 512.\n        Returns:\n            tuple: A tuple containing the following four lists\n                list of preprocesssed token lists\n                list of input id lists\n                list of input mask lists\n                list of token type id lists\n        \"\"\"\n        if max_len > BERT_MAX_LEN:\n            print(\"setting max_len to max allowed tokens: {}\".format(BERT_MAX_LEN))\n            max_len = BERT_MAX_LEN\n\n        if isinstance(tokens[0][0], str):\n            tokens = [x[0 : max_len - 2] + [\"[SEP]\"] for x in tokens]\n            token_type_ids = None\n        else:\n            # get tokens for each sentence [[t00, t01, ...] [t10, t11,... ]]\n            tokens = [\n                self._truncate_seq_pair(sentence[0], sentence[1], max_len - 3)\n                for sentence in tokens\n            ]\n\n            # construct token_type_ids\n            # [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ]\n            token_type_ids = [\n                [[i] * len(sentence) for i, sentence in enumerate(example)] for example in tokens\n            ]\n            # merge sentences\n            tokens = [[token for sentence in example for token in sentence] for example in tokens]\n            # prefix with [0] for [CLS]\n            token_type_ids = [\n                [0] + [i for sentence in example for i in sentence] for example in token_type_ids\n            ]\n            # pad sequence\n            token_type_ids = [x + [0] * (max_len - len(x)) for x in token_type_ids]\n\n        tokens = [[\"[CLS]\"] + x for x in tokens]\n        # convert tokens to indices\n        input_ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokens]\n        # pad sequence\n        input_ids = [x + [0] * (max_len - len(x)) for x in input_ids]\n        # create input mask\n        input_mask = [[min(1, x) for x in y] for y in input_ids]\n        return tokens, input_ids, input_mask, token_type_ids\n\n    def tokenize_ner(\n        self, text, max_len=BERT_MAX_LEN, labels=None, label_map=None, trailing_piece_tag=\"X\"\n    ):\n        \"\"\"\n        Tokenize and preprocesses input word lists, involving the following steps\n            0. WordPiece tokenization.\n            1. Convert string tokens to token ids.\n            2. Convert input labels to label ids, if labels and label_map are\n                provided.\n            3. If a word is tokenized into multiple pieces of tokens by the\n                WordPiece tokenizer, label the extra tokens with\n                trailing_piece_tag.\n            4. Pad or truncate input text according to max_seq_length\n            5. Create input_mask for masking out padded tokens.\n\n        Args:\n            text (list): List of lists. Each sublist is a list of words in an\n                input sentence.\n            max_len (int, optional): Maximum length of the list of\n                tokens. Lists longer than this are truncated and shorter\n                ones are padded with \"O\"s. Default value is BERT_MAX_LEN=512.\n            labels (list, optional): List of word label lists. Each sublist\n                contains labels corresponding to the input word list. The lengths\n                of the label list and word list must be the same. Default\n                value is None.\n            label_map (dict, optional): Dictionary for mapping original token\n                labels (which may be string type) to integers. Default value\n                is None.\n            trailing_piece_tag (str, optional): Tag used to label trailing\n                word pieces. For example, \"criticize\" is broken into \"critic\"\n                and \"##ize\", \"critic\" preserves its original label and \"##ize\"\n                is labeled as trailing_piece_tag. Default value is \"X\".\n\n        Returns:\n            tuple: A tuple containing the following four lists.\n                1. input_ids_all: List of lists. Each sublist contains\n                    numerical values, i.e. token ids, corresponding to the\n                    tokens in the input text data.\n                2. input_mask_all: List of lists. Each sublist\n                    contains the attention mask of the input token id list,\n                    1 for input tokens and 0 for padded tokens, so that\n                    padded tokens are not attended to.\n                3. trailing_token_mask: List of lists. Each sublist is\n                    a boolean list, True for the first word piece of each\n                    original word, False for the trailing word pieces,\n                    e.g. \"##ize\". This mask is useful for removing the\n                    predictions on trailing word pieces, so that each\n                    original word in the input text has a unique predicted\n                    label.\n                4. label_ids_all: List of lists of numerical labels,\n                    each sublist contains token labels of a input\n                    sentence/paragraph, if labels is provided. If the `labels`\n                    argument is not provided, the value of this is None.\n        \"\"\"\n\n        def _is_iterable_but_not_string(obj):\n            return isinstance(obj, Iterable) and not isinstance(obj, str)\n\n        if max_len > BERT_MAX_LEN:\n            warnings.warn(\"setting max_len to max allowed tokens: {}\".format(BERT_MAX_LEN))\n            max_len = BERT_MAX_LEN\n\n        if not _is_iterable_but_not_string(text):\n            # The input text must be an non-string Iterable\n            raise ValueError(\"Input text must be an iterable and not a string.\")\n        else:\n            # If the input text is a single list of words, convert it to\n            # list of lists for later iteration\n            if not _is_iterable_but_not_string(text[0]):\n                text = [text]\n        if labels is not None:\n            if not _is_iterable_but_not_string(labels):\n                raise ValueError(\"labels must be an iterable and not a string.\")\n            else:\n                if not _is_iterable_but_not_string(labels[0]):\n                    labels = [labels]\n\n        label_available = True\n        if labels is None:\n            label_available = False\n            # create an artificial label list for creating trailing token mask\n            labels = [[\"O\"] * len(t) for t in text]\n\n        input_ids_all = []\n        input_mask_all = []\n        label_ids_all = []\n        trailing_token_mask_all = []\n        for t, t_labels in zip(text, labels):\n\n            if len(t) != len(t_labels):\n                raise ValueError(\n                    \"The number of words is {0}, but the number of labels is {1}.\".format(\n                        len(t), len(t_labels)\n                    )\n                )\n\n            new_labels = []\n            new_tokens = []\n            if label_available:\n                for word, tag in zip(t, t_labels):\n                    sub_words = self.tokenizer.tokenize(word)\n                    for count, sub_word in enumerate(sub_words):\n                        if count > 0:\n                            tag = trailing_piece_tag\n                        new_labels.append(tag)\n                        new_tokens.append(sub_word)\n            else:\n                for word in t:\n                    sub_words = self.tokenizer.tokenize(word)\n                    for count, sub_word in enumerate(sub_words):\n                        if count > 0:\n                            tag = trailing_piece_tag\n                        else:\n                            tag = \"O\"\n                        new_labels.append(tag)\n                        new_tokens.append(sub_word)\n\n            if len(new_tokens) > max_len:\n                new_tokens = new_tokens[:max_len]\n                new_labels = new_labels[:max_len]\n            input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)\n\n            # The mask has 1 for real tokens and 0 for padding tokens.\n            # Only real tokens are attended to.\n            input_mask = [1.0] * len(input_ids)\n\n            # Zero-pad up to the max sequence length.\n            padding = [0.0] * (max_len - len(input_ids))\n            label_padding = [\"O\"] * (max_len - len(input_ids))\n\n            input_ids += padding\n            input_mask += padding\n            new_labels += label_padding\n\n            trailing_token_mask_all.append(\n                [True if label != trailing_piece_tag else False for label in new_labels]\n            )\n\n            if label_map:\n                label_ids = [label_map[label] for label in new_labels]\n            else:\n                label_ids = new_labels\n\n            input_ids_all.append(input_ids)\n            input_mask_all.append(input_mask)\n            label_ids_all.append(label_ids)\n\n        if label_available:\n            return (input_ids_all, input_mask_all, trailing_token_mask_all, label_ids_all)\n        else:\n            return input_ids_all, input_mask_all, trailing_token_mask_all, None\n\n\ndef create_data_loader(\n    input_ids, input_mask, label_ids=None, sample_method=\"random\", batch_size=32\n):\n    \"\"\"\n    Create a dataloader for sampling and serving data batches.\n\n    Args:\n        input_ids (list): List of lists. Each sublist contains numerical\n            values, i.e. token ids, corresponding to the tokens in the input\n            text data.\n        input_mask (list): List of lists. Each sublist contains the attention\n            mask of the input token id list, 1 for input tokens and 0 for\n            padded tokens, so that padded tokens are not attended to.\n        label_ids (list, optional): List of lists of numerical labels,\n            each sublist contains token labels of a input\n            sentence/paragraph. Default value is None.\n        sample_method (str, optional): Order of data sampling. Accepted\n            values are \"random\", \"sequential\". Default value is \"random\".\n        batch_size (int, optional): Number of samples used in each training\n            iteration. Default value is 32.\n\n    Returns:\n        DataLoader: A Pytorch Dataloader containing the input_ids tensor,\n            input_mask tensor, and label_ids (if provided) tensor.\n\n    \"\"\"\n    input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)\n    input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)\n\n    if label_ids:\n        label_ids_tensor = torch.tensor(label_ids, dtype=torch.long)\n        tensor_data = TensorDataset(input_ids_tensor, input_mask_tensor, label_ids_tensor)\n    else:\n        tensor_data = TensorDataset(input_ids_tensor, input_mask_tensor)\n\n    if sample_method == \"random\":\n        sampler = RandomSampler(tensor_data)\n    elif sample_method == \"sequential\":\n        sampler = SequentialSampler(tensor_data)\n    else:\n        raise ValueError(\n            \"Invalid sample_method value, accepted values are: \" \"random and sequential.\"\n        )\n\n    dataloader = DataLoader(tensor_data, sampler=sampler, batch_size=batch_size)\n\n    return dataloader\n\n\nclass TextDataset(Dataset):\n    \"\"\"\n    Characterizes a dataset for PyTorch which can be used to load a file containing multiple rows\n    where each row is a training example. The format of each line in the file is assumed to be\n    tokens, mask and label.\n    \"\"\"\n\n    def __init__(self, filename):\n        \"\"\"\n        Initialization. We set the filename and number of lines in the file.\n        Args:\n            filename(str): Name of the file.\n        \"\"\"\n        self._filename = filename\n        self._total_data = (\n            int(subprocess.check_output(\"wc -l \" + filename, shell=True).split()[0]) - 1\n        )\n\n    def __len__(self):\n        \"\"\"Denotes the total number of samples in the file.\"\"\"\n        return self._total_data\n\n    @staticmethod\n    def _cast(row):\n        return [int(x.strip()) for x in row]\n\n    def __getitem__(self, index):\n        \"\"\"\n        Generates one sample of data. We assume that the last column is label here. We use\n        linecache to load files lazily.\n\n        Args:\n            index(int): Index of the test case.\n\n        Returns(list, list, int): Returns the tokens, mask and label for a single item.\n\n        \"\"\"\n        line = linecache.getline(self._filename, index + 1)\n        row = next(csv.reader([line]))\n\n        tokens = self._cast(row[0][1:-1].split(\",\"))\n        mask = self._cast(row[1][1:-1].split(\",\"))\n\n        data = {\n            \"token_ids\": torch.tensor(tokens, dtype=torch.long),\n            \"input_mask\": torch.tensor(mask, dtype=torch.long),\n            \"labels\": torch.tensor(int(row[2]), dtype=torch.long),\n        }\n\n        return data\n\n\ndef get_dataset_multiple_files(files):\n    \"\"\" Get dataset from multiple files\n\n    Args:\n        files(list): List of paths to the files.\n\n    Returns:\n\n        torch.utils.data.Dataset : A combined dataset of all files in the directory.\n\n    \"\"\"\n    datasets = [TextDataset(x) for x in files]\n    return ConcatDataset(datasets)\n"
  },
  {
    "path": "utils_nlp/models/bert/sequence_classification.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\nfrom collections import namedtuple\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset\nfrom pytorch_pretrained_bert.modeling import BertForSequenceClassification\nfrom pytorch_pretrained_bert.optimization import BertAdam\nfrom tqdm import tqdm\n\nfrom utils_nlp.models.bert.common import Language\nfrom utils_nlp.common.pytorch_utils import (\n    get_device,\n    parallelize_model,\n    move_model_to_device,\n)\n\nfrom cached_property import cached_property\n\n\nclass BERTSequenceClassifier:\n    \"\"\"BERT-based sequence classifier\"\"\"\n\n    def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir=\".\"):\n        \"\"\"Initializes the classifier and the underlying pretrained model.\n\n        Args:\n            language (Language, optional): The pretrained model's language.\n                                           Defaults to Language.ENGLISH.\n            num_labels (int, optional): The number of unique labels in the\n                training data. Defaults to 2.\n            cache_dir (str, optional): Location of BERT's cache directory.\n                Defaults to \".\".\n        \"\"\"\n        if num_labels < 2:\n            raise ValueError(\"Number of labels should be at least 2.\")\n\n        self.language = language\n        self.num_labels = num_labels\n        self.cache_dir = cache_dir\n\n        # create classifier\n        self.model = BertForSequenceClassification.from_pretrained(\n            language, cache_dir=cache_dir, num_labels=num_labels\n        )\n        self.has_cuda = self.cuda\n\n    @cached_property\n    def cuda(self):\n        \"\"\" cache the output of torch.cuda.is_available() \"\"\"\n\n        self.has_cuda = torch.cuda.is_available()\n        return self.has_cuda\n\n    def fit(\n        self,\n        token_ids,\n        input_mask,\n        labels,\n        token_type_ids=None,\n        num_gpus=None,\n        num_epochs=1,\n        batch_size=32,\n        lr=2e-5,\n        warmup_proportion=None,\n        verbose=True,\n    ):\n        \"\"\"Fine-tunes the BERT classifier using the given training data.\n\n        Args:\n            token_ids (list): List of training token id lists.\n            input_mask (list): List of input mask lists.\n            labels (list): List of training labels.\n            token_type_ids (list, optional): List of lists. Each sublist\n                contains segment ids indicating if the token belongs to\n                the first sentence(0) or second sentence(1). Only needed\n                for two-sentence tasks.\n            num_gpus (int, optional): The number of gpus to use.\n                                      If None is specified, all available GPUs\n                                      will be used. Defaults to None.\n            num_epochs (int, optional): Number of training epochs.\n                Defaults to 1.\n            batch_size (int, optional): Training batch size. Defaults to 32.\n            lr (float): Learning rate of the Adam optimizer. Defaults to 2e-5.\n            warmup_proportion (float, optional): Proportion of training to\n                perform linear learning rate warmup for. E.g., 0.1 = 10% of\n                training. Defaults to None.\n            verbose (bool, optional): If True, shows the training progress and\n                loss values. Defaults to True.\n        \"\"\"\n\n        device, num_gpus = get_device(num_gpus)\n\n        self.model = move_model_to_device(self.model, device)\n        self.model = parallelize_model(self.model, device, num_gpus=num_gpus)\n\n        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)\n        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)\n        labels_tensor = torch.tensor(labels, dtype=torch.long)\n\n        if token_type_ids:\n            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)\n            train_dataset = TensorDataset(\n                token_ids_tensor,\n                input_mask_tensor,\n                token_type_ids_tensor,\n                labels_tensor,\n            )\n        else:\n            train_dataset = TensorDataset(\n                token_ids_tensor, input_mask_tensor, labels_tensor\n            )\n        train_sampler = RandomSampler(train_dataset)\n\n        train_dataloader = DataLoader(\n            train_dataset, sampler=train_sampler, batch_size=batch_size\n        )\n        # define optimizer and model parameters\n        param_optimizer = list(self.model.named_parameters())\n        no_decay = [\"bias\", \"LayerNorm.bias\", \"LayerNorm.weight\"]\n        optimizer_grouped_parameters = [\n            {\n                \"params\": [\n                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)\n                ],\n                \"weight_decay\": 0.01,\n            },\n            {\n                \"params\": [\n                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)\n                ],\n                \"weight_decay\": 0.0,\n            },\n        ]\n\n        num_batches = len(train_dataloader)\n        num_train_optimization_steps = num_batches * num_epochs\n\n        if warmup_proportion is None:\n            opt = BertAdam(optimizer_grouped_parameters, lr=lr)\n        else:\n            opt = BertAdam(\n                optimizer_grouped_parameters,\n                lr=lr,\n                t_total=num_train_optimization_steps,\n                warmup=warmup_proportion,\n            )\n\n        # define loss function\n        loss_func = nn.CrossEntropyLoss().to(device)\n\n        # train\n        self.model.train()  # training mode\n\n        for epoch in range(num_epochs):\n            training_loss = 0\n            for i, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")):\n                if token_type_ids:\n                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(\n                        t.to(device) for t in batch\n                    )\n                else:\n                    token_type_ids_batch = None\n                    x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)\n\n                opt.zero_grad()\n\n                y_h = self.model(\n                    input_ids=x_batch,\n                    token_type_ids=token_type_ids_batch,\n                    attention_mask=mask_batch,\n                    labels=None,\n                )\n                loss = loss_func(y_h, y_batch).mean()\n\n                training_loss += loss.item()\n\n                loss.backward()\n                opt.step()\n                if verbose:\n                    if i % ((num_batches // 10) + 1) == 0:\n                        print(\n                            \"epoch:{}/{}; batch:{}->{}/{}; avg loss:{:.6f}\".format(\n                                epoch + 1,\n                                num_epochs,\n                                i + 1,\n                                min(i + 1 + num_batches // 10, num_batches),\n                                num_batches,\n                                training_loss / (i + 1),\n                            )\n                        )\n        # empty cache\n        del [x_batch, y_batch, mask_batch, token_type_ids_batch]\n        torch.cuda.empty_cache()\n\n    def predict(\n        self,\n        token_ids,\n        input_mask,\n        token_type_ids=None,\n        num_gpus=None,\n        batch_size=32,\n        probabilities=False,\n    ):\n        \"\"\"Scores the given dataset and returns the predicted classes.\n\n        Args:\n            token_ids (list): List of training token lists.\n            input_mask (list): List of input mask lists.\n            token_type_ids (list, optional): List of lists. Each sublist\n                contains segment ids indicating if the token belongs to\n                the first sentence(0) or second sentence(1). Only needed\n                for two-sentence tasks.\n            num_gpus (int, optional): The number of gpus to use.\n                                      If None is specified, all available GPUs\n                                      will be used. Defaults to None.\n            batch_size (int, optional): Scoring batch size. Defaults to 32.\n            probabilities (bool, optional):\n                If True, the predicted probability distribution\n                is also returned. Defaults to False.\n        Returns:\n            1darray, namedtuple(1darray, ndarray): Predicted classes or\n                (classes, probabilities) if probabilities is True.\n        \"\"\"\n        device, num_gpus = get_device(num_gpus)\n        self.model = move_model_to_device(self.model, device)\n        self.model = parallelize_model(self.model, device, num_gpus)\n\n        # score\n        self.model.eval()\n\n        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)\n        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)\n\n        if token_type_ids:\n            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)\n            test_dataset = TensorDataset(\n                token_ids_tensor, input_mask_tensor, token_type_ids_tensor\n            )\n        else:\n            test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor)\n\n        test_sampler = SequentialSampler(test_dataset)\n        test_dataloader = DataLoader(\n            test_dataset, sampler=test_sampler, batch_size=batch_size\n        )\n\n        preds = []\n        for i, batch in enumerate(tqdm(test_dataloader, desc=\"Iteration\")):\n            if token_type_ids:\n                x_batch, mask_batch, token_type_ids_batch = tuple(\n                    t.to(device) for t in batch\n                )\n            else:\n                token_type_ids_batch = None\n                x_batch, mask_batch = tuple(t.to(device) for t in batch)\n\n            with torch.no_grad():\n                p_batch = self.model(\n                    input_ids=x_batch,\n                    token_type_ids=token_type_ids_batch,\n                    attention_mask=mask_batch,\n                    labels=None,\n                )\n            preds.append(p_batch.cpu())\n\n        preds = np.concatenate(preds)\n\n        if probabilities:\n            return namedtuple(\"Predictions\", \"classes probabilities\")(\n                preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy()\n            )\n        else:\n            return preds.argmax(axis=1)\n"
  },
  {
    "path": "utils_nlp/models/bert/sequence_classification_distributed.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This script reuses some code from\n# https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py\n\nimport os\nimport warnings\n\nimport numpy as np\nimport torch.nn as nn\nimport torch.utils.data\nfrom pytorch_pretrained_bert.modeling import BertForSequenceClassification\nfrom pytorch_pretrained_bert.optimization import BertAdam\nfrom tqdm import tqdm\n\nfrom utils_nlp.common.pytorch_utils import (\n    get_device,\n    move_model_to_device,\n    parallelize_model,\n)\nfrom utils_nlp.models.bert.common import Language\n\ntry:\n    import horovod.torch as hvd\nexcept ImportError:\n    raise warnings.warn(\"No Horovod found! Can't do distributed training..\")\n\n\nclass BERTSequenceClassifier:\n    \"\"\"BERT-based sequence classifier\"\"\"\n\n    def __init__(\n        self,\n        language=Language.ENGLISH,\n        num_labels=2,\n        cache_dir=\".\",\n        use_distributed=False,\n    ):\n\n        \"\"\"\n\n        Args:\n            language: Language passed to pre-trained BERT model to pick the appropriate\n                model\n            num_labels: number of unique labels in train dataset\n            cache_dir: cache_dir to load pre-trained BERT model. Defaults to \".\"\n        \"\"\"\n        if num_labels < 2:\n            raise ValueError(\"Number of labels should be at least 2.\")\n\n        self.language = language\n        self.num_labels = num_labels\n        self.cache_dir = cache_dir\n        self.use_distributed = use_distributed\n\n        # create classifier\n        self.model = BertForSequenceClassification.from_pretrained(\n            language.value, cache_dir=cache_dir, num_labels=num_labels\n        )\n\n        # define optimizer and model parameters\n        param_optimizer = list(self.model.named_parameters())\n        no_decay = [\"bias\", \"LayerNorm.bias\", \"LayerNorm.weight\"]\n        optimizer_grouped_parameters = [\n            {\n                \"params\": [\n                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)\n                ],\n                \"weight_decay\": 0.01,\n            },\n            {\n                \"params\": [\n                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)\n                ]\n            },\n        ]\n        self.optimizer_params = optimizer_grouped_parameters\n        self.name_parameters = self.model.named_parameters()\n        self.state_dict = self.model.state_dict()\n\n        if use_distributed:\n            hvd.init()\n            if torch.cuda.is_available():\n                torch.cuda.set_device(hvd.local_rank())\n            else:\n                warnings.warn(\"No GPU available! Using CPU.\")\n\n    def create_optimizer(\n        self,\n        num_train_optimization_steps,\n        lr=2e-5,\n        fp16_allreduce=False,\n        warmup_proportion=None,\n    ):\n\n        \"\"\"\n        Method to create an BERT Optimizer based on the inputs from the user.\n\n        Args:\n            num_train_optimization_steps(int): Number of optimization steps.\n            lr (float): learning rate of the adam optimizer. defaults to 2e-5.\n            warmup_proportion (float, optional): proportion of training to\n                perform linear learning rate warmup for. e.g., 0.1 = 10% of\n                training. defaults to none.\n            fp16_allreduce(bool, optional)L if true, use fp16 compression\n                during allreduce.\n\n        Returns:\n            pytorch_pretrained_bert.optimization.BertAdam  : A BertAdam optimizer with\n                user specified config.\n\n        \"\"\"\n        if self.use_distributed:\n            lr = lr * hvd.size()\n\n        if warmup_proportion is None:\n            optimizer = BertAdam(self.optimizer_params, lr=lr)\n        else:\n            optimizer = BertAdam(\n                self.optimizer_params,\n                lr=lr,\n                t_total=num_train_optimization_steps,\n                warmup=warmup_proportion,\n            )\n\n        if self.use_distributed:\n            compression = (\n                hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none\n            )\n            optimizer = hvd.DistributedOptimizer(\n                optimizer,\n                named_parameters=self.model.named_parameters(),\n                compression=compression,\n            )\n\n        return optimizer\n\n    def create_data_loader(self, dataset, batch_size=32, mode=\"train\", **kwargs):\n        \"\"\"\n        Method to create a data loader for a given Tensor dataset.\n\n        Args:\n            mode(str): Mode for creating data loader. Could be train or test.\n            dataset(torch.utils.data.Dataset): A Tensor dataset.\n            batch_size(int): Batch size.\n\n        Returns:\n            torch.utils.data.DataLoader: A torch data loader to the given dataset.\n\n        \"\"\"\n\n        if mode == \"test\":\n            sampler = torch.utils.data.sampler.SequentialSampler(dataset)\n        elif self.use_distributed:\n            sampler = torch.utils.data.distributed.DistributedSampler(\n                dataset, num_replicas=hvd.size(), rank=hvd.rank()\n            )\n        else:\n            sampler = torch.utils.data.RandomSampler(dataset)\n\n        data_loader = torch.utils.data.DataLoader(\n            dataset, batch_size=batch_size, sampler=sampler, **kwargs\n        )\n\n        return data_loader\n\n    def save_model(self):\n        \"\"\"\n        Method to save the trained model.\n        #ToDo: Works for English Language now. Multiple language support needs to\n        # be added.\n\n        \"\"\"\n        # Save the model to the outputs directory for capture\n        output_dir = \"outputs\"\n        os.makedirs(output_dir, exist_ok=True)\n\n        # Save a trained model, configuration and tokenizer\n        model_to_save = (\n            self.model.module if hasattr(self.model, \"module\") else self.model\n        )\n\n        # If we save using the predefined names, we can load using `from_pretrained`\n        output_model_file = \"outputs/bert-large-uncased\"\n        output_config_file = \"outputs/bert_config.json\"\n\n        torch.save(model_to_save.state_dict(), output_model_file)\n        model_to_save.config.to_json_file(output_config_file)\n\n    def fit(\n        self,\n        train_loader,\n        epoch,\n        bert_optimizer=None,\n        num_epochs=1,\n        num_gpus=None,\n        lr=2e-5,\n        warmup_proportion=None,\n        fp16_allreduce=False,\n        num_train_optimization_steps=10,\n    ):\n        \"\"\"\n        Method to fine-tune the bert classifier using the given training data\n\n        Args:\n            train_loader(torch.DataLoader): Torch Dataloader created from Torch Dataset\n            epoch(int): Current epoch number of training.\n            bert_optimizer(optimizer): optimizer can be BERTAdam for local and\n                Dsitributed if Horovod\n            num_epochs(int): the number of epochs to run\n            num_gpus(int): the number of gpus. If None is specified, all available GPUs\n                will be used.\n            lr (float): learning rate of the adam optimizer. defaults to 2e-5.\n            warmup_proportion (float, optional): proportion of training to\n                perform linear learning rate warmup for. e.g., 0.1 = 10% of\n                training. defaults to none.\n            fp16_allreduce(bool): if true, use fp16 compression during allreduce\n            num_train_optimization_steps: number of steps the optimizer should take.\n        \"\"\"\n\n        device, num_gpus = get_device(num_gpus)\n\n        self.model = move_model_to_device(self.model, device)\n        self.model = parallelize_model(self.model, device, num_gpus=num_gpus)\n        if bert_optimizer is None:\n            bert_optimizer = self.create_optimizer(\n                num_train_optimization_steps=num_train_optimization_steps,\n                lr=lr,\n                warmup_proportion=warmup_proportion,\n                fp16_allreduce=fp16_allreduce,\n            )\n\n        if self.use_distributed:\n            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)\n\n        loss_func = nn.CrossEntropyLoss().to(device)\n\n        # train\n        self.model.train()  # training mode\n\n        token_type_ids_batch = None\n\n        num_print = 1000\n        for batch_idx, data in enumerate(train_loader):\n\n            x_batch = data[\"token_ids\"]\n            x_batch = x_batch.cuda()\n\n            y_batch = data[\"labels\"]\n            y_batch = y_batch.cuda()\n\n            mask_batch = data[\"input_mask\"]\n            mask_batch = mask_batch.cuda()\n\n            if \"token_type_ids\" in data and data[\"token_type_ids\"] is not None:\n                token_type_ids_batch = data[\"token_type_ids\"]\n                token_type_ids_batch = token_type_ids_batch.cuda()\n\n            bert_optimizer.zero_grad()\n\n            y_h = self.model(\n                input_ids=x_batch,\n                token_type_ids=token_type_ids_batch,\n                attention_mask=mask_batch,\n                labels=None,\n            )\n\n            loss = loss_func(y_h, y_batch).mean()\n            loss.backward()\n\n            bert_optimizer.synchronize()\n            bert_optimizer.step()\n\n            if batch_idx % num_print == 0:\n                print(\n                    \"Train Epoch: {}/{} ({:.0f}%) \\t Batch:{} \\tLoss: {:.6f}\".format(\n                        epoch,\n                        num_epochs,\n                        100.0 * batch_idx / len(train_loader),\n                        batch_idx + 1,\n                        loss.item(),\n                    )\n                )\n\n        del [x_batch, y_batch, mask_batch, token_type_ids_batch]\n        torch.cuda.empty_cache()\n\n    def predict(self, test_loader, num_gpus=None, probabilities=False):\n        \"\"\"\n\n        Method to predict the results on the test loader. Only evaluates for\n            non distributed workload on the head node in a distributed setup.\n\n        Args:\n            test_loader(torch Dataloader): Torch Dataloader created from Torch Dataset\n            num_gpus (int, optional): The number of gpus to use.\n                                      If None is specified, all available GPUs\n                                      will be used. Defaults to None.\n            probabilities (bool, optional):\n                If True, the predicted probability distribution\n                is also returned. Defaults to False.\n\n        Returns:\n            1darray, dict(1darray, 1darray, ndarray): Predicted classes and\n                target labels or a dictionary with classes, target labels,\n                probabilities) if probabilities is True.\n        \"\"\"\n        device, num_gpus = get_device(num_gpus)\n        self.model = move_model_to_device(self.model, device)\n        self.model = parallelize_model(self.model, device, num_gpus=num_gpus)\n\n        # score\n        self.model.eval()\n\n        preds = []\n        test_labels = []\n        for i, data in enumerate(tqdm(test_loader, desc=\"Iteration\")):\n            x_batch = data[\"token_ids\"]\n            x_batch = x_batch.cuda()\n\n            mask_batch = data[\"input_mask\"]\n            mask_batch = mask_batch.cuda()\n\n            y_batch = data[\"labels\"]\n\n            token_type_ids_batch = None\n            if \"token_type_ids\" in data and data[\"token_type_ids\"] is not None:\n                token_type_ids_batch = data[\"token_type_ids\"]\n                token_type_ids_batch = token_type_ids_batch.cuda()\n\n            with torch.no_grad():\n                p_batch = self.model(\n                    input_ids=x_batch,\n                    token_type_ids=token_type_ids_batch,\n                    attention_mask=mask_batch,\n                    labels=None,\n                )\n            preds.append(p_batch.cpu())\n            test_labels.append(y_batch)\n\n        preds = np.concatenate(preds)\n        test_labels = np.concatenate(test_labels)\n\n        if probabilities:\n            return {\n                \"Predictions\": preds.argmax(axis=1),\n                \"Target\": test_labels,\n                \"classes probabilities\": nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(),\n            }\n        else:\n            return preds.argmax(axis=1), test_labels\n"
  },
  {
    "path": "utils_nlp/models/bert/sequence_encoding.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This script reuses code from:\n# https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples\n# /extract_features.py, with necessary modifications.\n\nfrom enum import Enum\n\nimport numpy as np\nimport pandas as pd\nimport torch\nfrom cached_property import cached_property\nfrom pytorch_pretrained_bert.modeling import BertModel\nfrom torch.utils.data import DataLoader, SequentialSampler, TensorDataset\n\nfrom utils_nlp.common.pytorch_utils import (\n    get_device,\n    move_model_to_device,\n    parallelize_model,\n)\nfrom utils_nlp.models.bert.common import Language, Tokenizer\n\n\nclass PoolingStrategy(str, Enum):\n    \"\"\"Enumerate pooling strategies\"\"\"\n\n    MAX: str = \"max\"\n    MEAN: str = \"mean\"\n    CLS: str = \"cls\"\n\n\nclass BERTSentenceEncoder:\n    \"\"\"BERT-based sentence encoder\"\"\"\n\n    def __init__(\n        self,\n        bert_model=None,\n        tokenizer=None,\n        language=Language.ENGLISH,\n        num_gpus=None,\n        cache_dir=\".\",\n        to_lower=True,\n        max_len=512,\n        layer_index=-1,\n        pooling_strategy=PoolingStrategy.MEAN,\n    ):\n        \"\"\"Initialize the encoder's underlying model and tokenizer\n\n        Args:\n            bert_model: BERT model to use for encoding.\n                Defaults to pretrained BertModel.\n            tokenizer: Tokenizer to use for preprocessing.\n                Defaults to pretrained BERT tokenizer.\n            language: The pretrained model's language. Defaults to Language.ENGLISH.\n            num_gpus: The number of gpus to use. Defaults to None, which forces all\n                available GPUs to be used.\n            cache_dir: Location of BERT's cache directory. Defaults to \".\"\n            to_lower: True to lowercase before tokenization. Defaults to False.\n            max_len: Maximum number of tokens.\n            layer_index: The layer from which to extract features.\n                         Defaults to the last layer; can also be a list of integers\n                         for experimentation.\n            pooling_strategy: Pooling strategy to aggregate token embeddings into\n                sentence embedding.\n        \"\"\"\n        self.model = (\n            bert_model.model.bert\n            if bert_model\n            else BertModel.from_pretrained(language, cache_dir=cache_dir)\n        )\n        self.tokenizer = (\n            tokenizer\n            if tokenizer\n            else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)\n        )\n        self.num_gpus = num_gpus\n        self.max_len = max_len\n        self.layer_index = layer_index\n        self.pooling_strategy = pooling_strategy\n        self.has_cuda = self.cuda\n\n    @property\n    def layer_index(self):\n        return self._layer_index\n\n    @layer_index.setter\n    def layer_index(self, layer_index):\n        if isinstance(layer_index, int):\n            self._layer_index = [layer_index]\n        else:\n            self.layer_index = layer_index\n\n    @cached_property\n    def cuda(self):\n        \"\"\" cache the output of torch.cuda.is_available() \"\"\"\n\n        self.has_cuda = torch.cuda.is_available()\n        return self.has_cuda\n\n    @property\n    def pooling_strategy(self):\n        return self._pooling_strategy\n\n    @pooling_strategy.setter\n    def pooling_strategy(self, pooling_strategy):\n        self._pooling_strategy = pooling_strategy\n\n    def get_hidden_states(self, text, batch_size=32):\n        \"\"\"Extract the hidden states from the pretrained model\n\n        Args:\n            text: List of documents to extract features from.\n            batch_size: Batch size, defaults to 32.\n\n        Returns:\n            pd.DataFrame with columns:\n                text_index (int), token (str), layer_index (int), values (list[float]).\n        \"\"\"\n        device, num_gpus = get_device(self.num_gpus)\n        self.model = move_model_to_device(self.model, device)\n        self.model = parallelize_model(self.model, device, self.num_gpus)\n\n        self.model.eval()\n\n        tokens = self.tokenizer.tokenize(text)\n\n        (\n            tokens,\n            input_ids,\n            input_mask,\n            input_type_ids,\n        ) = self.tokenizer.preprocess_encoder_tokens(tokens, max_len=self.max_len)\n\n        input_ids = torch.tensor(input_ids, dtype=torch.long, device=device)\n        input_mask = torch.tensor(input_mask, dtype=torch.long, device=device)\n        input_type_ids = torch.arange(\n            input_ids.size(0), dtype=torch.long, device=device\n        )\n\n        eval_data = TensorDataset(input_ids, input_mask, input_type_ids)\n        eval_dataloader = DataLoader(\n            eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size\n        )\n\n        hidden_states = {\"text_index\": [], \"token\": [], \"layer_index\": [], \"values\": []}\n        for (\n            input_ids_tensor,\n            input_mask_tensor,\n            example_indices_tensor,\n        ) in eval_dataloader:\n            with torch.no_grad():\n                all_encoder_layers, _ = self.model(\n                    input_ids_tensor,\n                    token_type_ids=None,\n                    attention_mask=input_mask_tensor,\n                )\n                self.embedding_dim = all_encoder_layers[0].size()[-1]\n\n            for b, example_index in enumerate(example_indices_tensor):\n                for (i, token) in enumerate(tokens[example_index.item()]):\n                    for (j, layer_index) in enumerate(self.layer_index):\n                        layer_output = (\n                            all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n                        )\n                        layer_output = layer_output[b]\n                        hidden_states[\"text_index\"].append(example_index.item())\n                        hidden_states[\"token\"].append(token)\n                        hidden_states[\"layer_index\"].append(layer_index)\n                        hidden_states[\"values\"].append(\n                            [round(x.item(), 6) for x in layer_output[i]]\n                        )\n\n            # empty cache\n            del [input_ids_tensor, input_mask_tensor, example_indices_tensor]\n            torch.cuda.empty_cache()\n\n        # empty cache\n        del [input_ids, input_mask, input_type_ids]\n        torch.cuda.empty_cache()\n\n        return pd.DataFrame.from_dict(hidden_states)\n\n    def pool(self, df):\n        \"\"\"Pooling to aggregate token-wise embeddings to sentence embeddings\n\n        Args:\n            df: pd.DataFrame with columns text_index (int), token (str),\n                layer_index (int), values (list[float])\n\n        Returns:\n            pd.DataFrame grouped by text index and layer index\n        \"\"\"\n\n        def max_pool(x):\n            values = np.array(\n                [\n                    np.reshape(np.array(x.values[i]), self.embedding_dim)\n                    for i in range(x.values.shape[0])\n                ]\n            )\n            m, _ = torch.max(torch.tensor(values, dtype=torch.float), 0)\n            return m.numpy()\n\n        def mean_pool(x):\n            values = np.array(\n                [\n                    np.reshape(np.array(x.values[i]), self.embedding_dim)\n                    for i in range(x.values.shape[0])\n                ]\n            )\n            return torch.mean(torch.tensor(values, dtype=torch.float), 0).numpy()\n\n        def cls_pool(x):\n            values = np.array(\n                [\n                    np.reshape(np.array(x.values[i]), self.embedding_dim)\n                    for i in range(x.values.shape[0])\n                ]\n            )\n            return values[0]\n\n        try:\n            if self.pooling_strategy == \"max\":\n                pool_func = max_pool\n            elif self.pooling_strategy == \"mean\":\n                pool_func = mean_pool\n            elif self.pooling_strategy == \"cls\":\n                pool_func = cls_pool\n            else:\n                raise ValueError(\"Please enter valid pooling strategy\")\n        except ValueError as ve:\n            print(ve)\n\n        return (\n            df.groupby([\"text_index\", \"layer_index\"])[\"values\"]\n            .apply(lambda x: pool_func(x))\n            .reset_index()\n        )\n\n    def encode(self, text, batch_size=32, as_numpy=False):\n        \"\"\"Computes sentence encodings\n\n        Args:\n            text: List of documents to encode.\n            batch_size: Batch size, defaults to 32.\n        \"\"\"\n        df = self.get_hidden_states(text, batch_size)\n        pooled = self.pool(df)\n\n        if as_numpy:\n            return np.array(pooled[\"values\"].tolist())\n        else:\n            return pooled\n"
  },
  {
    "path": "utils_nlp/models/bert/token_classification.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\n# This script reuses some code from\n# https://github.com/huggingface/transformers/blob/master/examples\n# /run_glue.py\n\nfrom collections import namedtuple\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom pytorch_pretrained_bert.modeling import BertForTokenClassification\nfrom pytorch_pretrained_bert.optimization import BertAdam\nfrom tqdm import tqdm, trange\n\nfrom utils_nlp.models.bert.common import Language, create_data_loader\nfrom utils_nlp.common.pytorch_utils import get_device, move_model_to_device\n\nfrom cached_property import cached_property\n\n\nclass BERTTokenClassifier:\n    \"\"\"BERT-based token classifier.\"\"\"\n\n    def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir=\".\"):\n\n        \"\"\"\n        Initializes the classifier and the underlying pre-trained model.\n\n        Args:\n            language (Language, optional): The pre-trained model's language.\n                The value of this argument determines which BERT model is\n                used:\n                    Language.ENGLISH: \"bert-base-uncased\"\n                    Language.ENGLISHCASED: \"bert-base-cased\"\n                    Language.ENGLISHLARGE: \"bert-large-uncased\"\n                    Language.ENGLISHLARGECASED: \"bert-large-cased\"\n                    Language.CHINESE: \"bert-base-chinese\"\n                    Language.MULTILINGUAL: \"bert-base-multilingual-cased\"\n                Defaults to Language.ENGLISH.\n            num_labels (int, optional): The number of unique labels in the\n                data. Defaults to 2.\n            cache_dir (str, optional): Location of BERT's cache directory.\n                Defaults to \".\".\n        \"\"\"\n\n        if num_labels < 2:\n            raise ValueError(\"Number of labels should be at least 2.\")\n\n        self.language = language\n        self.num_labels = num_labels\n        self.cache_dir = cache_dir\n\n        self.model = BertForTokenClassification.from_pretrained(\n            language, cache_dir=cache_dir, num_labels=num_labels\n        )\n        self.has_cuda = self.cuda\n\n    @cached_property\n    def cuda(self):\n        \"\"\" Caches the output of torch.cuda.is_available() \"\"\"\n\n        self.has_cuda = torch.cuda.is_available()\n        return self.has_cuda\n\n    def _get_optimizer(self, learning_rate, num_train_optimization_steps, warmup_proportion):\n        \"\"\"\n        Initializes the optimizer and configure parameters to apply weight\n        decay on.\n        \"\"\"\n        param_optimizer = list(self.model.named_parameters())\n        no_decay_params = [\"bias\", \"LayerNorm.bias\", \"LayerNorm.weight\"]\n        params_weight_decay = 0.01\n        optimizer_grouped_parameters = [\n            {\n                \"params\": [\n                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay_params)\n                ],\n                \"weight_decay\": params_weight_decay,\n            },\n            {\n                \"params\": [p for n, p in param_optimizer if any(nd in n for nd in no_decay_params)],\n                \"weight_decay\": 0.0,\n            },\n        ]\n\n        if warmup_proportion is None:\n            optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate)\n        else:\n            optimizer = BertAdam(\n                optimizer_grouped_parameters,\n                lr=learning_rate,\n                t_total=num_train_optimization_steps,\n                warmup=warmup_proportion,\n            )\n\n        return optimizer\n\n    def fit(\n        self,\n        token_ids,\n        input_mask,\n        labels,\n        num_gpus=None,\n        num_epochs=1,\n        batch_size=32,\n        learning_rate=2e-5,\n        warmup_proportion=None,\n    ):\n        \"\"\"\n        Fine-tunes the BERT classifier using the given training data.\n\n        Args:\n            token_ids (list): List of lists. Each sublist contains\n                numerical token ids corresponding to the tokens in the input\n                text data.\n            input_mask (list): List of lists. Each sublist contains\n                the attention mask of the input token id list. 1 for input\n                tokens and 0 for padded tokens, so that padded tokens are\n                not attended to.\n            labels (list): List of lists, each sublist contains numerical\n                token labels of an input sentence/paragraph.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used. Defaults to None.\n            num_epochs (int, optional): Number of training epochs.\n                Defaults to 1.\n            batch_size (int, optional): Training batch size. Defaults to 32.\n            learning_rate (float, optional): learning rate of the BertAdam\n                optimizer. Defaults to 2e-5.\n            warmup_proportion (float, optional): Proportion of training to\n                perform linear learning rate warmup for. E.g., 0.1 = 10% of\n                training. Defaults to None.\n        \"\"\"\n\n        train_dataloader = create_data_loader(\n            input_ids=token_ids,\n            input_mask=input_mask,\n            label_ids=labels,\n            sample_method=\"random\",\n            batch_size=batch_size,\n        )\n\n        device, num_gpus = get_device(num_gpus)\n\n        self.model = move_model_to_device(self.model, device, num_gpus)\n\n        if num_gpus is None:\n            num_gpus_used = torch.cuda.device_count()\n        else:\n            num_gpus_used = min(num_gpus, torch.cuda.device_count())\n\n        num_train_optimization_steps = max((int(len(token_ids) / batch_size) * num_epochs), 1)\n        optimizer = self._get_optimizer(\n            learning_rate=learning_rate,\n            num_train_optimization_steps=num_train_optimization_steps,\n            warmup_proportion=warmup_proportion,\n        )\n\n        self.model.train()\n        for _ in trange(int(num_epochs), desc=\"Epoch\"):\n            tr_loss = 0\n            nb_tr_steps = 0\n            for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\", mininterval=30)):\n                batch = tuple(t.to(device) for t in batch)\n                b_token_ids, b_input_mask, b_label_ids = batch\n\n                loss = self.model(\n                    input_ids=b_token_ids, attention_mask=b_input_mask, labels=b_label_ids\n                )\n\n                if num_gpus_used > 1:\n                    # mean() to average on multi-gpu.\n                    loss = loss.mean()\n                # Accumulate parameter gradients\n                loss.backward()\n\n                tr_loss += loss.item()\n                nb_tr_steps += 1\n\n                # Update parameters based on current gradients\n                optimizer.step()\n                # Reset parameter gradients to zero\n                optimizer.zero_grad()\n\n            train_loss = tr_loss / nb_tr_steps\n            print(\"Train loss: {}\".format(train_loss))\n\n            torch.cuda.empty_cache()\n\n    def predict(\n        self, token_ids, input_mask, labels=None, batch_size=32, num_gpus=None, probabilities=False\n    ):\n        \"\"\"\n        Predict token labels on the testing data.\n\n        Args:\n            token_ids (list): List of lists. Each sublist contains\n                numerical token ids corresponding to the tokens in the input\n                text data.\n            input_mask (list): List of lists. Each sublist contains\n                the attention mask of the input token list, 1 for input\n                tokens and 0 for padded tokens, so that padded tokens are\n                not attended to.\n            labels (list, optional): List of lists. Each sublist contains\n                numerical token labels of an input sentence/paragraph.\n                If provided, it's used to compute the evaluation loss.\n                Default value is None.\n            batch_size (int, optional): Testing batch size. Defaults to 32.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used. Defaults to None.\n\n        Returns:\n            list or namedtuple(list, ndarray): List of lists of predicted\n                token labels or ([token labels], probabilities) if\n                probabilities is True. The probabilities output is an n x m\n                array, where n is the size of the testing data and m is the\n                number of tokens in each input sublist. The probability\n                values are the softmax probability of the predicted class.\n        \"\"\"\n        test_dataloader = create_data_loader(\n            input_ids=token_ids,\n            input_mask=input_mask,\n            label_ids=labels,\n            batch_size=batch_size,\n            sample_method=\"sequential\",\n        )\n        device, num_gpus = get_device(num_gpus)\n\n        self.model = move_model_to_device(self.model, device, num_gpus)\n\n        self.model.eval()\n        eval_loss = 0\n        nb_eval_steps = 0\n        for step, batch in enumerate(tqdm(test_dataloader, desc=\"Iteration\", mininterval=10)):\n            batch = tuple(t.to(device) for t in batch)\n            true_label_available = False\n            if labels:\n                b_input_ids, b_input_mask, b_labels = batch\n                true_label_available = True\n            else:\n                b_input_ids, b_input_mask = batch\n\n            with torch.no_grad():\n                logits = self.model(b_input_ids, attention_mask=b_input_mask)\n                if true_label_available:\n                    active_loss = b_input_mask.view(-1) == 1\n                    active_logits = logits.view(-1, self.num_labels)[active_loss]\n                    active_labels = b_labels.view(-1)[active_loss]\n                    loss_fct = nn.CrossEntropyLoss()\n                    tmp_eval_loss = loss_fct(active_logits, active_labels)\n\n                    eval_loss += tmp_eval_loss.mean().item()\n\n            logits = logits.detach().cpu()\n\n            if step == 0:\n                logits_all = logits.numpy()\n            else:\n                logits_all = np.append(logits_all, logits, axis=0)\n\n            nb_eval_steps += 1\n\n        predictions = [list(p) for p in np.argmax(logits_all, axis=2)]\n\n        if true_label_available:\n            validation_loss = eval_loss / nb_eval_steps\n            print(\"Evaluation loss: {}\".format(validation_loss))\n\n        if probabilities:\n            return namedtuple(\"Predictions\", \"classes probabilities\")(\n                predictions, np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2)\n            )\n        else:\n            return predictions\n\n\ndef create_label_map(label_list, trailing_piece_tag=\"X\"):\n    label_map = {label: i for i, label in enumerate(label_list)}\n\n    if trailing_piece_tag not in label_list:\n        label_map[trailing_piece_tag] = len(label_list)\n\n    return label_map\n\n\ndef postprocess_token_labels(\n    labels, input_mask, label_map=None, remove_trailing_word_pieces=False, trailing_token_mask=None\n):\n    \"\"\"\n    Postprocesses token classification output:\n        1) Removes predictions on padded tokens.\n        2) If label_map is provided, maps predicted numerical labels\n            back to original labels.\n        3) If remove_trailing_word_pieces is True and trailing_token_mask\n            is provided, remove the predicted labels on trailing word pieces\n            generated by WordPiece tokenizer.\n\n    Args:\n        labels (list): List of lists of predicted token labels.\n        input_mask (list): List of lists. Each sublist contains the attention\n            mask of the input token list, 1 for input tokens and 0\n            for padded tokens.\n        label_map (dict, optional): A dictionary mapping original labels\n            (which may be string type) to numerical label ids. If\n            provided, it's used to map predicted numerical labels back to\n            original labels. Default value is None.\n        remove_trailing_word_pieces (bool, optional): Whether to remove\n            predicted labels of trailing word pieces generated by WordPiece\n            tokenizer. For example, \"criticize\" is broken into \"critic\" and\n            \"##ize\". After removing predicted label for \"##ize\",\n            the predicted label for \"critic\" is assigned to the original word\n            \"criticize\". Default value is False.\n        trailing_token_mask (list, optional): list of boolean values, True for\n            the first word piece of each original word, False for trailing\n            word pieces, e.g. ##ize. If remove_trailing_word_pieces is\n            True, this mask is used to remove the predicted labels on\n            trailing word pieces, so that each original word in the input\n            text has a unique predicted label.\n    \"\"\"\n    if label_map:\n        reversed_label_map = {v: k for k, v in label_map.items()}\n        labels_org = [[reversed_label_map[l_i] for l_i in l] for l in labels]\n    else:\n        labels_org = labels\n\n    labels_org_no_padding = [\n        [label for label, mask in zip(label_list, mask_list) if mask == 1]\n        for label_list, mask_list in zip(labels_org, input_mask)\n    ]\n\n    if remove_trailing_word_pieces and trailing_token_mask:\n        # Remove the padded values in trailing_token_mask first\n        token_mask_no_padding = [\n            [token for token, padding in zip(t_mask, p_mask) if padding == 1]\n            for t_mask, p_mask in zip(trailing_token_mask, input_mask)\n        ]\n\n        labels_no_trailing_pieces = [\n            [label for label, mask in zip(label_list, mask_list) if mask]\n            for label_list, mask_list in zip(labels_org_no_padding, token_mask_no_padding)\n        ]\n        return labels_no_trailing_pieces\n    else:\n        return labels_org_no_padding\n"
  },
  {
    "path": "utils_nlp/models/gensen/README.md",
    "content": "# GenSen\n\nLearning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning\n\nSandeep Subramanian, Adam Trischler, Yoshua Bengio & Christopher Pal\n\nICLR 2018\n\n\n### About\n\nGenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training. These representations are useful for transfer and low-resource learning. For details please refer to ICLR [paper](https://openreview.net/forum?id=B18WgG-CZ&noteId=B18WgG-CZ).\n\n### Code\n\nWe provide a distributed PyTorch with Horovod implementation of the paper along with pre-trained models as well as code to evaluate these models on a variety of transfer learning benchmarks.\nThis code is based on the gibhub codebase from [Maluuba](https://github.com/Maluuba/gensen), but we have refactored the code in the following aspects:\n1. Support a distributed PyTorch with Horovod\n2. Clean and refactor the original code in a more structured form\n3. Change the training file (`train.py`) from non-stopping to stop when the validation loss reaches to the local minimum\n4. Update the code from Python 2.7 to 3+ and PyTorch from 0.2 or 0.3 to 1.0.1\n5. Add some necessary comments\n6. Add some code for training on AzureML platform\n7. Fix the bug on when setting the batch size to 1, the training raises an error\n### Requirements\n\n- Python 3+\n- PyTorch 1.0.1\n- nltk\n- h5py\n- numpy\n- scikit-learn\n\n### Reference\n\n```\n@article{subramanian2018learning,\ntitle={Learning general purpose distributed sentence representations via large scale multi-task learning},\nauthor={Subramanian, Sandeep and Trischler, Adam and Bengio, Yoshua and Pal, Christopher J},\njournal={arXiv preprint arXiv:1804.00079},\nyear={2018}\n}\n```\n"
  },
  {
    "path": "utils_nlp/models/gensen/__init__.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nSNLI_CLEAN_PATH = \"clean/snli_1.0\"\n"
  },
  {
    "path": "utils_nlp/models/gensen/create_gensen_model.py",
    "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Creates a GenSen model from a MultiSeq2Seq model.\"\"\"\nimport os\nimport pickle\n\nimport torch\n\n\ndef create_multiseq2seq_model(\n    trained_model_folder,\n    save_folder,\n    save_name,\n    trained_model_name=\"best_model.model\",\n):\n\n    \"\"\"\n    Method that creates a GenSen model from a MultiSeq2Seq model.\n\n    Args:\n        trained_model_folder (str): Path to the folder containing a saved model\n        save_folder (str): Path to save the encoder\n        save_name (str): Name of the model\n        trained_model_name (str, optional): Loaded model as the input\n\n    Returns: None\n\n    \"\"\"\n\n    model = torch.load(\n        open(os.path.join(trained_model_folder, trained_model_name), \"rb\")\n    )\n    # model.copy() prevents raising the error.\n    for item in model.copy().keys():\n        if not item.startswith(\"module.encoder\") and not item.startswith(\n            \"module.src_embedding\"\n        ):\n            model.pop(item)\n\n    for item in model.copy().keys():\n        model[item.replace(\"module.\", \"\")] = model[item]\n\n    for item in model.copy().keys():\n        if item.startswith(\"module.\"):\n            del model[item]\n\n    torch.save(model, os.path.join(save_folder, \"%s.model\" % save_name))\n    # Add 'rb'.\n    model_vocab = pickle.load(\n        open(os.path.join(trained_model_folder, \"src_vocab.pkl\"), \"rb\")\n    )\n    pickle.dump(\n        model_vocab,\n        open(os.path.join(save_folder, \"%s_vocab.pkl\" % save_name), \"wb\"),\n    )\n\n\n# Original source: https://github.com/Maluuba/gensen\n"
  },
  {
    "path": "utils_nlp/models/gensen/gensen.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport h5py\nfrom sklearn.linear_model import LinearRegression\nimport nltk\nimport numpy as np\nimport pickle\nimport os\nimport copy\nimport logging\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Variable\nfrom torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence\n\n\nclass Encoder(nn.Module):\n    \"\"\"GenSen Encoder.\n    \n    Original source in https://github.com/Maluuba/gensen\n    \"\"\"\n\n    def __init__(\n        self, vocab_size, embedding_dim, hidden_dim, num_layers, rnn_type=\"GRU\"\n    ):\n        \"\"\"Initialize all the parameters.\n\n        Args:\n            vocab_size (int): Size of the vocabulary.\n            embedding_dim (int): the size of each embedding vector\n            hidden_dim (int): the size of each hidden vector\n            num_layers (int): the number of layers.\n            rnn_type (str): Type of RNN.\n        \"\"\"\n        super(Encoder, self).__init__()\n        self.rnn_type = rnn_type\n        rnn = getattr(nn, rnn_type)\n        self.src_embedding = nn.Embedding(\n            num_embeddings=vocab_size, embedding_dim=embedding_dim\n        )\n\n        self.encoder = rnn(\n            input_size=embedding_dim,\n            hidden_size=hidden_dim,\n            num_layers=num_layers,\n            batch_first=True,\n            bidirectional=True,\n        )\n\n    def set_pretrained_embeddings(self, embedding_matrix):\n        \"\"\"Set embedding weights.\n\n        Args:\n            embedding_matrix(torch.Tensor): Embedding matrix.\n\n        \"\"\"\n        if embedding_matrix.shape[0] != self.src_embedding.weight.size(\n            0\n        ) or embedding_matrix.shape[1] != self.src_embedding.weight.size(1):\n            logging.info(\n                \"\"\"\n                Warning pretrained embedding shape mismatch %d x %d\n                expected %d x %d\"\"\"\n                % (\n                    embedding_matrix.shape[0],\n                    embedding_matrix.shape[1],\n                    self.src_embedding.weight.size(0),\n                    self.src_embedding.weight.size(1),\n                )\n            )\n            self.src_embedding = nn.Embedding(\n                embedding_matrix.shape[0], embedding_matrix.shape[1]\n            )\n            self.src_vocab_size = embedding_matrix.shape[0]\n            self.src_emb_dim = embedding_matrix.shape[1]\n\n        try:\n            self.src_embedding.weight.data.set_(\n                torch.from_numpy(embedding_matrix)\n            )\n        except BaseException:\n            self.src_embedding.weight.data.set_(\n                torch.from_numpy(embedding_matrix).cuda()\n            )\n\n        self.src_embedding.cuda()\n\n    def forward(self, input, lengths, return_all=False, pool=\"last\"):\n        \"\"\"Propogate input through the encoder.\n\n        Args:\n            input(torch.Tensor): Embedding matrix\n            lengths (torch.Tensor): list of sequences lengths of each batch element.\n            return_all (bool): Return all or only the last hidden state.\n            pool(str): Type of getting hidden state.\n\n        Returns:\n            torch.Tensor: Return last or all hidden states.\n\n        \"\"\"\n        embedding = self.src_embedding(input)\n        src_emb = pack_padded_sequence(embedding, lengths, batch_first=True)\n        if self.rnn_type == \"LSTM\":\n            h, (h_t, _) = self.encoder(src_emb)\n        else:\n            h, h_t = self.encoder(src_emb)\n\n        # Get hidden state via max-pooling or h_t\n        if pool == \"last\":\n            h_t = torch.cat((h_t[-1], h_t[-2]), 1)\n        elif pool == \"max\":\n            h_tmp, _ = pad_packed_sequence(h, batch_first=True)\n            h_t = torch.max(h_tmp, 1)[0].squeeze()\n        else:\n            raise ValueError(\"Pool %s is not valid \" % (pool))\n\n        # Return all or only the last hidden state\n        if return_all:\n            h, _ = pad_packed_sequence(h, batch_first=True)\n            return h, h_t\n        else:\n            return h_t\n\n\nclass GenSen(nn.Module):\n    \"\"\"A wrapper class for multiple GenSen models.\n    \n    Original source in https://github.com/Maluuba/gensen\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        super(GenSen, self).__init__()\n        self.gensen_models = args\n\n    def vocab_expansion(self, task_vocab):\n        \"\"\"Expand the model's vocabulary with pretrained word embeddings.\n\n        Args:\n            task_vocab(list): Vocabulary for each task.\n\n        \"\"\"\n        for model in self.gensen_models:\n            model.vocab_expansion(task_vocab)\n\n    def get_representation(\n        self,\n        sentences,\n        pool=\"last\",\n        tokenize=False,\n        return_numpy=True,\n        add_start_end=True,\n    ):\n        \"\"\" Get model representations.\n\n        Args:\n            sentences(list): Sentences to get embeddings.\n            pool(str): Type of getting hidden state.\n            tokenize(bool): To tokenize or not.\n            return_numpy(bool): To return a numpy array or not.\n            add_start_end(bool): To add start and end notation (<s> </s>) to each\n            sentence or not.\n\n        Returns:\n            torch.Tensor : Return last or all hidden states.\n\n        \"\"\"\n        representations = [\n            model.get_representation(\n                sentences,\n                pool=pool,\n                tokenize=tokenize,\n                return_numpy=return_numpy,\n                add_start_end=add_start_end,\n            )\n            for model in self.gensen_models\n        ]\n        if return_numpy:\n            return (\n                np.concatenate([x[0] for x in representations], axis=2),\n                np.concatenate([x[1] for x in representations], axis=1),\n            )\n        else:\n            return (\n                torch.cat([x[0] for x in representations], 2),\n                torch.cat([x[1] for x in representations], 1),\n            )\n\n\nclass GenSenSingle(nn.Module):\n    \"\"\"GenSen Wrapper.\n    \n    Original source in https://github.com/Maluuba/gensen\n    \"\"\"\n\n    def __init__(\n        self,\n        model_folder,\n        filename_prefix,\n        pretrained_emb,\n        cuda=False,\n        rnn_type=\"GRU\",\n    ):\n        \"\"\" Initialize params.\n\n        Args:\n            model_folder(str): Folder where the model resides.\n            filename_prefix(str): Prefix for the model file.\n            pretrained_emb(torch.Tensor): Pretrained Embedding vector.\n            cuda(bool): Use Cuda or not.\n            rnn_type(str): Type of RNN.\n        \"\"\"\n\n        super(GenSenSingle, self).__init__()\n        self.model_folder = model_folder\n        self.filename_prefix = filename_prefix\n        self.pretrained_emb = pretrained_emb\n        self.cuda = cuda\n        self.rnn_type = rnn_type\n        self._load_params()\n        self.vocab_expanded = False\n\n    def _load_params(self):\n        \"\"\"Load pretrained params.\"\"\"\n        # Read vocab pickle files\n        open(\n            os.path.join(\n                self.model_folder, \"%s_vocab.pkl\" % self.filename_prefix\n            ),\n            \"rb\",\n        )\n        model_vocab = pickle.load(\n            open(\n                os.path.join(\n                    self.model_folder, \"%s_vocab.pkl\" % self.filename_prefix\n                ),\n                \"rb\",\n            ),\n            encoding=\"latin1\",\n        )\n\n        # Word to index mappings\n        self.word2id = model_vocab[\"word2id\"]\n        self.id2word = model_vocab[\"id2word\"]\n        self.task_word2id = self.word2id\n        self.id2word = self.id2word\n\n        encoder_model = torch.load(\n            os.path.join(\n                self.model_folder, \"%s.model\" % (self.filename_prefix)\n            )\n        )\n\n        # Initialize encoders\n        self.encoder = Encoder(\n            vocab_size=encoder_model[\"src_embedding.weight\"].size(0),\n            embedding_dim=encoder_model[\"src_embedding.weight\"].size(1),\n            hidden_dim=encoder_model[\"encoder.weight_hh_l0\"].size(1),\n            num_layers=1 if len(encoder_model) < 10 else 2,\n            rnn_type=self.rnn_type,\n        )\n\n        # Load pretrained sentence encoder weights\n        self.encoder.load_state_dict(encoder_model)\n\n        # Set encoders in eval model.\n        self.encoder.eval()\n\n        # Store the initial word embeddings somewhere to re-train vocab expansion multiple times.\n        self.model_embedding_matrix = copy.deepcopy(\n            self.encoder.src_embedding.weight.data.cpu().numpy()\n        )\n\n        # Move encoder to GPU if self.cuda\n        if self.cuda:\n            self.encoder = self.encoder.cuda()\n\n    def first_expansion(self):\n        \"\"\" Training linear regression model for the first time.\"\"\"\n\n        # Read pre-trained word embedding h5 file\n        logging.info(\"Loading pretrained word embeddings\")\n        pretrained_embeddings = h5py.File(self.pretrained_emb)\n        pretrained_embedding_matrix = pretrained_embeddings[\"embedding\"].value\n        pretrain_vocab = pretrained_embeddings[\"words_flatten\"].value.split(\n            \"\\n\"\n        )\n        pretrain_word2id = {\n            word: ind for ind, word in enumerate(pretrain_vocab)\n        }\n\n        # Set up training data for vocabulary expansion\n        model_train = []\n        pretrain_train = []\n\n        for word in pretrain_word2id:\n            if word in self.word2id:\n                model_train.append(\n                    self.model_embedding_matrix[self.word2id[word]]\n                )\n                pretrain_train.append(\n                    pretrained_embedding_matrix[pretrain_word2id[word]]\n                )\n\n        logging.info(\"Training vocab expansion on model\")\n        lreg = LinearRegression()\n        lreg.fit(pretrain_train, model_train)\n        self.lreg = lreg\n        self.pretrain_word2id = pretrain_word2id\n        self.pretrained_embedding_matrix = pretrained_embedding_matrix\n\n    def vocab_expansion(self, task_vocab):\n        \"\"\" Expand the model's vocabulary with pretrained word embeddings.\n\n        Args:\n            task_vocab(list): Vocabulary for each task.\n        \"\"\"\n\n        self.task_word2id = {\"<s>\": 0, \"<pad>\": 1, \"</s>\": 2, \"<unk>\": 3}\n\n        self.task_id2word = {0: \"<s>\", 1: \"<pad>\", 2: \"</s>\", 3: \"<unk>\"}\n\n        ctr = 4\n        for idx, word in enumerate(task_vocab):\n            if word not in self.task_word2id:\n                self.task_word2id[word] = ctr\n                self.task_id2word[ctr] = word\n                ctr += 1\n\n        if not self.vocab_expanded:\n            self.first_expansion()\n\n        # Expand vocabulary using the linear regression model\n        task_embeddings = []\n        oov_pretrain = 0\n        oov_task = 0\n\n        for word in self.task_id2word.values():\n            if word in self.word2id:\n                task_embeddings.append(\n                    self.model_embedding_matrix[self.word2id[word]]\n                )\n            elif word in self.pretrain_word2id:\n                oov_task += 1\n                task_embeddings.append(\n                    self.lreg.predict(\n                        self.pretrained_embedding_matrix[\n                            self.pretrain_word2id[word]\n                        ].reshape(1, -1)\n                    )\n                    .squeeze()\n                    .astype(np.float32)\n                )\n            else:\n                oov_pretrain += 1\n                oov_task += 1\n                task_embeddings.append(\n                    self.model_embedding_matrix[self.word2id[\"<unk>\"]]\n                )\n\n        logging.info(\"Found %d task OOVs \" % (oov_task))\n        logging.info(\"Found %d pretrain OOVs \" % (oov_pretrain))\n        task_embeddings = np.stack(task_embeddings)\n        self.encoder.set_pretrained_embeddings(task_embeddings)\n        self.vocab_expanded = True\n\n        # Move encoder to GPU if self.cuda\n        if self.cuda:\n            self.encoder = self.encoder.cuda()\n\n    def get_minibatch(self, sentences, tokenize=False, add_start_end=True):\n        \"\"\"Prepare minibatch.\n\n        Args:\n            sentences(list): Sentences to get embeddings.\n            tokenize(bool): To tokenize or not.\n            add_start_end(bool): To add start and end notation (<s> </s>) to each\n            sentence or not.\n\n        Returns:\n            dict: A dictionary with sentences, lengths and sentence embeddings.\n\n        \"\"\"\n        if tokenize:\n            sentences = [\n                nltk.word_tokenize(sentence) for sentence in sentences\n            ]\n        else:\n            sentences = [sentence.split() for sentence in sentences]\n\n        if add_start_end:\n            sentences = [\n                [\"<s>\"] + sentence + [\"</s>\"] for sentence in sentences\n            ]\n\n        lens = [len(sentence) for sentence in sentences]\n        sorted_idx = np.argsort(lens)[::-1]\n        sorted_sentences = [sentences[idx] for idx in sorted_idx]\n        rev = np.argsort(sorted_idx)\n        sorted_lens = [len(sentence) for sentence in sorted_sentences]\n        max_len = max(sorted_lens)\n\n        sentences = [\n            [\n                self.task_word2id[w]\n                if w in self.task_word2id\n                else self.task_word2id[\"<unk>\"]\n                for w in sentence\n            ]\n            + [self.task_word2id[\"<pad>\"]] * (max_len - len(sentence))\n            for sentence in sorted_sentences\n        ]\n\n        with torch.no_grad():\n            sentences = Variable(torch.LongTensor(sentences))\n            rev = Variable(torch.LongTensor(rev))\n        lengths = sorted_lens\n\n        if self.cuda:\n            sentences = sentences.cuda()\n            rev = rev.cuda()\n\n        return {\"sentences\": sentences, \"lengths\": lengths, \"rev\": rev}\n\n    def get_representation(\n        self,\n        sentences,\n        pool=\"last\",\n        tokenize=False,\n        return_numpy=True,\n        add_start_end=True,\n    ):\n        \"\"\"Get model representations.\n\n        Args:\n            sentences(list): Sentences to get embeddings.\n            pool(str): Type of getting hidden state.\n            tokenize(bool): To tokenize or not.\n            return_numpy(bool): To return a numpy array or not.\n            add_start_end(bool): To add start and end notation (<s> </s>) to each\n            sentence or not.\n\n        Returns:\n            torch.Tensor : Return last or all hidden states.\n\n        \"\"\"\n\n        minibatch = self.get_minibatch(\n            sentences, tokenize=tokenize, add_start_end=add_start_end\n        )\n        h, h_t = self.encoder(\n            input=minibatch[\"sentences\"],\n            lengths=minibatch[\"lengths\"],\n            return_all=True,\n            pool=pool,\n        )\n        h = h.index_select(0, minibatch[\"rev\"])\n        h_t = h_t.index_select(0, minibatch[\"rev\"])\n        if return_numpy:\n            return h.data.cpu().numpy(), h_t.data.cpu().numpy()\n        else:\n            return h, h_t\n\n\n"
  },
  {
    "path": "utils_nlp/models/gensen/multi_task_model.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Parent model for Multitask Training.\"\"\"\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n\nfrom utils_nlp.models.pytorch_modules.conditional_gru import ConditionalGRU\n\n\nclass MultitaskModel(nn.Module):\n    \"\"\"A Multi Task Sequence to Sequence (Seq2Seq) model with GRUs.\n\n    Auxiliary NLI task trained jointly as well.\n    Ref: Multi-Task Sequence to Sequence Learning\n    https://arxiv.org/pdf/1511.06114.pdf\n    \"\"\"\n\n    def __init__(\n        self,\n        src_emb_dim,\n        trg_emb_dim,\n        src_vocab_size,\n        trg_vocab_size,\n        src_hidden_dim,\n        trg_hidden_dim,\n        pad_token_src,\n        pad_token_trg,\n        num_tasks,\n        bidirectional=False,\n        nlayers_src=1,\n        dropout=0.0,\n        paired_tasks=None,\n    ):\n        \"\"\"Initialize Seq2Seq Model.\"\"\"\n        super(MultitaskModel, self).__init__()\n        self.src_vocab_size = src_vocab_size\n        self.trg_vocab_size = trg_vocab_size\n        self.src_emb_dim = src_emb_dim\n        self.trg_emb_dim = trg_emb_dim\n        self.src_hidden_dim = src_hidden_dim\n        self.trg_hidden_dim = trg_hidden_dim\n        self.bidirectional = bidirectional\n        self.nlayers_src = nlayers_src\n        self.dropout = dropout\n        self.num_tasks = num_tasks\n        self.paired_tasks = paired_tasks\n        self.num_directions = 2 if bidirectional else 1\n        self.pad_token_src = pad_token_src\n        self.pad_token_trg = pad_token_trg\n        self.src_hidden_dim = (\n            src_hidden_dim // 2 if self.bidirectional else src_hidden_dim\n        )\n        self.decoder = ConditionalGRU\n\n        self.src_embedding = nn.Embedding(\n            src_vocab_size, src_emb_dim, self.pad_token_src\n        )\n\n        self.encoder = nn.GRU(\n            src_emb_dim,\n            self.src_hidden_dim,\n            self.nlayers_src,\n            bidirectional=bidirectional,\n            batch_first=True,\n            dropout=self.dropout,\n        )\n\n        self.enc_drp = nn.Dropout(self.dropout)\n\n        self.trg_embedding = nn.ModuleList(\n            [\n                nn.Embedding(trg_vocab_size, trg_emb_dim, self.pad_token_trg)\n                for task in range(self.num_tasks)\n            ]\n        )\n\n        self.decoders = nn.ModuleList(\n            [\n                self.decoder(trg_emb_dim, trg_hidden_dim, dropout=self.dropout)\n                for task in range(self.num_tasks)\n            ]\n        )\n\n        self.decoder2vocab = nn.ModuleList(\n            [\n                nn.Linear(trg_hidden_dim, trg_vocab_size)\n                for task in range(self.num_tasks)\n            ]\n        )\n\n        self.nli_decoder = nn.Sequential(\n            nn.Dropout(0.3),\n            nn.Linear(4 * src_hidden_dim, 512),\n            nn.ReLU(),\n            nn.Linear(512, 3),\n        )\n\n        self.init_weights()\n\n    def init_weights(self):\n        \"\"\"Initialize weights.\"\"\"\n        initrange = 0.1\n        self.src_embedding.weight.data.uniform_(-initrange, initrange)\n        for module in self.trg_embedding:\n            module.weight.data.uniform_(-initrange, initrange)\n        for module in self.decoder2vocab:\n            module.bias.data.fill_(0)\n\n    def set_pretrained_embeddings(self, embedding_matrix):\n        \"\"\"Set embedding weights.\"\"\"\n        if embedding_matrix.shape[0] != self.src_embedding.weight.size(\n            0\n        ) or embedding_matrix.shape[1] != self.src_embedding.weight.size(1):\n            self.src_embedding = nn.Embedding(\n                embedding_matrix.shape[0], embedding_matrix.shape[1]\n            )\n            self.src_vocab_size = embedding_matrix.shape[0]\n            self.src_emb_dim = embedding_matrix.shape[1]\n\n        try:\n            self.src_embedding.weight.data.set_(\n                torch.from_numpy(embedding_matrix)\n            )\n        except BaseException:\n            self.src_embedding.weight.data.set_(\n                torch.from_numpy(embedding_matrix).cuda()\n            )\n\n        self.src_embedding.cuda()\n\n    def forward(\n        self, minibatch, task_idx, return_hidden=False, paired_trg=None\n    ):\n        \"\"\"Propogate input through the network.\n\n        Seq2Seq:\n        inputs: minibatch['input_src'], minibatch['input_trg']\n        input_src       - batch size x source sequence length\n        input_trg       - batch size x target sequence length\n        src_lengths     - batch size (list)\n        paired_trg      - batch size x target sequence length or None\n        returns: decoder_logit (pre-softmax over words)\n        decoder_logit   - batch size x target sequence length x target vocab size\n\n        NLI:\n        sent1           - batch size x source sequence length\n        sent2           - batch size x target sequence length\n        sent1_lengths   - batch size (list)\n        sent2_lengths   - batch size (list)\n        rev_sent1       - batch size (LongTensor)\n        rev_sent2       - batch size (LongTensor)\n        returns: class_logits (pre-softmax over NLI classes)\n        decoder_logit   - batch size x 3\n        \"\"\"\n        if minibatch[\"type\"] == \"nli\":\n            sent1_emb = self.src_embedding(minibatch[\"sent1\"])\n            sent2_emb = self.src_embedding(minibatch[\"sent2\"])\n\n            sent1_lengths = minibatch[\"sent1_lens\"].data.view(-1).tolist()\n            sent1_emb = pack_padded_sequence(\n                sent1_emb, sent1_lengths, batch_first=True\n            )\n            sent1, sent1_h = self.encoder(sent1_emb)\n\n            sent2_lengths = minibatch[\"sent2_lens\"].data.view(-1).tolist()\n            sent2_emb = pack_padded_sequence(\n                sent2_emb, sent2_lengths, batch_first=True\n            )\n            sent2, sent2_h = self.encoder(sent2_emb)\n\n            if self.bidirectional:\n                sent1_h = torch.cat((sent1_h[-1], sent1_h[-2]), 1)\n                sent2_h = torch.cat((sent2_h[-1], sent2_h[-2]), 1)\n            else:\n                sent1_h = sent1_h[-1]\n                sent2_h = sent2_h[-1]\n\n            sent1_h = sent1_h.index_select(0, minibatch[\"rev_sent1\"])\n            sent2_h = sent2_h.index_select(0, minibatch[\"rev_sent2\"])\n\n            features = torch.cat(\n                (\n                    sent1_h,\n                    sent2_h,\n                    torch.abs(sent1_h - sent2_h),\n                    sent1_h * sent2_h,\n                ),\n                1,\n            )\n\n            if return_hidden:\n                return sent1_h, sent2_h, self.nli_decoder(features)\n            else:\n                return self.nli_decoder(features)\n\n        else:\n            src_emb = self.src_embedding(minibatch[\"input_src\"])\n            trg_emb = self.trg_embedding[task_idx](minibatch[\"input_trg\"])\n            src_lengths = minibatch[\"src_lens\"].data.view(-1).tolist()\n            src_emb = pack_padded_sequence(\n                src_emb, src_lengths, batch_first=True\n            )\n\n            _, src_h_t = self.encoder(src_emb)\n\n            if self.bidirectional:\n                h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)\n            else:\n                h_t = src_h_t[-1]\n\n            h_t = h_t.unsqueeze(0)\n            h_t = self.enc_drp(h_t)\n\n            # Debug with squeeze on error.\n            trg_h, _ = self.decoders[task_idx](\n                trg_emb,\n                h_t.view(-1, self.trg_hidden_dim),\n                h_t.view(-1, self.trg_hidden_dim),\n            )\n\n            trg_h_reshape = trg_h.contiguous().view(\n                trg_h.size(0) * trg_h.size(1), trg_h.size(2)\n            )\n\n            decoder_logit = self.decoder2vocab[task_idx](trg_h_reshape)\n            decoder_logit = decoder_logit.view(\n                trg_h.size(0), trg_h.size(1), decoder_logit.size(1)\n            )\n\n            if (\n                self.paired_tasks is not None\n                and task_idx in self.paired_tasks\n                and paired_trg is not None\n            ):\n                other_task_idx = self.paired_tasks[task_idx]\n                trg_emb_2 = self.trg_embedding[other_task_idx](paired_trg)\n\n                trg_h_2, _ = self.decoders[other_task_idx](\n                    trg_emb_2, h_t.squeeze(), h_t.squeeze()\n                )\n\n                trg_h_reshape_2 = trg_h_2.contiguous().view(\n                    trg_h_2.size(0) * trg_h_2.size(1), trg_h_2.size(2)\n                )\n\n                decoder_logit_2 = self.decoder2vocab[other_task_idx](\n                    trg_h_reshape_2\n                )\n                decoder_logit_2 = decoder_logit_2.view(\n                    trg_h_2.size(0), trg_h_2.size(1), decoder_logit_2.size(1)\n                )\n                if return_hidden:\n                    return decoder_logit, decoder_logit_2, h_t\n                else:\n                    return decoder_logit, decoder_logit_2\n\n            if return_hidden:\n                return decoder_logit, h_t\n            else:\n                return decoder_logit\n\n    def decode(self, logits):\n        \"\"\"Return probability distribution over words.\"\"\"\n        logits_reshape = logits.view(-1, logits.size(2))\n        word_probs = F.softmax(logits_reshape)\n        word_probs = word_probs.view(\n            logits.size(0), logits.size(1), logits.size(2)\n        )\n        return word_probs\n\n    def get_hidden(self, input_src, src_lengths, strategy=\"last\"):\n        \"\"\"Return the encoder hidden state.\"\"\"\n        src_emb = self.src_embedding(input_src)\n        src_lengths = src_lengths.data.view(-1).tolist()\n        src_emb = pack_padded_sequence(src_emb, src_lengths, batch_first=True)\n        src_h, src_h_t = self.encoder(src_emb)\n        if strategy == \"last\":\n            if self.bidirectional:\n                h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)\n            else:\n                h_t = src_h_t[-1]\n        else:\n            src_h, _ = pad_packed_sequence(src_h, batch_first=True)\n            h_t = torch.max(src_h, 1)[0].squeeze()\n\n        return src_h, h_t\n\n\n# Original source: https://github.com/Maluuba/gensen\n"
  },
  {
    "path": "utils_nlp/models/gensen/preprocess_utils.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport shutil\nfrom utils_nlp.models.gensen import SNLI_CLEAN_PATH\n\n\ndef _preprocess(split_map, data_path, column_names):\n    \"\"\"\n    Method to save the tokens for each split in a snli_1.0_{split}.txt.clean file,\n    with the sentence pairs and scores tab-separated and the tokens separated by a\n    single space.\n\n    Args:\n        split_map(dict) : A dictionary containing train, test and dev\n        tokenized dataframes.\n        data_path(str): Path to the data folder.\n        column_names(list): List of column names for the new columns created.\n\n    \"\"\"\n\n    for file_split, df in split_map.items():\n        base_txt_path = os.path.join(\n            data_path, SNLI_CLEAN_PATH, \"snli_1.0_{}.txt\".format(file_split)\n        )\n\n        df[column_names[0]] = df[\"sentence1_tokens\"].apply(\n            lambda x: \" \" \"\".join(x)\n        )\n        df[column_names[1]] = df[\"sentence2_tokens\"].apply(\n            lambda x: \" \" \"\".join(x)\n        )\n        df[column_names[0]].to_csv(\n            \"{}.s1.tok\".format(base_txt_path),\n            sep=\" \",\n            header=False,\n            index=False,\n        )\n        df[column_names[1]].to_csv(\n            \"{}.s2.tok\".format(base_txt_path),\n            sep=\" \",\n            header=False,\n            index=False,\n        )\n        df[column_names[2]].to_csv(\n            \"{}.lab\".format(base_txt_path), sep=\" \", header=False, index=False\n        )\n        df_clean = df[column_names]\n        df_clean.to_csv(\n            \"{}.clean\".format(base_txt_path),\n            sep=\"\\t\",\n            header=False,\n            index=False,\n        )\n        # remove rows with blank scores\n        df_noblank = df_clean.loc[df_clean[column_names[2]] != \"-\"].copy()\n        df_noblank.to_csv(\n            \"{}.clean.noblank\".format(base_txt_path),\n            sep=\"\\t\",\n            header=False,\n            index=False,\n        )\n\n\ndef _split_and_cleanup(split_map, data_path):\n    \"\"\"\n    Method that removes quotations from .tok files and saves the tokenized sentence\n    and labels separately, in the form snli_1.0_{split}.txt.s1.tok or snli_1.0_{\n    split}.txt.s2.tok or snli_1.0_{split}.txt.lab.\n\n    Args:\n        split_map(dict) : A dictionary containing train, test and dev\n        tokenized dataframes.\n        data_path(str): Path to the data folder.\n\n    \"\"\"\n\n    for file_split in split_map.keys():\n\n        s1_tok_path = os.path.join(\n            data_path,\n            SNLI_CLEAN_PATH,\n            \"snli_1.0_{}.txt.s1.tok\".format(file_split),\n        )\n        s2_tok_path = os.path.join(\n            data_path,\n            SNLI_CLEAN_PATH,\n            \"snli_1.0_{}.txt.s2.tok\".format(file_split),\n        )\n        with open(s1_tok_path, \"r\") as fin, open(\n            \"{}.tmp\".format(s1_tok_path), \"w\"\n        ) as tmp:\n            for line in fin:\n                s = line.replace('\"', \"\")\n                tmp.write(s)\n        with open(s2_tok_path, \"r\") as fin, open(\n            \"{}.tmp\".format(s2_tok_path), \"w\"\n        ) as tmp:\n            for line in fin:\n                s = line.replace('\"', \"\")\n                tmp.write(s)\n        shutil.move(\"{}.tmp\".format(s1_tok_path), s1_tok_path)\n        shutil.move(\"{}.tmp\".format(s2_tok_path), s2_tok_path)\n\n\ndef gensen_preprocess(train_tok, dev_tok, test_tok, data_path):\n    \"\"\"\n    Method to preprocess the train, validation and test datasets according to Gensen\n    models requirements.\n\n    Args:\n        train_tok(pd.Dataframe): Tokenized training dataframe.\n        dev_tok(pd.Dataframe): Tokenized validation dataframe.\n        test_tok(pd.Dataframe): Tokenized test dataframe.\n        data_path(str): Path to the data folder.\n\n    Returns:\n        str: Path to the processed dataset for GenSen.\n\n    \"\"\"\n\n    split_map = {}\n\n    if train_tok is not None:\n        split_map[\"train\"] = train_tok\n    if dev_tok is not None:\n        split_map[\"dev\"] = dev_tok\n    if test_tok is not None:\n        split_map[\"test\"] = test_tok\n\n    column_names = [\"s1.tok\", \"s2.tok\", \"score\"]\n\n    if not os.path.exists(os.path.join(data_path, SNLI_CLEAN_PATH)):\n        os.makedirs(os.path.join(data_path, SNLI_CLEAN_PATH), exist_ok=True)\n\n    _preprocess(split_map, data_path, column_names)\n    _split_and_cleanup(split_map, data_path)\n\n    return os.path.join(data_path, SNLI_CLEAN_PATH)\n"
  },
  {
    "path": "utils_nlp/models/gensen/utils.py",
    "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Minibatching utilities.\"\"\"\nimport itertools\nimport operator\nimport os\nimport pickle\n\nimport numpy as np\nimport torch\nfrom sklearn.utils import shuffle\nfrom torch.autograd import Variable\n\n# Change to python3+.\n# from itertools import zip\n\n\nclass DataIterator(object):\n    \"\"\"Data Iterator.\"\"\"\n\n    @staticmethod\n    def _trim_vocab(vocab, vocab_size):\n        \"\"\"Discard start, end, pad and unk tokens if already present.\n\n        Args:\n            vocab(list): Vocabulary.\n            vocab_size(int): The size of the vocabulary.\n\n        Returns:\n            word2id(list): Word to index list.\n            id2word(list): Index to word list.\n        \"\"\"\n        if \"<s>\" in vocab:\n            del vocab[\"<s>\"]\n        if \"<pad>\" in vocab:\n            del vocab[\"<pad>\"]\n        if \"</s>\" in vocab:\n            del vocab[\"</s>\"]\n        if \"<unk>\" in vocab:\n            del vocab[\"<unk>\"]\n\n        word2id = {\"<s>\": 0, \"<pad>\": 1, \"</s>\": 2, \"<unk>\": 3}\n\n        id2word = {0: \"<s>\", 1: \"<pad>\", 2: \"</s>\", 3: \"<unk>\"}\n\n        sorted_word2id = sorted(\n            vocab.items(), key=operator.itemgetter(1), reverse=True\n        )\n\n        if vocab_size != -1:\n            sorted_words = [x[0] for x in sorted_word2id[:vocab_size]]\n        else:\n            sorted_words = [x[0] for x in sorted_word2id]\n\n        for ind, word in enumerate(sorted_words):\n            word2id[word] = ind + 4\n\n        for ind, word in enumerate(sorted_words):\n            id2word[ind + 4] = word\n\n        return word2id, id2word\n\n    def construct_vocab(\n        self, sentences, vocab_size, lowercase=False, charlevel=False\n    ):\n        \"\"\"Create vocabulary.\n\n        Args:\n            sentences(list): The list of sentences.\n            vocab_size(int): The size of vocabulary.\n            lowercase(bool): If lowercase the sentences.\n            charlevel(bool): If need to split the sentence with space.\n\n        Returns:\n            word2id(list): Word to index list.\n            id2word(list): Index to word list.\n        \"\"\"\n        vocab = {}\n        for sentence in sentences:\n            if isinstance(sentence, str):\n                if lowercase:\n                    sentence = sentence.lower()\n                if not charlevel:\n                    sentence = sentence.split()\n            for word in sentence:\n                if word not in vocab:\n                    vocab[word] = 1\n                else:\n                    vocab[word] += 1\n        word2id, id2word = self._trim_vocab(vocab, vocab_size)\n        return word2id, id2word\n\n\nclass BufferedDataIterator(DataIterator):\n    \"\"\"Multi Parallel corpus data iterator.\"\"\"\n\n    def __init__(\n        self,\n        src,\n        trg,\n        src_vocab_size,\n        trg_vocab_size,\n        tasknames,\n        save_dir,\n        buffer_size=1e6,\n        lowercase=False,\n        seed=0,\n    ):\n        \"\"\"Initialize params.\n\n        Args:\n            src(list): source dataset.\n            trg(list): target dataset.\n            src_vocab_size(int): The size of source vocab.\n            trg_vocab_size(int): The size of target vocab.\n            tasknames(list): The list of task names.\n            save_dir(str): The saving dir.\n            buffer_size(float): Buffer size.\n            lowercase(bool): if lowercase the data.\n        \"\"\"\n        self.seed = seed\n        self.fname_src = src\n        self.fname_trg = trg\n        self.src_vocab_size = src_vocab_size\n        self.trg_vocab_size = trg_vocab_size\n        self.tasknames = tasknames\n        self.save_dir = save_dir\n        self.buffer_size = buffer_size\n        self.lowercase = lowercase\n\n        # Open a list of file pointers to all the files.\n        self.f_src = [\n            open(fname, \"r\", encoding=\"utf-8\") for fname in self.fname_src\n        ]\n        self.f_trg = [\n            open(fname, \"r\", encoding=\"utf-8\") for fname in self.fname_trg\n        ]\n\n        # Initialize dictionaries that contain sentences & word mapping dicts\n        self.src = [\n            {\"data\": [], \"word2id\": None, \"id2word\": None}\n            for i in range(len(self.fname_src))\n        ]\n        self.trg = [\n            {\"data\": [], \"word2id\": None, \"id2word\": None}\n            for i in range(len(self.fname_trg))\n        ]\n        self.build_vocab()\n\n        \"\"\"Reset file pointers to the start after reading the file to\n        build vocabularies.\"\"\"\n        for idx in range(len(self.src)):\n            self._reset_filepointer(idx)\n        for idx in range(len(self.src)):\n            self.fetch_buffer(idx)\n\n    def _reset_filepointer(self, idx):\n        \"\"\"Reset file pointer.\n\n        Args:\n            idx(int): Index used to reset file pointer.\n\n        \"\"\"\n        self.f_src[idx] = open(self.fname_src[idx], \"r\", encoding=\"utf-8\")\n        self.f_trg[idx] = open(self.fname_trg[idx], \"r\", encoding=\"utf-8\")\n\n    def fetch_buffer(self, idx, reset=True):\n        \"\"\"Fetch sentences from the file into the buffer.\n\n        Args:\n            idx(int): Index used to fetch the sentences.\n            reset(bool): If need to reset the contents of the current buffer.\n\n        \"\"\"\n        # Reset the contents of the current buffer.\n        if reset:\n            self.src[idx][\"data\"] = []\n            self.trg[idx][\"data\"] = []\n\n        # Populate buffer\n        for src, trg in zip(self.f_src[idx], self.f_trg[idx]):\n            if len(self.src[idx][\"data\"]) == self.buffer_size:\n                break\n            if self.lowercase:\n                self.src[idx][\"data\"].append(src.lower().split())\n                self.trg[idx][\"data\"].append(trg.lower().split())\n\n            else:\n                self.src[idx][\"data\"].append(src.split())\n                self.trg[idx][\"data\"].append(trg.split())\n\n        # Sort sentences by decreasing length (hacky bucketing)\n        self.src[idx][\"data\"], self.trg[idx][\"data\"] = zip(\n            *sorted(\n                zip(self.src[idx][\"data\"], self.trg[idx][\"data\"]),\n                key=lambda x: len(x[0]),\n                reverse=True,\n            )\n        )\n\n        \"\"\"If buffer isn't full after reading the contents of the file,\n        cycle around. \"\"\"\n        if len(self.src[idx][\"data\"]) < self.buffer_size:\n            assert len(self.src[idx][\"data\"]) == len(self.trg[idx][\"data\"])\n            # Cast things to list to avoid issue with calling .append above\n            self.src[idx][\"data\"] = list(self.src[idx][\"data\"])\n            self.trg[idx][\"data\"] = list(self.trg[idx][\"data\"])\n            self._reset_filepointer(idx)\n            self.fetch_buffer(idx, reset=False)\n\n    def build_vocab(self):\n        \"\"\"Build a memory efficient vocab.\"\"\"\n        # Construct common source vocab.\n        # Check if save directory exists.\n        if not os.path.exists(self.save_dir):\n            raise ValueError(\"Could not find save dir : %s\" % self.save_dir)\n\n        # Check if a cached vocab file exists.\n        if os.path.exists(os.path.join(self.save_dir, \"src_vocab.pkl\")):\n            vocab = pickle.load(\n                open(os.path.join(self.save_dir, \"src_vocab.pkl\"), \"rb\")\n            )\n            word2id, id2word = vocab[\"word2id\"], vocab[\"id2word\"]\n        # If not, compute the vocab from scratch and store a cache.\n        else:\n            word2id, id2word = self.construct_vocab(\n                itertools.chain.from_iterable(self.f_src),\n                self.src_vocab_size,\n                self.lowercase,\n            )\n            pickle.dump(\n                {\"word2id\": word2id, \"id2word\": id2word},\n                open(os.path.join(self.save_dir, \"src_vocab.pkl\"), \"wb\"),\n            )\n        for corpus in self.src:\n            corpus[\"word2id\"], corpus[\"id2word\"] = word2id, id2word\n\n        # Do the same for the target vocabulary.\n        if os.path.exists(os.path.join(self.save_dir, \"trg_vocab.pkl\")):\n            vocab = pickle.load(\n                open(os.path.join(self.save_dir, \"trg_vocab.pkl\"), \"rb\")\n            )\n            for idx, (corpus, fname) in enumerate(zip(self.trg, self.f_trg)):\n                word2id, id2word = (\n                    vocab[self.tasknames[idx]][\"word2id\"],\n                    vocab[self.tasknames[idx]][\"id2word\"],\n                )\n                corpus[\"word2id\"], corpus[\"id2word\"] = word2id, id2word\n        else:\n            trg_vocab_dump = {}\n            for idx, (corpus, fname) in enumerate(zip(self.trg, self.f_trg)):\n                word2id, id2word = self.construct_vocab(\n                    fname, self.trg_vocab_size, self.lowercase\n                )\n                corpus[\"word2id\"], corpus[\"id2word\"] = word2id, id2word\n                trg_vocab_dump[self.tasknames[idx]] = {}\n                trg_vocab_dump[self.tasknames[idx]][\"word2id\"] = word2id\n                trg_vocab_dump[self.tasknames[idx]][\"id2word\"] = id2word\n\n            pickle.dump(\n                trg_vocab_dump,\n                open(os.path.join(self.save_dir, \"trg_vocab.pkl\"), \"wb\"),\n            )\n\n    def shuffle_dataset(self, idx):\n        \"\"\"Shuffle current buffer.\"\"\"\n        self.src[idx][\"data\"], self.trg[idx][\"data\"] = shuffle(\n            self.src[idx][\"data\"],\n            self.trg[idx][\"data\"],\n            random_state=self.seed,\n        )\n\n    def get_parallel_minibatch(\n        self, corpus_idx, index, batch_size, max_len_src, max_len_trg\n    ):\n        \"\"\"Prepare minibatch.\n\n        Args:\n            corpus_idx(int): Corpus Index.\n            index(int): Index.\n            batch_size(int): Batch Size.\n            max_len_src(int): Max length for resource.\n            max_len_trg(int): Max length ofr target.\n\n        Returns: minibatch of src-trg pairs(dict).\n\n        \"\"\"\n        src_lines = [\n            [\"<s>\"] + line[: max_len_src - 2] + [\"</s>\"]\n            for line in self.src[corpus_idx][\"data\"][\n                index : index + batch_size\n            ]\n        ]\n\n        trg_lines = [\n            [\"<s>\"] + line[: max_len_trg - 2] + [\"</s>\"]\n            for line in self.trg[corpus_idx][\"data\"][\n                index : index + batch_size\n            ]\n        ]\n\n        \"\"\"Sort sentences by decreasing length within a minibatch for\n        `torch.nn.utils.packed_padded_sequence`\"\"\"\n        src_lens = [len(line) for line in src_lines]\n        sorted_indices = np.argsort(src_lens)[::-1]\n\n        sorted_src_lines = [src_lines[idx] for idx in sorted_indices]\n        sorted_trg_lines = [trg_lines[idx] for idx in sorted_indices]\n\n        sorted_src_lens = [len(line) for line in sorted_src_lines]\n        sorted_trg_lens = [len(line) for line in sorted_trg_lines]\n\n        max_src_len = max(sorted_src_lens)\n        max_trg_len = max(sorted_trg_lens)\n\n        # Map words to indices\n        input_lines_src = [\n            [\n                self.src[corpus_idx][\"word2id\"][w]\n                if w in self.src[corpus_idx][\"word2id\"]\n                else self.src[corpus_idx][\"word2id\"][\"<unk>\"]\n                for w in line\n            ]\n            + [self.src[corpus_idx][\"word2id\"][\"<pad>\"]]\n            * (max_src_len - len(line))\n            for line in sorted_src_lines\n        ]\n\n        input_lines_trg = [\n            [\n                self.trg[corpus_idx][\"word2id\"][w]\n                if w in self.trg[corpus_idx][\"word2id\"]\n                else self.trg[corpus_idx][\"word2id\"][\"<unk>\"]\n                for w in line[:-1]\n            ]\n            + [self.trg[corpus_idx][\"word2id\"][\"<pad>\"]]\n            * (max_trg_len - len(line))\n            for line in sorted_trg_lines\n        ]\n\n        output_lines_trg = [\n            [\n                self.trg[corpus_idx][\"word2id\"][w]\n                if w in self.trg[corpus_idx][\"word2id\"]\n                else self.trg[corpus_idx][\"word2id\"][\"<unk>\"]\n                for w in line[1:]\n            ]\n            + [self.trg[corpus_idx][\"word2id\"][\"<pad>\"]]\n            * (max_trg_len - len(line))\n            for line in sorted_trg_lines\n        ]\n\n        # Cast lists to torch tensors\n        input_lines_src = Variable(torch.LongTensor(input_lines_src)).cuda()\n        input_lines_trg = Variable(torch.LongTensor(input_lines_trg)).cuda()\n        output_lines_trg = Variable(torch.LongTensor(output_lines_trg)).cuda()\n        sorted_src_lens = (\n            Variable(torch.LongTensor(sorted_src_lens), volatile=True)\n            .squeeze()\n            .cuda()\n        )\n\n        # Return minibatch of src-trg pairs\n        return {\n            \"input_src\": input_lines_src,\n            \"input_trg\": input_lines_trg,\n            \"output_trg\": output_lines_trg,\n            \"src_lens\": sorted_src_lens,\n            \"type\": \"seq2seq\",\n        }\n\n\nclass NLIIterator(DataIterator):\n    \"\"\"Data iterator for tokenized NLI datasets.\"\"\"\n\n    def __init__(\n        self, train, dev, test, vocab_size, lowercase=True, vocab=None, seed=0\n    ):\n        \"\"\"Initialize params.\n\n        Each of train/dev/test is a tab-separate file of the form\n        premise \\t hypothesis \\t label.\n\n        Args:\n            train(torch.Tensor): Training dataset.\n            dev(torch.Tensor): Validation dataset.\n            test(torch.Tensor): Testing dataset.\n            vocab_size(int): The size of the vocabulary.\n            lowercase(bool): If lowercase the dataset.\n            vocab(Union[bytes,str): The list of the vocabulary.\n        \"\"\"\n        self.seed = seed\n        self.train = train\n        self.dev = dev\n        self.test = test\n        self.vocab_size = vocab_size\n        self.lowercase = lowercase\n        self.vocab = vocab\n        self.train_lines = [\n            line.strip().lower().split(\"\\t\")\n            for line in open(self.train, encoding=\"utf-8\")\n        ]\n        self.dev_lines = [\n            line.strip().lower().split(\"\\t\")\n            for line in open(self.dev, encoding=\"utf-8\")\n        ]\n        self.test_lines = [\n            line.strip().lower().split(\"\\t\")\n            for line in open(self.test, encoding=\"utf-8\")\n        ]\n\n        if self.vocab is not None:\n            # binary mode doesn't take an encoding argument\n            self.vocab = pickle.load(open(self.vocab, \"rb\"))\n            self.word2id = self.vocab[\"word2id\"]\n            self.id2word = self.vocab[\"id2word\"]\n            self.vocab_size = len(self.word2id)\n        else:\n            self.word2id, self.id2word = self.construct_vocab(\n                [x[0] for x in self.train_lines]\n                + [x[1] for x in self.train_lines],\n                self.vocab_size,\n                lowercase=self.lowercase,\n            )\n\n        # Label text to class mapping.\n        self.text2label = {\"entailment\": 0, \"neutral\": 1, \"contradiction\": 2}\n\n        self.shuffle_dataset()\n\n    def shuffle_dataset(self):\n        \"\"\"Shuffle training data.\"\"\"\n        self.train_lines = shuffle(self.train_lines, random_state=self.seed)\n\n    def get_parallel_minibatch(self, index, batch_size, sent_type=\"train\"):\n        \"\"\"Prepare minibatch.\n\n        Args:\n            index(int): The index for line.\n            batch_size(int): Batch size.\n            sent_type(str): Type of dataset.\n\n        Returns:\n            dict for batch training.\n        \"\"\"\n        if sent_type == \"train\":\n            lines = self.train_lines\n        elif sent_type == \"dev\":\n            lines = self.dev_lines\n        else:\n            lines = self.test_lines\n\n        sent1 = [\n            [\"<s>\"] + line[0].split() + [\"</s>\"]\n            for line in lines[index : index + batch_size]\n        ]\n\n        sent2 = [\n            [\"<s>\"] + line[1].split() + [\"</s>\"]\n            for line in lines[index : index + batch_size]\n        ]\n\n        labels = [\n            self.text2label[line[2]]\n            for line in lines[index : index + batch_size]\n        ]\n\n        sent1_lens = [len(line) for line in sent1]\n        sorted_sent1_indices = np.argsort(sent1_lens)[::-1]\n        sorted_sent1_lines = [sent1[idx] for idx in sorted_sent1_indices]\n        rev_sent1 = np.argsort(sorted_sent1_indices)\n\n        sent2_lens = [len(line) for line in sent2]\n        sorted_sent2_indices = np.argsort(sent2_lens)[::-1]\n        sorted_sent2_lines = [sent2[idx] for idx in sorted_sent2_indices]\n        rev_sent2 = np.argsort(sorted_sent2_indices)\n\n        sorted_sent1_lens = [len(line) for line in sorted_sent1_lines]\n        sorted_sent2_lens = [len(line) for line in sorted_sent2_lines]\n\n        max_sent1_len = max(sorted_sent1_lens)\n        max_sent2_len = max(sorted_sent2_lens)\n\n        sent1 = [\n            [\n                self.word2id[w] if w in self.word2id else self.word2id[\"<unk>\"]\n                for w in line\n            ]\n            + [self.word2id[\"<pad>\"]] * (max_sent1_len - len(line))\n            for line in sorted_sent1_lines\n        ]\n\n        sent2 = [\n            [\n                self.word2id[w] if w in self.word2id else self.word2id[\"<unk>\"]\n                for w in line\n            ]\n            + [self.word2id[\"<pad>\"]] * (max_sent2_len - len(line))\n            for line in sorted_sent2_lines\n        ]\n\n        sent1 = Variable(torch.LongTensor(sent1)).cuda()\n        sent2 = Variable(torch.LongTensor(sent2)).cuda()\n        labels = Variable(torch.LongTensor(labels)).cuda()\n        sent1_lens = (\n            Variable(torch.LongTensor(sorted_sent1_lens), requires_grad=False)\n            .squeeze()\n            .cuda()\n        )\n        sent2_lens = (\n            Variable(torch.LongTensor(sorted_sent2_lens), requires_grad=False)\n            .squeeze()\n            .cuda()\n        )\n        rev_sent1 = (\n            Variable(torch.LongTensor(rev_sent1), requires_grad=False)\n            .squeeze()\n            .cuda()\n        )\n        rev_sent2 = (\n            Variable(torch.LongTensor(rev_sent2), requires_grad=False)\n            .squeeze()\n            .cuda()\n        )\n\n        return {\n            \"sent1\": sent1,\n            \"sent2\": sent2,\n            \"sent1_lens\": sent1_lens,\n            \"sent2_lens\": sent2_lens,\n            \"rev_sent1\": rev_sent1,\n            \"rev_sent2\": rev_sent2,\n            \"labels\": labels,\n            \"type\": \"nli\",\n        }\n\n\ndef get_validation_minibatch(\n    src, trg, index, batch_size, src_word2id, trg_word2id\n):\n    \"\"\"Prepare minibatch.\n\n    Args:\n        src(list): source data.\n        trg(list): target data.\n        index(int): index for the file.\n        batch_size(int): batch size.\n        src_word2id(list): Word to index for source.\n        trg_word2id(list): Word to index for target.\n\n    Returns:\n        Dict for seq2seq model.\n    \"\"\"\n    src_lines = [\n        [\"<s>\"] + line + [\"</s>\"] for line in src[index : index + batch_size]\n    ]\n\n    trg_lines = [\n        [\"<s>\"] + line + [\"</s>\"] for line in trg[index : index + batch_size]\n    ]\n\n    src_lens = [len(line) for line in src_lines]\n    sorted_indices = np.argsort(src_lens)[::-1]\n\n    sorted_src_lines = [src_lines[idx] for idx in sorted_indices]\n    sorted_trg_lines = [trg_lines[idx] for idx in sorted_indices]\n\n    sorted_src_lens = [len(line) for line in sorted_src_lines]\n    sorted_trg_lens = [len(line) for line in sorted_trg_lines]\n\n    max_src_len = max(sorted_src_lens)\n    max_trg_len = max(sorted_trg_lens)\n\n    input_lines_src = [\n        [src_word2id[w] if w in src else src_word2id[\"<unk>\"] for w in line]\n        + [src_word2id[\"<pad>\"]] * (max_src_len - len(line))\n        for line in sorted_src_lines\n    ]\n\n    input_lines_trg = [\n        [\n            trg_word2id[w] if w in trg_word2id else trg_word2id[\"<unk>\"]\n            for w in line[:-1]\n        ]\n        + [trg_word2id[\"<pad>\"]] * (max_trg_len - len(line))\n        for line in sorted_trg_lines\n    ]\n\n    output_lines_trg = [\n        [\n            trg_word2id[w] if w in trg_word2id else trg_word2id[\"<unk>\"]\n            for w in line[1:]\n        ]\n        + [trg_word2id[\"<pad>\"]] * (max_trg_len - len(line))\n        for line in sorted_trg_lines\n    ]\n    # For pytroch 0.4\n    with torch.no_grad():\n        input_lines_src = Variable(torch.LongTensor(input_lines_src)).cuda()\n        input_lines_trg = Variable(torch.LongTensor(input_lines_trg)).cuda()\n        output_lines_trg = Variable(torch.LongTensor(output_lines_trg)).cuda()\n        # sorted_src_lens = Variable(\n        #     torch.LongTensor(sorted_src_lens)\n        # ).squeeze().cuda()\n        sorted_src_lens = (\n            Variable(torch.LongTensor(sorted_src_lens))\n            .view(len(sorted_src_lens))\n            .cuda()\n        )\n    return {\n        \"input_src\": input_lines_src,\n        \"input_trg\": input_lines_trg,\n        \"output_trg\": output_lines_trg,\n        \"src_lens\": sorted_src_lens,\n        \"type\": \"seq2seq\",\n    }\n\n\ndef compute_validation_loss(\n    config, model, train_iterator, criterion, task_idx, lowercase=False\n):\n    \"\"\"Compute validation loss for a task.\n\n    Args:\n        config(dict): configuration list.\n        model(MultitaskModel): model.\n        train_iterator(BufferedDataIterator): Multi Parallel corpus data iterator.\n        criterion(nn.CrossEntropyLoss): criterion function for loss.\n        task_idx(int): Task index.\n        lowercase(bool): If lowercase the data.\n\n    Returns: float as the mean of the loss.\n\n    \"\"\"\n    val_src = config[\"data\"][\"paths\"][task_idx][\"val_src\"]\n    val_trg = config[\"data\"][\"paths\"][task_idx][\"val_trg\"]\n\n    if lowercase:\n        val_src = [\n            line.strip().lower().split()\n            for line in open(val_src, \"r\", encoding=\"utf-8\")\n        ]\n        val_trg = [\n            line.strip().lower().split()\n            for line in open(val_trg, \"r\", encoding=\"utf-8\")\n        ]\n    else:\n        val_src = [\n            line.strip().split()\n            for line in open(val_src, \"r\", encoding=\"utf-8\")\n        ]\n        val_trg = [\n            line.strip().split()\n            for line in open(val_trg, \"r\", encoding=\"utf-8\")\n        ]\n\n    batch_size = config[\"training\"][\"batch_size\"]\n    losses = []\n    for j in range(0, len(val_src), batch_size):\n        minibatch = get_validation_minibatch(\n            val_src,\n            val_trg,\n            j,\n            batch_size,\n            train_iterator.src[task_idx][\"word2id\"],\n            train_iterator.trg[task_idx][\"word2id\"],\n        )\n        decoder_logit = model(minibatch, task_idx)\n\n        loss = criterion(\n            decoder_logit.contiguous().view(-1, decoder_logit.size(2)),\n            minibatch[\"output_trg\"].contiguous().view(-1),\n        )\n\n        # losses.append(loss.data[0])\n        losses.append(loss.item())\n\n    return np.mean(losses)\n\n\n# Original source: https://github.com/Maluuba/gensen\n"
  },
  {
    "path": "utils_nlp/models/glove/Makefile",
    "content": "CC = gcc\n#For older gcc, use -O3 or -O2 instead of -Ofast\n# CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wno-unused-result\nCFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic\nBUILDDIR := build\nSRCDIR := src\n\nall: dir glove shuffle cooccur vocab_count\n\ndir :\n\tmkdir -p $(BUILDDIR)\nglove : $(SRCDIR)/glove.c\n\t$(CC) $(SRCDIR)/glove.c -o $(BUILDDIR)/glove $(CFLAGS)\nshuffle : $(SRCDIR)/shuffle.c\n\t$(CC) $(SRCDIR)/shuffle.c -o $(BUILDDIR)/shuffle $(CFLAGS)\ncooccur : $(SRCDIR)/cooccur.c\n\t$(CC) $(SRCDIR)/cooccur.c -o $(BUILDDIR)/cooccur $(CFLAGS)\nvocab_count : $(SRCDIR)/vocab_count.c\n\t$(CC) $(SRCDIR)/vocab_count.c -o $(BUILDDIR)/vocab_count $(CFLAGS)\n\nclean:\n\trm -rf glove shuffle cooccur vocab_count build\n"
  },
  {
    "path": "utils_nlp/models/glove/README.md",
    "content": "## GloVe: Global Vectors for Word Representation\n\n\n| nearest neighbors of <br/> <em>frog</em> | Litoria             |  Leptodactylidae | Rana | Eleutherodactylus |\n| --- | ------------------------------- | ------------------- | ---------------- | ------------------- |\n| Pictures | <img src=\"http://nlp.stanford.edu/projects/glove/images/litoria.jpg\"></img> | <img src=\"http://nlp.stanford.edu/projects/glove/images/leptodactylidae.jpg\"></img> | <img src=\"http://nlp.stanford.edu/projects/glove/images/rana.jpg\"></img> | <img src=\"http://nlp.stanford.edu/projects/glove/images/eleutherodactylus.jpg\"></img> |\n\n| Comparisons | man -> woman             |  city -> zip | comparative -> superlative |\n| --- | ------------------------|-------------------------|-------------------------|\n| GloVe Geometry | <img src=\"http://nlp.stanford.edu/projects/glove/images/man_woman_small.jpg\"></img>  | <img src=\"http://nlp.stanford.edu/projects/glove/images/city_zip_small.jpg\"></img> | <img src=\"http://nlp.stanford.edu/projects/glove/images/comparative_superlative_small.jpg\"></img> |\n\nWe provide an implementation of the GloVe model for learning word representations, and describe how to download web-dataset vectors or train your own. See the [project page](http://nlp.stanford.edu/projects/glove/) or the [paper](http://nlp.stanford.edu/pubs/glove.pdf) for more information on glove vectors.\n\n## Download pre-trained word vectors\nThe links below contain word vectors obtained from the respective corpora. If you want word vectors trained on massive web datasets, you need only download one of these text files! Pre-trained word vectors are made available under the <a href=\"http://opendatacommons.org/licenses/pddl/\">Public Domain Dedication and License</a>. \n<div class=\"entry\">\n<ul style=\"padding-left:0px; margin-top:0px; margin-bottom:0px\">\n  <li> Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): <a href=\"http://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip\">glove.42B.300d.zip</a> </li>\n  <li> Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): <a href=\"http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip\">glove.840B.300d.zip</a> </li>\n  <li> Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download): <a href=\"http://nlp.stanford.edu/data/wordvecs/glove.6B.zip\">glove.6B.zip</a> </li>\n  <li> Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download): <a href=\"http://nlp.stanford.edu/data/wordvecs/glove.twitter.27B.zip\">glove.twitter.27B.zip</a>\n</ul>\n</div>\n\n## Train word vectors on a new corpus\n\n<img src=\"https://travis-ci.org/stanfordnlp/GloVe.svg?branch=master\"></img>\n\nIf the web datasets above don't match the semantics of your end use case, you can train word vectors on your own corpus.\n\n    $ git clone http://github.com/stanfordnlp/glove\n    $ cd glove && make\n    $ ./demo.sh\n\nThe demo.sh script downloads a small corpus, consisting of the first 100M characters of Wikipedia. It collects unigram counts, constructs and shuffles cooccurrence data, and trains a simple version of the GloVe model. It also runs a word analogy evaluation script in python to verify word vector quality. More details about training on your own corpus can be found by reading [demo.sh](https://github.com/stanfordnlp/GloVe/blob/master/demo.sh) or the [src/README.md](https://github.com/stanfordnlp/GloVe/tree/master/src)\n\n### License\nAll work contained in this package is licensed under the Apache License, Version 2.0. See the include LICENSE file.\n"
  },
  {
    "path": "utils_nlp/models/glove/demo.sh",
    "content": "#!/bin/bash\nset -e\n\n# Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.\n# One optional argument can specify the language used for eval script: matlab, octave or [default] python\n\nmake\nif [ ! -e text8 ]; then\n  if hash wget 2>/dev/null; then\n    wget http://mattmahoney.net/dc/text8.zip\n  else\n    curl -O http://mattmahoney.net/dc/text8.zip\n  fi\n  unzip text8.zip\n  rm text8.zip\nfi\n\nCORPUS=text8\nVOCAB_FILE=vocab.txt\nCOOCCURRENCE_FILE=cooccurrence.bin\nCOOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin\nBUILDDIR=build\nSAVE_FILE=vectors\nVERBOSE=2\nMEMORY=4.0\nVOCAB_MIN_COUNT=5\nVECTOR_SIZE=50\nMAX_ITER=15\nWINDOW_SIZE=15\nBINARY=2\nNUM_THREADS=8\nX_MAX=10\n\necho\necho \"$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE\"\n$BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE\necho \"$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE\"\n$BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE\necho \"$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE\"\n$BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE\necho \"$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE\"\n$BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE\nif [ \"$CORPUS\" = 'text8' ]; then\n   if [ \"$1\" = 'matlab' ]; then\n       matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 \n   elif [ \"$1\" = 'octave' ]; then\n       octave < ./eval/octave/read_and_evaluate_octave.m 1>&2\n   else\n       echo \"$ python eval/python/evaluate.py\"\n       python eval/python/evaluate.py\n   fi\nfi\n"
  },
  {
    "path": "utils_nlp/models/glove/src/README.md",
    "content": "### Package Contents\n\nTo train your own GloVe vectors, first you'll need to prepare your corpus as a single text file with all words separated by one or more spaces or tabs. If your corpus has multiple documents, the documents (only) should be separated by new line characters. Cooccurrence contexts for words do not extend past newline characters. Once you create your corpus, you can train GloVe vectors using the following 4 tools. An example is included in `demo.sh`, which you can modify as necessary.\n\nThe four main tools in this package are:\n\n#### 1) vocab_count\nThis tool requires an input corpus that should already consist of whitespace-separated tokens. Use something like the [Stanford Tokenizer](https://nlp.stanford.edu/software/tokenizer.html) first on raw text. From the corpus, it constructs unigram counts from a corpus, and optionally thresholds the resulting vocabulary based on total vocabulary size or minimum frequency count.\n\n#### 2) cooccur\nConstructs word-word cooccurrence statistics from a corpus. The user should supply a vocabulary file, as produced by `vocab_count`, and may specify a variety of parameters, as described by running `./build/cooccur`.\n\n#### 3) shuffle\nShuffles the binary file of cooccurrence statistics produced by `cooccur`. For large files, the file is automatically split into chunks, each of which is shuffled and stored on disk before being merged and shuffled together. The user may specify a number of parameters, as described by running `./build/shuffle`.\n\n#### 4) glove\nTrain the GloVe model on the specified cooccurrence data, which typically will be the output of the `shuffle` tool. The user should supply a vocabulary file, as given by `vocab_count`, and may specify a number of other parameters, which are described by running `./build/glove`.\n"
  },
  {
    "path": "utils_nlp/models/glove/src/cooccur.c",
    "content": "//  Tool to calculate word-word cooccurrence statistics\n//\n//  Copyright (c) 2014, 2018 The Board of Trustees of\n//  The Leland Stanford Junior University. All Rights Reserved.\n//\n//  Licensed under the Apache License, Version 2.0 (the \"License\");\n//  you may not use this file except in compliance with the License.\n//  You may obtain a copy of the License at\n//\n//      http://www.apache.org/licenses/LICENSE-2.0\n//\n//  Unless required by applicable law or agreed to in writing, software\n//  distributed under the License is distributed on an \"AS IS\" BASIS,\n//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n//  See the License for the specific language governing permissions and\n//  limitations under the License.\n//\n//\n//  For more information, bug reports, fixes, contact:\n//    Jeffrey Pennington (jpennin@stanford.edu)\n//    Christopher Manning (manning@cs.stanford.edu)\n//    https://github.com/stanfordnlp/GloVe/\n//    GlobalVectors@googlegroups.com\n//    http://nlp.stanford.edu/projects/glove/\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <math.h>\n\n#define MAX_STRING_LENGTH 1000\n#define TSIZE 1048576\n#define SEED 1159241\n\n#define HASHFN bitwisehash\n\ntypedef double real;\n\ntypedef struct cooccur_rec {\n    int word1;\n    int word2;\n    real val;\n} CREC;\n\ntypedef struct cooccur_rec_id {\n    int word1;\n    int word2;\n    real val;\n    int id;\n} CRECID;\n\ntypedef struct hashrec {\n    char        *word;\n    long long id;\n    struct hashrec *next;\n} HASHREC;\n\nint verbose = 2; // 0, 1, or 2\nlong long max_product; // Cutoff for product of word frequency ranks below which cooccurrence counts will be stored in a compressed full array\nlong long overflow_length; // Number of cooccurrence records whose product exceeds max_product to store in memory before writing to disk\nint window_size = 15; // default context window size\nint symmetric = 1; // 0: asymmetric, 1: symmetric\nreal memory_limit = 3; // soft limit, in gigabytes, used to estimate optimal array sizes\nint distance_weighting = 1; // Flag to control the distance weighting of cooccurrence counts\nchar *vocab_file, *file_head;\n\n/* Efficient string comparison */\nint scmp( char *s1, char *s2 ) {\n    while (*s1 != '\\0' && *s1 == *s2) {s1++; s2++;}\n    return(*s1 - *s2);\n}\n\n/* Move-to-front hashing and hash function from Hugh Williams, http://www.seg.rmit.edu.au/code/zwh-ipl/ */\n\n/* Simple bitwise hash function */\nunsigned int bitwisehash(char *word, int tsize, unsigned int seed) {\n    char c;\n    unsigned int h;\n    h = seed;\n    for (; (c =* word) != '\\0'; word++) h ^= ((h << 5) + c + (h >> 2));\n    return((unsigned int)((h&0x7fffffff) % tsize));\n}\n\n/* Create hash table, initialise pointers to NULL */\nHASHREC ** inithashtable() {\n    int i;\n    HASHREC **ht;\n    ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE );\n    for (i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL;\n    return(ht);\n}\n\n/* Search hash table for given string, return record if found, else NULL */\nHASHREC *hashsearch(HASHREC **ht, char *w) {\n    HASHREC     *htmp, *hprv;\n    unsigned int hval = HASHFN(w, TSIZE, SEED);\n    for (hprv = NULL, htmp=ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);\n    if ( htmp != NULL && hprv!=NULL ) { // move to front on access\n        hprv->next = htmp->next;\n        htmp->next = ht[hval];\n        ht[hval] = htmp;\n    }\n    return(htmp);\n}\n\n/* Insert string in hash table, check for duplicates which should be absent */\nvoid hashinsert(HASHREC **ht, char *w, long long id) {\n    HASHREC     *htmp, *hprv;\n    unsigned int hval = HASHFN(w, TSIZE, SEED);\n    for (hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);\n    if (htmp == NULL) {\n        htmp = (HASHREC *) malloc(sizeof(HASHREC));\n        htmp->word = (char *) malloc(strlen(w) + 1);\n        strcpy(htmp->word, w);\n        htmp->id = id;\n        htmp->next = NULL;\n        if (hprv == NULL) ht[hval] = htmp;\n        else hprv->next = htmp;\n    }\n    else fprintf(stderr, \"Error, duplicate entry located: %s.\\n\",htmp->word);\n    return;\n}\n\n/* Read word from input stream. Return 1 when encounter '\\n' or EOF (but separate from word), 0 otherwise.\n   Words can be separated by space(s), tab(s), or newline(s). Carriage return characters are just ignored.\n   (Okay for Windows, but not for Mac OS 9-. Ignored even if by themselves or in words.)\n   A newline is taken as indicating a new document (contexts won't cross newline).\n   Argument word array is assumed to be of size MAX_STRING_LENGTH.\n   words will be truncated if too long. They are truncated with some care so that they\n   cannot truncate in the middle of a utf-8 character, but\n   still little to no harm will be done for other encodings like iso-8859-1.\n   (This function appears identically copied in vocab_count.c and cooccur.c.)\n */\nint get_word(char *word, FILE *fin) {\n    int i = 0, ch;\n    for ( ; ; ) {\n        ch = fgetc(fin);\n        if (ch == '\\r') continue;\n        if (i == 0 && ((ch == '\\n') || (ch == EOF))) {\n            word[i] = 0;\n            return 1;\n        }\n        if (i == 0 && ((ch == ' ') || (ch == '\\t'))) continue; // skip leading space\n        if ((ch == EOF) || (ch == ' ') || (ch == '\\t') || (ch == '\\n')) {\n            if (ch == '\\n') ungetc(ch, fin); // return the newline next time as document ender\n            break;\n        }\n        if (i < MAX_STRING_LENGTH - 1)\n          word[i++] = ch; // don't allow words to exceed MAX_STRING_LENGTH\n    }\n    word[i] = 0; //null terminate\n    // avoid truncation destroying a multibyte UTF-8 char except if only thing on line (so the i > x tests won't overwrite word[0])\n    // see https://en.wikipedia.org/wiki/UTF-8#Description\n    if (i == MAX_STRING_LENGTH - 1 && (word[i-1] & 0x80) == 0x80) {\n        if ((word[i-1] & 0xC0) == 0xC0) {\n            word[i-1] = '\\0';\n        } else if (i > 2 && (word[i-2] & 0xE0) == 0xE0) {\n            word[i-2] = '\\0';\n        } else if (i > 3 && (word[i-3] & 0xF8) == 0xF0) {\n            word[i-3] = '\\0';\n        }\n    }\n    return 0;\n}\n\n/* Write sorted chunk of cooccurrence records to file, accumulating duplicate entries */\nint write_chunk(CREC *cr, long long length, FILE *fout) {\n    if (length == 0) return 0;\n\n    long long a = 0;\n    CREC old = cr[a];\n    \n    for (a = 1; a < length; a++) {\n        if (cr[a].word1 == old.word1 && cr[a].word2 == old.word2) {\n            old.val += cr[a].val;\n            continue;\n        }\n        fwrite(&old, sizeof(CREC), 1, fout);\n        old = cr[a];\n    }\n    fwrite(&old, sizeof(CREC), 1, fout);\n    return 0;\n}\n\n/* Check if two cooccurrence records are for the same two words, used for qsort */\nint compare_crec(const void *a, const void *b) {\n    int c;\n    if ( (c = ((CREC *) a)->word1 - ((CREC *) b)->word1) != 0) return c;\n    else return (((CREC *) a)->word2 - ((CREC *) b)->word2);\n    \n}\n\n/* Check if two cooccurrence records are for the same two words */\nint compare_crecid(CRECID a, CRECID b) {\n    int c;\n    if ( (c = a.word1 - b.word1) != 0) return c;\n    else return a.word2 - b.word2;\n}\n\n/* Swap two entries of priority queue */\nvoid swap_entry(CRECID *pq, int i, int j) {\n    CRECID temp = pq[i];\n    pq[i] = pq[j];\n    pq[j] = temp;\n}\n\n/* Insert entry into priority queue */\nvoid insert(CRECID *pq, CRECID new, int size) {\n    int j = size - 1, p;\n    pq[j] = new;\n    while ( (p=(j-1)/2) >= 0 ) {\n        if (compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); j = p;}\n        else break;\n    }\n}\n\n/* Delete entry from priority queue */\nvoid delete(CRECID *pq, int size) {\n    int j, p = 0;\n    pq[p] = pq[size - 1];\n    while ( (j = 2*p+1) < size - 1 ) {\n        if (j == size - 2) {\n            if (compare_crecid(pq[p],pq[j]) > 0) swap_entry(pq,p,j);\n            return;\n        }\n        else {\n            if (compare_crecid(pq[j], pq[j+1]) < 0) {\n                if (compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); p = j;}\n                else return;\n            }\n            else {\n                if (compare_crecid(pq[p],pq[j+1]) > 0) {swap_entry(pq,p,j+1); p = j + 1;}\n                else return;\n            }\n        }\n    }\n}\n\n/* Write top node of priority queue to file, accumulating duplicate entries */\nint merge_write(CRECID new, CRECID *old, FILE *fout) {\n    if (new.word1 == old->word1 && new.word2 == old->word2) {\n        old->val += new.val;\n        return 0; // Indicates duplicate entry\n    }\n    fwrite(old, sizeof(CREC), 1, fout);\n    *old = new;\n    return 1; // Actually wrote to file\n}\n\n/* Merge [num] sorted files of cooccurrence records */\nint merge_files(int num) {\n    int i, size;\n    long long counter = 0;\n    CRECID *pq, new, old;\n    char filename[200];\n    FILE **fid, *fout;\n    fid = malloc(sizeof(FILE) * num);\n    pq = malloc(sizeof(CRECID) * num);\n    fout = stdout;\n    if (verbose > 1) fprintf(stderr, \"Merging cooccurrence files: processed 0 lines.\");\n    \n    /* Open all files and add first entry of each to priority queue */\n    for (i = 0; i < num; i++) {\n        sprintf(filename,\"%s_%04d.bin\",file_head,i);\n        fid[i] = fopen(filename,\"rb\");\n        if (fid[i] == NULL) {fprintf(stderr, \"Unable to open file %s.\\n\",filename); return 1;}\n        fread(&new, sizeof(CREC), 1, fid[i]);\n        new.id = i;\n        insert(pq,new,i+1);\n    }\n    \n    /* Pop top node, save it in old to see if the next entry is a duplicate */\n    size = num;\n    old = pq[0];\n    i = pq[0].id;\n    delete(pq, size);\n    fread(&new, sizeof(CREC), 1, fid[i]);\n    if (feof(fid[i])) size--;\n    else {\n        new.id = i;\n        insert(pq, new, size);\n    }\n    \n    /* Repeatedly pop top node and fill priority queue until files have reached EOF */\n    while (size > 0) {\n        counter += merge_write(pq[0], &old, fout); // Only count the lines written to file, not duplicates\n        if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,\"\\033[39G%lld lines.\",counter);\n        i = pq[0].id;\n        delete(pq, size);\n        fread(&new, sizeof(CREC), 1, fid[i]);\n        if (feof(fid[i])) size--;\n        else {\n            new.id = i;\n            insert(pq, new, size);\n        }\n    }\n    fwrite(&old, sizeof(CREC), 1, fout);\n    fprintf(stderr,\"\\033[0GMerging cooccurrence files: processed %lld lines.\\n\",++counter);\n    for (i=0;i<num;i++) {\n        sprintf(filename,\"%s_%04d.bin\",file_head,i);\n        remove(filename);\n    }\n    fprintf(stderr,\"\\n\");\n    return 0;\n}\n\n/* Collect word-word cooccurrence counts from input stream */\nint get_cooccurrence() {\n    int flag, x, y, fidcounter = 1;\n    long long a, j = 0, k, id, counter = 0, ind = 0, vocab_size, w1, w2, *lookup, *history;\n    char format[20], filename[200], str[MAX_STRING_LENGTH + 1];\n    FILE *fid, *foverflow;\n    real *bigram_table, r;\n    HASHREC *htmp, **vocab_hash = inithashtable();\n    CREC *cr = malloc(sizeof(CREC) * (overflow_length + 1));\n    history = malloc(sizeof(long long) * window_size);\n    \n    fprintf(stderr, \"COUNTING COOCCURRENCES\\n\");\n    if (verbose > 0) {\n        fprintf(stderr, \"window size: %d\\n\", window_size);\n        if (symmetric == 0) fprintf(stderr, \"context: asymmetric\\n\");\n        else fprintf(stderr, \"context: symmetric\\n\");\n    }\n    if (verbose > 1) fprintf(stderr, \"max product: %lld\\n\", max_product);\n    if (verbose > 1) fprintf(stderr, \"overflow length: %lld\\n\", overflow_length);\n    sprintf(format,\"%%%ds %%lld\", MAX_STRING_LENGTH); // Format to read from vocab file, which has (irrelevant) frequency data\n    if (verbose > 1) fprintf(stderr, \"Reading vocab from file \\\"%s\\\"...\", vocab_file);\n    fid = fopen(vocab_file,\"r\");\n    if (fid == NULL) {fprintf(stderr,\"Unable to open vocab file %s.\\n\",vocab_file); return 1;}\n    while (fscanf(fid, format, str, &id) != EOF) hashinsert(vocab_hash, str, ++j); // Here id is not used: inserting vocab words into hash table with their frequency rank, j\n    fclose(fid);\n    vocab_size = j;\n    j = 0;\n    if (verbose > 1) fprintf(stderr, \"loaded %lld words.\\nBuilding lookup table...\", vocab_size);\n    \n    /* Build auxiliary lookup table used to index into bigram_table */\n    lookup = (long long *)calloc( vocab_size + 1, sizeof(long long) );\n    if (lookup == NULL) {\n        fprintf(stderr, \"Couldn't allocate memory!\");\n        return 1;\n    }\n    lookup[0] = 1;\n    for (a = 1; a <= vocab_size; a++) {\n        if ((lookup[a] = max_product / a) < vocab_size) lookup[a] += lookup[a-1];\n        else lookup[a] = lookup[a-1] + vocab_size;\n    }\n    if (verbose > 1) fprintf(stderr, \"table contains %lld elements.\\n\",lookup[a-1]);\n    \n    /* Allocate memory for full array which will store all cooccurrence counts for words whose product of frequency ranks is less than max_product */\n    bigram_table = (real *)calloc( lookup[a-1] , sizeof(real) );\n    if (bigram_table == NULL) {\n        fprintf(stderr, \"Couldn't allocate memory!\");\n        return 1;\n    }\n    \n    fid = stdin;\n    // sprintf(format,\"%%%ds\",MAX_STRING_LENGTH);\n    sprintf(filename,\"%s_%04d.bin\", file_head, fidcounter);\n    foverflow = fopen(filename,\"wb\");\n    if (verbose > 1) fprintf(stderr,\"Processing token: 0\");\n    \n    /* For each token in input stream, calculate a weighted cooccurrence sum within window_size */\n    while (1) {\n        if (ind >= overflow_length - window_size) { // If overflow buffer is (almost) full, sort it and write it to temporary file\n            qsort(cr, ind, sizeof(CREC), compare_crec);\n            write_chunk(cr,ind,foverflow);\n            fclose(foverflow);\n            fidcounter++;\n            sprintf(filename,\"%s_%04d.bin\",file_head,fidcounter);\n            foverflow = fopen(filename,\"wb\");\n            ind = 0;\n        }\n        flag = get_word(str, fid);\n        if (verbose > 2) fprintf(stderr, \"Maybe processing token: %s\\n\", str);\n        if (flag == 1) {\n            // Newline, reset line index (j); maybe eof.\n            if (feof(fid)) {\n                if (verbose > 2) fprintf(stderr, \"Not getting coocurs as at eof\\n\");\n                break;\n            }\n            j = 0;\n            if (verbose > 2) fprintf(stderr, \"Not getting coocurs as at newline\\n\");\n            continue;\n        }\n        counter++;\n        if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,\"\\033[19G%lld\",counter);\n        htmp = hashsearch(vocab_hash, str);\n        if (htmp == NULL) {\n            if (verbose > 2) fprintf(stderr, \"Not getting coocurs as word not in vocab\\n\");\n            continue; // Skip out-of-vocabulary words\n        }\n        w2 = htmp->id; // Target word (frequency rank)\n        for (k = j - 1; k >= ( (j > window_size) ? j - window_size : 0 ); k--) { // Iterate over all words to the left of target word, but not past beginning of line\n            w1 = history[k % window_size]; // Context word (frequency rank)\n            if (verbose > 2) fprintf(stderr, \"Adding cooccur between words %lld and %lld.\\n\", w1, w2);\n            if ( w1 < max_product/w2 ) { // Product is small enough to store in a full array\n                bigram_table[lookup[w1-1] + w2 - 2] += distance_weighting ? 1.0/((real)(j-k)) : 1.0; // Weight by inverse of distance between words if needed\n                if (symmetric > 0) bigram_table[lookup[w2-1] + w1 - 2] += distance_weighting ? 1.0/((real)(j-k)) : 1.0; // If symmetric context is used, exchange roles of w2 and w1 (ie look at right context too)\n            }\n            else { // Product is too big, data is likely to be sparse. Store these entries in a temporary buffer to be sorted, merged (accumulated), and written to file when it gets full.\n                cr[ind].word1 = w1;\n                cr[ind].word2 = w2;\n                cr[ind].val = distance_weighting ? 1.0/((real)(j-k)) : 1.0;\n                ind++; // Keep track of how full temporary buffer is\n                if (symmetric > 0) { // Symmetric context\n                    cr[ind].word1 = w2;\n                    cr[ind].word2 = w1;\n                    cr[ind].val = distance_weighting ? 1.0/((real)(j-k)) : 1.0;\n                    ind++;\n                }\n            }\n        }\n        history[j % window_size] = w2; // Target word is stored in circular buffer to become context word in the future\n        j++;\n    }\n    \n    /* Write out temp buffer for the final time (it may not be full) */\n    if (verbose > 1) fprintf(stderr,\"\\033[0GProcessed %lld tokens.\\n\",counter);\n    qsort(cr, ind, sizeof(CREC), compare_crec);\n    write_chunk(cr,ind,foverflow);\n    sprintf(filename,\"%s_0000.bin\",file_head);\n    \n    /* Write out full bigram_table, skipping zeros */\n    if (verbose > 1) fprintf(stderr, \"Writing cooccurrences to disk\");\n    fid = fopen(filename,\"wb\");\n    j = 1e6;\n    for (x = 1; x <= vocab_size; x++) {\n        if ( (long long) (0.75*log(vocab_size / x)) < j) {\n            j = (long long) (0.75*log(vocab_size / x));\n            if (verbose > 1) fprintf(stderr,\".\");\n        } // log's to make it look (sort of) pretty\n        for (y = 1; y <= (lookup[x] - lookup[x-1]); y++) {\n            if ((r = bigram_table[lookup[x-1] - 2 + y]) != 0) {\n                fwrite(&x, sizeof(int), 1, fid);\n                fwrite(&y, sizeof(int), 1, fid);\n                fwrite(&r, sizeof(real), 1, fid);\n            }\n        }\n    }\n    \n    if (verbose > 1) fprintf(stderr,\"%d files in total.\\n\",fidcounter + 1);\n    fclose(fid);\n    fclose(foverflow);\n    free(cr);\n    free(lookup);\n    free(bigram_table);\n    free(vocab_hash);\n    return merge_files(fidcounter + 1); // Merge the sorted temporary files\n}\n\nint find_arg(char *str, int argc, char **argv) {\n    int i;\n    for (i = 1; i < argc; i++) {\n        if (!scmp(str, argv[i])) {\n            if (i == argc - 1) {\n                printf(\"No argument given for %s\\n\", str);\n                exit(1);\n            }\n            return i;\n        }\n    }\n    return -1;\n}\n\nint main(int argc, char **argv) {\n    int i;\n    real rlimit, n = 1e5;\n    vocab_file = malloc(sizeof(char) * MAX_STRING_LENGTH);\n    file_head = malloc(sizeof(char) * MAX_STRING_LENGTH);\n    \n    if (argc == 1) {\n        printf(\"Tool to calculate word-word cooccurrence statistics\\n\");\n        printf(\"Author: Jeffrey Pennington (jpennin@stanford.edu)\\n\\n\");\n        printf(\"Usage options:\\n\");\n        printf(\"\\t-verbose <int>\\n\");\n        printf(\"\\t\\tSet verbosity: 0, 1, 2 (default), or 3\\n\");\n        printf(\"\\t-symmetric <int>\\n\");\n        printf(\"\\t\\tIf <int> = 0, only use left context; if <int> = 1 (default), use left and right\\n\");\n        printf(\"\\t-window-size <int>\\n\");\n        printf(\"\\t\\tNumber of context words to the left (and to the right, if symmetric = 1); default 15\\n\");\n        printf(\"\\t-vocab-file <file>\\n\");\n        printf(\"\\t\\tFile containing vocabulary (truncated unigram counts, produced by 'vocab_count'); default vocab.txt\\n\");\n        printf(\"\\t-memory <float>\\n\");\n        printf(\"\\t\\tSoft limit for memory consumption, in GB -- based on simple heuristic, so not extremely accurate; default 4.0\\n\");\n        printf(\"\\t-max-product <int>\\n\");\n        printf(\"\\t\\tLimit the size of dense cooccurrence array by specifying the max product <int> of the frequency counts of the two cooccurring words.\\n\\t\\tThis value overrides that which is automatically produced by '-memory'. Typically only needs adjustment for use with very large corpora.\\n\");\n        printf(\"\\t-overflow-length <int>\\n\");\n        printf(\"\\t\\tLimit to length <int> the sparse overflow array, which buffers cooccurrence data that does not fit in the dense array, before writing to disk. \\n\\t\\tThis value overrides that which is automatically produced by '-memory'. Typically only needs adjustment for use with very large corpora.\\n\");\n        printf(\"\\t-overflow-file <file>\\n\");\n        printf(\"\\t\\tFilename, excluding extension, for temporary files; default overflow\\n\");\n        printf(\"\\t-distance-weighting <int>\\n\");\n        printf(\"\\t\\tIf <int> = 0, do not weight cooccurrence count by distance between words; if <int> = 1 (default), weight the cooccurrence count by inverse of distance between words\\n\");\n\n        printf(\"\\nExample usage:\\n\");\n        printf(\"./cooccur -verbose 2 -symmetric 0 -window-size 10 -vocab-file vocab.txt -memory 8.0 -overflow-file tempoverflow < corpus.txt > cooccurrences.bin\\n\\n\");\n        return 0;\n    }\n\n    if ((i = find_arg((char *)\"-verbose\", argc, argv)) > 0) verbose = atoi(argv[i + 1]);\n    if ((i = find_arg((char *)\"-symmetric\", argc, argv)) > 0) symmetric = atoi(argv[i + 1]);\n    if ((i = find_arg((char *)\"-window-size\", argc, argv)) > 0) window_size = atoi(argv[i + 1]);\n    if ((i = find_arg((char *)\"-vocab-file\", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]);\n    else strcpy(vocab_file, (char *)\"vocab.txt\");\n    if ((i = find_arg((char *)\"-overflow-file\", argc, argv)) > 0) strcpy(file_head, argv[i + 1]);\n    else strcpy(file_head, (char *)\"overflow\");\n    if ((i = find_arg((char *)\"-memory\", argc, argv)) > 0) memory_limit = atof(argv[i + 1]);\n    if ((i = find_arg((char *)\"-distance-weighting\", argc, argv)) > 0)  distance_weighting = atoi(argv[i + 1]);\n    \n    /* The memory_limit determines a limit on the number of elements in bigram_table and the overflow buffer */\n    /* Estimate the maximum value that max_product can take so that this limit is still satisfied */\n    rlimit = 0.85 * (real)memory_limit * 1073741824/(sizeof(CREC));\n    while (fabs(rlimit - n * (log(n) + 0.1544313298)) > 1e-3) n = rlimit / (log(n) + 0.1544313298);\n    max_product = (long long) n;\n    overflow_length = (long long) rlimit/6; // 0.85 + 1/6 ~= 1\n    \n    /* Override estimates by specifying limits explicitly on the command line */\n    if ((i = find_arg((char *)\"-max-product\", argc, argv)) > 0) max_product = atoll(argv[i + 1]);\n    if ((i = find_arg((char *)\"-overflow-length\", argc, argv)) > 0) overflow_length = atoll(argv[i + 1]);\n    \n    return get_cooccurrence();\n}\n\n"
  },
  {
    "path": "utils_nlp/models/glove/src/glove.c",
    "content": "//  GloVe: Global Vectors for Word Representation\n//\n//  Copyright (c) 2014 The Board of Trustees of\n//  The Leland Stanford Junior University. All Rights Reserved.\n//\n//  Licensed under the Apache License, Version 2.0 (the \"License\");\n//  you may not use this file except in compliance with the License.\n//  You may obtain a copy of the License at\n//\n//      http://www.apache.org/licenses/LICENSE-2.0\n//\n//  Unless required by applicable law or agreed to in writing, software\n//  distributed under the License is distributed on an \"AS IS\" BASIS,\n//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n//  See the License for the specific language governing permissions and\n//  limitations under the License.\n//\n//\n//  For more information, bug reports, fixes, contact:\n//    Jeffrey Pennington (jpennin@stanford.edu)\n//    GlobalVectors@googlegroups.com\n//    http://nlp.stanford.edu/projects/glove/\n\n\n#include <stdint.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <math.h>\n#include <pthread.h>\n#include <time.h>\n\n#define _FILE_OFFSET_BITS 64\n#define MAX_STRING_LENGTH 1000\n\ntypedef double real;\n\ntypedef struct cooccur_rec {\n    int word1;\n    int word2;\n    real val;\n} CREC;\n\nint write_header=0; //0=no, 1=yes; writes vocab_size/vector_size as first line for use with some libraries, such as gensim.\nint verbose = 2; // 0, 1, or 2\nint use_unk_vec = 1; // 0 or 1\nint num_threads = 8; // pthreads\nint num_iter = 25; // Number of full passes through cooccurrence matrix\nint vector_size = 50; // Word vector size\nint save_gradsq = 0; // By default don't save squared gradient values\nint use_binary = 0; // 0: save as text files; 1: save as binary; 2: both. For binary, save both word and context word vectors.\nint model = 2; // For text file output only. 0: concatenate word and context vectors (and biases) i.e. save everything; 1: Just save word vectors (no bias); 2: Save (word + context word) vectors (no biases)\nint checkpoint_every = 0; // checkpoint the model for every checkpoint_every iterations. Do nothing if checkpoint_every <= 0\nreal eta = 0.05; // Initial learning rate\nreal alpha = 0.75, x_max = 100.0; // Weighting function parameters, not extremely sensitive to corpus, though may need adjustment for very small or very large corpora\nreal *W, *gradsq, *cost;\nlong long num_lines, *lines_per_thread, vocab_size;\nchar *vocab_file, *input_file, *save_W_file, *save_gradsq_file;\n\n/* Efficient string comparison */\nint scmp( char *s1, char *s2 ) {\n    while (*s1 != '\\0' && *s1 == *s2) {s1++; s2++;}\n    return(*s1 - *s2);\n}\n\nvoid initialize_parameters() {\n    long long a, b;\n    vector_size++; // Temporarily increment to allocate space for bias\n\n    /* Allocate space for word vectors and context word vectors, and correspodning gradsq */\n    a = posix_memalign((void **)&W, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc\n    if (W == NULL) {\n        fprintf(stderr, \"Error allocating memory for W\\n\");\n        exit(1);\n    }\n    a = posix_memalign((void **)&gradsq, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc\n    if (gradsq == NULL) {\n        fprintf(stderr, \"Error allocating memory for gradsq\\n\");\n        exit(1);\n    }\n    for (b = 0; b < vector_size; b++) {\n        for (a = 0; a < 2 * vocab_size; a++) {\n            W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size;\n        }\n    }\n    for (b = 0; b < vector_size; b++) {\n        for (a = 0; a < 2 * vocab_size; a++) {\n            gradsq[a * vector_size + b] = 1.0; // So initial value of eta is equal to initial learning rate\n        }\n    }\n    vector_size--;\n}\n\ninline real check_nan(real update) {\n    if (isnan(update) || isinf(update)) {\n        fprintf(stderr,\"\\ncaught NaN in update\");\n        return 0.;\n    } else {\n        return update;\n    }\n}\n\n/* Train the GloVe model */\nvoid *glove_thread(void *vid) {\n    long long a, b ,l1, l2;\n    long long id = *(long long*)vid;\n    CREC cr;\n    real diff, fdiff, temp1, temp2;\n    FILE *fin;\n    fin = fopen(input_file, \"rb\");\n    fseeko(fin, (num_lines / num_threads * id) * (sizeof(CREC)), SEEK_SET); //Threads spaced roughly equally throughout file\n    cost[id] = 0;\n    \n    real* W_updates1 = (real*)malloc(vector_size * sizeof(real));\n    real* W_updates2 = (real*)malloc(vector_size * sizeof(real));\n    for (a = 0; a < lines_per_thread[id]; a++) {\n        fread(&cr, sizeof(CREC), 1, fin);\n        if (feof(fin)) break;\n        if (cr.word1 < 1 || cr.word2 < 1) { continue; }\n        \n        /* Get location of words in W & gradsq */\n        l1 = (cr.word1 - 1LL) * (vector_size + 1); // cr word indices start at 1\n        l2 = ((cr.word2 - 1LL) + vocab_size) * (vector_size + 1); // shift by vocab_size to get separate vectors for context words\n        \n        /* Calculate cost, save diff for gradients */\n        diff = 0;\n        for (b = 0; b < vector_size; b++) diff += W[b + l1] * W[b + l2]; // dot product of word and context word vector\n        diff += W[vector_size + l1] + W[vector_size + l2] - log(cr.val); // add separate bias for each word\n        fdiff = (cr.val > x_max) ? diff : pow(cr.val / x_max, alpha) * diff; // multiply weighting function (f) with diff\n\n        // Check for NaN and inf() in the diffs.\n        if (isnan(diff) || isnan(fdiff) || isinf(diff) || isinf(fdiff)) {\n            fprintf(stderr,\"Caught NaN in diff for kdiff for thread. Skipping update\");\n            continue;\n        }\n\n        cost[id] += 0.5 * fdiff * diff; // weighted squared error\n        \n        /* Adaptive gradient updates */\n        fdiff *= eta; // for ease in calculating gradient\n        real W_updates1_sum = 0;\n        real W_updates2_sum = 0;\n        for (b = 0; b < vector_size; b++) {\n            // learning rate times gradient for word vectors\n            temp1 = fdiff * W[b + l2];\n            temp2 = fdiff * W[b + l1];\n            // adaptive updates\n            W_updates1[b] = temp1 / sqrt(gradsq[b + l1]);\n            W_updates2[b] = temp2 / sqrt(gradsq[b + l2]);\n            W_updates1_sum += W_updates1[b];\n            W_updates2_sum += W_updates2[b];\n            gradsq[b + l1] += temp1 * temp1;\n            gradsq[b + l2] += temp2 * temp2;\n        }\n        if (!isnan(W_updates1_sum) && !isinf(W_updates1_sum) && !isnan(W_updates2_sum) && !isinf(W_updates2_sum)) {\n            for (b = 0; b < vector_size; b++) {\n                W[b + l1] -= W_updates1[b];\n                W[b + l2] -= W_updates2[b];\n            }\n        }\n\n        // updates for bias terms\n        W[vector_size + l1] -= check_nan(fdiff / sqrt(gradsq[vector_size + l1]));\n        W[vector_size + l2] -= check_nan(fdiff / sqrt(gradsq[vector_size + l2]));\n        fdiff *= fdiff;\n        gradsq[vector_size + l1] += fdiff;\n        gradsq[vector_size + l2] += fdiff;\n        \n    }\n    free(W_updates1);\n    free(W_updates2);\n    \n    fclose(fin);\n    pthread_exit(NULL);\n}\n\n/* Save params to file */\nint save_params(int nb_iter) {\n    /*\n     * nb_iter is the number of iteration (= a full pass through the cooccurrence matrix).\n     *   nb_iter > 0 => checkpointing the intermediate parameters, so nb_iter is in the filename of output file.\n     *   else        => saving the final paramters, so nb_iter is ignored.\n     */\n\n    long long a, b;\n    char format[20];\n    char output_file[MAX_STRING_LENGTH], output_file_gsq[MAX_STRING_LENGTH];\n    char *word = malloc(sizeof(char) * MAX_STRING_LENGTH + 1);\n    FILE *fid, *fout, *fgs;\n    \n    if (use_binary > 0) { // Save parameters in binary file\n        if (nb_iter <= 0)\n            sprintf(output_file,\"%s.bin\",save_W_file);\n        else\n            sprintf(output_file,\"%s.%03d.bin\",save_W_file,nb_iter);\n\n        fout = fopen(output_file,\"wb\");\n        if (fout == NULL) {fprintf(stderr, \"Unable to open file %s.\\n\",save_W_file); return 1;}\n        for (a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&W[a], sizeof(real), 1,fout);\n        fclose(fout);\n        if (save_gradsq > 0) {\n            if (nb_iter <= 0)\n                sprintf(output_file_gsq,\"%s.bin\",save_gradsq_file);\n            else\n                sprintf(output_file_gsq,\"%s.%03d.bin\",save_gradsq_file,nb_iter);\n\n            fgs = fopen(output_file_gsq,\"wb\");\n            if (fgs == NULL) {fprintf(stderr, \"Unable to open file %s.\\n\",save_gradsq_file); return 1;}\n            for (a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&gradsq[a], sizeof(real), 1,fgs);\n            fclose(fgs);\n        }\n    }\n    if (use_binary != 1) { // Save parameters in text file\n        if (nb_iter <= 0)\n            sprintf(output_file,\"%s.txt\",save_W_file);\n        else\n            sprintf(output_file,\"%s.%03d.txt\",save_W_file,nb_iter);\n        if (save_gradsq > 0) {\n            if (nb_iter <= 0)\n                sprintf(output_file_gsq,\"%s.txt\",save_gradsq_file);\n            else\n                sprintf(output_file_gsq,\"%s.%03d.txt\",save_gradsq_file,nb_iter);\n\n            fgs = fopen(output_file_gsq,\"wb\");\n            if (fgs == NULL) {fprintf(stderr, \"Unable to open file %s.\\n\",save_gradsq_file); return 1;}\n        }\n        fout = fopen(output_file,\"wb\");\n        if (fout == NULL) {fprintf(stderr, \"Unable to open file %s.\\n\",save_W_file); return 1;}\n        fid = fopen(vocab_file, \"r\");\n        sprintf(format,\"%%%ds\",MAX_STRING_LENGTH);\n        if (fid == NULL) {fprintf(stderr, \"Unable to open file %s.\\n\",vocab_file); return 1;}\n        if (write_header) fprintf(fout, \"%lld %d\\n\", vocab_size, vector_size);\n        for (a = 0; a < vocab_size; a++) {\n            if (fscanf(fid,format,word) == 0) return 1;\n            // input vocab cannot contain special <unk> keyword\n            if (strcmp(word, \"<unk>\") == 0) return 1;\n            fprintf(fout, \"%s\",word);\n            if (model == 0) { // Save all parameters (including bias)\n                for (b = 0; b < (vector_size + 1); b++) fprintf(fout,\" %lf\", W[a * (vector_size + 1) + b]);\n                for (b = 0; b < (vector_size + 1); b++) fprintf(fout,\" %lf\", W[(vocab_size + a) * (vector_size + 1) + b]);\n            }\n            if (model == 1) // Save only \"word\" vectors (without bias)\n                for (b = 0; b < vector_size; b++) fprintf(fout,\" %lf\", W[a * (vector_size + 1) + b]);\n            if (model == 2) // Save \"word + context word\" vectors (without bias)\n                for (b = 0; b < vector_size; b++) fprintf(fout,\" %lf\", W[a * (vector_size + 1) + b] + W[(vocab_size + a) * (vector_size + 1) + b]);\n            fprintf(fout,\"\\n\");\n            if (save_gradsq > 0) { // Save gradsq\n                fprintf(fgs, \"%s\",word);\n                for (b = 0; b < (vector_size + 1); b++) fprintf(fgs,\" %lf\", gradsq[a * (vector_size + 1) + b]);\n                for (b = 0; b < (vector_size + 1); b++) fprintf(fgs,\" %lf\", gradsq[(vocab_size + a) * (vector_size + 1) + b]);\n                fprintf(fgs,\"\\n\");\n            }\n            if (fscanf(fid,format,word) == 0) return 1; // Eat irrelevant frequency entry\n        }\n\n        if (use_unk_vec) {\n            real* unk_vec = (real*)calloc((vector_size + 1), sizeof(real));\n            real* unk_context = (real*)calloc((vector_size + 1), sizeof(real));\n            word = \"<unk>\";\n\n            int num_rare_words = vocab_size < 100 ? vocab_size : 100;\n\n            for (a = vocab_size - num_rare_words; a < vocab_size; a++) {\n                for (b = 0; b < (vector_size + 1); b++) {\n                    unk_vec[b] += W[a * (vector_size + 1) + b] / num_rare_words;\n                    unk_context[b] += W[(vocab_size + a) * (vector_size + 1) + b] / num_rare_words;\n                }\n            }\n\n            fprintf(fout, \"%s\",word);\n            if (model == 0) { // Save all parameters (including bias)\n                for (b = 0; b < (vector_size + 1); b++) fprintf(fout,\" %lf\", unk_vec[b]);\n                for (b = 0; b < (vector_size + 1); b++) fprintf(fout,\" %lf\", unk_context[b]);\n            }\n            if (model == 1) // Save only \"word\" vectors (without bias)\n                for (b = 0; b < vector_size; b++) fprintf(fout,\" %lf\", unk_vec[b]);\n            if (model == 2) // Save \"word + context word\" vectors (without bias)\n                for (b = 0; b < vector_size; b++) fprintf(fout,\" %lf\", unk_vec[b] + unk_context[b]);\n            fprintf(fout,\"\\n\");\n\n            free(unk_vec);\n            free(unk_context);\n        }\n\n        fclose(fid);\n        fclose(fout);\n        if (save_gradsq > 0) fclose(fgs);\n    }\n    return 0;\n}\n\n/* Train model */\nint train_glove() {\n    long long a, file_size;\n    int save_params_return_code;\n    int b;\n    FILE *fin;\n    real total_cost = 0;\n\n    fprintf(stderr, \"TRAINING MODEL\\n\");\n    \n    fin = fopen(input_file, \"rb\");\n    if (fin == NULL) {fprintf(stderr,\"Unable to open cooccurrence file %s.\\n\",input_file); return 1;}\n    fseeko(fin, 0, SEEK_END);\n    file_size = ftello(fin);\n    num_lines = file_size/(sizeof(CREC)); // Assuming the file isn't corrupt and consists only of CREC's\n    fclose(fin);\n    fprintf(stderr,\"Read %lld lines.\\n\", num_lines);\n    if (verbose > 1) fprintf(stderr,\"Initializing parameters...\");\n    initialize_parameters();\n    if (verbose > 1) fprintf(stderr,\"done.\\n\");\n    if (verbose > 0) fprintf(stderr,\"vector size: %d\\n\", vector_size);\n    if (verbose > 0) fprintf(stderr,\"vocab size: %lld\\n\", vocab_size);\n    if (verbose > 0) fprintf(stderr,\"x_max: %lf\\n\", x_max);\n    if (verbose > 0) fprintf(stderr,\"alpha: %lf\\n\", alpha);\n    pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));\n    lines_per_thread = (long long *) malloc(num_threads * sizeof(long long));\n    \n    time_t rawtime;\n    struct tm *info;\n    char time_buffer[80];\n    // Lock-free asynchronous SGD\n    for (b = 0; b < num_iter; b++) {\n        total_cost = 0;\n        for (a = 0; a < num_threads - 1; a++) lines_per_thread[a] = num_lines / num_threads;\n        lines_per_thread[a] = num_lines / num_threads + num_lines % num_threads;\n        long long *thread_ids = (long long*)malloc(sizeof(long long) * num_threads);\n        for (a = 0; a < num_threads; a++) thread_ids[a] = a;\n        for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, glove_thread, (void *)&thread_ids[a]);\n        for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);\n        for (a = 0; a < num_threads; a++) total_cost += cost[a];\n        free(thread_ids);\n\n        time(&rawtime);\n        info = localtime(&rawtime);\n        strftime(time_buffer,80,\"%x - %I:%M.%S%p\", info);\n        fprintf(stderr, \"%s, iter: %03d, cost: %lf\\n\", time_buffer,  b+1, total_cost/num_lines);\n\n        if (checkpoint_every > 0 && (b + 1) % checkpoint_every == 0) {\n            fprintf(stderr,\"    saving itermediate parameters for iter %03d...\", b+1);\n            save_params_return_code = save_params(b+1);\n            if (save_params_return_code != 0)\n                return save_params_return_code;\n            fprintf(stderr,\"done.\\n\");\n        }\n\n    }\n    free(pt);\n    free(lines_per_thread);\n    return save_params(0);\n}\n\nint find_arg(char *str, int argc, char **argv) {\n    int i;\n    for (i = 1; i < argc; i++) {\n        if (!scmp(str, argv[i])) {\n            if (i == argc - 1) {\n                printf(\"No argument given for %s\\n\", str);\n                exit(1);\n            }\n            return i;\n        }\n    }\n    return -1;\n}\n\nint main(int argc, char **argv) {\n    int i;\n    FILE *fid;\n    vocab_file = malloc(sizeof(char) * MAX_STRING_LENGTH);\n    input_file = malloc(sizeof(char) * MAX_STRING_LENGTH);\n    save_W_file = malloc(sizeof(char) * MAX_STRING_LENGTH);\n    save_gradsq_file = malloc(sizeof(char) * MAX_STRING_LENGTH);\n    int result = 0;\n    \n    if (argc == 1) {\n        printf(\"GloVe: Global Vectors for Word Representation, v0.2\\n\");\n        printf(\"Author: Jeffrey Pennington (jpennin@stanford.edu)\\n\\n\");\n        printf(\"Usage options:\\n\");\n        printf(\"\\t-verbose <int>\\n\");\n        printf(\"\\t\\tSet verbosity: 0, 1, or 2 (default)\\n\");\n        printf(\"\\t-write-header <int>\\n\");\n        printf(\"\\t\\tIf 1, write vocab_size/vector_size as first line. Do nothing if 0 (default).\\n\");\n        printf(\"\\t-vector-size <int>\\n\");\n        printf(\"\\t\\tDimension of word vector representations (excluding bias term); default 50\\n\");\n        printf(\"\\t-threads <int>\\n\");\n        printf(\"\\t\\tNumber of threads; default 8\\n\");\n        printf(\"\\t-iter <int>\\n\");\n        printf(\"\\t\\tNumber of training iterations; default 25\\n\");\n        printf(\"\\t-eta <float>\\n\");\n        printf(\"\\t\\tInitial learning rate; default 0.05\\n\");\n        printf(\"\\t-alpha <float>\\n\");\n        printf(\"\\t\\tParameter in exponent of weighting function; default 0.75\\n\");\n        printf(\"\\t-x-max <float>\\n\");\n        printf(\"\\t\\tParameter specifying cutoff in weighting function; default 100.0\\n\");\n        printf(\"\\t-binary <int>\\n\");\n        printf(\"\\t\\tSave output in binary format (0: text, 1: binary, 2: both); default 0\\n\");\n        printf(\"\\t-model <int>\\n\");\n        printf(\"\\t\\tModel for word vector output (for text output only); default 2\\n\");\n        printf(\"\\t\\t   0: output all data, for both word and context word vectors, including bias terms\\n\");\n        printf(\"\\t\\t   1: output word vectors, excluding bias terms\\n\");\n        printf(\"\\t\\t   2: output word vectors + context word vectors, excluding bias terms\\n\");\n        printf(\"\\t-input-file <file>\\n\");\n        printf(\"\\t\\tBinary input file of shuffled cooccurrence data (produced by 'cooccur' and 'shuffle'); default cooccurrence.shuf.bin\\n\");\n        printf(\"\\t-vocab-file <file>\\n\");\n        printf(\"\\t\\tFile containing vocabulary (truncated unigram counts, produced by 'vocab_count'); default vocab.txt\\n\");\n        printf(\"\\t-save-file <file>\\n\");\n        printf(\"\\t\\tFilename, excluding extension, for word vector output; default vectors\\n\");\n        printf(\"\\t-gradsq-file <file>\\n\");\n        printf(\"\\t\\tFilename, excluding extension, for squared gradient output; default gradsq\\n\");\n        printf(\"\\t-save-gradsq <int>\\n\");\n        printf(\"\\t\\tSave accumulated squared gradients; default 0 (off); ignored if gradsq-file is specified\\n\");\n        printf(\"\\t-checkpoint-every <int>\\n\");\n        printf(\"\\t\\tCheckpoint a  model every <int> iterations; default 0 (off)\\n\");\n        printf(\"\\nExample usage:\\n\");\n        printf(\"./glove -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file vectors -gradsq-file gradsq -verbose 2 -vector-size 100 -threads 16 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 2 -model 2\\n\\n\");\n        result = 0;\n    } else {\n        if ((i = find_arg((char *)\"-write-header\", argc, argv)) > 0) write_header = atoi(argv[i + 1]);\n        if ((i = find_arg((char *)\"-verbose\", argc, argv)) > 0) verbose = atoi(argv[i + 1]);\n        if ((i = find_arg((char *)\"-vector-size\", argc, argv)) > 0) vector_size = atoi(argv[i + 1]);\n        if ((i = find_arg((char *)\"-iter\", argc, argv)) > 0) num_iter = atoi(argv[i + 1]);\n        if ((i = find_arg((char *)\"-threads\", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);\n        cost = malloc(sizeof(real) * num_threads);\n        if ((i = find_arg((char *)\"-alpha\", argc, argv)) > 0) alpha = atof(argv[i + 1]);\n        if ((i = find_arg((char *)\"-x-max\", argc, argv)) > 0) x_max = atof(argv[i + 1]);\n        if ((i = find_arg((char *)\"-eta\", argc, argv)) > 0) eta = atof(argv[i + 1]);\n        if ((i = find_arg((char *)\"-binary\", argc, argv)) > 0) use_binary = atoi(argv[i + 1]);\n        if ((i = find_arg((char *)\"-model\", argc, argv)) > 0) model = atoi(argv[i + 1]);\n        if (model != 0 && model != 1) model = 2;\n        if ((i = find_arg((char *)\"-save-gradsq\", argc, argv)) > 0) save_gradsq = atoi(argv[i + 1]);\n        if ((i = find_arg((char *)\"-vocab-file\", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]);\n        else strcpy(vocab_file, (char *)\"vocab.txt\");\n        if ((i = find_arg((char *)\"-save-file\", argc, argv)) > 0) strcpy(save_W_file, argv[i + 1]);\n        else strcpy(save_W_file, (char *)\"vectors\");\n        if ((i = find_arg((char *)\"-gradsq-file\", argc, argv)) > 0) {\n            strcpy(save_gradsq_file, argv[i + 1]);\n            save_gradsq = 1;\n        }\n        else if (save_gradsq > 0) strcpy(save_gradsq_file, (char *)\"gradsq\");\n        if ((i = find_arg((char *)\"-input-file\", argc, argv)) > 0) strcpy(input_file, argv[i + 1]);\n        else strcpy(input_file, (char *)\"cooccurrence.shuf.bin\");\n        if ((i = find_arg((char *)\"-checkpoint-every\", argc, argv)) > 0) checkpoint_every = atoi(argv[i + 1]);\n        \n        vocab_size = 0;\n        fid = fopen(vocab_file, \"r\");\n        if (fid == NULL) {fprintf(stderr, \"Unable to open vocab file %s.\\n\",vocab_file); return 1;}\n        while ((i = getc(fid)) != EOF) if (i == '\\n') vocab_size++; // Count number of entries in vocab_file\n        fclose(fid);\n\n        result = train_glove();\n        free(cost);\n    }\n    free(vocab_file);\n    free(input_file);\n    free(save_W_file);\n    free(save_gradsq_file);\n    return result;\n}\n"
  },
  {
    "path": "utils_nlp/models/glove/src/shuffle.c",
    "content": "//  Tool to shuffle entries of word-word cooccurrence files\n//\n//  Copyright (c) 2014 The Board of Trustees of\n//  The Leland Stanford Junior University. All Rights Reserved.\n//\n//  Licensed under the Apache License, Version 2.0 (the \"License\");\n//  you may not use this file except in compliance with the License.\n//  You may obtain a copy of the License at\n//\n//      http://www.apache.org/licenses/LICENSE-2.0\n//\n//  Unless required by applicable law or agreed to in writing, software\n//  distributed under the License is distributed on an \"AS IS\" BASIS,\n//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n//  See the License for the specific language governing permissions and\n//  limitations under the License.\n//\n//\n//  For more information, bug reports, fixes, contact:\n//    Jeffrey Pennington (jpennin@stanford.edu)\n//    GlobalVectors@googlegroups.com\n//    http://nlp.stanford.edu/projects/glove/\n\n#include <stdio.h>\n#include <string.h>\n#include <stdlib.h>\n\n#define MAX_STRING_LENGTH 1000\n\nstatic const long LRAND_MAX = ((long) RAND_MAX + 2) * (long)RAND_MAX;\ntypedef double real;\n\ntypedef struct cooccur_rec {\n    int word1;\n    int word2;\n    real val;\n} CREC;\n\nint verbose = 2; // 0, 1, or 2\nlong long array_size = 2000000; // size of chunks to shuffle individually\nchar *file_head; // temporary file string\nreal memory_limit = 2.0; // soft limit, in gigabytes\n\n/* Efficient string comparison */\nint scmp( char *s1, char *s2 ) {\n    while (*s1 != '\\0' && *s1 == *s2) {s1++; s2++;}\n    return(*s1 - *s2);\n}\n\n\n/* Generate uniformly distributed random long ints */\nstatic long rand_long(long n) {\n    long limit = LRAND_MAX - LRAND_MAX % n;\n    long rnd;\n    do {\n        rnd = ((long)RAND_MAX + 1) * (long)rand() + (long)rand();\n    } while (rnd >= limit);\n    return rnd % n;\n}\n\n/* Write contents of array to binary file */\nint write_chunk(CREC *array, long size, FILE *fout) {\n    long i = 0;\n    for (i = 0; i < size; i++) fwrite(&array[i], sizeof(CREC), 1, fout);\n    return 0;\n}\n\n/* Fisher-Yates shuffle */\nvoid shuffle(CREC *array, long n) {\n    long i, j;\n    CREC tmp;\n    for (i = n - 1; i > 0; i--) {\n        j = rand_long(i + 1);\n        tmp = array[j];\n        array[j] = array[i];\n        array[i] = tmp;\n    }\n}\n\n/* Merge shuffled temporary files; doesn't necessarily produce a perfect shuffle, but good enough */\nint shuffle_merge(int num) {\n    long i, j, k, l = 0;\n    int fidcounter = 0;\n    CREC *array;\n    char filename[MAX_STRING_LENGTH];\n    FILE **fid, *fout = stdout;\n    \n    array = malloc(sizeof(CREC) * array_size);\n    fid = malloc(sizeof(FILE) * num);\n    for (fidcounter = 0; fidcounter < num; fidcounter++) { //num = number of temporary files to merge\n        sprintf(filename,\"%s_%04d.bin\",file_head, fidcounter);\n        fid[fidcounter] = fopen(filename, \"rb\");\n        if (fid[fidcounter] == NULL) {\n            fprintf(stderr, \"Unable to open file %s.\\n\",filename);\n            return 1;\n        }\n    }\n    if (verbose > 0) fprintf(stderr, \"Merging temp files: processed %ld lines.\", l);\n    \n    while (1) { //Loop until EOF in all files\n        i = 0;\n        //Read at most array_size values into array, roughly array_size/num from each temp file\n        for (j = 0; j < num; j++) {\n            if (feof(fid[j])) continue;\n            for (k = 0; k < array_size / num; k++){\n                fread(&array[i], sizeof(CREC), 1, fid[j]);\n                if (feof(fid[j])) break;\n                i++;\n            }\n        }\n        if (i == 0) break;\n        l += i;\n        shuffle(array, i-1); // Shuffles lines between temp files\n        write_chunk(array,i,fout);\n        if (verbose > 0) fprintf(stderr, \"\\033[31G%ld lines.\", l);\n    }\n    fprintf(stderr, \"\\033[0GMerging temp files: processed %ld lines.\", l);\n    for (fidcounter = 0; fidcounter < num; fidcounter++) {\n        fclose(fid[fidcounter]);\n        sprintf(filename,\"%s_%04d.bin\",file_head, fidcounter);\n        remove(filename);\n    }\n    fprintf(stderr, \"\\n\\n\");\n    free(array);\n    return 0;\n}\n\n/* Shuffle large input stream by splitting into chunks */\nint shuffle_by_chunks() {\n    long i = 0, l = 0;\n    int fidcounter = 0;\n    char filename[MAX_STRING_LENGTH];\n    CREC *array;\n    FILE *fin = stdin, *fid;\n    array = malloc(sizeof(CREC) * array_size);\n    \n    fprintf(stderr,\"SHUFFLING COOCCURRENCES\\n\");\n    if (verbose > 0) fprintf(stderr,\"array size: %lld\\n\", array_size);\n    sprintf(filename,\"%s_%04d.bin\",file_head, fidcounter);\n    fid = fopen(filename,\"w\");\n    if (fid == NULL) {\n        fprintf(stderr, \"Unable to open file %s.\\n\",filename);\n        return 1;\n    }\n    if (verbose > 1) fprintf(stderr, \"Shuffling by chunks: processed 0 lines.\");\n    \n    while (1) { //Continue until EOF\n        if (i >= array_size) {// If array is full, shuffle it and save to temporary file\n            shuffle(array, i-2);\n            l += i;\n            if (verbose > 1) fprintf(stderr, \"\\033[22Gprocessed %ld lines.\", l);\n            write_chunk(array,i,fid);\n            fclose(fid);\n            fidcounter++;\n            sprintf(filename,\"%s_%04d.bin\",file_head, fidcounter);\n            fid = fopen(filename,\"w\");\n            if (fid == NULL) {\n                fprintf(stderr, \"Unable to open file %s.\\n\",filename);\n                return 1;\n            }\n            i = 0;\n        }\n        fread(&array[i], sizeof(CREC), 1, fin);\n        if (feof(fin)) break;\n        i++;\n    }\n    shuffle(array, i-2); //Last chunk may be smaller than array_size\n    write_chunk(array,i,fid);\n    l += i;\n    if (verbose > 1) fprintf(stderr, \"\\033[22Gprocessed %ld lines.\\n\", l);\n    if (verbose > 1) fprintf(stderr, \"Wrote %d temporary file(s).\\n\", fidcounter + 1);\n    fclose(fid);\n    free(array);\n    return shuffle_merge(fidcounter + 1); // Merge and shuffle together temporary files\n}\n\nint find_arg(char *str, int argc, char **argv) {\n    int i;\n    for (i = 1; i < argc; i++) {\n        if (!scmp(str, argv[i])) {\n            if (i == argc - 1) {\n                printf(\"No argument given for %s\\n\", str);\n                exit(1);\n            }\n            return i;\n        }\n    }\n    return -1;\n}\n\nint main(int argc, char **argv) {\n    int i;\n    file_head = malloc(sizeof(char) * MAX_STRING_LENGTH);\n    \n    if (argc == 1) {\n        printf(\"Tool to shuffle entries of word-word cooccurrence files\\n\");\n        printf(\"Author: Jeffrey Pennington (jpennin@stanford.edu)\\n\\n\");\n        printf(\"Usage options:\\n\");\n        printf(\"\\t-verbose <int>\\n\");\n        printf(\"\\t\\tSet verbosity: 0, 1, or 2 (default)\\n\");\n        printf(\"\\t-memory <float>\\n\");\n        printf(\"\\t\\tSoft limit for memory consumption, in GB; default 4.0\\n\");\n        printf(\"\\t-array-size <int>\\n\");\n        printf(\"\\t\\tLimit to length <int> the buffer which stores chunks of data to shuffle before writing to disk. \\n\\t\\tThis value overrides that which is automatically produced by '-memory'.\\n\");\n        printf(\"\\t-temp-file <file>\\n\");\n        printf(\"\\t\\tFilename, excluding extension, for temporary files; default temp_shuffle\\n\");\n        \n        printf(\"\\nExample usage: (assuming 'cooccurrence.bin' has been produced by 'coccur')\\n\");\n        printf(\"./shuffle -verbose 2 -memory 8.0 < cooccurrence.bin > cooccurrence.shuf.bin\\n\");\n        return 0;\n    }\n   \n    if ((i = find_arg((char *)\"-verbose\", argc, argv)) > 0) verbose = atoi(argv[i + 1]);\n    if ((i = find_arg((char *)\"-temp-file\", argc, argv)) > 0) strcpy(file_head, argv[i + 1]);\n    else strcpy(file_head, (char *)\"temp_shuffle\");\n    if ((i = find_arg((char *)\"-memory\", argc, argv)) > 0) memory_limit = atof(argv[i + 1]);\n    array_size = (long long) (0.95 * (real)memory_limit * 1073741824/(sizeof(CREC)));\n    if ((i = find_arg((char *)\"-array-size\", argc, argv)) > 0) array_size = atoll(argv[i + 1]);\n    return shuffle_by_chunks();\n}\n\n"
  },
  {
    "path": "utils_nlp/models/glove/src/vocab_count.c",
    "content": "//  Tool to extract unigram counts\n//\n//  GloVe: Global Vectors for Word Representation\n//  Copyright (c) 2014 The Board of Trustees of\n//  The Leland Stanford Junior University. All Rights Reserved.\n//\n//  Licensed under the Apache License, Version 2.0 (the \"License\");\n//  you may not use this file except in compliance with the License.\n//  You may obtain a copy of the License at\n//\n//      http://www.apache.org/licenses/LICENSE-2.0\n//\n//  Unless required by applicable law or agreed to in writing, software\n//  distributed under the License is distributed on an \"AS IS\" BASIS,\n//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n//  See the License for the specific language governing permissions and\n//  limitations under the License.\n//\n//\n//  For more information, bug reports, fixes, contact:\n//    Jeffrey Pennington (jpennin@stanford.edu)\n//    Christopher Manning (manning@cs.stanford.edu)\n//    https://github.com/stanfordnlp/GloVe/\n//    GlobalVectors@googlegroups.com\n//    http://nlp.stanford.edu/projects/glove/\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n\n#define MAX_STRING_LENGTH 1000\n#define TSIZE   1048576\n#define SEED    1159241\n\n#define HASHFN  bitwisehash\n\ntypedef struct vocabulary {\n    char *word;\n    long long count;\n} VOCAB;\n\ntypedef struct hashrec {\n    char *word;\n    long long count;\n    struct hashrec *next;\n} HASHREC;\n\nint verbose = 2; // 0, 1, or 2\nlong long min_count = 1; // min occurrences for inclusion in vocab\nlong long max_vocab = 0; // max_vocab = 0 for no limit\n\n\n/* Efficient string comparison */\nint scmp( char *s1, char *s2 ) {\n    while (*s1 != '\\0' && *s1 == *s2) {s1++; s2++;}\n    return *s1 - *s2;\n}\n\n\n/* Vocab frequency comparison; break ties alphabetically */\nint CompareVocabTie(const void *a, const void *b) {\n    long long c;\n    if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );\n    else return (scmp(((VOCAB *) a)->word,((VOCAB *) b)->word));\n    \n}\n\n/* Vocab frequency comparison; no tie-breaker */\nint CompareVocab(const void *a, const void *b) {\n    long long c;\n    if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );\n    else return 0;\n}\n\n/* Move-to-front hashing and hash function from Hugh Williams, http://www.seg.rmit.edu.au/code/zwh-ipl/ */\n\n/* Simple bitwise hash function */\nunsigned int bitwisehash(char *word, int tsize, unsigned int seed) {\n    char c;\n    unsigned int h;\n    h = seed;\n    for ( ; (c = *word) != '\\0'; word++) h ^= ((h << 5) + c + (h >> 2));\n    return (unsigned int)((h & 0x7fffffff) % tsize);\n}\n\n/* Create hash table, initialise pointers to NULL */\nHASHREC ** inithashtable() {\n    int i;\n    HASHREC **ht;\n    ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE );\n    for (i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL;\n    return ht;\n}\n\n/* Search hash table for given string, insert if not found */\nvoid hashinsert(HASHREC **ht, char *w) {\n    HASHREC     *htmp, *hprv;\n    unsigned int hval = HASHFN(w, TSIZE, SEED);\n    \n    for (hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);\n    if (htmp == NULL) {\n        htmp = (HASHREC *) malloc( sizeof(HASHREC) );\n        htmp->word = (char *) malloc( strlen(w) + 1 );\n        strcpy(htmp->word, w);\n        htmp->count = 1;\n        htmp->next = NULL;\n        if ( hprv==NULL )\n            ht[hval] = htmp;\n        else\n            hprv->next = htmp;\n    }\n    else {\n        /* new records are not moved to front */\n        htmp->count++;\n        if (hprv != NULL) {\n            /* move to front on access */\n            hprv->next = htmp->next;\n            htmp->next = ht[hval];\n            ht[hval] = htmp;\n        }\n    }\n    return;\n}\n\n/* Read word from input stream. Return 1 when encounter '\\n' or EOF (but separate from word), 0 otherwise.\n   Words can be separated by space(s), tab(s), or newline(s). Carriage return characters are just ignored.\n   (Okay for Windows, but not for Mac OS 9-. Ignored even if by themselves or in words.)\n   A newline is taken as indicating a new document (contexts won't cross newline).\n   Argument word array is assumed to be of size MAX_STRING_LENGTH.\n   words will be truncated if too long. They are truncated with some care so that they\n   cannot truncate in the middle of a utf-8 character, but\n   still little to no harm will be done for other encodings like iso-8859-1.\n   (This function appears identically copied in vocab_count.c and cooccur.c.)\n */\nint get_word(char *word, FILE *fin) {\n    int i = 0, ch;\n    for ( ; ; ) {\n        ch = fgetc(fin);\n        if (ch == '\\r') continue;\n        if (i == 0 && ((ch == '\\n') || (ch == EOF))) {\n            word[i] = 0;\n            return 1;\n        }\n        if (i == 0 && ((ch == ' ') || (ch == '\\t'))) continue; // skip leading space\n        if ((ch == EOF) || (ch == ' ') || (ch == '\\t') || (ch == '\\n')) {\n            if (ch == '\\n') ungetc(ch, fin); // return the newline next time as document ender\n            break;\n        }\n        if (i < MAX_STRING_LENGTH - 1)\n          word[i++] = ch; // don't allow words to exceed MAX_STRING_LENGTH\n    }\n    word[i] = 0; //null terminate\n    // avoid truncation destroying a multibyte UTF-8 char except if only thing on line (so the i > x tests won't overwrite word[0])\n    // see https://en.wikipedia.org/wiki/UTF-8#Description\n    if (i == MAX_STRING_LENGTH - 1 && (word[i-1] & 0x80) == 0x80) {\n        if ((word[i-1] & 0xC0) == 0xC0) {\n            word[i-1] = '\\0';\n        } else if (i > 2 && (word[i-2] & 0xE0) == 0xE0) {\n            word[i-2] = '\\0';\n        } else if (i > 3 && (word[i-3] & 0xF8) == 0xF0) {\n            word[i-3] = '\\0';\n        }\n    }\n    return 0;\n}\n\nint get_counts() {\n    long long i = 0, j = 0, vocab_size = 12500;\n    // char format[20];\n    char str[MAX_STRING_LENGTH + 1];\n    HASHREC **vocab_hash = inithashtable();\n    HASHREC *htmp;\n    VOCAB *vocab;\n    FILE *fid = stdin;\n    \n    fprintf(stderr, \"BUILDING VOCABULARY\\n\");\n    if (verbose > 1) fprintf(stderr, \"Processed %lld tokens.\", i);\n    // sprintf(format,\"%%%ds\",MAX_STRING_LENGTH);\n    while ( ! feof(fid)) {\n        // Insert all tokens into hashtable\n        int nl = get_word(str, fid);\n        if (nl) continue; // just a newline marker or feof\n        if (strcmp(str, \"<unk>\") == 0) {\n            fprintf(stderr, \"\\nError, <unk> vector found in corpus.\\nPlease remove <unk>s from your corpus (e.g. cat text8 | sed -e 's/<unk>/<raw_unk>/g' > text8.new)\");\n            return 1;\n        }\n        hashinsert(vocab_hash, str);\n        if (((++i)%100000) == 0) if (verbose > 1) fprintf(stderr,\"\\033[11G%lld tokens.\", i);\n    }\n    if (verbose > 1) fprintf(stderr, \"\\033[0GProcessed %lld tokens.\\n\", i);\n    vocab = malloc(sizeof(VOCAB) * vocab_size);\n    for (i = 0; i < TSIZE; i++) { // Migrate vocab to array\n        htmp = vocab_hash[i];\n        while (htmp != NULL) {\n            vocab[j].word = htmp->word;\n            vocab[j].count = htmp->count;\n            j++;\n            if (j>=vocab_size) {\n                vocab_size += 2500;\n                vocab = (VOCAB *)realloc(vocab, sizeof(VOCAB) * vocab_size);\n            }\n            htmp = htmp->next;\n        }\n    }\n    if (verbose > 1) fprintf(stderr, \"Counted %lld unique words.\\n\", j);\n    if (max_vocab > 0 && max_vocab < j)\n        // If the vocabulary exceeds limit, first sort full vocab by frequency without alphabetical tie-breaks.\n        // This results in pseudo-random ordering for words with same frequency, so that when truncated, the words span whole alphabet\n        qsort(vocab, j, sizeof(VOCAB), CompareVocab);\n    else max_vocab = j;\n    qsort(vocab, max_vocab, sizeof(VOCAB), CompareVocabTie); //After (possibly) truncating, sort (possibly again), breaking ties alphabetically\n    \n    for (i = 0; i < max_vocab; i++) {\n        if (vocab[i].count < min_count) { // If a minimum frequency cutoff exists, truncate vocabulary\n            if (verbose > 0) fprintf(stderr, \"Truncating vocabulary at min count %lld.\\n\",min_count);\n            break;\n        }\n        printf(\"%s %lld\\n\",vocab[i].word,vocab[i].count);\n    }\n    \n    if (i == max_vocab && max_vocab < j) if (verbose > 0) fprintf(stderr, \"Truncating vocabulary at size %lld.\\n\", max_vocab);\n    fprintf(stderr, \"Using vocabulary of size %lld.\\n\\n\", i);\n    return 0;\n}\n\nint find_arg(char *str, int argc, char **argv) {\n    int i;\n    for (i = 1; i < argc; i++) {\n        if (!scmp(str, argv[i])) {\n            if (i == argc - 1) {\n                printf(\"No argument given for %s\\n\", str);\n                exit(1);\n            }\n            return i;\n        }\n    }\n    return -1;\n}\n\nint main(int argc, char **argv) {\n    int i;\n    if (argc == 1) {\n        printf(\"Simple tool to extract unigram counts\\n\");\n        printf(\"Author: Jeffrey Pennington (jpennin@stanford.edu)\\n\\n\");\n        printf(\"Usage options:\\n\");\n        printf(\"\\t-verbose <int>\\n\");\n        printf(\"\\t\\tSet verbosity: 0, 1, or 2 (default)\\n\");\n        printf(\"\\t-max-vocab <int>\\n\");\n        printf(\"\\t\\tUpper bound on vocabulary size, i.e. keep the <int> most frequent words. The minimum frequency words are randomly sampled so as to obtain an even distribution over the alphabet.\\n\");\n        printf(\"\\t-min-count <int>\\n\");\n        printf(\"\\t\\tLower limit such that words which occur fewer than <int> times are discarded.\\n\");\n        printf(\"\\nExample usage:\\n\");\n        printf(\"./vocab_count -verbose 2 -max-vocab 100000 -min-count 10 < corpus.txt > vocab.txt\\n\");\n        return 0;\n    }\n    \n    if ((i = find_arg((char *)\"-verbose\", argc, argv)) > 0) verbose = atoi(argv[i + 1]);\n    if ((i = find_arg((char *)\"-max-vocab\", argc, argv)) > 0) max_vocab = atoll(argv[i + 1]);\n    if ((i = find_arg((char *)\"-min-count\", argc, argv)) > 0) min_count = atoll(argv[i + 1]);\n    return get_counts();\n}\n\n"
  },
  {
    "path": "utils_nlp/models/pretrained_embeddings/README.md",
    "content": "# Pretrained Embeddings\nThe pretrained embeddings submodule contains utility functions that help users quickly load and extract various types of pretrained embeddings such as fastText, GloVe, Word2Vec, etc.\n"
  },
  {
    "path": "utils_nlp/models/pretrained_embeddings/__init__.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nWORD2VEC_URL = \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\"\nFASTTEXT_EN_URL = (\n    \"https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip\"\n)\nGLOVE_URL = \"http://nlp.stanford.edu/data/glove.840B.300d.zip\"\n"
  },
  {
    "path": "utils_nlp/models/pretrained_embeddings/fasttext.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Functions to help users load and extract fastText pretrained embeddings.\"\"\"\n\nimport os\nimport zipfile\n\nfrom gensim.models.fasttext import load_facebook_model\n\nfrom utils_nlp.dataset.url_utils import maybe_download\nfrom utils_nlp.models.pretrained_embeddings import FASTTEXT_EN_URL\n\n\ndef _extract_fasttext_vectors(zip_path, dest_path=\".\"):\n    \"\"\" Extracts fastText embeddings from zip file.\n\n    Args:\n        zip_path(str): Path to the downloaded compressed zip file.\n        dest_path(str): Final destination directory path to the extracted zip file.\n        Picks the current working directory by default.\n\n    Returns:\n        str: Returns the absolute path to the extracted folder.\n    \"\"\"\n\n    if os.path.exists(zip_path):\n        with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n            zip_ref.extractall(path=dest_path)\n    else:\n        raise Exception(\"Zipped file not found!\")\n\n    os.remove(zip_path)\n    return dest_path\n\n\ndef _download_fasttext_vectors(download_dir, file_name=\"wiki.simple.zip\"):\n    \"\"\" Downloads pre-trained word vectors for English, trained on Wikipedia using\n    fastText. You can directly download the vectors from here:\n    https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip\n\n    For the full version of pre-trained word vectors, change the url for\n    FASTTEXT_EN_URL to https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip\n    in __init__.py\n\n    Args:\n        download_dir (str): File path to download the file\n        file_name (str) : File name given by default but can be changed by the user.\n\n    Returns:\n        str: file_path to the downloaded vectors.\n    \"\"\"\n\n    return maybe_download(\n        FASTTEXT_EN_URL, filename=file_name, work_directory=download_dir\n    )\n\n\ndef _maybe_download_and_extract(dest_path, file_name):\n    \"\"\" Downloads and extracts fastText vectors if they don’t already exist\n\n    Args:\n        dest_path(str): Final path where the vectors will be extracted.\n        file_name(str): File name of the fastText vector file.\n\n    Returns:\n        str: File path to the fastText vector file.\n    \"\"\"\n\n    dir_path = os.path.join(dest_path, \"fastText\")\n    file_path = os.path.join(dir_path, file_name)\n\n    if not os.path.exists(file_path):\n        if not os.path.exists(dir_path):\n            os.makedirs(dir_path)\n        zip_path = _download_fasttext_vectors(dir_path)\n        _extract_fasttext_vectors(zip_path, dir_path)\n    else:\n        print(\"Vector file already exists. No changes made.\")\n\n    return file_path\n\n\ndef load_pretrained_vectors(dest_path, file_name=\"wiki.simple.bin\"):\n    \"\"\" Method that loads fastText vectors. Downloads if it doesn't exist.\n\n    Args:\n        file_name(str): Name of the fastText file.\n        dest_path(str): Path to the directory where fastText vectors exist or will be\n        downloaded.\n\n    Returns:\n        gensim.models.fasttext.load_facebook_model: Loaded word2vectors\n\n    \"\"\"\n\n    file_path = _maybe_download_and_extract(dest_path, file_name)\n    model = load_facebook_model(file_path)\n    return model\n"
  },
  {
    "path": "utils_nlp/models/pretrained_embeddings/glove.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Functions to help users load and extract GloVe pretrained embeddings.\"\"\"\n\nimport os\nimport zipfile\n\nfrom gensim.models import KeyedVectors\nfrom gensim.scripts.glove2word2vec import glove2word2vec\nfrom gensim.test.utils import get_tmpfile\n\nfrom utils_nlp.dataset.url_utils import maybe_download\nfrom utils_nlp.models.pretrained_embeddings import GLOVE_URL\n\n\ndef _extract_glove_vectors(zip_path, dest_path=\".\"):\n    \"\"\" Extracts gloVe embeddings from zip file.\n\n    Args:\n        zip_path(str): Path to the downloaded compressed zip file.\n        dest_path(str): Final destination directory path to the extracted zip file.\n        Picks the current working directory by default.\n\n    Returns:\n        str: Returns the absolute path to the extracted folder.\n    \"\"\"\n\n    if os.path.exists(zip_path):\n        with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n            zip_ref.extractall(path=dest_path)\n    else:\n        raise Exception(\"Zipped file not found!\")\n\n    os.remove(zip_path)\n    return dest_path\n\n\ndef _download_glove_vectors(download_dir, file_name=\"glove.840B.300d.zip\"):\n    \"\"\" Downloads gloVe word vectors trained on Common Crawl corpus. You can\n    directly download the vectors from here:\n    http://nlp.stanford.edu/data/glove.840B.300d.zip\n\n    Args:\n        download_dir (str): File path to download the file\n        file_name (str) : File name given by default but can be changed by the user.\n\n    Returns:\n        str: file_path to the downloaded vectors.\n    \"\"\"\n\n    return maybe_download(\n        GLOVE_URL, filename=file_name, work_directory=download_dir\n    )\n\n\ndef _maybe_download_and_extract(dest_path, file_name):\n    \"\"\" Downloads and extracts gloVe vectors if they don’t already exist\n\n    Args:\n        dest_path(str): Final path where the vectors will be extracted.\n        file_name(str): File name of the gloVe vector file.\n\n    Returns:\n        str: File path to the gloVe vector file.\n    \"\"\"\n\n    dir_path = os.path.join(dest_path, \"gloVe\")\n    file_path = os.path.join(dir_path, file_name)\n\n    if not os.path.exists(file_path):\n        if not os.path.exists(dir_path):\n            os.makedirs(dir_path)\n        filepath = _download_glove_vectors(dir_path)\n        _extract_glove_vectors(filepath, dir_path)\n    else:\n        print(\"Vector file already exists. No changes made.\")\n\n    return file_path\n\n\ndef download_and_extract(dir_path, file_name=\"glove.840B.300d.txt\"):\n    \"\"\" Downloads and extracts gloVe vectors if they don’t already exist\n\n    Args:\n        dir_path(str): Final path where the vectors will be extracted.\n        file_name(str): File name of the gloVe vector file.\n\n    Returns:\n        str: File path to the gloVe vector file.\n    \"\"\"\n\n    return _maybe_download_and_extract(dir_path, file_name)\n\n\ndef load_pretrained_vectors(\n    dir_path, file_name=\"glove.840B.300d.txt\", limit=None\n):\n    \"\"\" Method that loads gloVe vectors. Downloads if it doesn't exist.\n\n    Args:\n        file_name(str): Name of the gloVe file.\n        dir_path(str): Path to the directory where gloVe vectors exist or will be\n        downloaded.\n        limit(int): Number of word vectors that is loaded from gensim. This option\n        allows us to save RAM space and avoid memory errors.\n\n    Returns:\n        gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors\n    \"\"\"\n\n    file_path = _maybe_download_and_extract(dir_path, file_name)\n    tmp_file = get_tmpfile(\"test_word2vec.txt\")\n\n    # Convert GloVe format to word2vec\n    _ = glove2word2vec(file_path, tmp_file)\n\n    model = KeyedVectors.load_word2vec_format(tmp_file, limit=limit)\n    os.remove(tmp_file)\n\n    return model\n"
  },
  {
    "path": "utils_nlp/models/pretrained_embeddings/word2vec.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Functions to help users load and extract Word2Vec pretrained embeddings.\"\"\"\n\nimport gzip\nimport os\n\nfrom gensim.models.keyedvectors import KeyedVectors\n\nfrom utils_nlp.dataset.url_utils import maybe_download\nfrom utils_nlp.models.pretrained_embeddings import WORD2VEC_URL\n\n\ndef _extract_word2vec_vectors(zip_path, dest_filepath):\n    \"\"\" Extracts word2vec embeddings from bin.gz archive\n\n    Args:\n        zip_path: Path to the downloaded compressed file.\n        dest_filepath: Final destination file path to the extracted zip file.\n    \"\"\"\n\n    if os.path.exists(zip_path):\n        with gzip.GzipFile(zip_path, \"rb\") as f_in, open(\n            dest_filepath, \"wb\"\n        ) as f_out:\n            f_out.writelines(f_in)\n    else:\n        raise Exception(\"Zipped file not found!\")\n\n    os.remove(zip_path)\n\n\ndef _download_word2vec_vectors(\n    download_dir, file_name=\"GoogleNews-vectors-negative300.bin.gz\"\n):\n    \"\"\" Downloads pretrained word vectors trained on GoogleNews corpus. You can\n    directly download the vectors from here:\n    https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n\n    Args:\n        download_dir (str): File path to download the file\n        file_name (str) : File name given by default but can be changed by the user.\n\n    Returns:\n        str: file_path to the downloaded vectors.\n    \"\"\"\n\n    return maybe_download(\n        WORD2VEC_URL, filename=file_name, work_directory=download_dir\n    )\n\n\ndef _maybe_download_and_extract(dest_path, file_name):\n    \"\"\" Downloads and extracts Word2vec vectors if they don’t already exist\n\n    Args:\n        dest_path: Path to the directory where the vectors will be extracted.\n        file_name: File name of the word2vec vector file.\n\n    Returns:\n         str: File path to the word2vec vector file.\n    \"\"\"\n\n    dir_path = os.path.join(dest_path, \"word2vec\")\n    file_path = os.path.join(dir_path, file_name)\n\n    if not os.path.exists(file_path):\n        if not os.path.exists(dir_path):\n            os.makedirs(dir_path)\n        filepath = _download_word2vec_vectors(dir_path)\n        _extract_word2vec_vectors(filepath, file_path)\n    else:\n        print(\"Vector file already exists. No changes made.\")\n\n    return file_path\n\n\ndef load_pretrained_vectors(\n    dir_path, file_name=\"GoogleNews-vectors-negative300.bin\", limit=None\n):\n    \"\"\" Method that loads word2vec vectors. Downloads if it doesn't exist.\n\n    Args:\n        file_name(str): Name of the word2vec file.\n        dir_path(str): Path to the directory where word2vec vectors exist or will be\n        downloaded.\n        limit(int): Number of word vectors that is loaded from gensim. This option\n        allows us to save RAM space and avoid memory errors.\n\n    Returns:\n        gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors\n\n    \"\"\"\n    file_path = _maybe_download_and_extract(dir_path, file_name)\n    word2vec_vectors = KeyedVectors.load_word2vec_format(\n        file_path, binary=True, limit=limit\n    )\n\n    return word2vec_vectors\n"
  },
  {
    "path": "utils_nlp/models/pytorch_modules/README.md",
    "content": "# PyTorch Modules\n\nThis folder contains the PyTorch modules that are used across the Git repository.\n\n## Summary\n\nThe following table summarizes each module.\n\n|Module|Description|\n|---|---|\n|[ConditionalGRU](conditional_gru.py)| An implemention of Gated Recurrent Unit (GRU) with peepholes, which was proposed in [Learning Precise Timing with LSTM Recurrent Networks](http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf) by Gers, F. A., Schraudolph, N. N., and Schmidhuber, J.|\n"
  },
  {
    "path": "utils_nlp/models/pytorch_modules/__init__.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n"
  },
  {
    "path": "utils_nlp/models/pytorch_modules/conditional_gru.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"A Gated Recurrent Unit (GRU) cell with peepholes.\"\"\"\nimport math\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass ConditionalGRU(nn.Module):\n    \"\"\"A Gated Recurrent Unit (GRU) cell with peepholes.\"\"\"\n\n    def __init__(self, input_dim, hidden_dim, dropout=0.0):\n        \"\"\"Initialize params.\n\n        Args:\n            input_dim: Dimension of the input vector.\n            hidden_dim: Dimension of the hidden layer.\n            dropout: Dropout of the network.\n        \"\"\"\n\n        super(ConditionalGRU, self).__init__()\n        self.input_dim = input_dim\n        self.hidden_dim = hidden_dim\n\n        self.input_weights = nn.Linear(self.input_dim, 3 * self.hidden_dim)\n        self.hidden_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim)\n        self.peep_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim)\n\n        self.reset_parameters()\n\n    def reset_parameters(self):\n        \"\"\"Set params. \"\"\"\n        stdv = 1.0 / math.sqrt(self.hidden_dim)\n        for weight in self.parameters():\n            weight.data.uniform_(-stdv, stdv)\n\n    def forward(self, input, hidden, ctx):\n        \"\"\"Propogate input through the layer.\n\n        Args:\n            input: batch size x target sequence length  x embedding dimension.\n            hidden: batch size x hidden dimension.\n            ctx: batch size x source sequence length  x hidden dimension.\n\n        Returns:\n            output(torch.Tensor)  - batch size x target sequence length  x\n            hidden dimension\n            hidden(torch.Tensor)  - (batch size x hidden dimension, batch size x hidden\n            dimension)\n\n        \"\"\"\n\n        def recurrence(input, hidden, ctx):\n            \"\"\"Recurrence helper.\"\"\"\n            input_gate = self.input_weights(input)\n            hidden_gate = self.hidden_weights(hidden)\n            peep_gate = self.peep_weights(ctx)\n            i_r, i_i, i_n = input_gate.chunk(3, 1)\n            h_r, h_i, h_n = hidden_gate.chunk(3, 1)\n            p_r, p_i, p_n = peep_gate.chunk(3, 1)\n            resetgate = F.sigmoid(i_r + h_r + p_r)\n            inputgate = F.sigmoid(i_i + h_i + p_i)\n            newgate = F.tanh(i_n + resetgate * h_n + p_n)\n            hy = newgate + inputgate * (hidden - newgate)\n\n            return hy\n\n        input = input.transpose(0, 1)\n\n        output = []\n        steps = range(input.size(0))\n        for i in steps:\n            hidden = recurrence(input[i], hidden, ctx)\n            if isinstance(hidden, tuple):\n                output.append(hidden[0])\n            else:\n                output.append(hidden)\n\n        output = torch.cat(output, 0).view(input.size(0), *output[0].size())\n        output = output.transpose(0, 1)\n\n        return output, hidden\n\n\n# Original source: https://github.com/Maluuba/gensen\n"
  },
  {
    "path": "utils_nlp/models/transformers/abstractive_summarization_bertsum.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\n# This script reuses some code from https://github.com/nlpyang/Presumm\n# This script reuses some code from https://github.com/huggingface/transformers/\n# Add to noticefile\n\nimport logging\nimport os\nimport pickle\nfrom collections import namedtuple\n\nimport torch\nfrom torch.utils.data import DataLoader, RandomSampler, SequentialSampler\nfrom torch.utils.data.distributed import DistributedSampler\nfrom tqdm import tqdm\nfrom transformers import AutoTokenizer, BertModel\n\nfrom utils_nlp.common.pytorch_utils import (\n    compute_training_steps,\n    get_amp,\n    get_device,\n    move_model_to_device,\n    parallelize_model,\n)\nfrom utils_nlp.eval import compute_rouge_python\nfrom utils_nlp.models.transformers.bertsum import model_builder\nfrom utils_nlp.models.transformers.bertsum.model_builder import AbsSummarizer\nfrom utils_nlp.models.transformers.bertsum.predictor import build_predictor\nfrom utils_nlp.models.transformers.common import Transformer\n\nMODEL_CLASS = {\"bert-base-uncased\": BertModel}\n\nlogger = logging.getLogger(__name__)\n\n\ndef fit_to_block_size(sequence, block_size, pad_token_id):\n    \"\"\" Adapt the source and target sequences' lengths to the block size.\n    If the sequence is shorter we append padding token to the right of the sequence.\n\n    Args:\n        sequence (list): sequence to be truncated to padded\n        block_size (int): length of the output\n\n    Returns:\n        sequence (list): padded or shortend list\n\n    \"\"\"\n    if len(sequence) > block_size:\n        return sequence[:block_size]\n    else:\n        sequence.extend([pad_token_id] * (block_size - len(sequence)))\n        return sequence\n\n\ndef build_mask(sequence, pad_token_id):\n    \"\"\" Builds the mask. The attention mechanism will only attend to positions\n    with value 1.\n\n    Args:\n        sequence (list): sequences for which the mask is built for.\n        pad_token_id (long): padding token id for which the mask is 0.\n\n    Returns:\n        mask (list): sequences of 1s and 0s.\n\n    \"\"\"\n    mask = torch.ones_like(sequence)\n    idx_pad_tokens = sequence == pad_token_id\n    mask[idx_pad_tokens] = 0\n    return mask\n\n\ndef compute_token_type_ids(batch, separator_token_id):\n    \"\"\" Segment embeddings as described in [1]\n    The values {0,1} were found in the repository [2].\n\n    Args:\n        batch (torch.Tensor, size [batch_size, block_size]):\n            Batch of input.\n        separator_token_id: int\n            The value of the token that separates the segments.\n\n    Returns:\n        torch.Tensor, size [batch_size, block_size]): segment embeddings.\n\n    [1] Liu, Yang, and Mirella Lapata. \"Text summarization with pretrained encoders.\"\n        arXiv preprint arXiv:1908.08345 (2019).\n    [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)\n    \"\"\"\n    batch_embeddings = []\n    for sequence in batch:\n        sentence_num = -1\n        embeddings = []\n        for s in sequence:\n            if s == separator_token_id:\n                sentence_num += 1\n            embeddings.append(sentence_num % 2)\n        batch_embeddings.append(embeddings)\n    return torch.tensor(batch_embeddings)\n\n\nclass BertSumAbsProcessor:\n    \"\"\"Class for preprocessing abstractive summarization data for\n        BertSumAbs algorithm.\"\"\"\n\n    def __init__(\n        self,\n        model_name=\"bert-base-uncased\",\n        to_lower=True,\n        cache_dir=\".\",\n        max_src_len=640,\n        max_tgt_len=140,\n    ):\n        \"\"\" Initialize the preprocessor.\n\n        Args:\n            model_name (str, optional): Transformer model name used in preprocessing.\n                check MODEL_CLASS for supported models. Defaults to \"bert-base-cased\".\n            to_lower (bool, optional): Whether to convert all letters to lower case\n                during tokenization. This is determined by if a cased model is used.\n                Defaults to True, which corresponds to a uncased model.\n            cache_dir (str, optional): Directory to cache the tokenizer.\n                Defaults to \".\".\n            max_src_len (int, optional): Max number of tokens that be used\n                as input. Defaults to 640.\n            max_tgt_len (int, optional): Max number of tokens that be used\n                as in target. Defaults to 140.\n\n        \"\"\"\n        self.model_name = model_name\n        self.tokenizer = AutoTokenizer.from_pretrained(\n            model_name,\n            do_lower_case=to_lower,\n            cache_dir=cache_dir,\n            output_loading_info=False,\n        )\n\n        self.symbols = {\n            \"BOS\": self.tokenizer.vocab[\"[unused0]\"],\n            \"EOS\": self.tokenizer.vocab[\"[unused1]\"],\n            \"PAD\": self.tokenizer.vocab[\"[PAD]\"],\n            \"EOQ\": self.tokenizer.vocab[\"[unused2]\"],\n        }\n\n        self.sep_token = \"[SEP]\"\n        self.cls_token = \"[CLS]\"\n        self.pad_token = \"[PAD]\"\n        self.tgt_bos = self.symbols[\"BOS\"]\n        self.tgt_eos = self.symbols[\"EOS\"]\n\n        self.max_src_len = max_src_len\n        self.max_tgt_len = max_tgt_len\n\n    @staticmethod\n    def list_supported_models():\n        return list(MODEL_CLASS)\n\n    @property\n    def model_name(self):\n        return self._model_name\n\n    @model_name.setter\n    def model_name(self, value):\n        if value not in self.list_supported_models():\n            raise ValueError(\n                \"Model name {} is not supported by BertSumAbsProcessor. \"\n                \"Call 'BertSumAbsProcessor.list_supported_models()' to \"\n                \"get all supported model names.\".format(value)\n            )\n\n        self._model_name = value\n\n    @staticmethod\n    def get_inputs(batch, device, model_name, train_mode=True):\n        \"\"\"\n        Creates an input dictionary given a model name.\n\n        Args:\n            batch (object): A Batch containing input ids, segment ids,\n                masks for the input ids and source text. If train_mode is True, it\n                also contains the target ids and the number of tokens\n                in the target and target text.\n            device (torch.device): A PyTorch device.\n            model_name (bool): Model name used to format the inputs.\n            train_mode (bool, optional): Training mode flag.\n                Defaults to True.\n\n        Returns:\n            dict: Dictionary containing input ids, segment ids, sentence class ids,\n            masks for the input ids. Target ids and number of tokens in the target are\n            only returned when train_mode is True.\n        \"\"\"\n\n        if model_name.split(\"-\")[0] in [\"bert\"]:\n            if train_mode:\n                # labels must be the last\n\n                return {\n                    \"src\": batch.src,\n                    \"segs\": batch.segs,\n                    \"mask_src\": batch.mask_src,\n                    \"tgt\": batch.tgt,\n                    \"tgt_num_tokens\": batch.tgt_num_tokens,\n                }\n            else:\n                return {\n                    \"src\": batch.src,\n                    \"segs\": batch.segs,\n                    \"mask_src\": batch.mask_src,\n                }\n        else:\n            raise ValueError(\"Model not supported: {}\".format(model_name))\n\n    def collate(self, data, block_size, device, train_mode=True):\n        \"\"\" Collate formats the data passed to the data loader.\n        In particular we tokenize the data batch after batch to avoid keeping them\n        all in memory.\n\n        Args:\n            data (list of (str, str)): input data to be loaded.\n            block_size (long): size of the encoded data to be passed into the data loader\n            device (torch.device): A PyTorch device.\n            train_mode (bool, optional): Training mode flag.\n                Defaults to True.\n\n        Returns:\n            namedtuple: a nametuple containing input ids, segment ids,\n                masks for the input ids and source text. If train_mode is True, it\n                also contains the target ids and the number of tokens\n                in the target and target text.\n        \"\"\"\n        data = [x for x in data if not len(x[\"src\"]) == 0]  # remove empty_files\n        if len(data) == 0:\n            return None\n        stories = [\" \".join(d[\"src\"]) for d in data]\n        if train_mode is True and \"tgt\" in data[0]:\n            summaries = [\" \".join(d[\"tgt\"]) for d in data]\n            encoded_text = [self.preprocess(d[\"src\"], d[\"tgt\"]) for d in data]\n        else:\n            encoded_text = [self.preprocess(d[\"src\"], None) for d in data]\n\n        encoded_stories = torch.tensor(\n            [\n                fit_to_block_size(story, block_size, self.tokenizer.pad_token_id)\n                for story, _ in encoded_text\n            ]\n        )\n        encoder_token_type_ids = compute_token_type_ids(\n            encoded_stories, self.tokenizer.cls_token_id\n        )\n        encoder_mask = build_mask(encoded_stories, self.tokenizer.pad_token_id)\n\n        if train_mode and \"tgt\" in data[0]:\n            encoded_summaries = torch.tensor(\n                [\n                    [self.tgt_bos]\n                    + fit_to_block_size(\n                        summary, block_size - 2, self.tokenizer.pad_token_id\n                    )\n                    + [self.tgt_eos]\n                    for _, summary in encoded_text\n                ]\n            )\n            summary_num_tokens = [\n                encoded_summary.ne(self.tokenizer.pad_token_id).sum()\n                for encoded_summary in encoded_summaries\n            ]\n\n            Batch = namedtuple(\n                \"Batch\",\n                [\n                    \"src\",\n                    \"segs\",\n                    \"mask_src\",\n                    \"tgt\",\n                    \"tgt_num_tokens\",\n                    \"src_str\",\n                    \"tgt_str\",\n                ],\n            )\n            batch = Batch(\n                src=encoded_stories.to(device),\n                segs=encoder_token_type_ids.to(device),\n                mask_src=encoder_mask.to(device),\n                tgt_num_tokens=torch.stack(summary_num_tokens).to(device),\n                tgt=encoded_summaries.to(device),\n                src_str=stories,\n                tgt_str=summaries,\n            )\n        else:\n            Batch = namedtuple(\"Batch\", [\"src\", \"segs\", \"mask_src\"])\n            batch = Batch(\n                src=encoded_stories.to(device),\n                segs=encoder_token_type_ids.to(device),\n                mask_src=encoder_mask.to(device),\n            )\n\n        return batch\n\n    def preprocess(self, story_lines, summary_lines=None):\n        \"\"\"preprocess multiple data points\n\n           Args:\n              story_lines (list of strings): List of sentences.\n              targets (list of strings, optional): List of sentences.\n                  Defaults to None, which means it doesn't include summary and is\n                  not training data.\n\n            Returns:\n                If summary_lines is None, return list of list of token ids. Otherwise,\n                return a tuple of (list of list of token ids, list of list of token ids).\n\n        \"\"\"\n        story_lines_token_ids = []\n        for line in story_lines:\n            try:\n                if len(line) <= 0:\n                    continue\n                story_lines_token_ids.append(\n                    self.tokenizer.encode(line, max_length=self.max_src_len)\n                )\n            except:\n                print(line)\n                raise\n        story_token_ids = [\n            token for sentence in story_lines_token_ids for token in sentence\n        ]\n        if summary_lines:\n            summary_lines_token_ids = []\n            for line in summary_lines:\n                try:\n                    if len(line) <= 0:\n                        continue\n                    summary_lines_token_ids.append(\n                        self.tokenizer.encode(line, max_length=self.max_tgt_len)\n                    )\n                except:\n                    print(line)\n                    raise\n            summary_token_ids = [\n                token for sentence in summary_lines_token_ids for token in sentence\n            ]\n            return story_token_ids, summary_token_ids\n        else:\n            return story_token_ids, None\n\n\ndef validate(summarizer, validate_dataset):\n    \"\"\" validation function to be used optionally in fine tuning.\n\n    Args:\n        summarizer(BertSumAbs): The summarizer under fine tuning.\n        validate_dataset (SummarizationDataset): dataset for validation.\n\n    Returns:\n        string: A string which contains the rouge score on a subset of\n            the validation dataset.\n\n    \"\"\"\n    TOP_N = 8\n    shortened_dataset = validate_dataset.shorten(TOP_N)\n    reference_summaries = [\n        \" \".join(t).rstrip(\"\\n\") for t in shortened_dataset.get_target()\n    ]\n    generated_summaries = summarizer.predict(\n        shortened_dataset, num_gpus=1, batch_size=4\n    )\n    assert len(generated_summaries) == len(reference_summaries)\n    print(\"###################\")\n    print(\"prediction is {}\".format(generated_summaries[0]))\n    print(\"reference is {}\".format(reference_summaries[0]))\n\n    rouge_score = compute_rouge_python(\n        cand=generated_summaries, ref=reference_summaries\n    )\n    return \"rouge score: {}\".format(rouge_score)\n\n\nclass BertSumAbs(Transformer):\n    \"\"\"class which performs abstractive summarization fine tuning and\n        prediction based on BertSumAbs model  \"\"\"\n\n    def __init__(\n        self,\n        processor,\n        model_name=\"bert-base-uncased\",\n        finetune_bert=True,\n        cache_dir=\".\",\n        label_smoothing=0.1,\n        test=False,\n        max_pos_length=768,\n    ):\n        \"\"\"Initialize an object of BertSumAbs.\n\n        Args:\n            processor (BertSumAbsProcessor): A processor with symbols, tokenizers\n                and collate functions that are used in finetuning and prediction.\n            model_name (str, optional:) Name of the pretrained model which is used\n                to initialize the encoder of the BertSumAbs model.\n                check MODEL_CLASS for supported models. Defaults to \"bert-base-uncased\".\n            finetune_bert (bool, option): Whether the bert model in the encoder is\n                finetune or not. Defaults to True.\n            cache_dir (str, optional): Directory to cache the tokenizer.\n                Defaults to \".\".\n            label_smoothing (float, optional): The amount of label smoothing.\n                Value range is [0, 1]. Defaults to 0.1.\n            test (bool, optional): Whether the class is initiated for test or not.\n                It must be True if the class obj is only initialized to load a\n                 checkpoint for test/inferencing.  Defaults to False.\n            max_pos_length (int, optional): maximum postional embedding length for the\n                input. Defaults to 768.\n        \"\"\"\n        model = MODEL_CLASS[model_name].from_pretrained(\n            model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False\n        )\n        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)\n\n        if model_name not in self.list_supported_models():\n            raise ValueError(\n                \"Model name {} is not supported by BertSumAbs. \"\n                \"Call 'BertSumAbs.list_supported_models()' to get all supported model \"\n                \"names.\".format(value)\n            )\n\n        self.model_class = MODEL_CLASS[model_name]\n        self.cache_dir = cache_dir\n        self.max_pos_length = max_pos_length\n\n        self.model = AbsSummarizer(\n            temp_dir=cache_dir,\n            finetune_bert=finetune_bert,\n            checkpoint=None,\n            label_smoothing=label_smoothing,\n            symbols=processor.symbols,\n            test=test,\n            max_pos=self.max_pos_length,\n        )\n        self.processor = processor\n        self.optim_bert = None\n        self.optim_dec = None\n\n    @staticmethod\n    def list_supported_models():\n        return list(MODEL_CLASS.keys())\n\n    def fit(\n        self,\n        train_dataset,\n        num_gpus=None,\n        gpu_ids=None,\n        batch_size=4,\n        local_rank=-1,\n        max_steps=5e4,\n        warmup_steps_bert=20000,\n        warmup_steps_dec=10000,\n        learning_rate_bert=0.002,\n        learning_rate_dec=0.2,\n        optimization_method=\"adam\",\n        max_grad_norm=0,\n        beta1=0.9,\n        beta2=0.999,\n        decay_method=\"noam\",\n        gradient_accumulation_steps=1,\n        report_every=10,\n        save_every=1000,\n        verbose=True,\n        seed=None,\n        fp16=False,\n        fp16_opt_level=\"O2\",\n        world_size=1,\n        rank=0,\n        validation_function=None,\n        checkpoint=None,\n        **kwargs,\n    ):\n        \"\"\"\n        Fine-tune pre-trained transofmer models for extractive summarization.\n\n        Args:\n            train_dataset (SummarizationDataset): Training dataset.\n            num_gpus (int, optional): The number of GPUs to use. If None, all\n                available GPUs will be used. If set to 0 or GPUs are not available,\n                CPU device will be used. Defaults to None.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            batch_size (int, optional): Maximum number of tokens in each batch.\n            local_rank (int, optional): Local_rank for distributed training on GPUs.\n                Local rank means the ranking of the current GPU device on the current\n                node. Defaults to -1, which means non-distributed training.\n            max_steps (int, optional): Maximum number of training steps. Defaults to 5e5.\n            warmup_steps_bert (int, optional): Number of steps taken to increase\n                learning rate from 0 to `learning_rate` for tuning the BERT encoder.\n                Defaults to 2e4.\n            warmup_steps_dec (int, optional): Number of steps taken to increase\n                learning rate from 0 to `learning_rate` for tuning the decoder.\n                Defaults to 1e4.\n            learning_rate_bert (float, optional):  Learning rate of the optimizer\n                for the encoder. Defaults to 0.002.\n            learning_rate_dec (float, optional):  Learning rate of the optimizer\n                for the decoder. Defaults to 0.2.\n            optimization_method (string, optional): Optimization method used in fine\n                tuning. Defaults to \"adam\".\n            max_grad_norm (float, optional): Maximum gradient norm for gradient clipping.\n                Defaults to 0.\n            beta1 (float, optional): The exponential decay rate for the first moment\n                estimates. Defaults to 0.9.\n            beta2 (float, optional): The exponential decay rate for the second-moment\n                estimates. This value should be set close to 1.0 on problems with\n                a sparse gradient. Defaults to 0.99.\n            decay_method (string, optional): learning rate decrease method.\n                Default to 'noam'.\n            gradient_accumulation_steps (int, optional): Number of batches to accumulate\n                gradients on between each model parameter update. Defaults to 1.\n            report_every (int, optional): The interval by steps to print out the\n                training log. Defaults to 10.\n            save_every (int, optional): The interval by steps to save the finetuned \n                model. Defaults to 100.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n            seed (int, optional): Random seed used to improve reproducibility.\n                Defaults to None.\n            fp16 (bool, optional): Whether to use mixed precision training.\n                Defaults to False.\n            fp16_opt_level (str, optional): optimization level, refer to\n                 https://nvidia.github.io/apex/amp.html#opt-levels for details.\n                 Value choices are: \"O0\", \"O1\", \"O2\", \"O3\". Defaults to \"O2\".\n            world_size (int, optional): Total number of GPUs that will be used.\n                Defaults to 1.\n            rank (int, optional): Global rank of the current GPU in distributed\n                training. It's calculated with the rank of the current node in the\n                cluster/world and the `local_rank` of the device in the current node.\n                See an example in :file: `examples/text_summarization/\n                abstractive_summarization_bertsum_cnndm_distributed_train.py`.\n                Defaults to 0.\n            validation_function (function, optional): function used in fitting to\n                validate the performance. Default to None.\n            checkpoint (str, optional): file path for a checkpoint based on which the\n                training continues. Default to None.\n        \"\"\"\n\n        # get device\n        device, num_gpus = get_device(\n            num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank\n        )\n        # move model to devices\n        print(\"device is {}\".format(device))\n        if checkpoint:\n            checkpoint = torch.load(checkpoint, map_location=\"cpu\")\n            self.model.load_checkpoint(checkpoint[\"model\"])\n        self.model = move_model_to_device(model=self.model, device=device)\n\n        # init optimizer\n        self.optim_bert = model_builder.build_optim_bert(\n            self.model,\n            optim=optimization_method,\n            lr_bert=learning_rate_bert,\n            warmup_steps_bert=warmup_steps_bert,\n            max_grad_norm=max_grad_norm,\n            beta1=beta1,\n            beta2=beta2,\n        )\n        self.optim_dec = model_builder.build_optim_dec(\n            self.model,\n            optim=optimization_method,\n            lr_dec=learning_rate_dec,\n            warmup_steps_dec=warmup_steps_dec,\n            max_grad_norm=max_grad_norm,\n            beta1=beta1,\n            beta2=beta2,\n        )\n\n        optimizers = [self.optim_bert, self.optim_dec]\n\n        self.amp = get_amp(fp16)\n        if self.amp:\n            self.model, optim = self.amp.initialize(\n                self.model, optimizers, opt_level=fp16_opt_level\n            )\n\n        global_step = 0\n        if checkpoint:\n            if checkpoint[\"optimizers\"]:\n                for i in range(len(optimizers)):\n                    model_builder.load_optimizer_checkpoint(\n                        optimizers[i], checkpoint[\"optimizers\"][i]\n                    )\n            if self.amp and \"amp\" in checkpoint and checkpoint[\"amp\"]:\n                self.amp.load_state_dict(checkpoint[\"amp\"])\n            if \"global_step\" in checkpoint and checkpoint[\"global_step\"]:\n                global_step = checkpoint[\"global_step\"] / world_size\n                print(\"global_step is {}\".format(global_step))\n\n        self.model = parallelize_model(\n            model=self.model,\n            device=device,\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=local_rank,\n        )\n\n        if local_rank == -1:\n            sampler = RandomSampler(train_dataset)\n        else:\n            sampler = DistributedSampler(\n                train_dataset, num_replicas=world_size, rank=rank\n            )\n\n        def collate_fn(data):\n            return self.processor.collate(\n                data, block_size=self.max_pos_length, device=device\n            )\n\n        train_dataloader = DataLoader(\n            train_dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate_fn\n        )\n\n        # compute the max number of training steps\n        max_steps = compute_training_steps(\n            train_dataloader,\n            max_steps=max_steps,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n        )\n\n        super().fine_tune(\n            train_dataloader=train_dataloader,\n            get_inputs=BertSumAbsProcessor.get_inputs,\n            device=device,\n            num_gpus=num_gpus,\n            max_steps=max_steps,\n            global_step=global_step,\n            max_grad_norm=max_grad_norm,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n            verbose=verbose,\n            seed=seed,\n            report_every=report_every,\n            save_every=save_every,\n            clip_grad_norm=False,\n            optimizer=optimizers,\n            scheduler=None,\n            fp16=fp16,\n            amp=self.amp,\n            validation_function=validation_function,\n        )\n\n        # release GPU memories\n        self.model.cpu()\n        torch.cuda.empty_cache()\n\n        self.save_model(max_steps)\n\n    def predict(\n        self,\n        test_dataset,\n        num_gpus=None,\n        gpu_ids=None,\n        local_rank=-1,\n        batch_size=16,\n        alpha=0.6,\n        beam_size=5,\n        min_length=15,\n        max_length=150,\n        fp16=False,\n        verbose=True,\n    ):\n        \"\"\"\n        Predict the summarization for the input data iterator.\n\n        Args:\n            test_dataset (SummarizationDataset): Dataset for which the summary\n                to be predicted.\n            num_gpus (int, optional): The number of GPUs used in prediction.\n                Defaults to 1.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            local_rank (int, optional): Local rank of the device in distributed\n                inferencing. Defaults to -1, which means non-distributed inferencing.\n            batch_size (int, optional): The number of test examples in each batch.\n                Defaults to 16.\n            alpha (float, optional): Length penalty. Defaults to 0.6.\n            beam_size (int, optional): Beam size of beam search. Defaults to 5.\n            min_length (int, optional): Minimum number of tokens in the output sequence.\n                Defaults to 15.\n            max_length (int, optional):  Maximum number of tokens in output\n                sequence. Defaults to 150.\n            fp16 (bool, optional): Whether to use half-precision model for prediction.\n                Defaults to False.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n\n        Returns:\n            List of strings which are the summaries\n\n        \"\"\"\n        device, num_gpus = get_device(\n            num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank\n        )\n\n        # move model to devices\n        def this_model_move_callback(model, device):\n            model = move_model_to_device(model, device)\n            return parallelize_model(\n                model, device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank\n            )\n\n        if fp16:\n            self.model = self.model.half()\n\n        self.model = move_model_to_device(self.model, device)\n        self.model.eval()\n\n        predictor = build_predictor(\n            self.processor.tokenizer,\n            self.processor.symbols,\n            self.model,\n            alpha=alpha,\n            beam_size=beam_size,\n            min_length=min_length,\n            max_length=max_length,\n        )\n        predictor = this_model_move_callback(predictor, device)\n        self.model = parallelize_model(\n            self.model,\n            device,\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=local_rank,\n        )\n\n        test_sampler = SequentialSampler(test_dataset)\n\n        def collate_fn(data):\n            return self.processor.collate(\n                data, self.max_pos_length, device, train_mode=False\n            )\n\n        test_dataloader = DataLoader(\n            test_dataset,\n            sampler=test_sampler,\n            batch_size=batch_size,\n            collate_fn=collate_fn,\n        )\n        print(\"dataset length is {}\".format(len(test_dataset)))\n\n        def format_summary(translation):\n            \"\"\" Transforms the output of the `from_batch` function\n            into nicely formatted summaries.\n            \"\"\"\n            raw_summary = translation\n            summary = (\n                raw_summary.replace(\"[unused0]\", \"\")\n                .replace(\"[unused3]\", \"\")\n                .replace(\"[CLS]\", \"\")\n                .replace(\"[SEP]\", \"\")\n                .replace(\"[PAD]\", \"\")\n                .replace(\"[unused1]\", \"\")\n                .replace(r\" +\", \" \")\n                .replace(\" [unused2] \", \".\")\n                .replace(\"[unused2]\", \"\")\n                .strip()\n            )\n\n            return summary\n\n        def generate_summary_from_tokenid(preds, pred_score):\n            batch_size = preds.size()[0]  # batch.batch_size\n            translations = []\n            for b in range(batch_size):\n                if len(preds[b]) < 1:\n                    pred_sents = \"\"\n                else:\n                    pred_sents = self.processor.tokenizer.convert_ids_to_tokens(\n                        [int(n) for n in preds[b] if int(n) != 0]\n                    )\n                    pred_sents = \" \".join(pred_sents).replace(\" ##\", \"\")\n                translations.append(pred_sents)\n            return translations\n\n        generated_summaries = []\n\n        for batch in tqdm(\n            test_dataloader, desc=\"Generating summary\", disable=not verbose\n        ):\n            input = self.processor.get_inputs(batch, device, \"bert\", train_mode=False)\n            translations, scores = predictor(**input)\n\n            translations_text = generate_summary_from_tokenid(translations, scores)\n            summaries = [format_summary(t) for t in translations_text]\n            generated_summaries.extend(summaries)\n\n        # release GPU memories\n        self.model.cpu()\n        torch.cuda.empty_cache()\n\n        return generated_summaries\n\n    def save_model(self, global_step=None, full_name=None):\n        \"\"\"\n        save the trained model.\n\n        Args:\n            global_step (int, optional): The number of steps that the model has been\n                finetuned for. Defaults to None.\n            full_name (str, optional): File name to save the model's `state_dict()`.\n                If it's None, the model is going to be saved under \"fine_tuned\" folder\n                of the cached directory of the object. Defaults to None.\n        \"\"\"\n        model_to_save = (\n            self.model.module if hasattr(self.model, \"module\") else self.model\n        )  # Take care of distributed/parallel training\n\n        if full_name is None:\n            output_model_dir = os.path.join(self.cache_dir, \"fine_tuned\")\n            os.makedirs(self.cache_dir, exist_ok=True)\n            os.makedirs(output_model_dir, exist_ok=True)\n            full_name = os.path.join(output_model_dir, \"bertsumabs.pt\")\n        else:\n            path, filename = os.path.split(full_name)\n            print(path)\n            os.makedirs(path, exist_ok=True)\n\n        checkpoint = {\n            \"optimizers\": [self.optim_bert.state_dict(), self.optim_dec.state_dict()],\n            \"model\": model_to_save.state_dict(),\n            \"amp\": self.amp.state_dict() if self.amp else None,\n            \"global_step\": global_step,\n            \"max_pos_length\": self.max_pos_length,\n        }\n\n        logger.info(\"Saving model checkpoint to %s\", full_name)\n        try:\n            print(\"saving through pytorch to {}\".format(full_name))\n            torch.save(checkpoint, full_name)\n        except OSError:\n            try:\n                print(\"saving as pickle\")\n                pickle.dump(checkpoint, open(full_name, \"wb\"))\n            except Exception:\n                raise\n        except Exception:\n            raise\n"
  },
  {
    "path": "utils_nlp/models/transformers/abstractive_summarization_seq2seq.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport os\nimport json\nimport logging\nfrom tqdm import tqdm\nimport random\n\nimport torch\nfrom torch.utils.data import DataLoader, SequentialSampler, Dataset\nfrom torch.utils.data.distributed import DistributedSampler\n\nfrom transformers import RobertaConfig, BertConfig\n\nfrom utils_nlp.models.transformers.common import Transformer\nfrom utils_nlp.common.pytorch_utils import (\n    get_device,\n    move_model_to_device,\n    parallelize_model,\n)\nimport s2s_ft\nfrom s2s_ft.utils import (\n    Seq2seqDatasetForBert,\n    batch_list_to_batch_tensors,\n)\nfrom s2s_ft.modeling import BertForSequenceToSequence\nfrom s2s_ft.modeling import MINILM_PRETRAINED_MODEL_ARCHIVE_MAP\nfrom s2s_ft.modeling import UNILM_PRETRAINED_MODEL_ARCHIVE_MAP\n\nfrom s2s_ft.tokenization_minilm import MinilmTokenizer\nfrom s2s_ft.configuration_minilm import MinilmConfig, MINILM_PRETRAINED_CONFIG_ARCHIVE_MAP \nfrom s2s_ft.tokenization_unilm import UnilmTokenizer\nfrom s2s_ft.configuration_unilm import UnilmConfig, UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP\n\nfrom s2s_ft.config import BertForSeq2SeqConfig\nimport s2s_ft.s2s_loader as seq2seq_loader\nfrom s2s_ft.modeling_decoding import BertForSeq2SeqDecoder\n\nSUPPORTED_BERT_MODELS = [\"bert-large-uncased\", \"bert-base-cased\", \"bert-large-cased\"]\nSUPPORTED_ROBERTA_MODELS = [\"roberta-base\", \"roberta-large\"]\n\n# ROBERTA and XLM_ROBERTA are converted to BERT format by\n# BertForSequenceToSequence.from_pretrained\nMODEL_CLASS = {}\nMODEL_CLASS.update({k: BertForSequenceToSequence for k in SUPPORTED_BERT_MODELS})\nMODEL_CLASS.update({k: BertForSequenceToSequence for k in SUPPORTED_ROBERTA_MODELS})\nMODEL_CLASS.update(\n    {k: BertForSequenceToSequence for k in UNILM_PRETRAINED_MODEL_ARCHIVE_MAP}\n)\nMODEL_CLASS.update(\n    {k: BertForSequenceToSequence for k in MINILM_PRETRAINED_MODEL_ARCHIVE_MAP}\n)\n\nTOKENIZER_CLASS = {}\nTOKENIZER_CLASS.update({k: UnilmTokenizer for k in UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP})\nTOKENIZER_CLASS.update({k: MinilmTokenizer for k in MINILM_PRETRAINED_CONFIG_ARCHIVE_MAP})\n\nCONFIG_CLASS = {}\nCONFIG_CLASS.update({k: BertConfig for k in SUPPORTED_BERT_MODELS})\nCONFIG_CLASS.update({k: RobertaConfig for k in SUPPORTED_ROBERTA_MODELS})\nCONFIG_CLASS.update({k: UnilmConfig for k in UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP})\nCONFIG_CLASS.update({k: MinilmConfig for k in MINILM_PRETRAINED_CONFIG_ARCHIVE_MAP})\n\n# XLM_ROBERTA is for multilingual and is WIP in s2s-ft.\n# We can add it when it's finished and validated\n# from transformers.modeling_xlm_roberta import XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP\n# MODEL_CLASS.update({k: BertForSequenceToSequence for k\n# in XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP})\n# CONFIG_CLASS.update({k: XLMRobertaConfig for k in\n# XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP})\n\n\nlogger = logging.getLogger(__name__)\n\n\ndef _get_model_type(model_name):\n    if \"-\".join(model_name.split(\"-\")[:2]) == \"xlm-roberta\":\n        return \"xlm-roberta\"\n    elif model_name.startswith(\"unilm\"):\n        return \"unilm\"\n    elif model_name.startswith(\"minilm\"):\n        return \"minilm\"\n    else:\n        return model_name.split(\"-\")[0]\n\n\ndef detokenize(tk_list):\n    r_list = []\n    for tk in tk_list:\n        if tk.startswith(\"##\") and len(r_list) > 0:\n            r_list[-1] = r_list[-1] + tk[2:]\n        else:\n            r_list.append(tk)\n    return r_list\n\n\nclass S2SAbsSumDataset(Dataset):\n    \"\"\"\n    Dataset containing data processed and ready to be passed to\n    S2SAbstractiveSummarizer.fit and S2SAbstractiveSummarizer.predict.\n    \"\"\"\n\n    def __init__(self, features):\n        self.features = features\n\n    def __getitem__(self, idx):\n        return self.features[idx]\n\n    def __len__(self):\n        return len(self.features)\n\n\nclass S2SAbsSumProcessor:\n    \"\"\"\n    Processor with methods for converting input data in different formats\n    to S2SAbsSumDataset for training and testing.\n\n    Args:\n        model_name (str, optional): Name of the model which determines the\n            tokenizer to use. Call `S2SAbsSumProcessor.list_supported_models()`\n            to see all supported model names. Defaults to \"unilm-base-cased\".\n        to_lower (bool, optional): Whether to convert all letters to lower case\n            during tokenization. This is determined by if a cased model is used.\n            Defaults to False, which corresponds to a cased model.\n        cache_dir (str, optional): Directory to cache the tokenizer.\n            Defaults to \".\".\n    \"\"\"\n\n    def __init__(\n        self, model_name=\"unilm-base-cased\", to_lower=False, cache_dir=\".\",\n    ):\n        if \"uncased\" in model_name:\n            to_lower = True\n        self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(\n            model_name, do_lower_case=to_lower, cache_dir=cache_dir\n        )\n        self.cache_dir = cache_dir\n        self._model_name = model_name\n\n    @staticmethod\n    def list_supported_models():\n        return list(MODEL_CLASS)\n\n    @classmethod\n    def get_inputs(cls, batch, device, model_name):\n        \"\"\"\n        Converts a batch of features to model input format,\n        used by Transformer.fine_tune.\n        \"\"\"\n        batch = tuple(t.to(device) for t in batch)\n        inputs = {\n            \"source_ids\": batch[0],\n            \"target_ids\": batch[1],\n            \"pseudo_ids\": batch[2],\n            \"num_source_tokens\": batch[3],\n            \"num_target_tokens\": batch[4],\n        }\n        return inputs\n\n    @staticmethod\n    def create_s2s_dataset(\n        examples,\n        train_mode,\n        tokenizer,\n        output_dir,\n        local_rank=-1,\n        cached_features_file=None,\n        top_n=-1,\n    ):\n        \"\"\"\n        Creates S2SAbsSumDataset from input file or list of dictionaries.\n\n        Args:\n            examples (str or list): Input file path or list of dictionaries.\n                The input file should be in the following format:\n                {\"src\": \"abcdefg\", \"tgt\": \"ag\"}\n                {\"src\": \"hijklmn\", \"tgt\": \"hn\"}\n                where the \"src\" field is the input text to summarize and the \"tgt\"\n                field is the summary.\n                The list of dictionaries should be in similar format:\n                [{\"src\": \"abcdefg\", \"tgt\": \"ag\"},\n                {\"src\": \"hijklmn\", \"tgt\": \"hn\"}]\n                The \"tgt\" field is optional if `train_mode` is False.\n            train_mode (bool): Whether the input data is for training or testing.\n                If True, both \"src\" and \"tgt\" fields need to be provided in\n                `examples`.\n                If False, only the \"src\" field is required.\n            tokenizer (tokenizer): Tokenizer used to convert tokens to token ids. The\n                type of the tokenizer depends on the model that will be used.\n            output_dir (str): Directory to save the cached features files.\n            local_rank (int, optional): Local rank of the device in distributed\n                training. Defaults to -1, which means non-distributed training.\n            cached_features_file (str, optional): Path of the cached features file.\n                If provided and the file already exists, it is loaded and used.\n                If provided and the file doesn't exist, processed features are\n                saved to this file.\n                If not provided, processed features are saved to `output_dir`.\n                Defaults to None.\n            top_n (int, optional): The number which specifies how many examples in the\n                beginning that will be used to create the dataset. Defaults to -1,\n                which means the whole lists of examples should be used.\n\n        Returns:\n            S2SAbsSumDataset\n\n        \"\"\"\n        if train_mode:\n            cached_features_file_name = \"cached_features_for_training.pt\"\n            shuffle_flag = True\n        else:\n            cached_features_file_name = \"cached_features_for_testing.pt\"\n            shuffle_flag = False\n\n        if not os.path.exists(output_dir):\n            os.mkdir(output_dir)\n\n        if cached_features_file is None:\n            cached_features_file = os.path.join(output_dir, cached_features_file_name)\n            if os.path.exists(cached_features_file):\n                os.remove(cached_features_file)\n        else:\n            if os.path.exists(cached_features_file):\n                print(\"use cached feature file {}\".format(cached_features_file))\n\n        features = load_and_cache_examples(\n            input_examples=examples,\n            tokenizer=tokenizer,\n            cached_features_file=cached_features_file,\n            shuffle=shuffle_flag,\n            local_rank=local_rank,\n            train_mode=train_mode,\n            top_n=top_n,\n        )\n\n        if not train_mode:\n            features = [\n                tokenizer.convert_ids_to_tokens(line[\"source_ids\"]) for line in features\n            ]\n\n            features = sorted(list(enumerate(features)), key=lambda x: -len(x[1]))\n\n        return S2SAbsSumDataset(features)\n\n    def s2s_dataset_from_iterable_sum_ds(\n        self, sum_ds, train_mode, cached_features_file=None, local_rank=-1, top_n=-1,\n    ):\n        \"\"\"\n        Converts IterableSummarizationDataset to S2SAbsSumDataset.\n\n        Args:\n            sum_ds (IterableSummarizationDataset): Input dataset.\n            train_mode (bool): Whether the input data is for training or testing.\n            cached_features_file (str, optional): Path of the cached features file.\n                If provided and the file already exists, it is loaded and used.\n                If provided and the file doesn't exist, processed features are\n                saved to this file.\n                If not provided, processed features are saved to cache_dir.\n                Defaults to None.\n            local_rank (int, optional): Local rank of the device in distributed\n                training. Defaults to -1, which means non-distributed training.\n            top_n (int, optional): The number which specifies how many examples in the\n                beginning of the input dataset that will be used to create the dataset.\n                Defaults to -1, which means the whole dataset should be processsed.\n\n        Returns:\n            S2SAbsSumDataset\n        \"\"\"\n\n        examples = []\n        if train_mode:\n            for source, target in zip(sum_ds, sum_ds.get_target()):\n                examples.append({\"src\": source, \"tgt\": target})\n        else:\n            for source in sum_ds:\n                examples.append({\"src\": source})\n\n        s2s_dataset = S2SAbsSumProcessor.create_s2s_dataset(\n            examples=examples,\n            train_mode=train_mode,\n            tokenizer=self.tokenizer,\n            output_dir=self.cache_dir,\n            local_rank=local_rank,\n            cached_features_file=cached_features_file,\n            top_n=top_n,\n        )\n\n        return s2s_dataset\n\n    def s2s_dataset_from_sum_ds(\n        self, sum_ds, train_mode, cached_features_file=None, local_rank=-1, top_n=-1\n    ):\n\n        \"\"\"\n        Converts SummarizationDataset to S2SAbsSumDataset.\n\n        Args:\n            sum_ds (SummarizationDataset): Input dataset.\n            train_mode (bool): Whether the input data is for training or testing.\n            cached_features_file (str, optional): Path of the cached features file.\n                If provided and the file already exists, it is loaded and used.\n                If provided and the file doesn't exist, processed features are\n                saved to this file.\n                If not provided, processed features are saved to cache_dir.\n                Defaults to None.\n            local_rank (int, optional): Local rank of the device in distributed\n                training. Defaults to -1, which means non-distributed training.\n            top_n (int, optional): The number which specifies how many examples in the\n                beginning of the input dataset that will be used to create the dataset.\n                Defaults to -1, which means the whole dataset should be processsed.\n\n        Returns:\n            S2SAbsSumDataset\n        \"\"\"\n        examples = []\n        for item in sum_ds:\n            examples.append(item)\n\n        s2s_dataset = S2SAbsSumProcessor.create_s2s_dataset(\n            examples=examples,\n            train_mode=train_mode,\n            tokenizer=self.tokenizer,\n            output_dir=self.cache_dir,\n            local_rank=local_rank,\n            cached_features_file=cached_features_file,\n            top_n=top_n,\n        )\n\n        return s2s_dataset\n\n    def s2s_dataset_from_json_or_file(\n        self, input_data, train_mode, cached_features_file=None, local_rank=-1, top_n=-1\n    ):\n        \"\"\"\n        Converts input file or list of dictionaries to S2SAbsSumDataset.\n\n        Args:\n            input_data (str or list): Input file path or list of dictionaries.\n                The input file should be in the following format:\n                {\"src\": \"abcdefg\", \"tgt\": \"ag\"}\n                {\"src\": \"hijklmn\", \"tgt\": \"hn\"}\n                where the \"src\" field is the input text to summarize and the \"tgt\"\n                field is the summary.\n                The list of dictionaries should be in similar format:\n                [{\"src\": \"abcdefg\", \"tgt\": \"ag\"},\n                {\"src\": \"hijklmn\", \"tgt\": \"hn\"}]\n                The \"tgt\" field is optional if `train_mode` is False.\n            train_mode (bool): Whether the input data is for training or testing.\n            cached_features_file (str, optional): Path of the cached features file.\n                If provided and the file already exists, it is loaded and used.\n                If provided and the file doesn't exist, processed features are\n                saved to this file.\n                If not provided, processed features are saved to cache_dir.\n                Defaults to None.\n            local_rank (int, optional): Local rank of the device in distributed\n                training. Defaults to -1, which means non-distributed training.\n            top_n (int, optional): The number which specifies how many examples in the\n                beginning of the input dataset that will be used to create the dataset.\n                Defaults to -1, which means the whole input data should be processsed.\n\n\n        Returns:\n            S2SAbsSumDataset\n        \"\"\"\n\n        s2s_dataset = S2SAbsSumProcessor.create_s2s_dataset(\n            examples=input_data,\n            train_mode=train_mode,\n            tokenizer=self.tokenizer,\n            output_dir=self.cache_dir,\n            local_rank=local_rank,\n            cached_features_file=cached_features_file,\n            top_n=top_n,\n        )\n\n        return s2s_dataset\n\n\nclass S2SConfig:\n    \"\"\"\n    This class contains some default decoding settings that the users usually\n    don't need to change.\n\n    Args:\n        new_pos_ids (bool, optional): Whether to use new_pos_ids for LMs.\n            Defaults to False.\n        min_len (int, optional): Minimal length of the output.\n            Defaults to 1.\n        ngram_size (int, optional): Size of forbidden duplicate ngrams.\n            Defaults to 3.\n        mode (str, optional): Choose in \"s2s\" (sequence to sequence),\n            \"l2r\" (left to right), and \"both\". Defaults to \"s2s\".\n        s2s_special_token (bool, optional): If True, use a special cls token\n            at the beginning of the sequence. Otherwise, use sep token at\n            at the beginning of the sequence. Defaults to False.\n        s2s_add_segment (bool, optional): If True, use special segment id for\n            the first token. Otherwise, use the same segment id for the first\n            token and the first sequence. Defaults to False.\n        s2s_share_segment (bool, optional): If `s2s_add_segment=True` and\n            `s2s_share_segement=True`, sharing segment embeddings for the\n            encoder of S2S. Defaults to False.\n        pos_shift (bool, optional): Whether to use position shift for\n            fine-tuning. Defaults to False.\n        ffn_type (int, optional): Type of the feedforward network. 0: mlp.\n            1: W((Wx+b) elem_prod x). Defaults to 0.\n        num_qkv (int, optional): Number of different <Q, K, V>. Defaults to 0.\n        seg_emb (bool, optional): Whether to use segment embedding for\n            self-attention. Defaults to False.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        new_pos_ids=False,\n        min_len=1,\n        ngram_size=3,\n        mode=\"s2s\",\n        s2s_special_token=False,\n        s2s_add_segment=False,\n        s2s_share_segment=False,\n        pos_shift=False,\n        ffn_type=0,\n        num_qkv=0,\n        seg_emb=False,\n    ):\n\n        self.new_pos_ids = new_pos_ids\n        self.min_len = min_len\n        self.forbid_ngram_size = ngram_size\n        self.mode = mode\n        self.s2s_special_token = s2s_special_token\n        self.s2s_add_segment = s2s_add_segment\n        self.s2s_share_segment = s2s_share_segment\n        self.pos_shift = pos_shift\n        self.ffn_type = ffn_type\n        self.num_qkv = num_qkv\n        self.seg_emb = seg_emb\n\n    def save_to_json(self, json_file):\n        with open(json_file, \"w\") as f:\n            json.dump(self.__dict__, f)\n\n    @classmethod\n    def load_from_json(cls, json_file):\n        config = cls()\n        with open(json_file, \"r\") as f:\n            config.__dict__ = json.load(f)\n        return config\n\n\nclass S2SAbstractiveSummarizer(Transformer):\n    def __init__(\n        self,\n        model_name=\"unilm-base-cased\",\n        to_lower=False,\n        cache_dir=\".\",\n        load_model_from_dir=None,\n        model_file_name=None,\n        label_smoothing=0.1,\n        max_seq_length=512,\n        max_source_seq_length=464,\n        max_target_seq_length=48,\n    ):\n        \"\"\"\n        Abstractive summarizer based on s2s-ft.\n\n        Args:\n            model_name (str, optional): Name of the model.\n                Call `S2SAbstractiveSummarizer.list_supported_models()` to see all\n                supported model names. Defaults to \"unilm-base-cased\".\n            to_lower (bool, optional): Whether to convert all letters to lower case\n                during tokenization. This is determined by if a cased model is used.\n                Defaults to False, which corresponds to a cased model.\n            cache_dir (str, optional): Directory to cache downloaded model files.\n                Defaults to \".\".\n            load_model_from_dir (str, optional): Directory to load the model from. If\n                model_file_name is not provided, assume model was saved by\n                `:func:`~transformers.PreTrainedModel.save_pretrained`` and the\n                directory should contain pytorch_model.bin and config.json.\n                Defaults to None.\n            model_file_name (str, optional): Name of the model file under\n                `load_model_from_dir`. If provided, assume model was saved by\n                `S2SAbstractiveSummarizer.save_model`.\n            label_smoothing (float, optional): Alpha in label smoothing.\n                Defaults to 0.1.\n            max_seq_length (int, optional): Maximum length of the sequence that\n                concatenates source sequence tokens, target sequence tokens, and\n                special tokens like cls and sep. Defaults to 512.\n            max_source_seq_length (int, optional): Maximum number of tokens in the\n                source sequence after tokenization. Defaults to 464.\n            max_target_seq_length (int, optional); Maximum number of tokens in the\n                target sequence after tokenization. Defaults to 48.\n\n        \"\"\"\n\n        if model_name not in self.list_supported_models():\n            raise ValueError(\n                \"Model name {0} is not supported by {1}. \"\n                \"Call '{1}.list_supported_models()' to get all supported model \"\n                \"names.\".format(model_name, self.__class__.__name__)\n            )\n        model_class = MODEL_CLASS[model_name]\n        config_class = CONFIG_CLASS[model_name]\n\n        self._model_name = model_name\n        self._model_type = _get_model_type(self._model_name)\n        if \"uncased\" in model_name:\n            to_lower = True\n\n        # self._bert_model_name is needed for BertForSeq2SeqDecoder\n        if self._model_type != \"bert\":\n            if self._model_type == \"roberta\":\n                self._bert_model_name = (\n                    self._model_name.replace(\"roberta\", \"bert\") + \"-cased\"\n                )\n            else:\n                self._bert_model_name = \"bert-\" + self._model_name.split(\"-\", 1)[-1]\n        else:\n            self._bert_model_name = self._model_name\n\n        self.cache_dir = cache_dir\n        self.load_model_from_dir = load_model_from_dir\n        self.do_lower_case = to_lower\n        self.max_seq_length = max_seq_length\n        self.max_source_seq_length = max_source_seq_length\n        self.max_target_seq_length = max_target_seq_length\n\n        if load_model_from_dir is None:\n            model_to_load = self._model_name\n        elif model_file_name is None:\n            # Assume model was saved by\n            # `:func:`~transformers.PreTrainedModel.save_pretrained``,\n            # The load_model_from_dir should contain pytorch_model.bin and config.json\n            # and can be loaded by\n            # `:func:`~transformers.PreTrainedModel.from_pretrained``.\n            logger.info(\"Loading cached model from {}\".format(load_model_from_dir))\n            model_to_load = load_model_from_dir\n        else:\n            # Assume model was saved by S2SAbstractiveSummarizer.save_model\n            model_to_load = os.path.join(load_model_from_dir, model_file_name)\n            logger.info(\"Loading cached model from {}\".format(model_to_load))\n\n        if load_model_from_dir is not None and model_file_name is None:\n            # Assume config.json is in load_model_from_dir\n            model_config = config_class.from_pretrained(\n                load_model_from_dir, cache_dir=cache_dir\n            )\n        else:\n            model_config = config_class.from_pretrained(\n                self._model_name, cache_dir=cache_dir\n            )\n        \n        self.model_config = model_config\n\n        # Convert regular model config to sequence to sequence config\n        config = BertForSeq2SeqConfig.from_exist_config(\n            config=model_config,\n            label_smoothing=label_smoothing,\n            max_position_embeddings=self.max_source_seq_length\n            + self.max_target_seq_length,\n        )\n        logger.info(\"Model config for seq2seq: %s\", str(config))\n\n        self.model = model_class.from_pretrained(\n            model_to_load,\n            config=config,\n            model_type=self._model_type,\n            cache_dir=cache_dir,\n            reuse_position_embedding=True,\n        )\n\n        self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(\n            self._model_name,\n            do_lower_case=to_lower,\n            cache_dir=cache_dir,\n            output_loading_info=False,\n        )\n\n    @staticmethod\n    def list_supported_models():\n        return list(MODEL_CLASS)\n\n    def fit(\n        self,\n        train_dataset,\n        learning_rate=5e-5,\n        per_gpu_batch_size=8,\n        num_epochs=1,\n        recover_step=-1,\n        recover_dir=None,\n        save_model_to_dir=None,\n        max_steps=-1,\n        local_rank=-1,\n        num_gpus=None,\n        gpu_ids=None,\n        gradient_accumulation_steps=1,\n        weight_decay=0.01,\n        adam_epsilon=1e-8,\n        warmup_steps=0,\n        fp16=False,\n        fp16_opt_level=\"O1\",\n        max_grad_norm=1.0,\n        verbose=True,\n        seed=None,\n        random_prob=0.1,\n        keep_prob=0.1,\n    ):\n\n        \"\"\"\n        Method for model-fine tuning.\n\n        Args:\n            train_dataset (S2SAbsSumDataset): Training dataset.\n            learning_rate (float, optional): Learning rate. Defaults to 5e-5.\n            per_gpu_batch_size (int, optional): Number of samples in each batch per\n                GPU. Defaults to 8.\n            num_epochs (int, optional): Number of passes through the entire training\n                dataset. Ignored if `max_steps` is set. Defaults to 1.\n            recover_step (int, optional): Step number to resume model fine-tuning from,\n                assuming the model was saved by `S2SAbstractiveSummarizer.save_model`\n                and the name is in the format \"model.{recover_step}.bin\".\n                Defaults to -1, which means start model fine-tuning from scratch.\n            recover_dir (str, optional): Directory to load model from if recover_step is\n                provided. Defaults to None.\n            save_model_to_dir (str, optional): Directory to save the model to. Defaults\n                to None and the fine-tuned model is not saved.\n            max_steps (int, optional): Maximum number of training steps. Defaults to -1\n                and the number of training steps is determined by  `num_epochs` and the\n                length of `train_dataset`.\n            local_rank (int, optional): Rank of the device in distributed training.\n                Defaults to -1 which means non-distributed training.\n            num_gpus (int, optional): Number of GPUs to use. Ignored if `gpu_ids` is\n                provided. Defaults to None and all available GPUs are used.\n            gpu_ids (list, optional): List of GPU IDs ot use. Defaults to None and GPUs\n                used are determined by num_gpus.\n            gradient_accumulation_steps (int, optional): Number of steps to accmumulate\n                gradient before each back propagation and model parameters update.\n                Defaults to 1.\n            weight_decay (float, optional): Weight decay to apply after each parameter\n                update. Defaults to 0.01.\n            adam_epsilon (float, optional): Epsilon of the AdamW optimizer.\n                Defaults to 1e-8.\n            warmup_steps (int, optional): Number of steps taken to increase learning\n                rate from 0 to `learning rate`. Defaults to 0.\n            fp16 (bool, optional): Whether to use 16-bit mixed precision through Apex.\n                Defaults to False.\n            fp16_opt_level(str, optional): Apex AMP optimization level for fp16.\n                One of in ['O0', 'O1', 'O2', and 'O3'].\n                See https://nvidia.github.io/apex/amp.html\"\n                Defaults to \"01\"\n            max_grad_norm (float, optional): Maximum gradient norm for gradient\n                clipping. Defaults to 1.0.\n            verbose (bool, optional): Whether to output training log. Defaults to True.\n            seed (int, optional): Random seed for model initialization.\n                Defaults to None.\n            random_prob (float, optional): Probability to randomly replace a masked\n                token. Defaults to 0.1.\n            keep_prob (float, optional): Probability to keep no change for a masked\n                token. Defaults to 0.1.\n\n        \"\"\"\n        global_step = 0\n        if recover_step > 0:\n            model_recover_checkpoint = os.path.join(\n                recover_dir, \"model.{}.bin\".format(recover_step)\n            )\n            logger.info(\n                \" ** Recover model checkpoint in %s ** \", model_recover_checkpoint\n            )\n            model_state_dict = torch.load(model_recover_checkpoint, map_location=\"cpu\")\n            optimizer_recover_checkpoint = os.path.join(\n                recover_dir, \"optim.{}.bin\".format(recover_step)\n            )\n            checkpoint_state_dict = torch.load(\n                optimizer_recover_checkpoint, map_location=\"cpu\"\n            )\n\n            checkpoint_state_dict[\"model\"] = model_state_dict\n            global_step = recover_step\n        else:\n            checkpoint_state_dict = None\n\n        device, num_gpus, amp = self.prepare_model_and_optimizer(\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=local_rank,\n            fp16=fp16,\n            fp16_opt_level=fp16_opt_level,\n            weight_decay=weight_decay,\n            learning_rate=learning_rate,\n            adam_epsilon=adam_epsilon,\n            checkpoint_state_dict=checkpoint_state_dict,\n        )\n\n        per_node_train_batch_size = (\n            per_gpu_batch_size * max(1, num_gpus) * gradient_accumulation_steps\n        )\n\n        # actual batch size, i.e. number of samples between each parameter update\n        batch_size = per_node_train_batch_size * (\n            torch.distributed.get_world_size() if local_rank != -1 else 1\n        )\n\n        # max_steps is mainly used by the scheduler to determine the learning rate,\n        # together with global_step\n        if max_steps == -1:\n            max_steps = max(num_epochs * len(train_dataset) // batch_size, 1)\n\n        if max_steps <= global_step:\n            logger.info(\"Training is done. Please use a new dir or clean this dir!\")\n\n            return\n\n        self.scheduler = Transformer.get_default_scheduler(\n            optimizer=self.optimizer,\n            warmup_steps=warmup_steps,\n            num_training_steps=max_steps,\n        )\n        if recover_step > 0:\n            self.scheduler.load_state_dict(checkpoint_state_dict[\"lr_scheduler\"])\n\n        train_dataset = Seq2seqDatasetForBert(\n            features=train_dataset,\n            max_source_len=self.max_source_seq_length,\n            max_target_len=self.max_target_seq_length,\n            vocab_size=self.tokenizer.vocab_size,\n            cls_id=self.tokenizer.cls_token_id,\n            sep_id=self.tokenizer.sep_token_id,\n            pad_id=self.tokenizer.pad_token_id,\n            mask_id=self.tokenizer.mask_token_id,\n            random_prob=random_prob,\n            keep_prob=keep_prob,\n            num_training_instances=batch_size * max_steps,\n            offset=batch_size * global_step,\n        )\n\n        # The training features are shuffled\n        train_sampler = (\n            SequentialSampler(train_dataset)\n            if local_rank == -1\n            else DistributedSampler(train_dataset, shuffle=False)\n        )\n        # batch_size of the dataloader is the number of samples to load each\n        # iteration on each node\n        train_dataloader = DataLoader(\n            train_dataset,\n            sampler=train_sampler,\n            batch_size=per_node_train_batch_size // gradient_accumulation_steps,\n            collate_fn=batch_list_to_batch_tensors,\n        )\n\n        global_step, _ = super().fine_tune(\n            train_dataloader=train_dataloader,\n            device=device,\n            num_gpus=num_gpus,\n            get_inputs=S2SAbsSumProcessor.get_inputs,\n            max_steps=max_steps,\n            global_step=global_step,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n            optimizer=self.optimizer,\n            scheduler=self.scheduler,\n            local_rank=local_rank,\n            fp16=fp16,\n            amp=amp,\n            max_grad_norm=max_grad_norm,\n            verbose=verbose,\n            seed=seed,\n        )\n\n        if save_model_to_dir is not None and local_rank in [-1, 0]:\n            self.save_model(save_model_to_dir, global_step, fp16)\n\n        # release GPU memories\n        self.model.cpu()\n        torch.cuda.empty_cache()\n        return global_step\n\n    def predict(\n        self,\n        test_dataset,\n        per_gpu_batch_size=4,\n        max_tgt_length=64,\n        beam_size=1,\n        need_score_traces=False,\n        length_penalty=0,\n        forbid_duplicate_ngrams=True,\n        forbid_ignore_word=\".\",\n        s2s_config=S2SConfig(),\n        num_gpus=None,\n        gpu_ids=None,\n        local_rank=-1,\n        fp16=False,\n        verbose=True,\n    ):\n        \"\"\"\n        Method for predicting, i.e. generating summaries.\n        Args:\n            test_dataset (S2SAbsSumDataset): Testing dataset.\n            per_gpu_batch_size (int, optional): Number of testing samples in each\n                batch per GPU. Defaults to 4.\n            max_tgt_length (int, optional): Maximum number of tokens in output\n                sequence. Defaults to 64.\n            beam_size (int, optional): Beam size of beam search. Defaults to 1.\n            need_score_traces (bool, optional): Whether to return score traces of\n                beam search. Defaults to False.\n            length_penalty (float, optional): Length penalty for beam search.\n                Defaults to 0.\n            forbid_duplicate_ngrams (bool, optional): Whether to forbid duplicate\n                n-grams when generating output. Size of the n-gram is determined by\n                `S2SConfig.ngram_size` which defaults to 3. Defaults to True.\n            forbid_ignore_word (str, optional): Words to ignore when forbidding\n                duplicate ngrams. Multiple words should be separated by \"|\", for\n                example, \".|[X_SEP]\". Defaults to \".\".\n            s2s_config (S2SConfig, optional): Some default decoding settings that\n                the users usually don't need to change. Defaults to S2SConfig().\n            num_gpus (int, optional): Number of GPUs to use. Ignored if `gpu_ids` is\n                provided. Defaults to None and all available GPUs are used.\n            gpu_ids (list, optional): List of GPU IDs ot use. Defaults to None and GPUs\n                used are determined by num_gpus.\n            local_rank (int, optional): Rank of the device in distributed training.\n                Defaults to -1 which means non-distributed training.\n            fp16 (bool, optional): Whether to use 16-bit mixed precision through Apex.\n                Defaults to False.\n            verbose(bool, optional): Whether to output predicting log. Defaults to True.\n\n        Returns:\n            List or tuple of lists: List of generated summaries. If `need_score_traces`\n                is True, also returns the score traces of beam search.\n\n        \"\"\"\n\n        if need_score_traces and beam_size <= 1:\n            raise ValueError(\n                \"Score trace is only available for beam search with beam size > 1.\"\n            )\n        if max_tgt_length >= self.max_seq_length - 2:\n            raise ValueError(\"Maximum tgt length exceeds max seq length - 2.\")\n\n        # preprocessing pipeline\n        if self._model_type == \"roberta\":\n            is_roberta = True\n            no_segment_embedding = True\n            vocab = self.tokenizer.encoder\n        else:\n            is_roberta = False\n            no_segment_embedding = False\n            vocab = self.tokenizer.vocab\n\n        if not self._model_name.startswith(\"unilm1.2\"):\n            if self._model_name.startswith(\"unilm-\") or self._model_name.startswith(\n                \"unilm1-\"\n            ):\n                new_segment_ids = True\n            else:\n                new_segment_ids = False\n        else:\n            new_segment_ids = False\n\n        cls_token = \"<s>\" if is_roberta else \"[CLS]\"\n        sep_token = \"</s>\" if is_roberta else \"[SEP]\"\n        pad_token = \"<pad>\" if is_roberta else \"[PAD]\"\n        mask_token = \"<mask>\" if is_roberta else \"[MASK]\"\n\n        max_src_length = self.max_seq_length - 2 - max_tgt_length\n        tokenizer = self.tokenizer\n        bi_uni_pipeline = []\n        bi_uni_pipeline.append(\n            seq2seq_loader.Preprocess4Seq2seqDecoder(\n                list(vocab.keys()),\n                tokenizer.convert_tokens_to_ids,\n                self.max_seq_length,\n                max_tgt_length=max_tgt_length,\n                pos_shift=s2s_config.pos_shift,\n                source_type_id=self.model_config.source_type_id,\n                target_type_id=self.model_config.target_type_id,\n                cls_token=tokenizer.cls_token,\n                sep_token=tokenizer.sep_token,\n                pad_token=tokenizer.pad_token,\n            )\n        )\n        mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids(\n        [tokenizer.mask_token, tokenizer.sep_token, tokenizer.sep_token])\n\n        def collate_fn(input_batch):\n            buf_id = [x[0] for x in input_batch]\n            buf = [x[1][:max_src_length] for x in input_batch]\n            max_a_len = max([len(x) for x in buf])\n            instances = []\n            for instance in [(x, max_a_len) for x in buf]:\n                for proc in bi_uni_pipeline:\n                    instance = proc(instance)\n                instances.append(instance)\n            batch = seq2seq_loader.batch_list_to_batch_tensors(instances)\n\n            return (batch, buf_id)\n\n        # prepare decoder\n        pair_num_relation = 0\n        cls_num_labels = 2\n        type_vocab_size = (\n            6 + (1 if s2s_config.s2s_add_segment else 0) if new_segment_ids else 2\n        )\n        forbid_ignore_set = None\n        if forbid_ignore_word:\n            w_list = []\n            for w in forbid_ignore_word.split(\"|\"):\n                if w.startswith(\"[\") and w.endswith(\"]\"):\n                    w_list.append(w.upper())\n                else:\n                    w_list.append(w)\n            forbid_ignore_set = set(self.tokenizer.convert_tokens_to_ids(w_list))\n\n        if hasattr(self.model, \"module\"):\n            state_dict = self.model.module.state_dict()\n        else:\n            state_dict = self.model.state_dict()\n        \n        bert_config = None\n        if self._model_type == \"minilm\":\n            bert_config = s2s_ft.modeling_decoding.BertConfig(\n                    30522,\n                    type_vocab_size=type_vocab_size,\n                    hidden_size=384,\n                    intermediate_size=1536, \n                    max_position_embeddings=self.max_seq_length,)\n\n        else: # self._model_type == \"unilm\":\n            bert_config = s2s_ft.modeling_decoding.BertConfig(\n                    len(list(vocab.keys())) if not is_roberta else 50265,\n                    type_vocab_size=type_vocab_size,\n                    max_position_embeddings=self.max_seq_length,)\n\n       \n        model = BertForSeq2SeqDecoder.from_pretrained(\n            self._bert_model_name,\n            bert_config,\n            state_dict=state_dict,\n            cache_dir=self.cache_dir,\n            mask_word_id=mask_word_id,\n            search_beam_size=beam_size,\n            length_penalty=length_penalty,\n            eos_id=eos_word_ids,\n            sos_id=sos_word_id,\n            forbid_duplicate_ngrams=forbid_duplicate_ngrams,\n            forbid_ignore_set=forbid_ignore_set,\n            ngram_size=s2s_config.forbid_ngram_size,\n            min_len=s2s_config.min_len,\n            mode=s2s_config.mode,\n            max_position_embeddings=self.max_seq_length,\n            pos_shift=s2s_config.pos_shift,\n        )\n\n        del state_dict\n\n        if fp16:\n            model.half()\n        # get device\n        device, num_gpus = get_device(\n            num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank\n        )\n\n        # # move model\n        model = move_model_to_device(model=model, device=device)\n\n        batch_size = per_gpu_batch_size * max(1, num_gpus)\n\n        model = parallelize_model(\n            model=model,\n            device=device,\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=local_rank,\n        )\n\n        # torch.cuda.empty_cache()\n        model.eval()\n        first_batch = True\n        batch_count = 0\n\n        output_lines = [\"\"] * len(test_dataset)\n        score_trace_list = [None] * len(test_dataset)\n\n        test_sampler = SequentialSampler(test_dataset)\n        test_dataloader = DataLoader(\n            test_dataset,\n            sampler=test_sampler,\n            batch_size=batch_size,\n            collate_fn=collate_fn,\n        )\n        for batch, buf_id in tqdm(\n            test_dataloader, desc=\"Evaluating\", disable=not verbose\n        ):\n            batch_count += 1\n            with torch.no_grad():\n                batch = [t.to(device) if t is not None else None for t in batch]\n                (\n                    input_ids,\n                    token_type_ids,\n                    position_ids,\n                    input_mask,\n                    mask_qkv,\n                    task_idx,\n                ) = batch\n                traces = model(\n                    input_ids,\n                    token_type_ids,\n                    position_ids,\n                    input_mask,\n                    task_idx=task_idx,\n                    mask_qkv=mask_qkv,\n                )\n                if beam_size > 1:\n                    traces = {k: v.tolist() for k, v in traces.items()}\n                    output_ids = traces[\"pred_seq\"]\n                else:\n                    output_ids = traces.tolist()\n\n                for i in range(len(batch[0])):\n                    w_ids = output_ids[i]\n                    output_buf = self.tokenizer.convert_ids_to_tokens(w_ids)\n                    output_tokens = []\n                    for t in output_buf:\n                        if t in (sep_token, pad_token):\n                            break\n                        output_tokens.append(t)\n                    if is_roberta:\n                        output_sequence = self.tokenizer.convert_tokens_to_string(\n                            output_tokens\n                        )\n                    else:\n                        output_sequence = \" \".join(detokenize(output_tokens))\n                    if \"\\n\" in output_sequence:\n                        output_sequence = \" [X_SEP] \".join(output_sequence.split(\"\\n\"))\n                    output_lines[buf_id[i]] = output_sequence\n                    if first_batch or batch_count % 50 == 0:\n                        logger.info(\"{} = {}\".format(buf_id[i], output_sequence))\n                    if need_score_traces:\n                        score_trace_list[buf_id[i]] = {\n                            \"scores\": traces[\"scores\"][i],\n                            \"wids\": traces[\"wids\"][i],\n                            \"ptrs\": traces[\"ptrs\"][i],\n                        }\n\n            first_batch = False\n\n        del model\n        del batch\n        torch.cuda.empty_cache()\n\n        if need_score_traces:\n            return output_lines, score_trace_list\n        else:\n            return output_lines\n\n    def save_model(self, output_dir, global_step, fp16):\n        model_to_save = (\n            self.model.module if hasattr(self.model, \"module\") else self.model\n        )\n        torch.save(\n            model_to_save.state_dict(),\n            os.path.join(output_dir, \"model.{}.bin\".format(global_step)),\n        )\n        optim_to_save = {\n            \"optimizer\": self.optimizer.state_dict(),\n            \"lr_scheduler\": self.scheduler.state_dict(),\n        }\n        if fp16:\n            optim_to_save[\"amp\"] = self.amp_state_dict\n        torch.save(\n            optim_to_save, os.path.join(output_dir, \"optim.{}.bin\".format(global_step)),\n        )\n\n\ndef load_and_cache_examples(\n    input_examples,\n    tokenizer,\n    local_rank,\n    train_mode=True,\n    cached_features_file=None,\n    shuffle=True,\n    top_n=-1,\n):\n    # Make sure only the first process in distributed training process the dataset,\n    # and the others will use the cache\n    if local_rank not in [-1, 0]:\n        torch.distributed.barrier()\n\n    if cached_features_file is not None and os.path.exists(cached_features_file):\n        logger.info(\"Loading features from cached file %s\", cached_features_file)\n        features = torch.load(cached_features_file)\n    else:\n        if isinstance(input_examples, str):\n            logger.info(\"Creating features from dataset file at %s\", input_examples)\n            examples = []\n            with open(input_examples, mode=\"r\", encoding=\"utf-8\") as reader:\n                for line in reader:\n                    examples.append(json.loads(line))\n        else:\n            examples = input_examples\n\n        if top_n != -1:\n            examples = examples[0:top_n]\n\n        features = []\n        if train_mode:\n            for example in tqdm(examples):\n                if isinstance(example[\"src\"], list):\n                    source_tokens = example[\"src\"]\n                    target_tokens = example[\"tgt\"]\n                else:\n                    source_tokens = tokenizer.tokenize(example[\"src\"])\n                    target_tokens = tokenizer.tokenize(example[\"tgt\"])\n                features.append(\n                    {\n                        \"source_ids\": tokenizer.convert_tokens_to_ids(source_tokens),\n                        \"target_ids\": tokenizer.convert_tokens_to_ids(target_tokens),\n                    }\n                )\n        else:\n            for example in tqdm(examples):\n                if isinstance(example[\"src\"], list):\n                    source_tokens = example[\"src\"]\n                else:\n                    source_tokens = tokenizer.tokenize(example[\"src\"])\n                features.append(\n                    {\"source_ids\": tokenizer.convert_tokens_to_ids(source_tokens),}\n                )\n\n        if shuffle:\n            random.shuffle(features)\n\n        if cached_features_file is not None:\n            logger.info(\"Saving features into cached file %s\", cached_features_file)\n            torch.save(features, cached_features_file)\n\n    # Make sure only the first process in distributed training process the dataset,\n    # and the others will use the cache\n    if local_rank == 0:\n        torch.distributed.barrier()\n\n    return features\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/__init__.py",
    "content": ""
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/adam.py",
    "content": "# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\"\"\" Implementation of ADAM optimizer. \"\"\"\n\nimport math\nimport torch\nfrom torch.optim.optimizer import Optimizer\n\n\nclass Adam(Optimizer):\n    r\"\"\"Implements Adam algorithm.\n\n    It has been proposed in `Adam: A Method for Stochastic Optimization`_.\n\n    Arguments:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float, optional): learning rate (default: 1e-3)\n        betas (Tuple[float, float], optional): coefficients used for computing\n            running averages of gradient and its square (default: (0.9, 0.999))\n        eps (float, optional): term added to the denominator to improve\n            numerical stability (default: 1e-8)\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\n        amsgrad (boolean, optional): whether to use the AMSGrad variant of this\n            algorithm from the paper `On the Convergence of Adam and Beyond`_\n            (default: False)\n\n    .. _Adam\\: A Method for Stochastic Optimization:\n        https://arxiv.org/abs/1412.6980\n    .. _On the Convergence of Adam and Beyond:\n        https://openreview.net/forum?id=ryQu7f-RZ\n    \"\"\"\n\n    def __init__(\n        self,\n        params,\n        lr=1e-3,\n        betas=(0.9, 0.999),\n        eps=1e-8,\n        weight_decay=0,\n        amsgrad=False,\n    ):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(\n            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad\n        )\n        super(Adam, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(Adam, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault(\"amsgrad\", False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group[\"params\"]:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError(\n                        \"Adam does not support sparse gradients, please consider SparseAdam instead\"\n                    )\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state[\"step\"] = 0\n                    # Exponential moving average of gradient values\n                    state[\"next_m\"] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state[\"next_v\"] = torch.zeros_like(p.data)\n\n                next_m, next_v = state[\"next_m\"], state[\"next_v\"]\n                beta1, beta2 = group[\"betas\"]\n\n                # Decay the first and second moment running average coefficient\n                # In-place operations to update the averages at the same time\n                next_m.mul_(beta1).add_(1 - beta1, grad)\n                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                update = next_m / (next_v.sqrt() + group[\"eps\"])\n\n                # Just adding the square of the weights to the loss function is *not*\n                # the correct way of using L2 regularization/weight decay with Adam,\n                # since that will interact with the m and v parameters in strange ways.\n                #\n                # Instead we want to decay the weights in a manner that doesn't interact\n                # with the m/v parameters. This is equivalent to adding the square\n                # of the weights to the loss with plain (non-momentum) SGD.\n                if group[\"weight_decay\"] > 0.0:\n                    update += group[\"weight_decay\"] * p.data\n\n                lr_scheduled = group[\"lr\"]\n\n                update_with_lr = lr_scheduled * update\n                p.data.add_(-update_with_lr)\n\n                state[\"step\"] += 1\n\n                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1\n                # No bias correction\n                # bias_correction1 = 1 - beta1 ** state['step']\n                # bias_correction2 = 1 - beta2 ** state['step']\n\n        return loss\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/beam.py",
    "content": "# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\"\"\" Beam classes used in the beam search. \"\"\"\n\nfrom __future__ import division\nimport torch\nfrom .penalties import PenaltyBuilder\n\n\nclass Beam(object):\n    \"\"\"\n    Class for managing the internals of the beam search process.\n\n    Takes care of beams, back pointers, and scores.\n\n    Args:\n       size (int): beam size\n       pad, bos, eos (int): indices of padding, beginning, and ending.\n       n_best (int): nbest size to use\n       cuda (bool): use gpu\n       global_scorer (:obj:`GlobalScorer`)\n    \"\"\"\n\n    def __init__(\n        self,\n        size,\n        pad,\n        bos,\n        eos,\n        n_best=1,\n        cuda=False,\n        global_scorer=None,\n        min_length=0,\n        stepwise_penalty=False,\n        block_ngram_repeat=0,\n        exclusion_tokens=set(),\n    ):\n\n        self.size = size\n        self.tt = torch.cuda if cuda else torch\n\n        # The score for each translation on the beam.\n        self.scores = self.tt.FloatTensor(size).zero_()\n        self.all_scores = []\n\n        # The backpointers at each time-step.\n        self.prev_ks = []\n\n        # The outputs at each time-step.\n        self.next_ys = [self.tt.LongTensor(size).fill_(pad)]\n        self.next_ys[0][0] = bos\n\n        # Has EOS topped the beam yet.\n        self._eos = eos\n        self.eos_top = False\n\n        # The attentions (matrix) for each time.\n        self.attn = []\n\n        # Time and k pair for finished.\n        self.finished = []\n        self.n_best = n_best\n\n        # Information for global scoring.\n        self.global_scorer = global_scorer\n        self.global_state = {}\n\n        # Minimum prediction length\n        self.min_length = min_length\n\n        # Apply Penalty at every step\n        self.stepwise_penalty = stepwise_penalty\n        self.block_ngram_repeat = block_ngram_repeat\n        self.exclusion_tokens = exclusion_tokens\n\n    def get_current_state(self):\n        \"Get the outputs for the current timestep.\"\n        return self.next_ys[-1]\n\n    def get_current_origin(self):\n        \"Get the backpointers for the current timestep.\"\n        return self.prev_ks[-1]\n\n    def advance(self, word_probs, attn_out):\n        \"\"\"\n        Given prob over words for every last beam `wordLk` and attention\n        `attn_out`: Compute and update the beam search.\n\n        Parameters:\n\n        * `word_probs`- probs of advancing from the last step (K x words)\n        * `attn_out`- attention at the last step\n\n        Returns: True if beam search is complete.\n        \"\"\"\n        num_words = word_probs.size(1)\n        if self.stepwise_penalty:\n            self.global_scorer.update_score(self, attn_out)\n        # force the output to be longer than self.min_length\n        cur_len = len(self.next_ys)\n        if cur_len < self.min_length:\n            for k in range(len(word_probs)):\n                word_probs[k][self._eos] = -1e20\n        # Sum the previous scores.\n        if len(self.prev_ks) > 0:\n            beam_scores = word_probs + self.scores.unsqueeze(1).expand_as(word_probs)\n            # Don't let EOS have children.\n            for i in range(self.next_ys[-1].size(0)):\n                if self.next_ys[-1][i] == self._eos:\n                    beam_scores[i] = -1e20\n\n            # Block ngram repeats\n            if self.block_ngram_repeat > 0:\n                ngrams = []\n                le = len(self.next_ys)\n                for j in range(self.next_ys[-1].size(0)):\n                    hyp, _ = self.get_hyp(le - 1, j)\n                    ngrams = set()\n                    fail = False\n                    gram = []\n                    for i in range(le - 1):\n                        # Last n tokens, n = block_ngram_repeat\n                        gram = (gram + [hyp[i].item()])[-self.block_ngram_repeat :]\n                        # Skip the blocking if it is in the exclusion list\n                        if set(gram) & self.exclusion_tokens:\n                            continue\n                        if tuple(gram) in ngrams:\n                            fail = True\n                        ngrams.add(tuple(gram))\n                    if fail:\n                        beam_scores[j] = -10e20\n        else:\n            beam_scores = word_probs[0]\n        flat_beam_scores = beam_scores.view(-1)\n        best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0, True, True)\n\n        self.all_scores.append(self.scores)\n        self.scores = best_scores\n\n        # best_scores_id is flattened beam x word array, so calculate which\n        # word and beam each score came from\n        prev_k = best_scores_id / num_words\n        self.prev_ks.append(prev_k)\n        self.next_ys.append((best_scores_id - prev_k * num_words))\n        self.attn.append(attn_out.index_select(0, prev_k))\n        self.global_scorer.update_global_state(self)\n\n        for i in range(self.next_ys[-1].size(0)):\n            if self.next_ys[-1][i] == self._eos:\n                global_scores = self.global_scorer.score(self, self.scores)\n                s = global_scores[i]\n                self.finished.append((s, len(self.next_ys) - 1, i))\n\n        # End condition is when top-of-beam is EOS and no global score.\n        if self.next_ys[-1][0] == self._eos:\n            self.all_scores.append(self.scores)\n            self.eos_top = True\n\n    def done(self):\n        return self.eos_top and len(self.finished) >= self.n_best\n\n    def sort_finished(self, minimum=None):\n        if minimum is not None:\n            i = 0\n            # Add from beam until we have minimum outputs.\n            while len(self.finished) < minimum:\n                global_scores = self.global_scorer.score(self, self.scores)\n                s = global_scores[i]\n                self.finished.append((s, len(self.next_ys) - 1, i))\n                i += 1\n\n        self.finished.sort(key=lambda a: -a[0])\n        scores = [sc for sc, _, _ in self.finished]\n        ks = [(t, k) for _, t, k in self.finished]\n        return scores, ks\n\n    def get_hyp(self, timestep, k):\n        \"\"\"\n        Walk back to construct the full hypothesis.\n        \"\"\"\n        hyp, attn = [], []\n        for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1):\n            hyp.append(self.next_ys[j + 1][k])\n            attn.append(self.attn[j][k])\n            k = self.prev_ks[j][k]\n        return hyp[::-1], torch.stack(attn[::-1])\n\n\nclass GNMTGlobalScorer(object):\n    \"\"\"\n    NMT re-ranking score from\n    \"Google's Neural Machine Translation System\" :cite:`wu2016google`\n\n    Args:\n       alpha (float): length parameter\n       beta (float):  coverage parameter\n    \"\"\"\n\n    def __init__(self, alpha, length_penalty):\n        self.alpha = alpha\n        penalty_builder = PenaltyBuilder(length_penalty)\n        # Term will be subtracted from probability\n        # Probability will be divided by this\n        self.length_penalty = penalty_builder.length_penalty()\n\n    def score(self, beam, logprobs):\n        \"\"\"\n        Rescores a prediction based on penalty functions\n        \"\"\"\n        normalized_probs = self.length_penalty(beam, logprobs, self.alpha)\n\n        return normalized_probs\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/data_loader.py",
    "content": "\"\"\"\nPortions Copyright (c) Microsoft Corporation\n\"\"\"\nimport gc\nimport itertools\nimport random\nimport torch\n\n\nclass IterableDistributedSampler(object):\n    \"\"\" Distributed sampler for iterable dataset.\n\n    Args:\n        world_size (int, optional): Total number of GPUs that will be used.\n            Defaults to 1.\n        rank (int, optional): Rank of the current GPU. Defaults to 0.\n        local_rank(int, optional): local_rank of the current GPU. Defaults to -1.\n\n    \"\"\"\n\n    def __init__(self, world_size=1, rank=0, local_rank=-1):\n        self.world_size = world_size\n        self.rank = rank\n        self.local_rank = local_rank\n\n    def iter(self, iterable):\n        if self.local_rank == -1:\n            return iterable\n        if self.world_size > 1:\n            return itertools.islice(iterable, self.rank, None, self.world_size)\n        else:\n            return iterable\n\n\nclass ChunkDataLoader(object):\n    \"\"\" Data Loader for Chunked Dataset.\n\n    Args:\n        datasets (list): list of data item list.\n        batch_size (int): Number of tokens per batch.\n        shuffle (bool): Whether the data is shuffled.\n        is_labeled (bool): Whether the data is labeled.\n        sampler (obj): Data sampler.\n\n    \"\"\"\n\n    def __init__(self, datasets, batch_size, shuffle, is_labeled, sampler):\n        self.datasets = datasets\n        self.batch_size = batch_size\n        self.shuffle = shuffle\n        self.is_labeled = is_labeled\n        self.cur_iter = self._next_dataset_iterator(datasets)\n        assert self.cur_iter is not None\n        self.sampler = sampler\n\n    def eachiter(self):\n        dataset_iter = (d for d in self.datasets)\n        while self.cur_iter is not None:\n            for batch in self.cur_iter:\n                yield batch\n            self.cur_iter = self._next_dataset_iterator(dataset_iter)\n\n    def __iter__(self):\n        return self.sampler.iter(self.eachiter())\n\n    def _next_dataset_iterator(self, dataset_iter):\n        try:\n            # Drop the current dataset for decreasing memory\n            if hasattr(self, \"cur_dataset\"):\n                self.cur_dataset = None\n                gc.collect()\n                del self.cur_dataset\n                gc.collect()\n\n            self.cur_dataset = next(dataset_iter)\n        except StopIteration:\n            return None\n\n        return DataIterator(\n            dataset=self.cur_dataset,\n            batch_size=self.batch_size,\n            shuffle=self.shuffle,\n            is_labeled=self.is_labeled,\n        )\n\n\nclass Batch(object):\n    def _pad(self, data, pad_id, width=-1):\n        if width == -1:\n            width = max(len(d) for d in data)\n        rtn_data = [d + [pad_id] * (width - len(d)) for d in data]\n        return rtn_data\n\n    def __init__(self, data=None, is_labeled=False):\n        \"\"\"Create a Batch from a list of examples.\"\"\"\n        if data is None or len(data) == 0:\n            raise ValueError(\"data is empty\")\n        self.batch_size = len(data)\n        pre_src = [x[0] for x in data]\n        pre_segs = [x[2] for x in data]\n        pre_clss = [x[3] for x in data]\n\n        src = torch.tensor(self._pad(pre_src, 0))\n\n        pre_labels = None\n        labels = None\n        if is_labeled:\n            pre_labels = [x[1] for x in data]\n            labels = torch.tensor(self._pad(pre_labels, 0))\n        segs = torch.tensor(self._pad(pre_segs, 0))\n        mask = ~(src == 0)\n\n        clss = torch.tensor(self._pad(pre_clss, -1))\n        mask_cls = ~(clss == -1)\n        clss[clss == -1] = 0\n\n        setattr(self, \"clss\", clss)\n        setattr(self, \"mask_cls\", mask_cls)\n        setattr(self, \"src\", src)\n        setattr(self, \"segs\", segs)\n        setattr(self, \"mask\", mask)\n\n        src_str = [x[4] for x in data]\n        setattr(self, \"src_str\", src_str)\n\n        if is_labeled:\n            setattr(self, \"labels\", labels)\n            tgt_str = [x[5] for x in data]\n            setattr(self, \"tgt_str\", tgt_str)\n\n    def to(self, device):\n        src = self.src.to(device)\n        segs = self.segs.to(device)\n        clss = self.clss.to(device)\n        mask = self.mask.to(device)\n        mask_cls = self.mask_cls.to(device)\n\n        setattr(self, \"clss\", clss)\n        setattr(self, \"mask_cls\", mask_cls)\n        setattr(self, \"src\", src)\n        setattr(self, \"segs\", segs)\n        setattr(self, \"mask\", mask)\n        if hasattr(self, \"labels\"):\n            labels = self.labels.to(device)\n            setattr(self, \"labels\", labels.to(device))\n\n        return self\n\n    def __len__(self):\n        return self.batch_size\n\n\ndef create_batch_with_size(data, batch_size):\n    \"\"\"Yield elements from data in chunks of batch_size.\"\"\"\n    minibatch, size_so_far = [], 0\n    for ex in data:\n        minibatch.append(ex)\n        size_so_far = simple_batch_size_fn(ex, len(minibatch))\n        if size_so_far == batch_size:\n            yield minibatch\n            minibatch, size_so_far = [], 0\n        elif size_so_far > batch_size:\n            yield minibatch[:-1]\n            minibatch, size_so_far = minibatch[-1:], simple_batch_size_fn(ex, 1)\n    if minibatch:\n        yield minibatch\n\n\ndef simple_batch_size_fn(new, count):\n    src, labels = new[0], new[1]\n    global max_n_sents, max_n_tokens, max_size\n    if count == 1:\n        max_size = 0\n        max_n_sents = 0\n        max_n_tokens = 0\n    max_n_sents = max(max_n_sents, len(src))\n    max_size = max(max_size, max_n_sents)\n    src_elements = count * max_size\n    return src_elements\n\n\nclass DataIterator(object):\n    def __init__(self, dataset, batch_size, is_labeled=False, shuffle=True, sort=True):\n        self.dataset = dataset\n        self.batch_size = batch_size\n        self.is_labeled = is_labeled\n        self.iterations = 0\n        self.shuffle = shuffle\n        self.sort = sort\n\n        self.sort_key = lambda x: len(x[1])\n\n        self._iterations_this_epoch = 0\n\n    def data(self):\n        if self.shuffle:\n            random.shuffle(self.dataset)\n        xs = self.dataset\n        return xs\n\n    def preprocess(self, ex, is_labeled):\n        src = ex[\"src\"]\n        if \"labels\" in ex:\n            labels = ex[\"labels\"]\n        else:\n            labels = None  # ex['src_sent_labels']\n\n        segs = ex[\"segs\"]\n        # if(not self.args.use_interval):\n        #    segs=[0]*len(segs)\n        clss = ex[\"clss\"]\n        src_txt = ex[\"src_txt\"]\n        tgt_txt = ex[\"tgt_txt\"]\n\n        if is_labeled:\n            return src, labels, segs, clss, src_txt, tgt_txt\n        else:\n            return src, labels, segs, clss, src_txt, None\n\n    def batch_buffer(self, data, batch_size):\n        minibatch, size_so_far = [], 0\n        for ex in data:\n            if len(ex[\"src\"]) == 0:\n                continue\n            ex = self.preprocess(ex, self.is_labeled)\n            if ex is None:\n                continue\n            minibatch.append(ex)\n            size_so_far = simple_batch_size_fn(ex, len(minibatch))\n            if size_so_far == batch_size:\n                yield minibatch\n                minibatch, size_so_far = [], 0\n            elif size_so_far > batch_size:\n                yield minibatch[:-1]\n                minibatch, size_so_far = minibatch[-1:], simple_batch_size_fn(ex, 1)\n        if minibatch:\n            yield minibatch\n\n    def create_batches(self):\n        \"\"\" Create batches \"\"\"\n        data = self.data()\n        for buffer in self.batch_buffer(data, self.batch_size * 50):\n\n            if self.sort:\n                p_batch = sorted(buffer, key=lambda x: len(x[3]))\n            else:\n                p_batch = buffer\n            p_batch = create_batch_with_size(p_batch, self.batch_size)\n\n            p_batch = list(p_batch)\n            if self.shuffle:\n                random.shuffle(p_batch)\n            for b in p_batch:\n                yield b\n\n    def __iter__(self):\n        while True:\n            self.batches = self.create_batches()\n            for idx, minibatch in enumerate(self.batches):\n                # fast-forward if loaded from state\n                if self._iterations_this_epoch > idx:\n                    continue\n                self.iterations += 1\n                self._iterations_this_epoch += 1\n                # if len(minibatch) == 0:\n                #    continue\n                batch = Batch(minibatch, self.is_labeled)\n                yield batch\n            return\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/dataset.py",
    "content": "import itertools\nimport torch\nfrom torch.utils.data import (\n    Dataset,\n    IterableDataset,\n)\n\n\ndef get_dataset(file):\n    yield torch.load(file)\n\n\nclass ExtSumProcessedIterableDataset(IterableDataset):\n    \"\"\"Iterable dataset for extractive summarization preprocessed data\n    \"\"\"\n\n    def __init__(self, file_list, is_shuffle=False):\n        \"\"\" Initiation function for iterable dataset for extractive summarization\n            preprocessed data.\n\n        Args:\n            file_list (list of strings): List of files that the dataset is loaded from.\n            is_shuffle (bool, optional): A boolean value specifies whether the list of\n                files is shuffled when the dataset is loaded. Defaults to False.\n        \"\"\"\n\n        self.file_list = file_list\n        self.is_shuffle = is_shuffle\n\n    def get_stream(self):\n        \"\"\" get a stream of cycled data from the dataset\"\"\"\n\n        if self.is_shuffle:\n            return itertools.chain.from_iterable(\n                map(get_dataset, itertools.cycle(self.file_list))\n            )\n        else:\n            return itertools.chain.from_iterable(\n                map(get_dataset, itertools.cycle(random.shuffle(self.file_list)))\n            )\n\n    def __iter__(self):\n        return self.get_stream()\n\n\nclass ExtSumProcessedDataset(Dataset):\n    \"\"\"Dataset for extractive summarization preprocessed data\n    \"\"\"\n\n    def __init__(self, file_list, is_shuffle=False):\n        \"\"\" Initiation function for dataset for extractive summarization preprocessed data.\n\n        Args:\n            file_list (list of strings): List of files that the dataset is loaded from.\n            is_shuffle (bool, optional): A boolean value specifies whether the list of\n                files is shuffled when the dataset is loaded. Defaults to False.\n        \"\"\"\n\n        self.file_list = sorted(file_list)\n        if is_shuffle:\n            random.shuffle(self.file_list)\n        self.data = []\n        for f in self.file_list:\n            self.data.extend(torch.load(f))\n\n    def __len__(self):\n        return len(self.data)\n\n    def __getitem__(self, idx):\n        return self.data[idx]\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/decoder.py",
    "content": "# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\"\"\" \nDecoder implementation of \"Attention is All You Need\"\n\"\"\"\n\nimport torch\nimport torch.nn as nn\nimport numpy as np\n\nfrom .encoder import PositionalEncoding\nfrom .neural import MultiHeadedAttention, PositionwiseFeedForward, DecoderState\n\nMAX_SIZE = 5000\n\n\nclass TransformerDecoderLayer(nn.Module):\n    \"\"\"\n    Args:\n      d_model (int): the dimension of keys/values/queries in\n                       MultiHeadedAttention, also the input size of\n                       the first-layer of the PositionwiseFeedForward.\n      heads (int): the number of heads for MultiHeadedAttention.\n      d_ff (int): the second-layer of the PositionwiseFeedForward.\n      dropout (float): dropout probability(0-1.0).\n      self_attn_type (string): type of self-attention scaled-dot, average\n    \"\"\"\n\n    def __init__(self, d_model, heads, d_ff, dropout):\n        super(TransformerDecoderLayer, self).__init__()\n\n        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)\n\n        self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)\n        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)\n        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)\n        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)\n        self.drop = nn.Dropout(dropout)\n        mask = self._get_attn_subsequent_mask(MAX_SIZE)\n        # Register self.mask as a buffer in TransformerDecoderLayer, so\n        # it gets TransformerDecoderLayer's cuda behavior automatically.\n        self.register_buffer(\"mask\", mask)\n\n    def forward(\n        self,\n        inputs,\n        memory_bank,\n        src_pad_mask,\n        tgt_pad_mask,\n        previous_input=None,\n        layer_cache=None,\n        step=None,\n    ):\n        \"\"\"\n        Args:\n            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`\n            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`\n            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`\n            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`\n\n        Returns:\n            (`FloatTensor`, `FloatTensor`, `FloatTensor`):\n\n            * output `[batch_size x 1 x model_dim]`\n            * attn `[batch_size x 1 x src_len]`\n            * all_input `[batch_size x current_step x model_dim]`\n\n        \"\"\"\n        dec_mask = torch.gt(\n            tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)],\n            0,\n        )\n        input_norm = self.layer_norm_1(inputs)\n        all_input = input_norm\n        if previous_input is not None:\n            all_input = torch.cat((previous_input, input_norm), dim=1)\n            dec_mask = None\n\n        query = self.self_attn(\n            all_input,\n            all_input,\n            input_norm,\n            mask=dec_mask,\n            layer_cache=layer_cache,\n            type=\"self\",\n        )\n\n        query = self.drop(query) + inputs\n\n        query_norm = self.layer_norm_2(query)\n        mid = self.context_attn(\n            memory_bank,\n            memory_bank,\n            query_norm,\n            mask=src_pad_mask,\n            layer_cache=layer_cache,\n            type=\"context\",\n        )\n        output = self.feed_forward(self.drop(mid) + query)\n\n        return output, all_input\n        # return output\n\n    def _get_attn_subsequent_mask(self, size):\n        \"\"\"\n        Get an attention mask to avoid using the subsequent info.\n\n        Args:\n            size: int\n\n        Returns:\n            (`LongTensor`):\n\n            * subsequent_mask `[1 x size x size]`\n        \"\"\"\n        attn_shape = (1, size, size)\n        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype(\"uint8\")\n        subsequent_mask = torch.from_numpy(subsequent_mask)\n        return subsequent_mask\n\n\nclass TransformerDecoder(nn.Module):\n    \"\"\"\n    The Transformer decoder from \"Attention is All You Need\".\n\n\n    .. mermaid::\n\n       graph BT\n          A[input]\n          B[multi-head self-attn]\n          BB[multi-head src-attn]\n          C[feed forward]\n          O[output]\n          A --> B\n          B --> BB\n          BB --> C\n          C --> O\n\n\n    Args:\n       num_layers (int): number of encoder layers.\n       d_model (int): size of the model\n       heads (int): number of heads\n       d_ff (int): size of the inner FF layer\n       dropout (float): dropout parameters\n       embeddings (:obj:`onmt.modules.Embeddings`):\n          embeddings to use, should have positional encodings\n       attn_type (str): if using a seperate copy attention\n    \"\"\"\n\n    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings):\n        super(TransformerDecoder, self).__init__()\n\n        # Basic attributes.\n        self.decoder_type = \"transformer\"\n        self.num_layers = num_layers\n        self.embeddings = embeddings\n        self.pos_emb = PositionalEncoding(dropout, self.embeddings.embedding_dim)\n\n        # Build TransformerDecoder.\n        self.transformer_layers = nn.ModuleList(\n            [\n                TransformerDecoderLayer(d_model, heads, d_ff, dropout)\n                for _ in range(num_layers)\n            ]\n        )\n\n        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)\n\n    def forward(\n        self,\n        tgt,\n        memory_bank,\n        state,\n        memory_lengths=None,\n        step=None,\n        cache=None,\n        memory_masks=None,\n    ):\n        \"\"\"\n        See :obj:`onmt.modules.RNNDecoderBase.forward()`\n        \"\"\"\n\n        src_words = state.src\n        tgt_words = tgt\n        src_batch, src_len = src_words.size()\n        tgt_batch, tgt_len = tgt_words.size()\n\n        # Run the forward pass of the TransformerDecoder.\n        # emb = self.embeddings(tgt, step=step)\n        emb = self.embeddings(tgt)\n        assert emb.dim() == 3  # len x batch x embedding_dim\n\n        output = self.pos_emb(emb, step)\n\n        src_memory_bank = memory_bank\n        padding_idx = self.embeddings.padding_idx\n        tgt_pad_mask = (\n            tgt_words.data.eq(padding_idx)\n            .type(torch.uint8)\n            .unsqueeze(1)\n            .expand(tgt_batch, tgt_len, tgt_len)\n        )\n\n        if not memory_masks is None:\n            src_len = memory_masks.size(-1)\n            src_pad_mask = memory_masks.expand(src_batch, tgt_len, src_len)\n\n        else:\n            src_pad_mask = (\n                src_words.data.eq(padding_idx)\n                .unsqueeze(1)\n                .expand(src_batch, tgt_len, src_len)\n            )\n\n        if state.cache is None:\n            saved_inputs = []\n\n        for i in range(self.num_layers):\n            prev_layer_input = None\n            if state.cache is None:\n                if state.previous_input is not None:\n                    prev_layer_input = state.previous_layer_inputs[i]\n            output, all_input = self.transformer_layers[i](\n                output,\n                src_memory_bank,\n                src_pad_mask,\n                tgt_pad_mask,\n                previous_input=prev_layer_input,\n                layer_cache=state.cache[\"layer_{}\".format(i)]\n                if state.cache is not None\n                else None,\n                step=step,\n            )\n            if state.cache is None:\n                saved_inputs.append(all_input)\n\n        if state.cache is None:\n            saved_inputs = torch.stack(saved_inputs)\n\n        output = self.layer_norm(output)\n\n        # Process the result and update the attentions.\n\n        if state.cache is None:\n            state = state.update_state(tgt, saved_inputs)\n\n        return output, state\n\n    def init_decoder_state(self, src, memory_bank, with_cache=False):\n        \"\"\" Init decoder state \"\"\"\n        state = TransformerDecoderState(src)\n        if with_cache:\n            state._init_cache(memory_bank, self.num_layers)\n        return state\n\n\nclass TransformerDecoderState(DecoderState):\n    \"\"\" Transformer Decoder state base class \"\"\"\n\n    def __init__(self, src):\n        \"\"\"\n        Args:\n            src (FloatTensor): a sequence of source words tensors\n                    with optional feature tensors, of size (len x batch).\n        \"\"\"\n        self.src = src\n        self.previous_input = None\n        self.previous_layer_inputs = None\n        self.cache = None\n\n    @property\n    def _all(self):\n        \"\"\"\n        Contains attributes that need to be updated in self.beam_update().\n        \"\"\"\n        if self.previous_input is not None and self.previous_layer_inputs is not None:\n            return (self.previous_input, self.previous_layer_inputs, self.src)\n        else:\n            return (self.src,)\n\n    def detach(self):\n        if self.previous_input is not None:\n            self.previous_input = self.previous_input.detach()\n        if self.previous_layer_inputs is not None:\n            self.previous_layer_inputs = self.previous_layer_inputs.detach()\n        self.src = self.src.detach()\n\n    def update_state(self, new_input, previous_layer_inputs):\n        state = TransformerDecoderState(self.src)\n        state.previous_input = new_input\n        state.previous_layer_inputs = previous_layer_inputs\n        return state\n\n    def _init_cache(self, memory_bank, num_layers):\n        self.cache = {}\n\n        for l in range(num_layers):\n            layer_cache = {\"memory_keys\": None, \"memory_values\": None}\n            layer_cache[\"self_keys\"] = None\n            layer_cache[\"self_values\"] = None\n            self.cache[\"layer_{}\".format(l)] = layer_cache\n\n    def repeat_beam_size_times(self, beam_size):\n        \"\"\" Repeat beam_size times along batch dimension. \"\"\"\n        self.src = self.src.data.repeat(1, beam_size, 1)\n\n    def map_batch_fn(self, fn):\n        def _recursive_map(struct, batch_dim=0):\n            for k, v in struct.items():\n                if v is not None:\n                    if isinstance(v, dict):\n                        _recursive_map(v)\n                    else:\n                        struct[k] = fn(v, batch_dim)\n\n        self.src = fn(self.src, 0)\n        if self.cache is not None:\n            _recursive_map(self.cache)\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/encoder.py",
    "content": "# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\"Encoder classes used in the BertSum models.\"\nimport math\n\nimport torch\nimport torch.nn as nn\n\nfrom .neural import MultiHeadedAttention, PositionwiseFeedForward\n\n\nclass Classifier(nn.Module):\n    def __init__(self, hidden_size):\n        super(Classifier, self).__init__()\n        self.linear1 = nn.Linear(hidden_size, 1)\n        self.sigmoid = nn.Sigmoid()\n\n    def forward(self, x, mask_cls):\n        h = self.linear1(x).squeeze(-1)\n        sent_scores = self.sigmoid(h) * mask_cls.float()\n        return sent_scores\n\n\nclass PositionalEncoding(nn.Module):\n    def __init__(self, dropout, dim, max_len=5000):\n        pe = torch.zeros(max_len, dim)\n        position = torch.arange(0, max_len).unsqueeze(1)\n        div_term = torch.exp(\n            (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))\n        )\n        pe[:, 0::2] = torch.sin(position.float() * div_term)\n        pe[:, 1::2] = torch.cos(position.float() * div_term)\n        pe = pe.unsqueeze(0)\n        super(PositionalEncoding, self).__init__()\n        self.register_buffer(\"pe\", pe)\n        self.dropout = nn.Dropout(p=dropout)\n        self.dim = dim\n\n    def forward(self, emb, step=None):\n        emb = emb * math.sqrt(self.dim)\n        if step:\n            emb = emb + self.pe[:, step][:, None, :]\n\n        else:\n            emb = emb + self.pe[:, : emb.size(1)]\n        emb = self.dropout(emb)\n        return emb\n\n    def get_emb(self, emb):\n        return self.pe[:, : emb.size(1)]\n\n\nclass TransformerEncoderLayer(nn.Module):\n    def __init__(self, d_model, heads, d_ff, dropout):\n        super(TransformerEncoderLayer, self).__init__()\n\n        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)\n        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)\n        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)\n        self.dropout = nn.Dropout(dropout)\n\n    def forward(self, iter, query, inputs, mask):\n        if iter != 0:\n            input_norm = self.layer_norm(inputs)\n        else:\n            input_norm = inputs\n\n        mask = mask.unsqueeze(1)\n        context = self.self_attn(input_norm, input_norm, input_norm, mask=mask)\n        out = self.dropout(context) + inputs\n        return self.feed_forward(out)\n\n\nclass ExtTransformerEncoder(nn.Module):\n    def __init__(self, d_model, d_ff, heads, dropout, num_inter_layers=0):\n        super(ExtTransformerEncoder, self).__init__()\n        self.d_model = d_model\n        self.num_inter_layers = num_inter_layers\n        self.pos_emb = PositionalEncoding(dropout, d_model)\n        self.transformer_inter = nn.ModuleList(\n            [\n                TransformerEncoderLayer(d_model, heads, d_ff, dropout)\n                for _ in range(num_inter_layers)\n            ]\n        )\n        self.dropout = nn.Dropout(dropout)\n        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)\n        self.wo = nn.Linear(d_model, 1, bias=True)\n        self.sigmoid = nn.Sigmoid()\n\n    def forward(self, top_vecs, mask):\n        \"\"\" See :obj:`EncoderBase.forward()`\"\"\"\n\n        batch_size, n_sents = top_vecs.size(0), top_vecs.size(1)\n        pos_emb = self.pos_emb.pe[:, :n_sents]\n        x = top_vecs * mask[:, :, None].float()\n        x = x + pos_emb\n\n        for i in range(self.num_inter_layers):\n            x = self.transformer_inter[i](\n                i, x, x, ~mask\n            )  # all_sents * max_tokens * dim\n\n        x = self.layer_norm(x)\n        sent_scores = self.sigmoid(self.wo(x))\n        sent_scores = sent_scores.squeeze(-1) * mask.float()\n\n        return sent_scores\n\nclass RNNEncoder(nn.Module):\n\n    def __init__(self, bidirectional, num_layers, input_size,\n                 hidden_size, dropout=0.0):\n        super(RNNEncoder, self).__init__()\n        num_directions = 2 if bidirectional else 1\n        assert hidden_size % num_directions == 0\n        hidden_size = hidden_size // num_directions\n\n        self.rnn = LayerNormLSTM(\n            input_size=input_size,\n            hidden_size=hidden_size,\n            num_layers=num_layers,\n            bidirectional=bidirectional)\n\n        self.wo = nn.Linear(num_directions * hidden_size, 1, bias=True)\n        self.dropout = nn.Dropout(dropout)\n        self.sigmoid = nn.Sigmoid()\n\n    def forward(self, x, mask):\n        \"\"\"See :func:`EncoderBase.forward()`\"\"\"\n        x = torch.transpose(x, 1, 0)\n        memory_bank, _ = self.rnn(x)\n        memory_bank = self.dropout(memory_bank) + x\n        memory_bank = torch.transpose(memory_bank, 1, 0)\n\n        sent_scores = self.sigmoid(self.wo(memory_bank))\n        sent_scores = sent_scores.squeeze(-1) * mask.float()\n        return sent_scores\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/loss.py",
    "content": "# Modifications Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\n\n\"\"\"\nThis file handles the details of the loss function during training.\n\nThis includes: LossComputeBase and the standard NMTLossCompute, and\n               sharded loss compute stuff.\n\"\"\"\nfrom __future__ import division\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n# from models.reporter import Statistics\n\n\ndef abs_loss(generator, symbols, vocab_size, train=True, label_smoothing=0.0):\n    compute = NMTLossCompute(\n        generator,\n        symbols,\n        vocab_size,\n        label_smoothing=label_smoothing if train else 0.0,\n    )\n    # compute.to(device)\n    return compute\n\n\nclass LossComputeBase(nn.Module):\n    \"\"\"\n    Class for managing efficient loss computation. Handles\n    sharding next step predictions and accumulating mutiple\n    loss computations\n\n\n    Users can implement their own loss computation strategy by making\n    subclass of this one.  Users need to implement the _compute_loss()\n    and make_shard_state() methods.\n\n    Args:\n        generator (:obj:`nn.Module`) :\n             module that maps the output of the decoder to a\n             distribution over the target vocabulary.\n        tgt_vocab (:obj:`Vocab`) :\n             torchtext vocab object representing the target output\n        normalzation (str): normalize by \"sents\" or \"tokens\"\n    \"\"\"\n\n    def __init__(self, generator, pad_id):\n        super(LossComputeBase, self).__init__()\n        self.generator = generator\n        self.padding_idx = pad_id\n\n    def _make_shard_state(self, batch, output, attns=None):\n        \"\"\"\n        Make shard state dictionary for shards() to return iterable\n        shards for efficient loss computation. Subclass must define\n        this method to match its own _compute_loss() interface.\n        Args:\n            batch: the current batch.\n            output: the predict output from the model.\n            range_: the range of examples for computing, the whole\n                    batch or a trunc of it?\n            attns: the attns dictionary returned from the model.\n        \"\"\"\n        return NotImplementedError\n\n    def _compute_loss(self, batch, output, target, **kwargs):\n        \"\"\"\n        Compute the loss. Subclass must define this method.\n\n        Args:\n\n            batch: the current batch.\n            output: the predict output from the model.\n            target: the validate target to compare output with.\n            **kwargs(optional): additional info for computing loss.\n        \"\"\"\n        return NotImplementedError\n\n    def monolithic_compute_loss(self, output, target, number_tokens):\n        \"\"\"\n        Compute the forward loss for the batch.\n\n        Args:\n          batch (batch): batch of labeled examples\n          output (:obj:`FloatTensor`):\n              output of decoder model `[tgt_len x batch x hidden]`\n          attns (dict of :obj:`FloatTensor`) :\n              dictionary of attention distributions\n              `[tgt_len x batch x src_len]`\n        Returns:\n            :obj:`onmt.utils.Statistics`: loss statistics\n        \"\"\"\n        # shard_state = self._make_shard_state(output, target)\n        loss, batch_stats = self._compute_loss(output, target)\n        normalization = number_tokens.sum()\n\n        return loss.div(float(normalization))\n\n    def sharded_compute_loss(self, batch, output, shard_size, normalization):\n        \"\"\"Compute the forward loss and backpropagate.  Computation is done\n        with shards and optionally truncation for memory efficiency.\n\n        Also supports truncated BPTT for long sequences by taking a\n        range in the decoder output sequence to back propagate in.\n        Range is from `(cur_trunc, cur_trunc + trunc_size)`.\n\n        Note sharding is an exact efficiency trick to relieve memory\n        required for the generation buffers. Truncation is an\n        approximate efficiency trick to relieve the memory required\n        in the RNN buffers.\n\n        Args:\n          batch (batch) : batch of labeled examples\n          output (:obj:`FloatTensor`) :\n              output of decoder model `[tgt_len x batch x hidden]`\n          attns (dict) : dictionary of attention distributions\n              `[tgt_len x batch x src_len]`\n          cur_trunc (int) : starting position of truncation window\n          trunc_size (int) : length of truncation window\n          shard_size (int) : maximum number of examples in a shard\n          normalization (int) : Loss is divided by this number\n\n        Returns:\n            :obj:`onmt.utils.Statistics`: validation loss statistics\n\n        \"\"\"\n        # batch_stats = Statistics()\n        shard_state = self._make_shard_state(batch, output)\n        for shard in shards(shard_state, shard_size):\n            loss, stats = self._compute_loss(batch, **shard)\n            loss.div(float(normalization)).backward()\n            # batch_stats.update(stats)\n\n        # return batch_stats\n        return loss\n\n    def _stats(self, loss, scores, target):\n        \"\"\"\n        Args:\n            loss (:obj:`FloatTensor`): the loss computed by the loss criterion.\n            scores (:obj:`FloatTensor`): a score for each possible output\n            target (:obj:`FloatTensor`): true targets\n\n        Returns:\n            :obj:`onmt.utils.Statistics` : statistics for this batch.\n        \"\"\"\n        pred = scores.max(1)[1]\n        non_padding = target.ne(self.padding_idx)\n        num_correct = pred.eq(target).masked_select(non_padding).sum().item()\n        num_non_padding = non_padding.sum().item()\n        # return Statistics(loss.item(), num_non_padding, num_correct)\n        return loss.item()\n\n    def _bottle(self, _v):\n        return _v.view(-1, _v.size(2))\n\n    def _unbottle(self, _v, batch_size):\n        return _v.view(-1, batch_size, _v.size(1))\n\n\nclass LabelSmoothingLoss(nn.Module):\n    \"\"\"\n    With label smoothing,\n    KL-divergence between q_{smoothed ground truth prob.}(w)\n    and p_{prob. computed by model}(w) is minimized.\n    \"\"\"\n\n    def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100):\n        assert 0.0 < label_smoothing <= 1.0\n        self.padding_idx = ignore_index\n        super(LabelSmoothingLoss, self).__init__()\n\n        smoothing_value = label_smoothing / (tgt_vocab_size - 2)\n        one_hot = torch.full((tgt_vocab_size,), smoothing_value)\n        one_hot[self.padding_idx] = 0\n        self.register_buffer(\"one_hot\", one_hot.unsqueeze(0))\n        self.confidence = 1.0 - label_smoothing\n\n    def forward(self, output, target):\n        \"\"\"\n        output (FloatTensor): batch_size x n_classes\n        target (LongTensor): batch_size\n        \"\"\"\n        model_prob = self.one_hot.repeat(target.size(0), 1)\n        model_prob.scatter_(1, target.unsqueeze(1), self.confidence)\n        model_prob.masked_fill_((target == self.padding_idx).unsqueeze(1), 0)\n\n        return F.kl_div(output, model_prob, reduction=\"sum\")\n\n\nclass NMTLossCompute(LossComputeBase):\n    \"\"\"\n    Standard NMT Loss Computation.\n    \"\"\"\n\n    def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0):\n        super(NMTLossCompute, self).__init__(generator, symbols[\"PAD\"])\n        self.sparse = not isinstance(generator[1], nn.LogSoftmax)\n        if label_smoothing > 0:\n            self.criterion = LabelSmoothingLoss(\n                label_smoothing, vocab_size, ignore_index=self.padding_idx\n            )\n        else:\n            self.criterion = nn.NLLLoss(ignore_index=self.padding_idx, reduction=\"sum\")\n\n    def _make_shard_state(self, target, tgt_num_tokens, output):\n        return {\n            \"output\": output,\n            \"target\": target,\n            \"number_tokens\": tgt_num_tokens,\n        }\n\n    def _compute_loss(self, output, target, **kwargs):\n        bottled_output = self._bottle(output)\n        scores = self.generator(bottled_output)\n        gtruth = target.contiguous().view(-1)\n\n        loss = self.criterion(scores, gtruth)\n\n        stats = self._stats(loss.clone(), scores, gtruth)\n\n        return loss, stats\n\n\ndef filter_shard_state(state, shard_size=None):\n    \"\"\" ? \"\"\"\n    for k, v in state.items():\n        if shard_size is None:\n            yield k, v\n\n        if v is not None:\n            v_split = []\n            if isinstance(v, torch.Tensor):\n                for v_chunk in torch.split(v, shard_size):\n                    v_chunk = v_chunk.data.clone()\n                    v_chunk.requires_grad = v.requires_grad\n                    v_split.append(v_chunk)\n            yield k, (v, v_split)\n\n\ndef shards(state, shard_size, eval_only=False):\n    \"\"\"\n    Args:\n        state: A dictionary which corresponds to the output of\n               *LossCompute._make_shard_state(). The values for\n               those keys are Tensor-like or None.\n        shard_size: The maximum size of the shards yielded by the model.\n        eval_only: If True, only yield the state, nothing else.\n              Otherwise, yield shards.\n\n    Yields:\n        Each yielded shard is a dict.\n\n    Side effect:\n        After the last shard, this function does back-propagation.\n    \"\"\"\n    if eval_only:\n        yield filter_shard_state(state)\n    else:\n        # non_none: the subdict of the state dictionary where the values\n        # are not None.\n        non_none = dict(filter_shard_state(state, shard_size))\n\n        # Now, the iteration:\n        # state is a dictionary of sequences of tensor-like but we\n        # want a sequence of dictionaries of tensors.\n        # First, unzip the dictionary into a sequence of keys and a\n        # sequence of tensor-like sequences.\n        keys, values = zip(\n            *(\n                (k, [v_chunk for v_chunk in v_split])\n                for k, (_, v_split) in non_none.items()\n            )\n        )\n\n        # Now, yield a dictionary for each shard. The keys are always\n        # the same. values is a sequence of length #keys where each\n        # element is a sequence of length #shards. We want to iterate\n        # over the shards, not over the keys: therefore, the values need\n        # to be re-zipped by shard and then each shard can be paired\n        # with the keys.\n        for shard_tensors in zip(*values):\n            yield dict(zip(keys, shard_tensors))\n\n        # Assumed backprop'd\n        variables = []\n        for k, (v, v_split) in non_none.items():\n            if isinstance(v, torch.Tensor) and state[k].requires_grad:\n                variables.extend(\n                    zip(\n                        torch.split(state[k], shard_size),\n                        [v_chunk.grad for v_chunk in v_split],\n                    )\n                )\n        inputs, grads = zip(*variables)\n        torch.autograd.backward(inputs, grads)\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/model_builder.py",
    "content": "# Modifications Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\"\"\"\nThe BertSum models for both extractive and abstractive summarization.\n\"\"\"\n\nimport sys\nimport copy\n\nimport torch\nimport torch.nn as nn\nfrom transformers import BertModel, BertConfig\nfrom torch.nn.init import xavier_uniform_\n\nfrom .decoder import TransformerDecoder\nfrom .encoder import Classifier, ExtTransformerEncoder, RNNEncoder\nfrom .optimizers import Optimizer\nfrom .loss import abs_loss\n\n\ndef load_optimizer_checkpoint(optimizer, checkpoint):\n    if checkpoint is not None:\n        saved_optimizer_state_dict = checkpoint  # .state_dict()\n        optimizer.optimizer.load_state_dict(saved_optimizer_state_dict)\n        if (optimizer.method == \"adam\") and (len(optimizer.optimizer.state) < 1):\n            raise RuntimeError(\n                \"Error: loaded Adam optimizer from existing model\"\n                + \" but optimizer state is empty\"\n            )\n\n\ndef build_optim(\n    model,\n    optim=\"adam\",\n    lr=0.002,\n    max_grad_norm=0,\n    beta1=0.9,\n    beta2=0.999,\n    decay_method=\"noam\",\n    warmup_steps=8000,\n):\n    \"\"\" Build optimizer \"\"\"\n    optim = Optimizer(\n        optim,\n        lr,\n        max_grad_norm,\n        beta1=beta1,\n        beta2=beta2,\n        decay_method=decay_method,\n        warmup_steps=warmup_steps,\n    )\n\n    optim.set_parameters(list(model.named_parameters()))\n\n    return optim\n\n\ndef build_optim_bert(\n    model,\n    optim=\"adam\",\n    lr_bert=0.002,\n    max_grad_norm=0,\n    beta1=0.9,\n    beta2=0.999,\n    warmup_steps_bert=8000,\n):\n\n    optim = Optimizer(\n        optim,\n        lr_bert,\n        max_grad_norm,\n        beta1=beta1,\n        beta2=beta2,\n        decay_method=\"noam\",\n        warmup_steps=warmup_steps_bert,\n    )\n\n    params = [\n        (n, p)\n        for n, p in list(model.named_parameters())\n        if (n.startswith(\"bert.model\") or n.startswith(\"module.bert.model\"))\n    ]\n    optim.set_parameters(params)\n\n    return optim\n\n\ndef build_optim_dec(\n    model,\n    optim=\"adam\",\n    lr_dec=0.2,\n    max_grad_norm=0,\n    beta1=0.9,\n    beta2=0.999,\n    warmup_steps_dec=8000,\n):\n    optim = Optimizer(\n        optim,\n        lr_dec,\n        max_grad_norm,\n        beta1=beta1,\n        beta2=beta2,\n        decay_method=\"noam\",\n        warmup_steps=warmup_steps_dec,\n    )\n\n    params = [\n        (n, p)\n        for n, p in list(model.named_parameters())\n        if (not n.startswith(\"bert.model\") and not n.startswith(\"module.bert.model\"))\n    ]\n    optim.set_parameters(params)\n\n    return optim\n\n\ndef get_generator(vocab_size, dec_hidden_size):\n    gen_func = nn.LogSoftmax(dim=-1)\n    generator = nn.Sequential(nn.Linear(dec_hidden_size, vocab_size), gen_func)\n    # generator.to(device)\n\n    return generator\n\nclass Transformer(nn.Module):\n    def __init__(self, temp_dir, model_class, pretrained_model_name, pretrained_config):\n        super(Transformer, self).__init__()\n        if(pretrained_model_name):\n            self.model = model_class.from_pretrained(pretrained_model_name,\n                                                   cache_dir=temp_dir)\n            #self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir)\n        else:\n            self.model = model_class(pretrained_config)\n\n    def forward(self, x, segs, mask):\n        if \"DistilBertModel\" in str(type(self.model)):\n            outputs = self.model(x, attention_mask =mask)\n        else:\n            outputs = self.model(x, token_type_ids=segs, attention_mask =mask)\n        #print(outputs)\n        #print(len(outputs))\n        top_vec = outputs[0] \n        \n        return top_vec\n\nclass BertSumExt(nn.Module):\n    def __init__(self, encoder, args, model_class, pretrained_model_name, max_pos=512, pretrained_config = None, temp_dir=\"./\"):\n        super(BertSumExt, self).__init__()\n        self.loss = torch.nn.BCELoss(reduction='none')\n        #self.device = device\n        self.transformer = Transformer(temp_dir, model_class, pretrained_model_name, pretrained_config)\n        if (encoder == 'classifier'):\n            self.encoder = Classifier(self.transformer.model.config.hidden_size)\n        elif(encoder=='transformer'):\n            self.encoder = ExtTransformerEncoder(self.transformer.model.config.hidden_size, args.ff_size, args.heads,\n                                                   args.dropout, args.inter_layers)\n        elif(encoder=='rnn'):\n            self.encoder = RNNEncoder(bidirectional=True, num_layers=1,\n                                      input_size=self.transformer.model.config.hidden_size, hidden_size=args.rnn_size,\n                                      dropout=args.dropout)\n        elif (encoder == 'baseline'):\n            bert_config = BertConfig(self.transformer.model.config.vocab_size, hidden_size=args.hidden_size,\n                                     num_hidden_layers=6, num_attention_heads=8, intermediate_size=args.ff_size)\n            self.transformer.model = BertModel(bert_config)\n            self.encoder = Classifier(self.transformer.model.config.hidden_size)\n        \n        self.max_pos = max_pos\n        if(max_pos > 512):\n            my_pos_embeddings = nn.Embedding(self.max_pos, self.transformer.model.config.hidden_size)\n            my_pos_embeddings.weight.data[:512] = self.transformer.model.embeddings.position_embeddings.weight.data\n            my_pos_embeddings.weight.data[512:] = self.transformer.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(self.max_pos-512,1)\n            self.transformer.model.embeddings.position_embeddings = my_pos_embeddings\n\n        if args.param_init != 0.0:\n            for p in self.encoder.parameters():\n                p.data.uniform_(-args.param_init, args.param_init)\n        if args.param_init_glorot:\n            for p in self.encoder.parameters():\n                if p.dim() > 1:\n                    xavier_uniform_(p)\n\n        #self.to(device)\n    def load_cp(self, pt):\n        self.load_state_dict(pt['model'], strict=True)\n\n    def forward(self, x, segs, clss, mask, mask_cls, labels=None, sentence_range=None):\n\n        top_vec = self.transformer(x, segs, mask)\n        sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), clss]\n        sents_vec = sents_vec * mask_cls[:, :, None].float()\n        sent_scores = self.encoder(sents_vec, mask_cls).squeeze(-1)\n        if labels is not None:\n            loss = self.loss(sent_scores, labels.float())\n            loss = (loss*mask_cls.float()).sum()\n            sent_scores = sent_scores + mask_cls.float()\n            return loss, sent_scores, mask_cls\n        else:\n            sent_scores = sent_scores + mask_cls.float()\n            return sent_scores, mask_cls\n\n\n\n\nclass Bert(nn.Module):\n    def __init__(self, large, temp_dir, finetune=False):\n        super(Bert, self).__init__()\n        if large:\n            self.model = BertModel.from_pretrained(\n                \"bert-large-uncased\", cache_dir=temp_dir\n            )\n        else:\n            self.model = BertModel.from_pretrained(\n                \"bert-base-uncased\", cache_dir=temp_dir\n            )\n\n        self.finetune = finetune\n\n    def forward(self, x, segs, mask):\n        if self.finetune:\n            outputs = self.model(x, attention_mask=mask)\n        else:\n            self.eval()\n            with torch.no_grad():\n                outputs = self.model(x, attention_mask=mask)\n        top_vec = outputs[0]\n        return top_vec\n\n\nclass AbsSummarizer(nn.Module):\n    def __init__(\n        self,\n        large=False,\n        symbols=None,\n        temp_dir=\"./\",\n        finetune_bert=True,\n        encoder=\"bert\",\n        max_pos=512,\n        use_bert_emb=True,\n        share_emb=False,\n        dec_dropout=0.2,\n        dec_layers=6,\n        dec_hidden_size=768,\n        dec_heads=8,\n        dec_ff_size=2048,\n        enc_hidden_size=512,\n        enc_ff_size=512,\n        enc_dropout=0.2,\n        enc_layers=6,\n        label_smoothing=0.1,\n        checkpoint=None,\n        bert_from_extractive=None,\n        test=False,\n    ):\n        super(AbsSummarizer, self).__init__()\n        self.bert = Bert(large, temp_dir, finetune_bert)\n\n        if bert_from_extractive is not None:\n            self.bert.model.load_state_dict(\n                dict(\n                    [\n                        (n[11:], p)\n                        for n, p in bert_from_extractive.items()\n                        if n.startswith(\"bert.model\")\n                    ]\n                ),\n                strict=True,\n            )\n\n        if encoder == \"baseline\":\n            bert_config = BertConfig(\n                self.bert.model.config.vocab_size,\n                hidden_size=enc_hidden_size,\n                num_hidden_layers=enc_layers,\n                num_attention_heads=8,\n                intermediate_size=enc_ff_size,\n                hidden_dropout_prob=enc_dropout,\n                attention_probs_dropout_prob=enc_dropout,\n            )\n            self.bert.model = BertModel(bert_config)\n\n        if max_pos > 512:\n            my_pos_embeddings = nn.Embedding(\n                max_pos, self.bert.model.config.hidden_size\n            )\n            my_pos_embeddings.weight.data[\n                :512\n            ] = self.bert.model.embeddings.position_embeddings.weight.data\n            my_pos_embeddings.weight.data[\n                512:\n            ] = self.bert.model.embeddings.position_embeddings.weight.data[-1][\n                None, :\n            ].repeat(\n                max_pos - 512, 1\n            )\n            self.bert.model.embeddings.position_embeddings = my_pos_embeddings\n        self.vocab_size = self.bert.model.config.vocab_size\n        tgt_embeddings = nn.Embedding(\n            self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0\n        )\n        if share_emb:\n            tgt_embeddings.weight = copy.deepcopy(\n                self.bert.model.embeddings.word_embeddings.weight\n            )\n\n        self.decoder = TransformerDecoder(\n            dec_layers,\n            dec_hidden_size,\n            heads=dec_heads,\n            d_ff=dec_ff_size,\n            dropout=dec_dropout,\n            embeddings=tgt_embeddings,\n        )\n\n        self.generator = get_generator(self.vocab_size, dec_hidden_size)\n        self.generator[0].weight = self.decoder.embeddings.weight\n\n        for module in self.decoder.modules():\n            if isinstance(module, (nn.Linear, nn.Embedding)):\n                module.weight.data.normal_(mean=0.0, std=0.02)\n            elif isinstance(module, nn.LayerNorm):\n                module.bias.data.zero_()\n                module.weight.data.fill_(1.0)\n            if isinstance(module, nn.Linear) and module.bias is not None:\n                module.bias.data.zero_()\n        for p in self.generator.parameters():\n            if p.dim() > 1:\n                xavier_uniform_(p)\n            else:\n                p.data.zero_()\n        if use_bert_emb:\n            tgt_embeddings = nn.Embedding(\n                self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0\n            )\n            tgt_embeddings.weight = copy.deepcopy(\n                self.bert.model.embeddings.word_embeddings.weight\n            )\n            self.decoder.embeddings = tgt_embeddings\n            self.generator[0].weight = self.decoder.embeddings.weight\n\n        self.symbols = symbols\n        self.label_smoothing = label_smoothing\n        self.test = test\n        if not test:\n            self.train_loss = abs_loss(\n                self.generator,\n                self.symbols,\n                self.vocab_size,\n                train=True,\n                label_smoothing=self.label_smoothing,\n            )\n\n    def load_checkpoint(self, checkpoint):\n        if checkpoint is not None:\n            self.load_state_dict(checkpoint, strict=False)\n        if not self.test:\n            self.train_loss = abs_loss(\n                self.generator,\n                self.symbols,\n                self.vocab_size,\n                train=True,\n                label_smoothing=self.label_smoothing,\n            )\n\n    # def move_to_device(self, device, move_to_device_fn):\n    # self.to(device)\n    # self.generator = move_to_device_fn(self.generator, device)\n    #    self = move_to_device_fn(self, device)\n    #    return self\n\n    # def forward(self, src, tgt, segs, clss, mask_src, mask_tgt, mask_cls):\n    def forward(\n        self, src, segs, mask_src, tgt, tgt_num_tokens\n    ):  # , mask_tgt, mask_cls):\n        top_vec = self.bert(src, segs, mask_src)\n        dec_state = self.decoder.init_decoder_state(src, top_vec)\n        decoder_outputs, state = self.decoder(tgt[:, :-1], top_vec, dec_state)\n        loss = self.train_loss.monolithic_compute_loss(\n            decoder_outputs, tgt[:, 1:], tgt_num_tokens\n        )\n        return loss, decoder_outputs\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/neural.py",
    "content": "# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\nimport math\n\nimport torch\nimport torch.nn as nn\n\n\ndef aeq(*args):\n    \"\"\"\n    Assert all arguments have the same value\n    \"\"\"\n    arguments = (arg for arg in args)\n    first = next(arguments)\n    assert all(\n        arg == first for arg in arguments\n    ), \"Not all arguments have the same value: \" + str(args)\n\n\ndef sequence_mask(lengths, max_len=None):\n    \"\"\"\n    Creates a boolean mask from sequence lengths.\n    \"\"\"\n    batch_size = lengths.numel()\n    max_len = max_len or lengths.max()\n    return (\n        torch.arange(0, max_len)\n        .type_as(lengths)\n        .repeat(batch_size, 1)\n        .lt(lengths.unsqueeze(1))\n    )\n\n\ndef gelu(x):\n    return (\n        0.5\n        * x\n        * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n    )\n\n\n\"\"\" Global attention modules (Luong / Bahdanau) \"\"\"\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass GlobalAttention(nn.Module):\n    \"\"\"\n    Global attention takes a matrix and a query vector. It\n    then computes a parameterized convex combination of the matrix\n    based on the input query.\n\n    Constructs a unit mapping a query `q` of size `dim`\n    and a source matrix `H` of size `n x dim`, to an output\n    of size `dim`.\n\n\n    .. mermaid::\n\n       graph BT\n          A[Query]\n          subgraph RNN\n            C[H 1]\n            D[H 2]\n            E[H N]\n          end\n          F[Attn]\n          G[Output]\n          A --> F\n          C --> F\n          D --> F\n          E --> F\n          C -.-> G\n          D -.-> G\n          E -.-> G\n          F --> G\n\n    All models compute the output as\n    :math:`c = sum_{j=1}^{SeqLength} a_j H_j` where\n    :math:`a_j` is the softmax of a score function.\n    Then then apply a projection layer to [q, c].\n\n    However they\n    differ on how they compute the attention score.\n\n    * Luong Attention (dot, general):\n       * dot: :math:`score(H_j,q) = H_j^T q`\n       * general: :math:`score(H_j, q) = H_j^T W_a q`\n\n\n    * Bahdanau Attention (mlp):\n       * :math:`score(H_j, q) = v_a^T tanh(W_a q + U_a h_j)`\n\n\n    Args:\n       dim (int): dimensionality of query and key\n       coverage (bool): use coverage term\n       attn_type (str): type of attention to use, options [dot,general,mlp]\n\n    \"\"\"\n\n    def __init__(self, dim, attn_type=\"dot\"):\n        super(GlobalAttention, self).__init__()\n\n        self.dim = dim\n        assert attn_type in [\n            \"dot\",\n            \"general\",\n            \"mlp\",\n        ], \"Please select a valid attention type.\"\n        self.attn_type = attn_type\n\n        if self.attn_type == \"general\":\n            self.linear_in = nn.Linear(dim, dim, bias=False)\n        elif self.attn_type == \"mlp\":\n            self.linear_context = nn.Linear(dim, dim, bias=False)\n            self.linear_query = nn.Linear(dim, dim, bias=True)\n            self.v = nn.Linear(dim, 1, bias=False)\n        # mlp wants it with bias\n        out_bias = self.attn_type == \"mlp\"\n        self.linear_out = nn.Linear(dim * 2, dim, bias=out_bias)\n\n    def score(self, h_t, h_s):\n        \"\"\"\n        Args:\n          h_t (`FloatTensor`): sequence of queries `[batch x tgt_len x dim]`\n          h_s (`FloatTensor`): sequence of sources `[batch x src_len x dim]`\n\n        Returns:\n          :obj:`FloatTensor`:\n           raw attention scores (unnormalized) for each src index\n          `[batch x tgt_len x src_len]`\n\n        \"\"\"\n\n        # Check input sizes\n        src_batch, src_len, src_dim = h_s.size()\n        tgt_batch, tgt_len, tgt_dim = h_t.size()\n\n        if self.attn_type in [\"general\", \"dot\"]:\n            if self.attn_type == \"general\":\n                h_t_ = h_t.view(tgt_batch * tgt_len, tgt_dim)\n                h_t_ = self.linear_in(h_t_)\n                h_t = h_t_.view(tgt_batch, tgt_len, tgt_dim)\n            h_s_ = h_s.transpose(1, 2)\n            # (batch, t_len, d) x (batch, d, s_len) --> (batch, t_len, s_len)\n            return torch.bmm(h_t, h_s_)\n        else:\n            dim = self.dim\n            wq = self.linear_query(h_t.view(-1, dim))\n            wq = wq.view(tgt_batch, tgt_len, 1, dim)\n            wq = wq.expand(tgt_batch, tgt_len, src_len, dim)\n\n            uh = self.linear_context(h_s.contiguous().view(-1, dim))\n            uh = uh.view(src_batch, 1, src_len, dim)\n            uh = uh.expand(src_batch, tgt_len, src_len, dim)\n\n            # (batch, t_len, s_len, d)\n            wquh = torch.tanh(wq + uh)\n\n            return self.v(wquh.view(-1, dim)).view(tgt_batch, tgt_len, src_len)\n\n    def forward(self, source, memory_bank, memory_lengths=None, memory_masks=None):\n        \"\"\"\n\n        Args:\n          source (`FloatTensor`): query vectors `[batch x tgt_len x dim]`\n          memory_bank (`FloatTensor`): source vectors `[batch x src_len x dim]`\n          memory_lengths (`LongTensor`): the source context lengths `[batch]`\n          coverage (`FloatTensor`): None (not supported yet)\n\n        Returns:\n          (`FloatTensor`, `FloatTensor`):\n\n          * Computed vector `[tgt_len x batch x dim]`\n          * Attention distribtutions for each query\n             `[tgt_len x batch x src_len]`\n        \"\"\"\n\n        # one step input\n        if source.dim() == 2:\n            one_step = True\n            source = source.unsqueeze(1)\n        else:\n            one_step = False\n\n        batch, source_l, dim = memory_bank.size()\n        batch_, target_l, dim_ = source.size()\n\n        # compute attention scores, as in Luong et al.\n        align = self.score(source, memory_bank)\n\n        if memory_masks is not None:\n            memory_masks = memory_masks.transpose(0, 1)\n            memory_masks = memory_masks.transpose(1, 2)\n            align.masked_fill_(1 - memory_masks.byte(), -float(\"inf\"))\n\n        if memory_lengths is not None:\n            mask = sequence_mask(memory_lengths, max_len=align.size(-1))\n            mask = mask.unsqueeze(1)  # Make it broadcastable.\n            align.masked_fill_(1 - mask, -float(\"inf\"))\n\n        align_vectors = F.softmax(align.view(batch * target_l, source_l), -1)\n        align_vectors = align_vectors.view(batch, target_l, source_l)\n\n        c = torch.bmm(align_vectors, memory_bank)\n\n        # concatenate\n        concat_c = torch.cat([c, source], 2).view(batch * target_l, dim * 2)\n        attn_h = self.linear_out(concat_c).view(batch, target_l, dim)\n        if self.attn_type in [\"general\", \"dot\"]:\n            attn_h = torch.tanh(attn_h)\n\n        if one_step:\n            attn_h = attn_h.squeeze(1)\n            align_vectors = align_vectors.squeeze(1)\n\n        else:\n            attn_h = attn_h.transpose(0, 1).contiguous()\n            align_vectors = align_vectors.transpose(0, 1).contiguous()\n\n        return attn_h, align_vectors\n\n\nclass PositionwiseFeedForward(nn.Module):\n    \"\"\" A two-layer Feed-Forward-Network with residual layer norm.\n\n    Args:\n        d_model (int): the size of input for the first-layer of the FFN.\n        d_ff (int): the hidden layer size of the second-layer\n            of the FNN.\n        dropout (float): dropout probability in :math:`[0, 1)`.\n    \"\"\"\n\n    def __init__(self, d_model, d_ff, dropout=0.1):\n        super(PositionwiseFeedForward, self).__init__()\n        self.w_1 = nn.Linear(d_model, d_ff)\n        self.w_2 = nn.Linear(d_ff, d_model)\n        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)\n        self.actv = gelu\n        self.dropout_1 = nn.Dropout(dropout)\n        self.dropout_2 = nn.Dropout(dropout)\n\n    def forward(self, x):\n        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))\n        output = self.dropout_2(self.w_2(inter))\n        return output + x\n\n\nclass MultiHeadedAttention(nn.Module):\n    \"\"\"\n    Multi-Head Attention module from\n    \"Attention is All You Need\"\n    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.\n\n    Similar to standard `dot` attention but uses\n    multiple attention distributions simulataneously\n    to select relevant items.\n\n    .. mermaid::\n\n       graph BT\n          A[key]\n          B[value]\n          C[query]\n          O[output]\n          subgraph Attn\n            D[Attn 1]\n            E[Attn 2]\n            F[Attn N]\n          end\n          A --> D\n          C --> D\n          A --> E\n          C --> E\n          A --> F\n          C --> F\n          D --> O\n          E --> O\n          F --> O\n          B --> O\n\n    Also includes several additional tricks.\n\n    Args:\n       head_count (int): number of parallel heads\n       model_dim (int): the dimension of keys/values/queries,\n           must be divisible by head_count\n       dropout (float): dropout parameter\n    \"\"\"\n\n    def __init__(self, head_count, model_dim, dropout=0.1, use_final_linear=True):\n        assert model_dim % head_count == 0\n        self.dim_per_head = model_dim // head_count\n        self.model_dim = model_dim\n\n        super(MultiHeadedAttention, self).__init__()\n        self.head_count = head_count\n\n        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)\n        self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head)\n        self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head)\n        self.softmax = nn.Softmax(dim=-1)\n        self.dropout = nn.Dropout(dropout)\n        self.use_final_linear = use_final_linear\n        if self.use_final_linear:\n            self.final_linear = nn.Linear(model_dim, model_dim)\n\n    def forward(\n        self,\n        key,\n        value,\n        query,\n        mask=None,\n        layer_cache=None,\n        type=None,\n        predefined_graph_1=None,\n    ):\n        \"\"\"\n        Compute the context vector and the attention vectors.\n\n        Args:\n           key (`FloatTensor`): set of `key_len`\n                key vectors `[batch, key_len, dim]`\n           value (`FloatTensor`): set of `key_len`\n                value vectors `[batch, key_len, dim]`\n           query (`FloatTensor`): set of `query_len`\n                 query vectors  `[batch, query_len, dim]`\n           mask: binary mask indicating which keys have\n                 non-zero attention `[batch, query_len, key_len]`\n        Returns:\n           (`FloatTensor`, `FloatTensor`) :\n\n           * output context vectors `[batch, query_len, dim]`\n           * one of the attention vectors `[batch, query_len, key_len]`\n        \"\"\"\n\n        # CHECKS\n        # batch, k_len, d = key.size()\n        # batch_, k_len_, d_ = value.size()\n        # aeq(batch, batch_)\n        # aeq(k_len, k_len_)\n        # aeq(d, d_)\n        # batch_, q_len, d_ = query.size()\n        # aeq(batch, batch_)\n        # aeq(d, d_)\n        # aeq(self.model_dim % 8, 0)\n        # if mask is not None:\n        #    batch_, q_len_, k_len_ = mask.size()\n        #    aeq(batch_, batch)\n        #    aeq(k_len_, k_len)\n        #    aeq(q_len_ == q_len)\n        # END CHECKS\n\n        batch_size = key.size(0)\n        dim_per_head = self.dim_per_head\n        head_count = self.head_count\n        key_len = key.size(1)\n        query_len = query.size(1)\n\n        def shape(x):\n            \"\"\"  projection \"\"\"\n            return x.view(batch_size, -1, head_count, dim_per_head).transpose(1, 2)\n\n        def unshape(x):\n            \"\"\"  compute context \"\"\"\n            return (\n                x.transpose(1, 2)\n                .contiguous()\n                .view(batch_size, -1, head_count * dim_per_head)\n            )\n\n        # 1) Project key, value, and query.\n        if layer_cache is not None:\n            if type == \"self\":\n                query, key, value = (\n                    self.linear_query(query),\n                    self.linear_keys(query),\n                    self.linear_values(query),\n                )\n\n                key = shape(key)\n                value = shape(value)\n\n                if layer_cache is not None:\n                    device = key.device\n                    if layer_cache[\"self_keys\"] is not None:\n                        key = torch.cat(\n                            (layer_cache[\"self_keys\"].to(device), key), dim=2\n                        )\n                    if layer_cache[\"self_values\"] is not None:\n                        value = torch.cat(\n                            (layer_cache[\"self_values\"].to(device), value), dim=2\n                        )\n                    layer_cache[\"self_keys\"] = key\n                    layer_cache[\"self_values\"] = value\n            elif type == \"context\":\n                query = self.linear_query(query)\n                if layer_cache is not None:\n                    if layer_cache[\"memory_keys\"] is None:\n                        key, value = self.linear_keys(key), self.linear_values(value)\n                        key = shape(key)\n                        value = shape(value)\n                    else:\n                        key, value = (\n                            layer_cache[\"memory_keys\"],\n                            layer_cache[\"memory_values\"],\n                        )\n                    layer_cache[\"memory_keys\"] = key\n                    layer_cache[\"memory_values\"] = value\n                else:\n                    key, value = self.linear_keys(key), self.linear_values(value)\n                    key = shape(key)\n                    value = shape(value)\n        else:\n            key = self.linear_keys(key)\n            value = self.linear_values(value)\n            query = self.linear_query(query)\n            key = shape(key)\n            value = shape(value)\n\n        query = shape(query)\n\n        key_len = key.size(2)\n        query_len = query.size(2)\n\n        # 2) Calculate and scale scores.\n        query = query / math.sqrt(dim_per_head)\n        scores = torch.matmul(query, key.transpose(2, 3))\n\n        if mask is not None:\n            mask = mask.unsqueeze(1).expand_as(scores)\n            # scores = scores.masked_fill(mask, -1e18)\n            scores = scores.masked_fill(mask, torch.finfo(torch.float16).min)\n            # scores = scores.masked_fill(mask, -2**16+1)\n\n        # 3) Apply attention dropout and compute context vectors.\n\n        attn = self.softmax(scores)\n\n        if not predefined_graph_1 is None:\n            attn_masked = attn[:, -1] * predefined_graph_1\n            attn_masked = attn_masked / (torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)\n\n            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)\n\n        drop_attn = self.dropout(attn)\n        if self.use_final_linear:\n            context = unshape(torch.matmul(drop_attn, value))\n            output = self.final_linear(context)\n            return output\n        else:\n            context = torch.matmul(drop_attn, value)\n            return context\n\n        # CHECK\n        # batch_, q_len_, d_ = output.size()\n        # aeq(q_len, q_len_)\n        # aeq(batch, batch_)\n        # aeq(d, d_)\n\n        # Return one attn\n\n\nclass DecoderState(object):\n    \"\"\"Interface for grouping together the current state of a recurrent\n    decoder. In the simplest case just represents the hidden state of\n    the model.  But can also be used for implementing various forms of\n    input_feeding and non-recurrent models.\n\n    Modules need to implement this to utilize beam search decoding.\n    \"\"\"\n\n    def detach(self):\n        \"\"\" Need to document this \"\"\"\n        self.hidden = tuple([_.detach() for _ in self.hidden])\n        self.input_feed = self.input_feed.detach()\n\n    def beam_update(self, idx, positions, beam_size):\n        \"\"\" Need to document this \"\"\"\n        for e in self._all:\n            sizes = e.size()\n            br = sizes[1]\n            if len(sizes) == 3:\n                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[\n                    :, :, idx\n                ]\n            else:\n                sent_states = e.view(\n                    sizes[0], beam_size, br // beam_size, sizes[2], sizes[3]\n                )[:, :, idx]\n\n            sent_states.data.copy_(sent_states.data.index_select(1, positions))\n\n    def map_batch_fn(self, fn):\n        raise NotImplementedError()\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/optimizers.py",
    "content": "# Modifications Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\n\"\"\" Optimizers class \"\"\"\nimport torch\nimport torch.optim as optim\nfrom torch.nn.utils import clip_grad_norm_\n\n\n# from onmt.utils import use_gpu\n# from models.adam import Adam\n\n\ndef use_gpu(opt):\n    \"\"\"\n    Creates a boolean if gpu used\n    \"\"\"\n    return (hasattr(opt, \"gpu_ranks\") and len(opt.gpu_ranks) > 0) or (\n        hasattr(opt, \"gpu\") and opt.gpu > -1\n    )\n\n\ndef build_optim(model, opt, checkpoint):\n    \"\"\" Build optimizer \"\"\"\n    saved_optimizer_state_dict = None\n\n    if opt.train_from:\n        optim = checkpoint[\"optim\"]\n        # We need to save a copy of optim.optimizer.state_dict() for setting\n        # the, optimizer state later on in Stage 2 in this method, since\n        # the method optim.set_parameters(model.parameters()) will overwrite\n        # optim.optimizer, and with ith the values stored in\n        # optim.optimizer.state_dict()\n        # saved_optimizer_state_dict = optim.optimizer.state_dict()\n        saved_optimizer_state_dict = optim\n    else:\n        optim = Optimizer(\n            opt.optim,\n            opt.learning_rate,\n            opt.max_grad_norm,\n            lr_decay=opt.learning_rate_decay,\n            start_decay_steps=opt.start_decay_steps,\n            decay_steps=opt.decay_steps,\n            beta1=opt.adam_beta1,\n            beta2=opt.adam_beta2,\n            adagrad_accum=opt.adagrad_accumulator_init,\n            decay_method=opt.decay_method,\n            warmup_steps=opt.warmup_steps,\n        )\n\n    optim.set_parameters(model.named_parameters())\n\n    if opt.train_from:\n        optim.optimizer.load_state_dict(saved_optimizer_state_dict)\n        if use_gpu(opt):\n            for state in optim.optimizer.state.values():\n                for k, v in state.items():\n                    if torch.is_tensor(v):\n                        state[k] = v.cuda()\n\n        if (optim.method == \"adam\") and (len(optim.optimizer.state) < 1):\n            raise RuntimeError(\n                \"Error: loaded Adam optimizer from existing model\"\n                + \" but optimizer state is empty\"\n            )\n\n    return optim\n\n\nclass MultipleOptimizer(object):\n    \"\"\" Implement multiple optimizers needed for sparse adam \"\"\"\n\n    def __init__(self, op):\n        \"\"\" ? \"\"\"\n        self.optimizers = op\n\n    def zero_grad(self):\n        \"\"\" ? \"\"\"\n        for op in self.optimizers:\n            op.zero_grad()\n\n    def step(self):\n        \"\"\" ? \"\"\"\n        for op in self.optimizers:\n            op.step()\n\n    @property\n    def state(self):\n        \"\"\" ? \"\"\"\n        return {k: v for op in self.optimizers for k, v in op.state.items()}\n\n    def state_dict(self):\n        \"\"\" ? \"\"\"\n        return [op.state_dict() for op in self.optimizers]\n\n    def load_state_dict(self, state_dicts):\n        \"\"\" ? \"\"\"\n        assert len(state_dicts) == len(self.optimizers)\n        for i in range(len(state_dicts)):\n            self.optimizers[i].load_state_dict(state_dicts[i])\n\n\nclass Optimizer(object):\n    \"\"\"\n    Controller class for optimization. Mostly a thin\n    wrapper for `optim`, but also useful for implementing\n    rate scheduling beyond what is currently available.\n    Also implements necessary methods for training RNNs such\n    as grad manipulations.\n\n    Args:\n      method (:obj:`str`): one of [sgd, adagrad, adadelta, adam]\n      lr (float): learning rate\n      lr_decay (float, optional): learning rate decay multiplier\n      start_decay_steps (int, optional): step to start learning rate decay\n      beta1, beta2 (float, optional): parameters for adam\n      adagrad_accum (float, optional): initialization parameter for adagrad\n      decay_method (str, option): custom decay options\n      warmup_steps (int, option): parameter for `noam` decay\n      model_size (int, option): parameter for `noam` decay\n\n    We use the default parameters for Adam that are suggested by\n    the original paper https://arxiv.org/pdf/1412.6980.pdf\n    These values are also used by other established implementations,\n    e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer\n    https://keras.io/optimizers/\n    Recently there are slightly different values used in the paper\n    \"Attention is all you need\"\n    https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98\n    was used there however, beta2=0.999 is still arguably the more\n    established value, so we use that here as well\n    \"\"\"\n\n    def __init__(\n        self,\n        method,\n        learning_rate,\n        max_grad_norm,\n        lr_decay=1,\n        start_decay_steps=None,\n        decay_steps=None,\n        beta1=0.9,\n        beta2=0.999,\n        adagrad_accum=0.0,\n        decay_method=None,\n        warmup_steps=4000,\n        weight_decay=0,\n    ):\n        self.last_ppl = None\n        self.learning_rate = learning_rate\n        self.original_lr = learning_rate\n        self.max_grad_norm = max_grad_norm\n        self.method = method\n        self.lr_decay = lr_decay\n        self.start_decay_steps = start_decay_steps\n        self.decay_steps = decay_steps\n        self.start_decay = False\n        self._step = 0\n        self.betas = [beta1, beta2]\n        self.adagrad_accum = adagrad_accum\n        self.decay_method = decay_method\n        self.warmup_steps = warmup_steps\n        self.weight_decay = weight_decay\n\n    def set_parameters(self, params):\n        \"\"\" ? \"\"\"\n        self.params = []\n        self.sparse_params = []\n        for k, p in params:\n            if p.requires_grad:\n                if self.method != \"sparseadam\" or \"embed\" not in k:\n                    self.params.append(p)\n                else:\n                    self.sparse_params.append(p)\n        if self.method == \"sgd\":\n            self.optimizer = optim.SGD(self.params, lr=self.learning_rate)\n        elif self.method == \"adagrad\":\n            self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate)\n            for group in self.optimizer.param_groups:\n                for p in group[\"params\"]:\n                    self.optimizer.state[p][\"sum\"] = self.optimizer.state[p][\n                        \"sum\"\n                    ].fill_(self.adagrad_accum)\n        elif self.method == \"adadelta\":\n            self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate)\n        elif self.method == \"adam\":\n            self.optimizer = optim.Adam(\n                self.params, lr=self.learning_rate, betas=self.betas, eps=1e-9\n            )\n        else:\n            raise RuntimeError(\"Invalid optim method: \" + self.method)\n\n        self.param_groups = self.optimizer.param_groups\n        self.state = self.optimizer.state\n\n    def _set_rate(self, learning_rate):\n        self.learning_rate = learning_rate\n        if self.method != \"sparseadam\":\n            self.optimizer.param_groups[0][\"lr\"] = self.learning_rate\n        else:\n            for op in self.optimizer.optimizers:\n                op.param_groups[0][\"lr\"] = self.learning_rate\n\n    def step(self):\n        \"\"\"Update the model parameters based on current gradients.\n\n        Optionally, will employ gradient modification or update learning\n        rate.\n        \"\"\"\n        self._step += 1\n\n        # Decay method used in tensor2tensor.\n        if self.decay_method == \"noam\":\n            self._set_rate(\n                self.original_lr\n                * min(self._step ** (-0.5), self._step * self.warmup_steps ** (-1.5))\n            )\n\n        else:\n            if (self.start_decay_steps is not None) and (\n                self._step >= self.start_decay_steps\n            ):\n                self.start_decay = True\n            if self.start_decay:\n                if (self._step - self.start_decay_steps) % self.decay_steps == 0:\n                    self.learning_rate = self.learning_rate * self.lr_decay\n\n        if self.method != \"sparseadam\":\n            self.optimizer.param_groups[0][\"lr\"] = self.learning_rate\n\n        if self.max_grad_norm:\n            clip_grad_norm_(self.params, self.max_grad_norm)\n        self.optimizer.step()\n\n    def add_param_group(self, param_group):\n        r\"\"\"Add a param group to the :class:`Optimizer` s `param_groups`.\n\n            This can be useful when fine tuning a pre-trained network as frozen layers can be made\n            trainable and added to the :class:`Optimizer` as training progresses.\n\n            Arguments:\n                param_group (dict): Specifies what Tensors should be optimized along with group\n                specific optimization options.\n            \"\"\"\n        assert isinstance(param_group, dict), \"param group must be a dict\"\n\n        params = param_group[\"params\"]\n        if isinstance(params, torch.Tensor):\n            param_group[\"params\"] = [params]\n        elif isinstance(params, set):\n            raise TypeError(\n                \"optimizer parameters need to be organized in ordered collections, but \"\n                \"the ordering of tensors in sets will change between runs. Please use a list instead.\"\n            )\n        else:\n            param_group[\"params\"] = list(params)\n\n        for param in param_group[\"params\"]:\n            if not isinstance(param, torch.Tensor):\n                raise TypeError(\n                    \"optimizer can only optimize Tensors, \"\n                    \"but one of the params is \" + torch.typename(param)\n                )\n            if not param.is_leaf:\n                raise ValueError(\"can't optimize a non-leaf Tensor\")\n\n        for name, default in self.defaults.items():\n            if default is required and name not in param_group:\n                raise ValueError(\n                    \"parameter group didn't specify a value of required optimization parameter \"\n                    + name\n                )\n            else:\n                param_group.setdefault(name, default)\n\n        param_set = set()\n        for group in self.param_groups:\n            param_set.update(set(group[\"params\"]))\n\n        if not param_set.isdisjoint(set(param_group[\"params\"])):\n            raise ValueError(\"some parameters appear in more than one parameter group\")\n\n        self.param_groups.append(param_group)\n\n    def load_state_dict(self, state_dict):\n        self.optimizer.load_state_dict(state_dict)\n\n    def state_dict(self):\n        \"\"\" ? \"\"\"\n        return self.optimizer.state_dict()\n\n    def zero_grad(self):\n        \"\"\" ? \"\"\"\n        self.optimizer.zero_grad()\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/penalties.py",
    "content": "# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\"\"\" PenaltyBuilder Class used in prediction/translation \"\"\"\n\nfrom __future__ import division\nimport torch\n\n\nclass PenaltyBuilder(object):\n    \"\"\"\n    Returns the Length and Coverage Penalty function for Beam Search.\n\n    Args:\n        length_pen (str): option name of length pen\n        cov_pen (str): option name of cov pen\n    \"\"\"\n\n    def __init__(self, length_pen):\n        self.length_pen = length_pen\n\n    def length_penalty(self):\n        if self.length_pen == \"wu\":\n            return self.length_wu\n        elif self.length_pen == \"avg\":\n            return self.length_average\n        else:\n            return self.length_none\n\n    \"\"\"\n    Below are all the different penalty terms implemented so far\n    \"\"\"\n\n    def length_wu(self, beam, logprobs, alpha=0.0):\n        \"\"\"\n        NMT length re-ranking score from\n        \"Google's Neural Machine Translation System\" :cite:`wu2016google`.\n        \"\"\"\n\n        modifier = ((5 + len(beam.next_ys)) ** alpha) / ((5 + 1) ** alpha)\n        return logprobs / modifier\n\n    def length_average(self, beam, logprobs, alpha=0.0):\n        \"\"\"\n        Returns the average probability of tokens in a sequence.\n        \"\"\"\n        return logprobs / len(beam.next_ys)\n\n    def length_none(self, beam, logprobs, alpha=0.0, beta=0.0):\n        \"\"\"\n        Returns unmodified scores.\n        \"\"\"\n        return logprobs\n"
  },
  {
    "path": "utils_nlp/models/transformers/bertsum/predictor.py",
    "content": "# Modifications Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n# This script reuses code from https://github.com/nlpyang/Presumm\n\n\"\"\" Translator Class and builder \"\"\"\nfrom __future__ import print_function\nimport codecs\nimport os\nimport math\n\nimport torch\nfrom torch import nn\nfrom tensorboardX import SummaryWriter\n\n# from others.utils import rouge_results_to_str, test_rouge, tile\nfrom .beam import GNMTGlobalScorer\n\n\ndef build_predictor(\n    tokenizer,\n    symbols,\n    model,\n    alpha=0.6,\n    beam_size=5,\n    min_length=15,\n    max_length=150,\n    logger=None,\n):\n    scorer = GNMTGlobalScorer(alpha, length_penalty=\"wu\")\n\n    translator = Translator(\n        beam_size,\n        min_length,\n        max_length,\n        model,\n        tokenizer,\n        symbols,\n        global_scorer=scorer,\n        logger=logger,\n    )\n    return translator\n\n\ndef tile(x, count, dim=0):\n    \"\"\"\n    Tiles x on dimension dim count times.\n    \"\"\"\n    perm = list(range(len(x.size())))\n    if dim != 0:\n        perm[0], perm[dim] = perm[dim], perm[0]\n        x = x.permute(perm).contiguous()\n    out_size = list(x.size())\n    out_size[0] *= count\n    batch = x.size(0)\n    x = (\n        x.view(batch, -1)\n        .transpose(0, 1)\n        .repeat(count, 1)\n        .transpose(0, 1)\n        .contiguous()\n        .view(*out_size)\n    )\n    if dim != 0:\n        x = x.permute(perm).contiguous()\n    return x\n\n\nclass Translator(nn.Module):\n    \"\"\"\n    Uses a model to translate a batch of sentences.\n\n\n    Args:\n       model (:obj:`onmt.modules.NMTModel`):\n          NMT model to use for translation\n       fields (dict of Fields): data fields\n       beam_size (int): size of beam to use\n       n_best (int): number of translations produced\n       max_length (int): maximum length output to produce\n       global_scores (:obj:`GlobalScorer`):\n         object to rescore final translations\n       copy_attn (bool): use copy attention during translation\n       beam_trace (bool): trace beam search for debugging\n       logger(logging.Logger): logger.\n    \"\"\"\n\n    def __init__(\n        self,\n        beam_size,\n        min_length,\n        max_length,\n        model,\n        vocab,\n        symbols,\n        block_trigram=True,\n        global_scorer=None,\n        logger=None,\n        dump_beam=\"\",\n    ):\n        super(Translator, self).__init__()\n        self.logger = logger\n\n        self.model = model.module if hasattr(model, \"module\") else model\n        self.generator = self.model.generator\n        self.decoder = self.model.decoder\n        self.bert = self.model.bert\n\n        self.vocab = vocab\n        self.symbols = symbols\n        self.start_token = symbols[\"BOS\"]\n        self.end_token = symbols[\"EOS\"]\n\n        self.global_scorer = global_scorer\n        self.beam_size = beam_size\n        self.min_length = min_length\n        self.max_length = max_length\n        self.block_trigram = block_trigram\n\n        self.dump_beam = dump_beam\n\n        # for debugging\n        self.beam_trace = self.dump_beam != \"\"\n        self.beam_accum = None\n\n        if self.beam_trace:\n            self.beam_accum = {\n                \"predicted_ids\": [],\n                \"beam_parent_ids\": [],\n                \"scores\": [],\n                \"log_probs\": [],\n            }\n\n    \"\"\"\n    def eval(self):\n        self.model.eval()\n        self.bert.eval()\n        self.decoder.eval()\n        self.generator.eval()\n    \"\"\"\n\n    def forward(self, src, segs, mask_src):\n        \"\"\"\n        Translate a batch of sentences.\n\n        Mostly a wrapper around :obj:`Beam`.\n\n        Args:\n           batch (:obj:`Batch`): a batch from a dataset object\n           data (:obj:`Dataset`): the dataset object\n           fast (bool): enables fast beam search (may not support all features)\n\n        Todo:\n           Shouldn't need the original dataset.\n        \"\"\"\n        with torch.no_grad():\n            predictions, scores = self._fast_translate_batch(\n                src, segs, mask_src, self.max_length, min_length=self.min_length\n            )\n            return predictions, scores\n\n    def _fast_translate_batch(self, src, segs, mask_src, max_length, min_length=0):\n        # TODO: faster code path for beam_size == 1.\n\n        # TODO: support these blacklisted features.\n        assert not self.dump_beam\n\n        beam_size = self.beam_size\n        batch_size = src.size()[0]  # 32 #batch.batch_size\n\n        src_features = self.bert(src, segs, mask_src)\n        this_decoder = (\n            self.decoder.module if hasattr(self.decoder, \"module\") else self.decoder\n        )\n        dec_states = this_decoder.init_decoder_state(src, src_features, with_cache=True)\n\n        device = src_features.device\n\n        # Tile states and memory beam_size times.\n        dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))\n        src_features = tile(src_features, beam_size, dim=0)\n        batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)\n        beam_offset = torch.arange(\n            0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device\n        )\n        alive_seq = torch.full(\n            [batch_size * beam_size, 1],\n            self.start_token,\n            dtype=torch.long,\n            device=device,\n        )\n\n        # Give full probability to the first beam on the first step.\n        topk_log_probs = torch.tensor(\n            [0.0] + [float(\"-inf\")] * (beam_size - 1), device=device\n        ).repeat(batch_size)\n\n        # Structure that holds finished hypotheses.\n        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812\n\n        results = {}\n        results[\"predictions\"] = [[] for _ in range(batch_size)]  # noqa: F812\n        results[\"scores\"] = [[] for _ in range(batch_size)]  # noqa: F812\n        # results[\"gold_score\"] = [0] * batch_size\n        # results[\"batch\"] = batch\n\n        for step in range(max_length):\n            decoder_input = alive_seq[:, -1].view(1, -1)\n\n            # Decoder forward.\n            decoder_input = decoder_input.transpose(0, 1)\n\n            dec_out, dec_states = this_decoder(\n                decoder_input, src_features, dec_states, step=step\n            )\n\n            # Generator forward.\n            log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0))\n            vocab_size = log_probs.size(-1)\n\n            if step < min_length:\n                log_probs[:, self.end_token] = torch.Tensor([-1e20])\n\n            # Multiply probs by the beam probability.\n            log_probs += topk_log_probs.view(-1).unsqueeze(1)\n\n            alpha = self.global_scorer.alpha\n            length_penalty = ((5.0 + (step + 1)) / 6.0) ** alpha\n\n            # Flatten probs into a list of possibilities.\n            curr_scores = log_probs / length_penalty\n\n            if self.block_trigram:\n                cur_len = alive_seq.size(1)\n                if cur_len > 3:\n                    for i in range(alive_seq.size(0)):\n                        fail = False\n                        words = [int(w) for w in alive_seq[i]]\n                        words = [self.vocab.ids_to_tokens[w] for w in words]\n                        words = \" \".join(words).replace(\" ##\", \"\").split()\n                        if len(words) <= 3:\n                            continue\n                        trigrams = [\n                            (words[i - 1], words[i], words[i + 1])\n                            for i in range(1, len(words) - 1)\n                        ]\n                        trigram = tuple(trigrams[-1])\n                        if trigram in trigrams[:-1]:\n                            fail = True\n                        if fail:\n                            curr_scores[i] = torch.Tensor([-10e20])\n\n            curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)\n            topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)\n\n            # Recover log probs.\n            topk_log_probs = topk_scores * length_penalty\n\n            # Resolve beam origin and true word ids.\n            topk_beam_index = topk_ids.div(vocab_size)\n            topk_ids = topk_ids.fmod(vocab_size)\n\n            # Map beam_index to batch_index in the flat representation.\n            batch_index = topk_beam_index + beam_offset[\n                : topk_beam_index.size(0)\n            ].unsqueeze(1)\n            select_indices = batch_index.view(-1)\n\n            # Append last prediction.\n            alive_seq = torch.cat(\n                [alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1\n            )\n\n            is_finished = topk_ids.eq(self.end_token)\n            if step + 1 == max_length:\n                is_finished.fill_(True)\n            # End condition is top beam is finished.\n            end_condition = is_finished[:, 0].eq(True)\n            if step + 1 == max_length:\n                assert not any(end_condition.eq(False))\n\n            # Save finished hypotheses.\n            if is_finished.any():\n                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))\n                for i in range(is_finished.size(0)):\n                    b = batch_offset[i]\n                    if end_condition[i]:\n                        is_finished[i].fill_(1)\n                    finished_hyp = is_finished[i].nonzero().view(-1)\n                    # Store finished hypotheses for this batch.\n                    for j in finished_hyp:\n                        hypotheses[b].append((topk_scores[i, j], predictions[i, j, 1:]))\n                    # If the batch reached the end, save the n_best hypotheses.\n                    if end_condition[i]:\n                        best_hyp = sorted(\n                            hypotheses[b], key=lambda x: x[0], reverse=True\n                        )\n                        score, pred = best_hyp[0]\n                        results[\"scores\"][b].append(score)\n                        results[\"predictions\"][b].append(pred)\n                non_finished = end_condition.eq(0).nonzero().view(-1)\n                # If all sentences are translated, no need to go further.\n                if len(non_finished) == 0:\n                    break\n                # Remove finished batches for the next step.\n                topk_log_probs = topk_log_probs.index_select(0, non_finished)\n                batch_index = batch_index.index_select(0, non_finished)\n                batch_offset = batch_offset.index_select(0, non_finished)\n                alive_seq = predictions.index_select(0, non_finished).view(\n                    -1, alive_seq.size(-1)\n                )\n            # Reorder states.\n            select_indices = batch_index.view(-1)\n            src_features = src_features.index_select(0, select_indices)\n            dec_states.map_batch_fn(\n                lambda state, dim: state.index_select(dim, select_indices)\n            )\n\n        empty_output = [len(results[\"predictions\"][b]) <= 0 for b in batch_offset]\n        predictions = torch.tensor(\n            [\n                i[0].tolist()[0 : self.max_length]\n                + [0] * (self.max_length - i[0].size()[0])\n                for i in results[\"predictions\"]\n            ],\n            device=device,\n        )\n        scores = torch.tensor([i[0].item() for i in results[\"scores\"]], device=device)\n        return predictions, scores\n"
  },
  {
    "path": "utils_nlp/models/transformers/common.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This script reuses some code from\n# https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py\n\nimport datetime\nimport logging\nimport os\nimport random\nimport time\n\nimport numpy as np\nimport torch\nfrom tqdm import tqdm\nfrom transformers import AdamW, get_linear_schedule_with_warmup\n\nfrom utils_nlp.common.pytorch_utils import (\n    get_amp,\n    get_device,\n    move_model_to_device,\n    parallelize_model,\n)\n\nMAX_SEQ_LEN = 512\n\nlogger = logging.getLogger(__name__)\n\n\nclass Transformer:\n    def __init__(self, model_name, model, cache_dir):\n        self._model_name = model_name\n        self._model_type = model_name.split(\"-\")[0]\n        self.model = model\n        self.cache_dir = cache_dir\n\n    @property\n    def model_name(self):\n        return self._model_name\n\n    @property\n    def model_type(self):\n        return self._model_type\n\n    @staticmethod\n    def set_seed(seed, cuda=True):\n        random.seed(seed)\n        np.random.seed(seed)\n        torch.manual_seed(seed)\n        if cuda and torch.cuda.is_available():\n            torch.cuda.manual_seed_all(seed)\n\n    @staticmethod\n    def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon):\n        no_decay = [\"bias\", \"LayerNorm.weight\"]\n        optimizer_grouped_parameters = [\n            {\n                \"params\": [\n                    p\n                    for n, p in model.named_parameters()\n                    if not any(nd in n for nd in no_decay)\n                ],\n                \"weight_decay\": weight_decay,\n            },\n            {\n                \"params\": [\n                    p\n                    for n, p in model.named_parameters()\n                    if any(nd in n for nd in no_decay)\n                ],\n                \"weight_decay\": 0.0,\n            },\n        ]\n        optimizer = AdamW(\n            optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon\n        )\n        return optimizer\n\n    @staticmethod\n    def get_default_scheduler(optimizer, warmup_steps, num_training_steps):\n        scheduler = get_linear_schedule_with_warmup(\n            optimizer,\n            num_warmup_steps=warmup_steps,\n            num_training_steps=num_training_steps,\n        )\n        return scheduler\n\n    def prepare_model_and_optimizer(\n        self,\n        num_gpus,\n        gpu_ids,\n        local_rank,\n        weight_decay,\n        learning_rate,\n        adam_epsilon,\n        fp16=False,\n        fp16_opt_level=\"O1\",\n        checkpoint_state_dict=None,\n    ):\n        \"\"\"\n        This function initializes an optimizer and moves the model to a device.\n        It can be used by most child classes before calling fine_tune.\n        Child classes that require custom optimizers need to either override this\n            function or implement the steps listed below in the specified order\n            before fine-tuning.\n\n        The steps are performed in the following order:\n            1. Move model to device\n            2. Create optimizer\n            3. Initialize amp\n            4. Parallelize model\n        \"\"\"\n\n        amp = get_amp(fp16)\n\n        # get device\n        device, num_gpus = get_device(\n            num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank\n        )\n\n        # move model\n        self.model = move_model_to_device(model=self.model, device=device)\n\n        # init optimizer\n        self.optimizer = Transformer.get_default_optimizer(\n            self.model, weight_decay, learning_rate, adam_epsilon\n        )\n\n        if fp16 and amp:\n            self.model, self.optimizer = amp.initialize(\n                self.model, self.optimizer, opt_level=fp16_opt_level\n            )\n\n        if checkpoint_state_dict:\n            self.optimizer.load_state_dict(checkpoint_state_dict[\"optimizer\"])\n            self.model.load_state_dict(checkpoint_state_dict[\"model\"])\n\n            if fp16 and amp:\n                amp.load_state_dict(checkpoint_state_dict[\"amp\"])\n\n        self.model = parallelize_model(\n            model=self.model,\n            device=device,\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=local_rank,\n        )\n\n        return device, num_gpus, amp\n\n    def fine_tune(\n        self,\n        train_dataloader,\n        get_inputs,\n        device,\n        num_gpus=None,\n        max_steps=-1,\n        global_step=0,\n        max_grad_norm=1.0,\n        gradient_accumulation_steps=1,\n        optimizer=None,\n        scheduler=None,\n        fp16=False,\n        amp=None,\n        local_rank=-1,\n        verbose=True,\n        seed=None,\n        report_every=10,\n        save_every=-1,\n        clip_grad_norm=True,\n        validation_function=None,\n    ):\n\n        if seed is not None:\n            Transformer.set_seed(seed, num_gpus > 0)\n\n        # init training\n        tr_loss = 0.0\n        accum_loss = 0\n        train_size = 0\n        self.model.train()\n        self.model.zero_grad()\n\n        # train\n        start = time.time()\n        # TODO: Is this while necessary???\n        while global_step < max_steps:\n            epoch_iterator = tqdm(\n                train_dataloader,\n                desc=\"Iteration\",\n                disable=local_rank not in [-1, 0] or not verbose,\n            )\n            for step, batch in enumerate(epoch_iterator):\n                inputs = get_inputs(batch, device, self.model_name)\n                outputs = self.model(**inputs)\n\n                if isinstance(outputs, tuple):\n                    loss = outputs[0]\n                else:\n                    # Accomondate models based on older versions of Transformers,\n                    # e.g. UniLM\n                    loss = outputs\n\n                if num_gpus > 1:\n                    loss = loss.mean()\n\n                if gradient_accumulation_steps > 1:\n                    loss = loss / gradient_accumulation_steps\n\n                if fp16 and amp:\n                    with amp.scale_loss(loss, optimizer) as scaled_loss:\n                        scaled_loss.backward()\n                else:\n                    loss.backward()\n\n                tr_loss += loss.item()\n                accum_loss += loss.item()\n                train_size += list(inputs.values())[0].size()[0]\n                if (step + 1) % gradient_accumulation_steps == 0:\n\n                    global_step += 1\n\n                    if clip_grad_norm:\n                        if fp16 and amp:\n                            torch.nn.utils.clip_grad_norm_(\n                                amp.master_params(optimizer), max_grad_norm\n                            )\n                        else:\n                            torch.nn.utils.clip_grad_norm_(\n                                self.model.parameters(), max_grad_norm\n                            )\n\n                    if global_step % report_every == 0 and verbose:\n                        end = time.time()\n                        endtime_string = datetime.datetime.fromtimestamp(end).strftime(\n                            \"%d/%m/%Y %H:%M:%S\"\n                        )\n                        log_line = \"\"\"timestamp: {0:s}, average loss: {1:.6f}, time duration: {2:f},\n                            number of examples in current reporting: {3:.0f}, step {4:.0f}\n                            out of total {5:.0f}\"\"\".format(\n                            endtime_string,\n                            accum_loss / report_every,\n                            end - start,\n                            # list(inputs.values())[0].size()[0],\n                            train_size,\n                            global_step,\n                            max_steps,\n                        )\n                        logger.info(log_line)\n                        print(log_line)\n                        accum_loss = 0\n                        train_size = 0\n                        start = end\n                    if optimizer:\n                        if type(optimizer) == list:\n                            for o in optimizer:\n                                o.step()\n                        else:\n                            optimizer.step()\n                    if scheduler:\n                        if type(scheduler) == list:\n                            for s in scheduler:\n                                s.step()\n                        else:\n                            scheduler.step()\n                    self.model.zero_grad()\n\n                    if (\n                        save_every != -1\n                        and global_step % save_every == 0\n                        and verbose\n                        and local_rank in [-1, 0]\n                    ):\n                        saved_model_path = os.path.join(\n                            self.cache_dir, f\"{self.model_name}_step_{global_step}.pt\"\n                        )\n                        self.save_model(saved_model_path)\n                        if validation_function:\n                            validation_log = validation_function(self)\n                            logger.info(validation_log)\n                            print(validation_log)\n                if global_step > max_steps:\n                    epoch_iterator.close()\n                    break\n        if fp16 and amp:\n            self.amp_state_dict = amp.state_dict()\n\n        # release GPU memories\n        self.model.cpu()\n        torch.cuda.empty_cache()\n\n        return global_step, tr_loss / global_step\n\n    def predict(self, eval_dataloader, get_inputs, num_gpus, gpu_ids, verbose=True):\n        # get device\n        device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=-1)\n\n        # move model\n        self.model = move_model_to_device(model=self.model, device=device)\n\n        # parallelize model\n        self.model = parallelize_model(\n            model=self.model,\n            device=device,\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=-1,\n        )\n\n        # predict\n        self.model.eval()\n        for batch in tqdm(eval_dataloader, desc=\"Scoring\", disable=not verbose):\n            with torch.no_grad():\n                inputs = get_inputs(batch, device, self.model_name, train_mode=False)\n                outputs = self.model(**inputs)\n                logits = outputs[0]\n            yield logits.detach().cpu().numpy()\n\n    def save_model(self, file_name=None):\n        \"\"\"\n        Saves the underlying PyTorch module's state.\n\n        Args:\n            file_name (str, optional): File name to save the model's `state_dict()`\n                that can be loaded by torch.load().\n                If None, the trained model, configuration and tokenizer are saved\n                using `save_pretrained()`; and the file is going to be saved under\n                \"fine_tuned\" folder of the cached directory of the object.\n                Defaults to None.\n        \"\"\"\n\n        # Save a trained model, configuration and tokenizer using `save_pretrained()`.\n        # They can then be reloaded using `from_pretrained()`\n        model_to_save = (\n            self.model.module if hasattr(self.model, \"module\") else self.model\n        )  # Take care of distributed/parallel training\n\n        if file_name:\n            logger.info(\"Saving model checkpoint to %s\", file_name)\n            torch.save(model_to_save.state_dict(), file_name)\n        else:\n            output_model_dir = os.path.join(self.cache_dir, \"fine_tuned\")\n\n            os.makedirs(self.cache_dir, exist_ok=True)\n            os.makedirs(output_model_dir, exist_ok=True)\n\n            logger.info(\"Saving model checkpoint to %s\", output_model_dir)\n            model_to_save.save_pretrained(output_model_dir)\n\n    def load_model(self, file_name):\n        \"\"\"\n        Loads a PyTorch module's state.\n\n        Args:\n            file_name (str): File name of saved the model's `state_dict()`\n        \"\"\"\n\n        model_to_load = (\n            self.model.module if hasattr(self.model, \"module\") else self.model\n        )  # Take care of distributed/parallel training\n\n        model_to_load.load_state_dict(torch.load(file_name))\n        logger.info(\"Model checkpoint loaded from %s\", file_name)\n"
  },
  {
    "path": "utils_nlp/models/transformers/datasets.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport collections\nimport itertools\nimport os\nimport torch\nfrom torch.utils.data import Dataset, IterableDataset\nfrom multiprocessing import Pool, cpu_count\nfrom functools import partial\nimport jsonlines\n\n\nclass SCDataSet(Dataset):\n    \"\"\"Dataset for single sequence classification tasks\"\"\"\n\n    def __init__(self, df, text_col, label_col, transform, **transform_args):\n        self.df = df\n        cols = list(df.columns)\n        self.transform = transform\n        self.transform_args = transform_args\n\n        if isinstance(text_col, int):\n            self.text_col = text_col\n        elif isinstance(text_col, str):\n            self.text_col = cols.index(text_col)\n        else:\n            raise TypeError(\"text_col must be of type int or str\")\n\n        if label_col is None:\n            self.label_col = None\n        elif isinstance(label_col, int):\n            self.label_col = label_col\n        elif isinstance(label_col, str):\n            self.label_col = cols.index(label_col)\n        else:\n            raise TypeError(\"label_col must be of type int or str\")\n\n    def __getitem__(self, idx):\n        input_ids, attention_mask, token_type_ids = self.transform(\n            self.df.iloc[idx, self.text_col], **self.transform_args\n        )\n        if self.label_col is None:\n            return tuple(\n                [\n                    torch.tensor(input_ids, dtype=torch.long),\n                    torch.tensor(attention_mask, dtype=torch.long),\n                    torch.tensor(token_type_ids, dtype=torch.long),\n                ]\n            )\n        labels = self.df.iloc[idx, self.label_col]\n        return tuple(\n            [\n                torch.tensor(input_ids, dtype=torch.long),  # input_ids\n                torch.tensor(attention_mask, dtype=torch.long),  # attention_mask\n                torch.tensor(token_type_ids, dtype=torch.long),  # segment ids\n                torch.tensor(labels, dtype=torch.long),  # labels\n            ]\n        )\n\n    def __len__(self):\n        return self.df.shape[0]\n\n\nclass SPCDataSet(Dataset):\n    \"\"\"Dataset for sequence pair classification tasks\"\"\"\n\n    def __init__(\n        self, df, text1_col, text2_col, label_col, transform, **transform_args\n    ):\n        self.df = df\n        cols = list(df.columns)\n        self.transform = transform\n        self.transform_args = transform_args\n\n        if isinstance(text1_col, int):\n            self.text1_col = text1_col\n        elif isinstance(text1_col, str):\n            self.text1_col = cols.index(text1_col)\n        else:\n            raise TypeError(\"text1_col must be of type int or str\")\n\n        if isinstance(text2_col, int):\n            self.text2_col = text2_col\n        elif isinstance(text2_col, str):\n            self.text2_col = cols.index(text2_col)\n        else:\n            raise TypeError(\"text2_col must be of type int or str\")\n\n        if label_col is None:\n            self.label_col = None\n        elif isinstance(label_col, int):\n            self.label_col = label_col\n        elif isinstance(label_col, str):\n            self.label_col = cols.index(label_col)\n        else:\n            raise TypeError(\"label_col must be of type int or str\")\n\n    def __getitem__(self, idx):\n        input_ids, attention_mask, token_type_ids = self.transform(\n            self.df.iloc[idx, self.text1_col],\n            self.df.iloc[idx, self.text2_col],\n            **self.transform_args,\n        )\n\n        if self.label_col is None:\n            return tuple(\n                [\n                    torch.tensor(input_ids, dtype=torch.long),\n                    torch.tensor(attention_mask, dtype=torch.long),\n                    torch.tensor(token_type_ids, dtype=torch.long),\n                ]\n            )\n\n        labels = self.df.iloc[idx, self.label_col]\n        return tuple(\n            [\n                torch.tensor(input_ids, dtype=torch.long),\n                torch.tensor(attention_mask, dtype=torch.long),\n                torch.tensor(token_type_ids, dtype=torch.long),\n                torch.tensor(labels, dtype=torch.long),\n            ]\n        )\n\n    def __len__(self):\n        return self.df.shape[0]\n\n\n# QAInput is a data structure representing an unique document-question-answer triplet.\n# Args:\n#    doc_text (str): Input document text.\n#    question_text(str): Input question text.\n#    qa_id (int or str): An unique id identifying a document-question-answer sample.\n#    is_impossible (bool): If the question is impossible to answer based on the input\n#    document.\n#    answer_start (int or list): Index of the answer start word in doc_text. For\n#        testing data, this can be a list of integers for multiple ground truth answers.\n#    answer_text (str or list): Text of the answer. For testing data, this can be a\n#        list of strings\n#        for multiple ground truth answers.\nQAInput = collections.namedtuple(\n    \"QAInput\",\n    [\n        \"doc_text\",\n        \"question_text\",\n        \"qa_id\",\n        \"is_impossible\",\n        \"answer_start\",\n        \"answer_text\",\n    ],\n)\n\n\nclass QADataset(Dataset):\n    def __init__(\n        self,\n        df,\n        doc_text_col,\n        question_text_col,\n        qa_id_col=None,\n        answer_start_col=None,\n        answer_text_col=None,\n        is_impossible_col=None,\n    ):\n        \"\"\"\n        A standard dataset structure for question answering that can be processed by\n        :meth:`utils_nlp.models.transformers.question_answering.QAProcessor.preprocess`\n\n        Args:\n            df (pandas.DataFrame): Input data frame.\n            doc_text_col (str): Name of the column containing the document texts.\n            question_text_col (str): Name of the column containing the question texts.\n            qa_id_col (str, optional): Name of the column containing the unique ids\n                identifying document-question-answer samples. If not provided, a\n                \"qa_id\" column is automatically created. Defaults to None.\n            answer_start_col (str, optional): Name of the column containing answer\n                start indices. For testing data, each value in the column can be a list\n                of integers for multiple ground truth answers. Defaults to None.\n            answer_text_col (str, optional): Name of the column containing answer texts.\n                For testing data, each value in the column can be a list of strings for\n                multiple ground truth answers. Defaults to None.\n            is_impossible_col (str, optional): Name of the column containing boolean\n                values indicating if the question is impossible to answer. If not\n                provided, a \"is_impossible\" column is automatically created and\n                populated with False. Defaults to None.\n        \"\"\"\n        self.df = df.copy()\n        self.doc_text_col = doc_text_col\n        self.question_text_col = question_text_col\n\n        if qa_id_col is None:\n            self.qa_id_col = \"qa_id\"\n            self.df[self.qa_id_col] = list(range(self.df.shape[0]))\n        else:\n            self.qa_id_col = qa_id_col\n\n        if is_impossible_col is None:\n            self.is_impossible_col = \"is_impossible\"\n            self.df[self.is_impossible_col] = False\n        else:\n            self.is_impossible_col = is_impossible_col\n\n        if answer_start_col is not None and answer_text_col is not None:\n            self.actual_answer_available = True\n        else:\n            self.actual_answer_available = False\n        self.answer_start_col = answer_start_col\n        self.answer_text_col = answer_text_col\n\n    def __getitem__(self, idx):\n        current_item = self.df.iloc[\n            idx,\n        ]\n        if self.actual_answer_available:\n            return QAInput(\n                doc_text=current_item[self.doc_text_col],\n                question_text=current_item[self.question_text_col],\n                qa_id=current_item[self.qa_id_col],\n                is_impossible=current_item[self.is_impossible_col],\n                answer_start=current_item[self.answer_start_col],\n                answer_text=current_item[self.answer_text_col],\n            )\n        else:\n            return QAInput(\n                doc_text=current_item[self.doc_text_col],\n                question_text=current_item[self.question_text_col],\n                qa_id=current_item[self.qa_id_col],\n                is_impossible=current_item[self.is_impossible_col],\n                answer_start=-1,\n                answer_text=\"\",\n            )\n\n    def __len__(self):\n        return self.df.shape[0]\n\n\ndef _line_iter(file_path):\n    with open(file_path, \"r\", encoding=\"utf8\") as fd:\n        for line in fd:\n            yield line\n\n\ndef _preprocess(sentences, preprocess_pipeline, word_tokenize=None):\n    \"\"\"\n    Helper function to preprocess a list of paragraphs.\n\n    Args:\n        param (Tuple): params are tuple of (a list of strings,\n            a list of preprocessing functions, and function to tokenize\n            setences into words). A paragraph is represented with a\n            single string with multiple setnences.\n\n    Returns:\n        list of list of strings, where each string is a token or word.\n    \"\"\"\n    if preprocess_pipeline is not None:\n        for function in preprocess_pipeline:\n            sentences = function(sentences)\n\n    if word_tokenize is None:\n        return sentences\n    else:\n        return sentences, [word_tokenize(sentence) for sentence in sentences]\n\n\ndef _create_data_from_iterator(iterator, preprocessing, word_tokenize):\n    for line in iterator:\n        yield _preprocess(\n            sentences=line,\n            preprocess_pipeline=preprocessing,\n            word_tokenize=word_tokenize,\n        )\n\n\nclass IterableSummarizationDataset(IterableDataset):\n    def __init__(\n        self,\n        source_file,\n        target_file=None,\n        source_preprocessing=None,\n        target_preprocessing=None,\n        word_tokenization=None,\n        top_n=-1,\n    ):\n        \"\"\"\n        Create a summarization dataset instance given the\n        paths of the source file and the target file\n\n        Args:\n            source_file (str): Full path of the file which contains a list of\n                the paragraphs with line break as seperator.\n            target_file (str): Full path of the file which contains a list of\n                the summaries for the paragraphs in the source file with line break as\n                seperator.\n            source_preprocessing (list of functions): A list of preprocessing functions\n                to process the paragraphs in the source file.\n            target_preprocessing (list of functions): A list of preprocessing functions\n                to process the paragraphs in the source file.\n            word_tokenization (function): Tokenization function for tokenize the\n                paragraphs and summaries. The tokenization method is used for sentence\n                selection in\n                :meth:`utils_nlp.models.transformers.extractive_summarization.\n                ExtSumProcessor.preprocess`\n            top_n (int, optional): The number which specifies how many examples in the\n                beginning of the paragraph and summary lists that will be processed by\n                this function. Defaults to -1, which means the whole lists of paragraphs\n                and summaries should be procsssed.\n        \"\"\"\n\n        source_iter = _line_iter(source_file)\n\n        if top_n != -1:\n            source_iter = itertools.islice(source_iter, top_n)\n\n        self._source = _create_data_from_iterator(\n            source_iter, source_preprocessing, word_tokenization\n        )\n\n        if target_file:\n            target_iter = _line_iter(target_file)\n            if top_n != -1:\n                target_iter = itertools.islice(target_iter, top_n)\n            self._target = _create_data_from_iterator(\n                target_iter, target_preprocessing, word_tokenization\n            )\n        else:\n            self._target = None\n\n    def __iter__(self):\n        for x in self._source:\n            yield x\n\n    def get_source(self):\n        return self._source\n\n    def get_target(self):\n        return self._target\n\n\nclass SummarizationDataset(Dataset):\n    def __init__(\n        self,\n        source_file,\n        source=None,\n        target_file=None,\n        target=None,\n        source_preprocessing=None,\n        target_preprocessing=None,\n        word_tokenize=None,\n        top_n=-1,\n        n_processes=-1,\n    ):\n        \"\"\"\n        Create a summarization dataset instance given the\n        paths of the source file and the target file.\n\n        Args:\n            source_file (str): Full path of the file which contains a list of\n                the input paragraphs with line break as seperator.\n            source (list of str, optional): a list of input paragraphs.\n                Defaults to None.\n            target_file (str, optional): Full path of the file which contains a list of\n                the summaries for the paragraphs in the source file with line break\n                as seperator.\n            target (list of str, optional): a list of summaries correponding to\n                `source`. Defaults to None.\n            source_preprocessing (list of functions): A list of preprocessing functions\n                to process the paragraphs in the source file.\n            target_preprocessing (list of functions): A list of preprocessing functions\n                to process the summaries in the target file.\n            top_n (int, optional): Number of examples to load from the input files.\n                Defaults to -1, which means the entire dataset is loaded.\n            n_processes (int, optional): Number of CPUs to use to process the data in\n                parallel. Defaults to -1, which means all the CPUs will be used.\n        \"\"\"\n        self._source_txt = []\n        if source_file is not None and os.path.exists(source_file):\n            with open(source_file, encoding=\"utf-8\") as f:\n                if top_n != -1:\n                    self._source_txt = list(itertools.islice(f, top_n))\n                else:\n                    self._source_txt = f.readlines()\n        if source:\n            self._source_txt.extend(source)\n\n        self._target_txt = []\n        if target_file is not None and os.path.exists(target_file):\n            with open(target_file, encoding=\"utf-8\") as f:\n                if top_n != -1:\n                    self._target_txt = list(itertools.islice(f, top_n))\n                else:\n                    self._target_txt = f.readlines()\n        if target:\n            self._target_txt.extend(target)\n\n        if len(self._target_txt) == 0:\n            self._target_txt = None\n        else:\n            assert len(self._source_txt) == len(self._target_txt)\n\n        result = parallel_preprocess(\n            self._source_txt,\n            preprocess_pipeline=source_preprocessing,\n            word_tokenize=word_tokenize,\n            num_pool=n_processes,\n        )\n        if word_tokenize:\n            self._source_txt = list(\n                map(lambda x: x[0], filter(lambda x: len(x[0]) > 0, result))\n            )\n            self._source = list(\n                map(lambda x: x[1], filter(lambda x: len(x[1]) > 0, result))\n            )\n        else:\n            self._source = list(\n                map(lambda x: x, filter(lambda x: len(x) > 0, result))\n            )\n\n        if self._target_txt is not None and len(self._target_txt) > 0:\n            result = parallel_preprocess(\n                self._target_txt,\n                preprocess_pipeline=target_preprocessing,\n                word_tokenize=word_tokenize,\n                num_pool=n_processes,\n            )\n\n            if word_tokenize:\n                self._target_txt = list(\n                    map(lambda x: x[0], filter(lambda x: len(x[0]) > 0, result))\n                )\n                self._target = list(\n                    map(lambda x: x[1], filter(lambda x: len(x[1]) > 0, result))\n                )\n            else:\n                self._target = list(\n                    map(lambda x: x, filter(lambda x: len(x) > 0, result))\n                )\n\n    def shorten(self, top_n=None):\n        if top_n is None:\n            return self\n        elif top_n <= len(self._source):\n            self._source = self._source[0:top_n]\n            self._source_txt = self._source_txt[0:top_n]\n\n            if self._target_txt is not None:\n                self._target = self._target[0:top_n]\n                self._target_txt = self._target_txt[0:top_n]\n            return self\n        else:\n            return self\n\n    def __getitem__(self, idx):\n        ## tupe is more adaptive\n        if self._target_txt is None:\n            return {\"src\": self._source[idx], \"src_txt\": self._source_txt[idx]}\n        else:\n            return {\n                \"src\": self._source[idx],\n                \"src_txt\": self._source_txt[idx],\n                \"tgt\": self._target[idx],\n                \"tgt_txt\": self._target_txt[idx],\n            }\n\n    def __len__(self):\n        return len(self._source)\n\n    def get_source(self):\n        return self._source\n\n    def get_source_txt(self):\n        return self._source_txt\n\n    def get_target_txt(self):\n        return self._target_txt\n\n    def get_target(self):\n        return self._target\n\n    def save_to_jsonl(self, output_file):\n        with jsonlines.open(output_file, mode=\"w\") as writer:\n            if self._target_txt is None:\n                for src in self._source:\n                    writer.write({\"src\": src})\n            else:\n                for src, tgt in zip(self._source, self._target):\n                    writer.write({\"src\": src, \"tgt\": tgt})\n\n\ndef parallel_preprocess(\n    input_data, preprocess_pipeline, word_tokenize=None, num_pool=-1\n):\n    \"\"\"\n    Process data in parallel using multiple CPUs.\n\n    Args:\n        input_data (list): List if input strings to process.\n        preprocess_pipeline (list): List of functions to apply on the input data.\n        word_tokenize (func, optional): A tokenization function used to tokenize\n            the results from preprocess_pipeline.\n        num_pool (int, optional): Number of CPUs to use. Defaults to -1 and all\n            available CPUs are used.\n\n    Returns:\n        list: list of processed text strings.\n\n    \"\"\"\n    if num_pool == -1:\n        num_pool = cpu_count()\n\n    num_pool = min(num_pool, len(input_data))\n\n    p = Pool(num_pool)\n\n    results = p.map(\n        partial(\n            _preprocess,\n            preprocess_pipeline=preprocess_pipeline,\n            word_tokenize=word_tokenize,\n        ),\n        input_data,\n        chunksize=min(1, int(len(input_data) / num_pool)),\n    )\n    p.close()\n    p.join()\n\n    return results\n"
  },
  {
    "path": "utils_nlp/models/transformers/extractive_summarization.py",
    "content": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\n# This script reuses some code from https://github.com/nlpyang/BertSum\n\nimport functools\nimport itertools\nimport logging\nimport os\nimport pickle\nfrom multiprocessing import Pool, cpu_count\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import DataLoader, RandomSampler, SequentialSampler\nfrom torch.utils.data.distributed import DistributedSampler\nfrom transformers import AutoTokenizer, BertModel, DistilBertModel\n\nfrom utils_nlp.common.pytorch_utils import (\n    compute_training_steps,\n    get_device,\n    move_model_to_device,\n    parallelize_model,\n)\nfrom utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection\nfrom utils_nlp.models.transformers.abstractive_summarization_bertsum import (\n    fit_to_block_size,\n)\n\nfrom utils_nlp.models.transformers.bertsum import model_builder\nfrom utils_nlp.models.transformers.bertsum.data_loader import (\n    Batch,\n    ChunkDataLoader,\n    IterableDistributedSampler,\n)\nfrom utils_nlp.models.transformers.bertsum.dataset import (\n    ExtSumProcessedDataset,\n    ExtSumProcessedIterableDataset,\n)\nfrom utils_nlp.models.transformers.bertsum.model_builder import BertSumExt\nfrom utils_nlp.models.transformers.common import Transformer\n\nMODEL_CLASS = {\n    \"bert-base-uncased\": BertModel,\n    \"distilbert-base-uncased\": DistilBertModel,\n}\n\nlogger = logging.getLogger(__name__)\n\n\nclass Bunch(object):\n    \"\"\" Class which convert a dictionary to an object \"\"\"\n\n    def __init__(self, adict):\n        self.__dict__.update(adict)\n\n\ndef get_dataloader(\n    data_iter,\n    shuffle=True,\n    is_labeled=False,\n    batch_size=3000,\n    world_size=1,\n    rank=0,\n    local_rank=-1,\n):\n    \"\"\"\n    Function to get data iterator over a list of data objects.\n\n    Args:\n        data_iter (generator): Data generator.\n        shuffle (bool): Whether the data is shuffled. Defaults to True.\n        is_labeled (bool): Whether the data objects are labeled data.\n                            Defaults to False.\n        batch_size (int): Number of tokens per batch. Defaults to 3000.\n        world_size (int): Total number of GPUs that will be used. Defaults to 1.\n        rank (int): Rank of the current GPU. Defaults to -1.\n\n    Returns:\n        DataIterator\n    \"\"\"\n    sampler = IterableDistributedSampler(world_size, rank, local_rank)\n    return ChunkDataLoader(\n        data_iter, batch_size, shuffle=shuffle, is_labeled=is_labeled, sampler=sampler\n    )\n\n\ndef get_pred(\n    example,\n    sent_scores,\n    cal_lead=False,\n    sentence_separator=\"<q>\",\n    block_trigram=True,\n    top_n=3,\n):\n    \"\"\"\n        Get the summarization prediction for the paragraph example based on the scores\n        returned by the transformer summarization model.\n\n        Args:\n            example (str): The object with \"src_txt\" field as the paragraph which\n                requries summarization. The \"src_txt\" is a list of strings.\n            sent_scores (list of floats): List of scores of how likely of the\n                sentence is included in the summary.\n            cal_lead (bool, optional): Boolean value which specifies whether the\n                prediction uses the first few sentences as summary. Defaults to False.\n            sentence_separator (str, optional): Seperator used in the generated summary.\n                Defaults to '<q>'.\n            block_trigram (bool, optional): Boolean value which specifies whether the\n                summary should include any sentence that has the same trigram as the\n                already selected sentences. Defaults to True.\n            top_n (int, optional): The maximum number of sentences that the summary\n                should included. Defaults to 3.\n\n        Returns:\n            A string which is the summary for the example.\n    \"\"\"\n\n    def _get_ngrams(n, text):\n        ngram_set = set()\n        text_length = len(text)\n        max_index_ngram_start = text_length - n\n        for i in range(max_index_ngram_start + 1):\n            ngram_set.add(tuple(text[i : i + n]))\n        return ngram_set\n\n    def _block_tri(c, p):\n        tri_c = _get_ngrams(3, c.split())\n        for s in p:\n            tri_s = _get_ngrams(3, s.split())\n            if len(tri_c.intersection(tri_s)) > 0:\n                return True\n        return False\n\n    selected_ids = np.argsort(-sent_scores)\n    # selected_ids = np.argsort(-sent_scores, 1)\n    if cal_lead:\n        selected_ids = range(len(example[\"clss\"]))\n\n    pred = []\n    _pred = []\n    final_selections = []\n    for j in selected_ids[: len(example[\"src_txt\"])]:\n        if j >= len(example[\"src_txt\"]):\n            continue\n        candidate = example[\"src_txt\"][j].strip()\n        if block_trigram:\n            if not _block_tri(candidate, _pred):\n                _pred.append(candidate)\n                final_selections.append(j)\n        else:\n            _pred.append(candidate)\n            final_selections.append(j)\n\n        # only select the top n\n        if len(_pred) == top_n:\n            break\n\n    sorted_selections = sorted(final_selections)\n    _pred = []\n    for i in sorted_selections:\n        _pred.append(example[\"src_txt\"][i].strip())\n    _pred = sentence_separator.join(_pred)\n    pred.append(_pred.strip())\n    return pred\n\n\nclass ExtSumProcessedData:\n    \"\"\"class loaded data preprocessed as in\n    :class:`utils_nlp.models.transformers.datasets.SummarizationDataset`\"\"\"\n\n    @staticmethod\n    def save_data(data_iter, is_test=False, save_path=\"./\", chunk_size=None):\n        \"\"\" Save the preprocessed data into files with specified chunk size\n\n        Args:\n            data_iter (iterator): Data iterator returned from\n                :class:`utils_nlp.models.transformers.datasets.SummarizationDataset`\n            is_test (bool): Boolean value which indicates whether target data\n                is included. If set to True, the file name contains \"test\", otherwise,\n                the file name contains \"train\". Defaults to False.\n            save_path (str): Directory where the data should be saved. Defaults to \"./\".\n            chunk_size (int): The number of examples that should be included in each\n                file. Defaults to None, which means only one file is used.\n\n        Returns:\n            a list of strings which are the files the data is saved to.\n        \"\"\"\n        os.makedirs(save_path, exist_ok=True)\n\n        def _chunks(iterable, chunk_size):\n            iterator = filter(None, iterable)\n            for first in iterator:\n                if chunk_size:\n                    yield itertools.chain(\n                        [first], itertools.islice(iterator, chunk_size - 1)\n                    )\n                else:\n                    yield itertools.chain([first], itertools.islice(iterator, None))\n\n        chunks = _chunks(data_iter, chunk_size)\n        filename_list = []\n        for i, chunked_data in enumerate(chunks):\n            filename = f\"{i}_test\" if is_test else f\"{i}_train\"\n            torch.save(list(chunked_data), os.path.join(save_path, filename))\n            filename_list.append(os.path.join(save_path, filename))\n        return filename_list\n\n    def _get_files(self, root):\n        train_files = []\n        test_files = []\n        files = [\n            os.path.join(root, f)\n            for f in os.listdir(root)\n            if os.path.isfile(os.path.join(root, f))\n        ]\n        for fname in files:\n            if fname.find(\"train\") != -1:\n                train_files.append(fname)\n            elif fname.find(\"test\") != -1:\n                test_files.append(fname)\n\n        return train_files, test_files\n\n    def splits(self, root, train_iterable=False):\n        \"\"\"Get the train and test dataset from the folder\n\n        Args:\n            root (str): Directory where the data can be loaded.\n\n        Returns:\n            Tuple of ExtSumProcessedIterableDataset as train dataset\n            and ExtSumProcessedDataset as test dataset.\n        \"\"\"\n        train_files, test_files = self._get_files(root)\n        if train_iterable:\n            return (\n                ExtSumProcessedIterableDataset(train_files, is_shuffle=True),\n                ExtSumProcessedDataset(test_files, is_shuffle=False),\n            )\n        else:\n            return (\n                ExtSumProcessedDataset(train_files, is_shuffle=True),\n                ExtSumProcessedDataset(test_files, is_shuffle=False),\n            )\n\n\ndef preprocess_single_add_oracleids(input_data, oracle_mode=\"greedy\", selections=3):\n    \"\"\" Preprocess single data point to generate oracle summaries and\n        sentence tokenization of the source text.\n\n        Args:\n            input_data (dict): An item from `SummarizationDataset`\n            oracle_mode (str, optional): Sentence selection method.\n                Defaults to \"greedy\".\n            selections (int, optional): The number of sentence used as summary.\n                Defaults to 3.\n        Returns:\n            Dictionary of fields \"src\", \"src_txt\", \"tgt\", \"tgt_txt\" and \"oracle_ids\"\n    \"\"\"\n\n    oracle_ids = None\n    if \"tgt\" in input_data:\n        if oracle_mode == \"greedy\":\n            oracle_ids = greedy_selection(\n                input_data[\"src\"], input_data[\"tgt\"], selections\n            )\n        elif oracle_mode == \"combination\":\n            oracle_ids = combination_selection(\n                input_data[\"src\"], input_data[\"tgt\"], selections\n            )\n        input_data[\"oracle_ids\"] = oracle_ids\n    # input_data[\"src_txt\"] = tokenize.sent_tokenize(input_data[\"src_txt\"])\n    return input_data\n\n\ndef parallel_preprocess(input_data, preprocess, num_pool=-1):\n    \"\"\"\n    Process data in parallel using multiple GPUs.\n\n    Args:\n        input_data (list): List if input strings to process.\n        preprocess_pipeline (list): List of functions to apply on the input data.\n        word_tokenize (func, optional): A tokenization function used to tokenize\n            the results from preprocess_pipeline.\n        num_pool (int, optional): Number of CPUs to use. Defaults to -1 and all\n            available CPUs are used.\n\n    Returns:\n        list: list of processed text strings.\n\n    \"\"\"\n    if num_pool == -1:\n        num_pool = cpu_count()\n\n    num_pool = min(num_pool, len(input_data))\n\n    p = Pool(num_pool)\n\n    results = p.map(\n        preprocess, input_data, chunksize=min(1, int(len(input_data) / num_pool))\n    )\n    p.close()\n    p.join()\n\n    return results\n\n\nclass ExtSumProcessor:\n    \"\"\"Class for preprocessing extractive summarization data.\"\"\"\n\n    def __init__(\n        self,\n        model_name=\"distilbert-base-uncased\",\n        to_lower=False,\n        cache_dir=\".\",\n        max_nsents=200,\n        max_src_ntokens=2000,\n        min_nsents=3,\n        min_src_ntokens=5,\n    ):\n        \"\"\" Initialize the preprocessor.\n\n        Args:\n            model_name (str, optional): Transformer model name used in preprocessing.\n                check MODEL_CLASS for supported models. Defaults to \"bert-base-cased\".\n            to_lower (bool, optional): Whether to convert all letters to lower case\n                during tokenization. This is determined by if a cased model is used.\n                Defaults to False, which corresponds to a cased model.\n            cache_dir (str, optional): Directory to cache the tokenizer.\n                Defaults to \".\".\n            max_nsents (int, optional): Max number of sentences that can be used\n                as input. Defaults to 200.\n            max_src_ntokens (int, optional): Max number of tokens that be used\n                as input. Defaults to 2000.\n            min_nsents (int, optional): Minimum number of sentences that are required\n                as input. If the input has less number of sentences than this value,\n                it's skipped and cannot be used as a valid input. Defaults to 3.\n            min_src_ntokens (int, optional): Minimum number of tokens that are required\n                as an input sentence.If the input sentence has less number of tokens\n                than this value, it's skipped and cannot be used as a valid sentence.\n                Defaults to 5.\n\n        \"\"\"\n        self.model_name = model_name\n        self.tokenizer = AutoTokenizer.from_pretrained(\n            model_name,\n            do_lower_case=to_lower,\n            cache_dir=cache_dir,\n            output_loading_info=False,\n        )\n        self.sep_vid = self.tokenizer.vocab[\"[SEP]\"]\n        self.cls_vid = self.tokenizer.vocab[\"[CLS]\"]\n        self.pad_vid = self.tokenizer.vocab[\"[PAD]\"]\n\n        self.max_nsents = max_nsents\n        self.max_src_ntokens = max_src_ntokens\n        self.min_nsents = min_nsents\n        self.min_src_ntokens = min_src_ntokens\n\n    @staticmethod\n    def list_supported_models():\n        return list(MODEL_CLASS)\n\n    @property\n    def model_name(self):\n        return self._model_name\n\n    @model_name.setter\n    def model_name(self, value):\n        if value not in self.list_supported_models():\n            raise ValueError(\n                \"Model name {} is not supported by ExtSumProcessor. \"\n                \"Call 'ExtSumProcessor.list_supported_models()' to get all supported \"\n                \"model names.\".format(value)\n            )\n\n        self._model_name = value\n\n    @staticmethod\n    def get_inputs(batch, device, model_name, train_mode=True):\n        \"\"\"\n        Creates an input dictionary given a model name.\n\n        Args:\n            batch (object): A Batch containing input ids, segment ids, sentence class\n                ids, masks for the input ids, masks for  sentence class ids and source\n                text. If train_model is True, it also contains the labels and target\n                text.\n            device (torch.device): A PyTorch device.\n            model_name (bool): Model name used to format the inputs.\n            train_mode (bool, optional): Training mode flag.\n                Defaults to True.\n\n        Returns:\n            dict: Dictionary containing input ids, segment ids, sentence class ids,\n            masks for the input ids, masks for the sentence class ids and labels.\n            Labels are only returned when train_mode is True.\n        \"\"\"\n\n        if model_name.split(\"-\")[0] in [\"bert\", \"distilbert\"]:\n            if train_mode:\n                batch = batch.to(device)\n                # labels must be the last\n                return {\n                    \"x\": batch.src,\n                    \"segs\": batch.segs,\n                    \"clss\": batch.clss,\n                    \"mask\": batch.mask,\n                    \"mask_cls\": batch.mask_cls,\n                    \"labels\": batch.labels,\n                }\n            else:\n                batch = batch.to(device)\n                return {\n                    \"x\": batch.src,\n                    \"segs\": batch.segs,\n                    \"clss\": batch.clss,\n                    \"mask\": batch.mask,\n                    \"mask_cls\": batch.mask_cls,\n                    # \"labels\": batch.labels,\n                }\n                \"\"\"\n                return {\n                    \"x\": batch.src.to(device),\n                    \"segs\": batch.segs.to(device),\n                    \"clss\": batch.clss.to(device),\n                    \"mask\": batch.mask.to(device),\n                    \"mask_cls\": batch.mask_cls.to(device),\n                }\n                \"\"\"\n        else:\n            raise ValueError(\"Model not supported: {}\".format(model_name))\n\n    def preprocess(self, input_data_list, oracle_mode=\"greedy\", selections=3):\n        \"\"\" Preprocess multiple data points.\n\n           Args:\n              input_data_list (SummarizationDataset): The dataset to be preprocessed.\n              oracle_mode (str, optional): Sentence selection method.\n                Defaults to \"greedy\".\n              selections (int, optional): The number of sentence used as summary.\n                Defaults to 3.\n\n            Returns:\n                Iterator of dictory objects containing input ids, segment ids,\n                sentence class ids, labels, source text and target text.\n                If targets is None, the label and target text are None.\n        \"\"\"\n        preprocess = functools.partial(\n            preprocess_single_add_oracleids, oracle_mode=\"greedy\", selections=3\n        )\n        return parallel_preprocess(input_data_list, preprocess)\n\n    def collate(self, data, block_size, device, train_mode=True):\n        \"\"\" Collcate function for pytorch data loaders.\n            Args:\n                data (list): A list of samples from SummarizationDataset.\n                block_size (int): maximum input length for the model.\n                train_mode (bool): whether the collate function is used for training\n                    or not. Defaults to True.\n\n            Returns:\n                `Batch` object: a data minibatch as the input of a model.\n\n        \"\"\"\n\n        if len(data) == 0:\n            return None\n        else:\n            if train_mode is True and \"tgt\" in data[0] and \"oracle_ids\" in data[0]:\n                encoded_text = [self.encode_single(d, block_size) for d in data]\n                batch = Batch(list(filter(None, encoded_text)), True)\n            else:\n                encoded_text = [\n                    self.encode_single(d, block_size, train_mode) for d in data\n                ]\n                # src, labels, segs, clss, src_txt, tgt_txt =  zip(*encoded_text)\n                # new_data = [list(i) for i in list(zip(*encoded_text))]\n                # batch =  Batch(new_data)\n                filtered_list = list(filter(None, encoded_text))\n                # if len(filtered_list) != len(data):\n                #    raise ValueError(\"no test data shouldn't be skipped\")\n                batch = Batch(filtered_list)\n            return batch.to(device)\n\n    def encode_single(self, d, block_size, train_mode=True):\n        \"\"\" Enocde a single sample.\n            Args:\n                d (dict): s data sample from SummarizationDataset.\n                block_size (int): maximum input length for the model.\n\n            Returns:\n                Tuple of encoded data.\n\n        \"\"\"\n\n        src = d[\"src\"]\n\n        if len(src) == 0:\n            raise ValueError(\"source doesn't have any sentences\")\n\n        original_src_txt = [\" \".join(s) for s in src]\n        # no filtering for prediction\n        idxs = [i for i, s in enumerate(src)]\n        src = [src[i] for i in idxs]\n\n        tgt_txt = None\n        labels = None\n        if (\n            train_mode and \"oracle_ids\" in d and \"tgt\" in d and \"tgt_txt\" in d\n        ):  # is not None and tgt is not None:\n            labels = [0] * len(src)\n            for l in d[\"oracle_ids\"]:\n                labels[l] = 1\n\n            # source filtering for only training\n            idxs = [i for i, s in enumerate(src) if (len(s) > self.min_src_ntokens)]\n            src = [src[i][: self.max_src_ntokens] for i in idxs]\n            src = src[: self.max_nsents]\n            labels = [labels[i] for i in idxs]\n            labels = labels[: self.max_nsents]\n\n            if len(src) < self.min_nsents:\n                return None\n            if len(labels) == 0:\n                return None\n            tgt_txt = \"\".join([\" \".join(tt) for tt in d[\"tgt\"]])\n\n        src_txt = [\" \".join(sent) for sent in src]\n        text = \" [SEP] [CLS] \".join(src_txt)\n        src_subtokens = self.tokenizer.tokenize(text)\n        # src_subtokens = src_subtokens[:510]\n        src_subtokens = (\n            [\"[CLS]\"]\n            + fit_to_block_size(\n                src_subtokens, block_size - 2, self.tokenizer.pad_token_id\n            )\n            + [\"[SEP]\"]\n        )\n        src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)\n        _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]\n        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]\n        segments_ids = []\n        for i, s in enumerate(segs):\n            if i % 2 == 0:\n                segments_ids += s * [0]\n            else:\n                segments_ids += s * [1]\n        cls_ids = [i for i, t in enumerate(src_subtoken_idxs) if t == self.cls_vid]\n        if labels:\n            labels = labels[: len(cls_ids)]\n        src_txt = [original_src_txt[i] for i in idxs]\n        return src_subtoken_idxs, labels, segments_ids, cls_ids, src_txt, tgt_txt\n\n\nclass ExtractiveSummarizer(Transformer):\n    \"\"\"class which performs extractive summarization fine tuning and prediction \"\"\"\n\n    def __init__(\n        self,\n        processor,\n        model_name=\"distilbert-base-uncased\",\n        encoder=\"transformer\",\n        max_pos_length=512,\n        cache_dir=\".\",\n    ):\n        \"\"\"Initialize a ExtractiveSummarizer.\n\n        Args:\n            model_name (str, optional): Transformer model name used in preprocessing.\n                check MODEL_CLASS for supported models.\n                Defaults to \"distilbert-base-uncased\".\n            encoder (str, optional): Encoder algorithm used by summarization layer.\n                There are four options:\n                    - baseline: it used a smaller transformer model to replace the bert\n                        model and with transformer summarization layer.\n                    - classifier: it uses pretrained BERT and fine-tune BERT with simple\n                        logistic classification summarization layer.\n                    - transformer: it uses pretrained BERT and fine-tune BERT with\n                        transformer summarization layer.\n                    - RNN: it uses pretrained BERT and fine-tune BERT with LSTM\n                        summarization layer.\n                Defaults to \"transformer\".\n            cache_dir (str, optional): Directory to cache the tokenizer.\n                Defaults to \".\".\n        \"\"\"\n\n        model = MODEL_CLASS[model_name].from_pretrained(\n            model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False\n        )\n        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)\n\n        if model_name not in self.list_supported_models():\n            raise ValueError(\n                \"Model name {} is not supported by ExtractiveSummarizer. \"\n                \"Call 'ExtractiveSummarizer.list_supported_models()' to get all  \"\n                \"supported model names.\".format(model_name)\n            )\n        self.processor = processor\n        self.max_pos_length = max_pos_length\n        self.model_class = MODEL_CLASS[model_name]\n        default_summarizer_layer_parameters = {\n            \"ff_size\": 512,\n            \"heads\": 4,\n            \"dropout\": 0.1,\n            \"inter_layers\": 2,\n            \"hidden_size\": 128,\n            \"rnn_size\": 512,\n            \"param_init\": 0.0,\n            \"param_init_glorot\": True,\n        }\n\n        args = Bunch(default_summarizer_layer_parameters)\n        self.model = BertSumExt(\n            encoder, args, self.model_class, model_name, max_pos_length, None, cache_dir\n        )\n\n    @staticmethod\n    def list_supported_models():\n        return list(MODEL_CLASS)\n\n    def fit(\n        self,\n        train_dataset,\n        num_gpus=None,\n        gpu_ids=None,\n        batch_size=3000,\n        local_rank=-1,\n        max_steps=5e5,\n        warmup_steps=1e5,\n        learning_rate=2e-3,\n        optimization_method=\"adam\",\n        max_grad_norm=0,\n        beta1=0.9,\n        beta2=0.999,\n        decay_method=\"noam\",\n        gradient_accumulation_steps=1,\n        report_every=50,\n        verbose=True,\n        seed=None,\n        save_every=-1,\n        world_size=1,\n        rank=0,\n        use_preprocessed_data=False,\n        **kwargs,\n    ):\n        \"\"\"\n        Fine-tune pre-trained transofmer models for extractive summarization.\n\n        Args:\n            train_dataset (ExtSumProcessedIterableDataset): Training dataset.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used. If set to 0 or GPUs are not\n                available, CPU device will be used. Defaults to None.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            batch_size (int, optional): Maximum number of tokens in each batch.\n            local_rank (int, optional): Local_rank for distributed training on GPUs.\n                Defaults to -1, which means non-distributed training.\n            max_steps (int, optional): Maximum number of training steps.\n                Defaults to 5e5.\n            warmup_steps (int, optional): Number of steps taken to increase learning\n                rate from 0 to `learning_rate`. Defaults to 1e5.\n            learning_rate (float, optional):  Learning rate of the AdamW optimizer.\n                Defaults to 5e-5.\n            optimization_method (string, optional): Optimization method used in\n                fine tuning.\n            max_grad_norm (float, optional): Maximum gradient norm for gradient\n                clipping.\n                Defaults to 0.\n            gradient_accumulation_steps (int, optional): Number of batches to accumulate\n                gradients on between each model parameter update. Defaults to 1.\n            decay_method (string, optional): learning rate decrease method.\n                Defaulta to 'noam'.\n            report_every (int, optional): The interval by steps to print out the\n                trainint log.\n                Defaults to 50.\n            beta1 (float, optional): The exponential decay rate for the first moment\n                estimates.\n                Defaults to 0.9.\n            beta2 (float, optional): The exponential decay rate for the second-moment\n                estimates.\n                This value should be set close to 1.0 on problems with a sparse\n                gradient.\n                Defaults to 0.99.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n            seed (int, optional): Random seed used to improve reproducibility.\n                Defaults to None.\n            rank (int, optional): Global rank of the current GPU in distributed\n                training. It's calculated with the rank of the current node in\n                the cluster/world and the `local_rank` of the device in the current\n                node. See an example in :file: `examples/text_summarization/\n                extractive_summarization_cnndm_distributed_train.py`.\n                Defaults to 0.\n        \"\"\"\n\n        # get device\n        device, num_gpus = get_device(\n            num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank\n        )\n        # move model\n        self.model = move_model_to_device(model=self.model, device=device)\n\n        # init optimizer\n        optimizer = model_builder.build_optim(\n            self.model,\n            optimization_method,\n            learning_rate,\n            max_grad_norm,\n            beta1,\n            beta2,\n            decay_method,\n            warmup_steps,\n        )\n        self.model = parallelize_model(\n            model=self.model,\n            device=device,\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=local_rank,\n        )\n\n        # batch_size is the number of tokens in a batch\n        if use_preprocessed_data:\n            train_dataloader = get_dataloader(\n                train_dataset.get_stream(),\n                is_labeled=True,\n                batch_size=batch_size,\n                world_size=world_size,\n                rank=rank,\n                local_rank=local_rank,\n            )\n        else:\n            if local_rank == -1:\n                sampler = RandomSampler(train_dataset)\n            else:\n                sampler = DistributedSampler(\n                    train_dataset, num_replicas=world_size, rank=rank\n                )\n\n            def collate_fn(data):\n                return self.processor.collate(\n                    data, block_size=self.max_pos_length, device=device\n                )\n\n            train_dataloader = DataLoader(\n                train_dataset,\n                sampler=sampler,\n                batch_size=batch_size,\n                collate_fn=collate_fn,\n            )\n\n        # compute the max number of training steps\n        max_steps = compute_training_steps(\n            train_dataloader,\n            max_steps=max_steps,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n        )\n\n        super().fine_tune(\n            train_dataloader=train_dataloader,\n            get_inputs=ExtSumProcessor.get_inputs,\n            device=device,\n            num_gpus=num_gpus,\n            max_steps=max_steps,\n            max_grad_norm=max_grad_norm,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n            optimizer=optimizer,\n            scheduler=None,\n            verbose=verbose,\n            seed=seed,\n            report_every=report_every,\n            clip_grad_norm=False,\n            save_every=save_every,\n        )\n\n    def predict(\n        self,\n        test_dataset,\n        num_gpus=None,\n        gpu_ids=None,\n        batch_size=16,\n        sentence_separator=\"<q>\",\n        top_n=3,\n        block_trigram=True,\n        cal_lead=False,\n        verbose=True,\n        local_rank=-1,\n    ):\n        \"\"\"\n        Predict the summarization for the input data iterator.\n\n        Args:\n            test_dataset (Dataset): Dataset for which the summary to be predicted\n            num_gpus (int, optional): The number of GPUs used in prediction.\n                Defaults to 1.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            batch_size (int, optional): The number of test examples in each batch.\n                Defaults to 16.\n            sentence_separator (str, optional): String to be inserted between\n                sentences in the prediction. Defaults to '<q>'.\n            top_n (int, optional): The number of sentences that should be selected\n                from the paragraph as summary. Defaults to 3.\n            block_trigram (bool, optional): voolean value which specifies whether\n                the summary should include any sentence that has the same trigram\n                as the already selected sentences. Defaults to True.\n            cal_lead (bool, optional): Boolean value which specifies whether the\n                prediction uses the first few sentences as summary. Defaults to False.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n\n        Returns:\n            List of strings which are the summaries\n\n        \"\"\"\n\n        device, num_gpus = get_device(\n            num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank\n        )\n\n        def collate_processed_data(dict_list):\n            # tuple_batch =  [list(col) for col in zip(*[d.values() for d in dict_list]\n            if dict_list is None or len(dict_list) <= 0:\n                return None\n            tuple_batch = [list(d.values()) for d in dict_list]\n            # generate mask and mask_cls, and only select tensors for the model input\n            # the labels was never used in prediction, set is_labeled as False\n            batch = Batch(tuple_batch, is_labeled=False)\n            return batch\n\n        def collate(data):\n            return self.processor.collate(\n                data, block_size=self.max_pos_length, train_mode=False, device=device\n            )\n\n        if len(test_dataset) == 0:\n            return None\n        if \"segs\" in test_dataset[0]:\n            collate_fn = collate_processed_data\n        else:\n            collate_fn = collate\n\n        test_sampler = SequentialSampler(test_dataset)\n        test_dataloader = DataLoader(\n            test_dataset,\n            sampler=test_sampler,\n            batch_size=batch_size,\n            collate_fn=collate_fn,\n        )\n        sent_scores = self.predict_scores(\n            test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids\n        )\n\n        sent_scores_list = list(sent_scores)\n        scores_list = []\n        for i in sent_scores_list:\n            scores_list.extend(i)\n        prediction = []\n        for i in range(len(test_dataset)):\n            temp_pred = get_pred(\n                test_dataset[i],\n                scores_list[i],\n                cal_lead=cal_lead,\n                sentence_separator=sentence_separator,\n                block_trigram=block_trigram,\n                top_n=top_n,\n            )\n            prediction.extend(temp_pred)\n\n        # release GPU memories\n        self.model.cpu()\n        torch.cuda.empty_cache()\n\n        return prediction\n\n    def predict_scores(self, test_dataloader, num_gpus=1, gpu_ids=None, verbose=True):\n        \"\"\"\n        Scores a dataset using a fine-tuned model and a given dataloader.\n\n        Args:\n            test_dataloader (Dataloader): Dataloader for scoring the data.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used.\n                If set to 0 or GPUs are not available, CPU device will be used.\n                Defaults to None.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n\n        Returns\n            1darray: numpy array of predicted sentence scores.\n        \"\"\"\n\n        preds = list(\n            super().predict(\n                eval_dataloader=test_dataloader,\n                get_inputs=ExtSumProcessor.get_inputs,\n                num_gpus=num_gpus,\n                gpu_ids=gpu_ids,\n                verbose=verbose,\n            )\n        )\n        return preds\n\n    def save_model(self, full_name=None):\n        \"\"\"\n        save the trained model.\n\n        Args:\n            full_name (str, optional): File name to save the model's `state_dict()`.\n                If it's None, the model is going to be saved under \"fine_tuned\"\n                folder of the cached directory of the object. Defaults to None.\n        \"\"\"\n        model_to_save = (\n            self.model.module if hasattr(self.model, \"module\") else self.model\n        )  # Take care of distributed/parallel training\n\n        if full_name is None:\n            output_model_dir = os.path.join(self.cache_dir, \"fine_tuned\")\n            os.makedirs(self.cache_dir, exist_ok=True)\n            os.makedirs(output_model_dir, exist_ok=True)\n            full_name = os.path.join(output_model_dir, self.model_name)\n\n        logger.info(\"Saving model checkpoint to %s\", full_name)\n        try:\n            print(\"saving through pytorch\")\n            torch.save(model_to_save.state_dict(), full_name)\n        except OSError:\n            try:\n                print(\"saving as pickle\")\n                pickle.dump(model_to_save.state_dict(), open(full_name, \"wb\"))\n            except Exception:\n                raise\n        except Exception:\n            raise\n"
  },
  {
    "path": "utils_nlp/models/transformers/named_entity_recognition.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport logging\nfrom collections import Iterable\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import TensorDataset\nfrom transformers import (\n    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,\n    AutoConfig,\n    AutoModelForTokenClassification,\n    AutoTokenizer,\n)\n\nfrom utils_nlp.common.pytorch_utils import compute_training_steps\nfrom utils_nlp.models.transformers.common import MAX_SEQ_LEN, Transformer\n\nsupported_models = [\n    list(x.pretrained_config_archive_map)\n    for x in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING\n]\nsupported_models = sorted([x for y in supported_models for x in y])\n\n\nclass TokenClassificationProcessor:\n    \"\"\"\n    Process raw dataset for training and testing.\n\n    Args:\n        model_name (str, optional): The pretained model name.\n            Defaults to \"bert-base-cased\".\n        to_lower (bool, optional): Lower case text input.\n            Defaults to False.\n        cache_dir (str, optional): The default folder for saving cache files.\n            Defaults to \".\".\n    \"\"\"\n\n    def __init__(self, model_name=\"bert-base-cased\", to_lower=False, cache_dir=\".\"):\n        self.model_name = model_name\n        self.to_lower = to_lower\n        self.cache_dir = cache_dir\n        self.tokenizer = AutoTokenizer.from_pretrained(\n            model_name,\n            do_lower_case=to_lower,\n            cache_dir=cache_dir,\n            output_loading_info=False,\n        )\n\n    @staticmethod\n    def get_inputs(batch, device, model_name, train_mode=True):\n        \"\"\"\n        Creates an input dictionary given a model name.\n\n        Args:\n            batch (tuple): A tuple containing input ids, attention mask,\n                segment ids, and labels tensors.\n            device (torch.device): A PyTorch device.\n            model_name (bool): Model name used to format the inputs.\n            train_mode (bool, optional): Training mode flag.\n                Defaults to True.\n\n        Returns:\n            dict: Dictionary containing input ids, segment ids, masks, and labels.\n                Labels are only returned when train_mode is True.\n        \"\"\"\n        batch = tuple(t.to(device) for t in batch)\n        if model_name in supported_models:\n            if train_mode:\n                inputs = {\n                    \"input_ids\": batch[0],\n                    \"attention_mask\": batch[1],\n                    \"labels\": batch[3],\n                }\n            else:\n                inputs = {\"input_ids\": batch[0], \"attention_mask\": batch[1]}\n\n            # distilbert doesn't support segment ids\n            if model_name.split(\"-\")[0] not in [\"distilbert\"]:\n                inputs[\"token_type_ids\"] = batch[2]\n\n            return inputs\n        else:\n            raise ValueError(\"Model not supported: {}\".format(model_name))\n\n    @staticmethod\n    def create_label_map(label_lists, trailing_piece_tag=\"X\"):\n        \"\"\"\n        Create a dictionary object to map a label (str) to an ID (int).\n\n        Args:\n            label_lists (list): A list of label lists. Each element is a list of labels\n                which presents class of each token.\n            trailing_piece_tag (str, optional): Tag used to label trailing word pieces.\n                Defaults to \"X\".\n\n        Returns:\n            dict: A dictionary object to map a label (str) to an ID (int).\n        \"\"\"\n\n        unique_labels = sorted(set([x for y in label_lists for x in y]))\n        label_map = {label: i for i, label in enumerate(unique_labels)}\n\n        if trailing_piece_tag not in unique_labels:\n            label_map[trailing_piece_tag] = len(unique_labels)\n\n        return label_map\n\n    def preprocess(\n        self,\n        text,\n        max_len=MAX_SEQ_LEN,\n        labels=None,\n        label_map=None,\n        trailing_piece_tag=\"X\",\n    ):\n        \"\"\"\n        Tokenize and preprocesses input word lists, involving the following steps\n            0. WordPiece tokenization.\n            1. Convert string tokens to token ids.\n            2. Convert input labels to label ids, if labels and label_map are\n                provided.\n            3. If a word is tokenized into multiple pieces of tokens by the\n                WordPiece tokenizer, label the extra tokens with\n                trailing_piece_tag.\n            4. Pad or truncate input text according to max_seq_length\n            5. Create input_mask for masking out padded tokens.\n\n        Args:\n            text (list): List of lists. Each sublist is a list of words in an\n                input sentence.\n            max_len (int, optional): Maximum length of the list of\n                tokens. Lists longer than this are truncated and shorter\n                ones are padded with \"O\"s. Default value is BERT_MAX_LEN=512.\n            labels (list, optional): List of word label lists. Each sublist\n                contains labels corresponding to the input word list. The lengths\n                of the label list and word list must be the same. Default\n                value is None.\n            label_map (dict, optional): Dictionary for mapping original token\n                labels (which may be string type) to integers. Default value\n                is None.\n            trailing_piece_tag (str, optional): Tag used to label trailing\n                word pieces. For example, \"criticize\" is broken into \"critic\"\n                and \"##ize\", \"critic\" preserves its original label and \"##ize\"\n                is labeled as trailing_piece_tag. Default value is \"X\".\n\n        Returns:\n            TensorDataset: A TensorDataset containing the following four tensors.\n                1. input_ids_all: Tensor. Each sublist contains numerical values,\n                    i.e. token ids, corresponding to the tokens in the input\n                    text data.\n                2. input_mask_all: Tensor. Each sublist contains the attention\n                    mask of the input token id list, 1 for input tokens and 0 for\n                    padded tokens, so that padded tokens are not attended to.\n                3. trailing_token_mask_all: Tensor. Each sublist is\n                    a boolean list, True for the first word piece of each\n                    original word, False for the trailing word pieces,\n                    e.g. \"##ize\". This mask is useful for removing the\n                    predictions on trailing word pieces, so that each\n                    original word in the input text has a unique predicted\n                    label.\n                4. label_ids_all: Tensor, each sublist contains token labels of\n                    a input sentence/paragraph, if labels is provided. If the\n                    `labels` argument is not provided, it will not return this tensor.\n        \"\"\"\n\n        def _is_iterable_but_not_string(obj):\n            return isinstance(obj, Iterable) and not isinstance(obj, str)\n\n        if max_len > MAX_SEQ_LEN:\n            logging.warning(\n                \"Setting max_len to max allowed sequence length: {}\".format(MAX_SEQ_LEN)\n            )\n            max_len = MAX_SEQ_LEN\n\n        logging.warn(\n            \"Token lists with length > {} will be truncated\".format(MAX_SEQ_LEN)\n        )\n\n        if not _is_iterable_but_not_string(text):\n            # The input text must be an non-string Iterable\n            raise ValueError(\"Input text must be an iterable and not a string.\")\n        else:\n            # If the input text is a single list of words, convert it to\n            # list of lists for later iteration\n            if not _is_iterable_but_not_string(text[0]):\n                text = [text]\n\n        if labels is not None:\n            if not _is_iterable_but_not_string(labels):\n                raise ValueError(\"labels must be an iterable and not a string.\")\n            else:\n                if not _is_iterable_but_not_string(labels[0]):\n                    labels = [labels]\n\n        label_available = True\n        if labels is None:\n            label_available = False\n            # create an artificial label list for creating trailing token mask\n            labels = [[\"O\"] * len(t) for t in text]\n\n        input_ids_all = []\n        input_mask_all = []\n        label_ids_all = []\n        trailing_token_mask_all = []\n\n        for t, t_labels in zip(text, labels):\n            if len(t) != len(t_labels):\n                raise ValueError(\n                    \"Num of words and num of labels should be the same {0}!={1}\".format(\n                        len(t), len(t_labels)\n                    )\n                )\n\n            new_labels = []\n            new_tokens = []\n            for word, tag in zip(t, t_labels):\n                sub_words = self.tokenizer.tokenize(word)\n                for count, sub_word in enumerate(sub_words):\n                    if count > 0:\n                        tag = trailing_piece_tag\n                    new_labels.append(tag)\n                    new_tokens.append(sub_word)\n\n            if len(new_tokens) > max_len:\n                new_tokens = new_tokens[:max_len]\n                new_labels = new_labels[:max_len]\n            input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)\n\n            # The mask has 1 for real tokens and 0 for padding tokens.\n            # Only real tokens are attended to.\n            input_mask = [1.0] * len(input_ids)\n\n            # Zero-pad up to the max sequence length.\n            padding = [0.0] * (max_len - len(input_ids))\n            label_padding = [\"O\"] * (max_len - len(input_ids))\n\n            input_ids += padding\n            input_mask += padding\n            new_labels += label_padding\n\n            trailing_token_mask_all.append(\n                [True if label != trailing_piece_tag else False for label in new_labels]\n            )\n\n            if label_map:\n                label_ids = [label_map[label] for label in new_labels]\n            else:\n                label_ids = new_labels\n\n            input_ids_all.append(input_ids)\n            input_mask_all.append(input_mask)\n            label_ids_all.append(label_ids)\n\n        if label_available:\n            td = TensorDataset(\n                torch.LongTensor(input_ids_all),\n                torch.LongTensor(input_mask_all),\n                torch.LongTensor(trailing_token_mask_all),\n                torch.LongTensor(label_ids_all),\n            )\n        else:\n            td = TensorDataset(\n                torch.LongTensor(input_ids_all),\n                torch.LongTensor(input_mask_all),\n                torch.LongTensor(trailing_token_mask_all),\n            )\n        return td\n\n\nclass TokenClassifier(Transformer):\n    \"\"\"\n    A wrapper for token classification use case based on Transformer.\n\n    Args:\n        model_name (str, optional): The pretained model name.\n            Defaults to \"bert-base-cased\".\n        num_labels (int, optional): The number of labels.\n            Defaults to 2.\n        cache_dir (str, optional): The default folder for saving cache files.\n            Defaults to \".\".\n    \"\"\"\n\n    def __init__(self, model_name=\"bert-base-cased\", num_labels=2, cache_dir=\".\"):\n        config = AutoConfig.from_pretrained(\n            model_name, num_labels=num_labels, cache_dir=cache_dir\n        )\n        model = AutoModelForTokenClassification.from_pretrained(\n            model_name, cache_dir=cache_dir, config=config, output_loading_info=False\n        )\n        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)\n\n    @staticmethod\n    def list_supported_models():\n        return supported_models\n\n    def fit(\n        self,\n        train_dataloader,\n        num_epochs=1,\n        max_steps=-1,\n        gradient_accumulation_steps=1,\n        num_gpus=None,\n        gpu_ids=None,\n        local_rank=-1,\n        weight_decay=0.0,\n        learning_rate=5e-5,\n        adam_epsilon=1e-8,\n        warmup_steps=0,\n        fp16=False,\n        fp16_opt_level=\"O1\",\n        checkpoint_state_dict=None,\n        verbose=True,\n        seed=None,\n    ):\n        \"\"\"\n        Fine-tunes a pre-trained sequence classification model.\n\n        Args:\n            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.\n            num_epochs (int, optional): Number of training epochs. Defaults to 1.\n            max_steps (int, optional): Total number of training steps.\n                If set to a positive value, it overrides num_epochs.\n                Otherwise, it's determined by the dataset length,\n                gradient_accumulation_steps, and num_epochs.\n                Defualts to -1.\n            gradient_accumulation_steps (int, optional): Number of steps to accumulate\n                before performing a backward/update pass.\n                Default to 1.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used.\n                If set to 0 or GPUs are not available, CPU device will be used.\n                Defaults to None.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            local_rank (int, optional): Local_rank for distributed training on GPUs.\n                Defaults to -1, which means non-distributed training.\n            weight_decay (float, optional): Weight decay to apply after each\n                parameter update.\n                Defaults to 0.0.\n            learning_rate (float, optional):  Learning rate of the AdamW optimizer.\n                Defaults to 5e-5.\n            adam_epsilon (float, optional): Epsilon of the AdamW optimizer.\n                Defaults to 1e-8.\n            warmup_steps (int, optional): Number of steps taken to increase learning\n                rate from 0 to `learning rate`. Defaults to 0.\n            fp16 (bool): Whether to use 16-bit mixed precision through Apex\n                Defaults to False\n            fp16_opt_level (str): Apex AMP optimization level for fp16.\n                One of in ['O0', 'O1', 'O2', and 'O3']\n                See https://nvidia.github.io/apex/amp.html\"\n                Defaults to \"01\"\n            checkpoint_state_dict (dict): Checkpoint states of model and optimizer.\n                If specified, the model and optimizer's parameters are loaded using\n                checkpoint_state_dict[\"model\"] and checkpoint_state_dict[\"optimizer\"]\n                Defaults to None.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n            seed (int, optional): Random seed used to improve reproducibility.\n                Defaults to None.\n        \"\"\"\n\n        # init device and optimizer\n        device, num_gpus, amp = self.prepare_model_and_optimizer(\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=local_rank,\n            weight_decay=weight_decay,\n            learning_rate=learning_rate,\n            adam_epsilon=adam_epsilon,\n            fp16=fp16,\n            fp16_opt_level=fp16_opt_level,\n            checkpoint_state_dict=checkpoint_state_dict,\n        )\n\n        # compute the max number of training steps\n        max_steps = compute_training_steps(\n            dataloader=train_dataloader,\n            num_epochs=num_epochs,\n            max_steps=max_steps,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n        )\n\n        # init scheduler\n        scheduler = Transformer.get_default_scheduler(\n            optimizer=self.optimizer,\n            warmup_steps=warmup_steps,\n            num_training_steps=max_steps,\n        )\n\n        # fine tune\n        super().fine_tune(\n            train_dataloader=train_dataloader,\n            get_inputs=TokenClassificationProcessor.get_inputs,\n            device=device,\n            num_gpus=num_gpus,\n            max_steps=max_steps,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n            optimizer=self.optimizer,\n            scheduler=scheduler,\n            fp16=fp16,\n            amp=amp,\n            local_rank=local_rank,\n            verbose=verbose,\n            seed=seed,\n        )\n\n    def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):\n        \"\"\"\n        Scores a dataset using a fine-tuned model and a given dataloader.\n\n        Args:\n            test_dataloader (DataLoader): DataLoader for scoring the data.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used. If set to 0 or GPUs are\n                not available, CPU device will be used.\n                Defaults to None.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n\n        Returns\n            1darray: numpy array of predicted label indices.\n        \"\"\"\n\n        preds = list(\n            super().predict(\n                eval_dataloader=test_dataloader,\n                get_inputs=TokenClassificationProcessor.get_inputs,\n                num_gpus=num_gpus,\n                gpu_ids=gpu_ids,\n                verbose=verbose,\n            )\n        )\n        preds = np.concatenate(preds)\n        return preds\n\n    def get_predicted_token_labels(self, predictions, label_map, dataset):\n        \"\"\"\n        Post-process the raw prediction values and get the class label for each token.\n\n        Args:\n            predictions (ndarray): A numpy ndarray produced from the `predict`\n                function call. The shape of the ndarray is:\n                [number_of_examples, sequence_length, number_of_labels].\n            label_map (dict): A dictionary object to map a label (str) to an ID (int).\n                dataset (TensorDataset): The TensorDataset for evaluation.\n            dataset (Dataset): The test Dataset instance.\n\n        Returns:\n            list: A list of lists. The size of the retured list is the number of\n                testing samples.\n            Each sublist represents the predicted label for each token.\n        \"\"\"\n\n        num_samples = len(dataset.tensors[0])\n        if num_samples != predictions.shape[0]:\n            raise ValueError(\n                \"Predictions have {0} samples, but got {1} samples in dataset\".format(\n                    predictions.shape[0], num_samples\n                )\n            )\n\n        label_id2str = {v: k for k, v in label_map.items()}\n        attention_mask_all = dataset.tensors[1].data.numpy()\n        trailing_mask_all = dataset.tensors[2].data.numpy()\n        seq_len = len(trailing_mask_all[0])\n        labels = []\n\n        for idx in range(num_samples):\n            seq_probs = predictions[idx]\n            attention_mask = attention_mask_all[idx]\n            trailing_mask = trailing_mask_all[idx]\n            one_sample = []\n\n            for sid in range(seq_len):\n                if attention_mask[sid] == 0:\n                    break\n\n                if not bool(trailing_mask[sid]):\n                    continue\n\n                label_id = seq_probs[sid].argmax()\n                one_sample.append(label_id2str[label_id])\n            labels.append(one_sample)\n        return labels\n\n    def get_true_test_labels(self, label_map, dataset):\n        \"\"\"\n        Get the true testing label values.\n\n        Args:\n            label_map (dict): A dictionary object to map a label (str) to an ID (int).\n                dataset (TensorDataset): The TensorDataset for evaluation.\n            dataset (Dataset): The test Dataset instance.\n\n        Returns:\n            list: A list of lists. The size of the retured list is the number\n                of testing samples.\n            Each sublist represents the predicted label for each token.\n        \"\"\"\n\n        num_samples = len(dataset.tensors[0])\n        label_id2str = {v: k for k, v in label_map.items()}\n        attention_mask_all = dataset.tensors[1].data.numpy()\n        trailing_mask_all = dataset.tensors[2].data.numpy()\n        label_ids_all = dataset.tensors[3].data.numpy()\n        seq_len = len(trailing_mask_all[0])\n        labels = []\n\n        for idx in range(num_samples):\n            attention_mask = attention_mask_all[idx]\n            trailing_mask = trailing_mask_all[idx]\n            label_ids = label_ids_all[idx]\n            one_sample = []\n\n            for sid in range(seq_len):\n                if attention_mask[sid] == 0:\n                    break\n\n                if not trailing_mask[sid]:\n                    continue\n\n                label_id = label_ids[sid]\n                one_sample.append(label_id2str[label_id])\n            labels.append(one_sample)\n        return labels\n"
  },
  {
    "path": "utils_nlp/models/transformers/question_answering.py",
    "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Modifications copyright © Microsoft Corporation\n\n\nimport collections\nimport json\nimport logging\nimport math\nimport os\n\nimport jsonlines\nimport torch\nfrom torch.utils.data import TensorDataset\nfrom tqdm import tqdm\n\nfrom transformers import AutoTokenizer\nfrom transformers.modeling_albert import (\n    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,\n    AlbertForQuestionAnswering,\n)\nfrom transformers.modeling_bert import (\n    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,\n    BertForQuestionAnswering,\n)\nfrom transformers.modeling_distilbert import (\n    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,\n    DistilBertForQuestionAnswering,\n)\nfrom transformers.modeling_xlnet import (\n    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,\n    XLNetForQuestionAnswering,\n)\nfrom transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize\n\nfrom utils_nlp.common.pytorch_utils import (\n    compute_training_steps,\n    get_device,\n    move_model_to_device,\n    parallelize_model,\n)\nfrom utils_nlp.models.transformers.common import MAX_SEQ_LEN, Transformer\n\nMODEL_CLASS = {}\nMODEL_CLASS.update(\n    {k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}\n)\nMODEL_CLASS.update(\n    {k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP}\n)\nMODEL_CLASS.update(\n    {k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}\n)\nMODEL_CLASS.update(\n    {k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP}\n)\n\n# cached files during preprocessing\n# these are used in postprocessing to generate the final answer texts\nCACHED_EXAMPLES_TRAIN_FILE = \"cached_examples_train.jsonl\"\nCACHED_FEATURES_TRAIN_FILE = \"cached_features_train.jsonl\"\n\nCACHED_EXAMPLES_TEST_FILE = \"cached_examples_test.jsonl\"\nCACHED_FEATURES_TEST_FILE = \"cached_features_test.jsonl\"\n\nlogger = logging.getLogger(__name__)\n\n\ndef _list_supported_models():\n    return list(MODEL_CLASS)\n\n\nclass QAProcessor:\n    \"\"\"\n    Class for preprocessing and postprocessing question answering data.\n\n    Args:\n        model_name (str, optional): Name of the model.\n            Call QAProcessor.list_supported_models() to get all supported models.\n            Defaults to \"bert-base-cased\".\n        to_lower (bool, optional): Whether to convert all letters to lower case during\n            tokenization. This is determined by if a cased model is used.\n            Defaults to False, which corresponds to a cased model.\n        custom_tokenize (function, optional): A custom tokenize function\n            used to tokenize the input text. If not provided, the default tokenizer\n            corresponding to the model_name is loaded and its `tokenize` method is used.\n            NOTE that even this function is provided, the numerical token ids are still\n            generated by the `convert_tokens_to_ids` method of the default tokenizer,\n            so there is a risk that tokens generated by the custom_tokenize\n            function don't have correponding token ids in the default toeknizer.\n            Defaults to None.\n        cache_dir (str, optional): Directory to cache the tokenizer. Defaults to \".\".\n    \"\"\"\n\n    def __init__(\n        self,\n        model_name=\"bert-base-cased\",\n        to_lower=False,\n        custom_tokenize=None,\n        cache_dir=\".\",\n    ):\n        self.model_name = model_name\n        self.tokenizer = AutoTokenizer.from_pretrained(\n            model_name,\n            do_lower_case=to_lower,\n            cache_dir=cache_dir,\n            output_loading_info=False,\n        )\n        self.do_lower_case = to_lower\n        self.custom_tokenize = custom_tokenize\n\n    @property\n    def model_name(self):\n        return self._model_name\n\n    @model_name.setter\n    def model_name(self, value):\n        if value not in self.list_supported_models():\n            raise ValueError(\n                \"Model name {} is not supported by QAProcessor. \"\n                \"Call 'QAProcessor.list_supported_models()' to get all supported model \"\n                \"names.\".format(value)\n            )\n\n        self._model_name = value\n        self._model_type = value.split(\"-\")[0]\n\n    @property\n    def model_type(self):\n        return self._model_type\n\n    @staticmethod\n    def get_inputs(batch, device, model_name, train_mode=True):\n        \"\"\"\n        Creates an input dictionary given a model name.\n\n        Args:\n            batch (tuple): A tuple containing input ids, attention mask,\n                segment ids, and labels tensors.\n            device (torch.device): A PyTorch device.\n            model_name (bool, optional): Model name used to format the inputs.\n            train_mode (bool, optional): Training mode flag.\n                Defaults to True.\n\n        Returns:\n            dict: Dictionary containing input ids, segment ids, masks, and labels.\n                Labels are only returned when train_mode is True.\n        \"\"\"\n        batch = tuple(t.to(device) for t in batch)\n        model_type = model_name.split(\"-\")[0]\n\n        inputs = {\"input_ids\": batch[0], \"attention_mask\": batch[1]}\n\n        if train_mode:\n            inputs.update({\"start_positions\": batch[3], \"end_positions\": batch[4]})\n\n        if model_type not in [\"distilbert\"]:\n            inputs.update({\"token_type_ids\": batch[2]})\n\n        if model_type in [\"xlnet\"]:\n            if train_mode:\n                inputs.update({\"cls_index\": batch[5], \"p_mask\": batch[6]})\n            else:\n                inputs.update({\"cls_index\": batch[3], \"p_mask\": batch[4]})\n\n        return inputs\n\n    @staticmethod\n    def list_supported_models():\n        return _list_supported_models()\n\n    def preprocess(\n        self,\n        qa_dataset,\n        is_training,\n        max_question_length=64,\n        max_seq_length=MAX_SEQ_LEN,\n        doc_stride=128,\n        feature_cache_dir=\"./cached_qa_features\",\n    ):\n        \"\"\"\n        Preprocesses raw question answering data and generates train/test features.\n\n        Args:\n            qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`):\n                Question answering data in standard QADataset format.\n            is_training (bool): Whether the input data is training data.\n            max_question_length (int, optional): Maximum number of tokens\n                of the question sequence after tokenization, so the number of words\n                in the raw question is usually less than max_question_length.\n                Defaults to 64.\n            max_seq_length (int, optional): Maximum number of tokens of the entire\n                feature token sequence after tokenization. The entire feature token\n                sequence is composed of:\n                [CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP]\n                for models other than XLNet,\n                and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for\n                XLNet. Defaults to MAX_SEQ_LEN.\n            doc_stride (int, optional): Size (number of tokens) of the sliding window\n                when breaking down a long document paragraph in to multiple document\n                spans. Defaults to 128.\n            feature_cache_dir (int, optional): Directory to save some intermediate\n                preprocessing results.\n                If `is_training` is True, CACHED_EXAMPLES_TRAIN_FILE and\n                CACHED_FEATURES_TRAIN_FILE are saved to this directory. Otherwise,\n                CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved\n                to this directory. These files are required during postprocessing to\n                generate the final answer texts from predicted answer start and answer\n                end indices. Defaults to \"./cached_qa_features\".\n        Returns:\n            DataSet: A Pytorch DataSet.\n        \"\"\"\n\n        if not os.path.exists(feature_cache_dir):\n            os.makedirs(feature_cache_dir)\n\n        if is_training and not qa_dataset.actual_answer_available:\n            raise Exception(\n                \"answer_start and answer_text must be provided for training data.\"\n            )\n\n        if is_training:\n            examples_file = os.path.join(feature_cache_dir, CACHED_EXAMPLES_TRAIN_FILE)\n            features_file = os.path.join(feature_cache_dir, CACHED_FEATURES_TRAIN_FILE)\n        else:\n            examples_file = os.path.join(feature_cache_dir, CACHED_EXAMPLES_TEST_FILE)\n            features_file = os.path.join(feature_cache_dir, CACHED_FEATURES_TEST_FILE)\n\n        with jsonlines.open(examples_file, \"w\") as examples_writer, jsonlines.open(\n            features_file, \"w\"\n        ) as features_writer:\n\n            unique_id_all = []\n            unique_id_cur = 1000000000\n\n            features = []\n            qa_examples = []\n            qa_examples_json = []\n            features_json = []\n\n            for qa_input in qa_dataset:\n                qa_example_cur = _create_qa_example(qa_input, is_training=is_training)\n\n                qa_examples.append(qa_example_cur)\n\n                qa_examples_json.append(\n                    {\n                        \"qa_id\": qa_example_cur.qa_id,\n                        \"doc_tokens\": qa_example_cur.doc_tokens,\n                    }\n                )\n\n                features_cur = _create_qa_features(\n                    qa_example_cur,\n                    model_type=self.model_type,\n                    tokenizer=self.tokenizer,\n                    unique_id=unique_id_cur,\n                    is_training=is_training,\n                    max_question_length=max_question_length,\n                    max_seq_length=max_seq_length,\n                    doc_stride=doc_stride,\n                    custom_tokenize=self.custom_tokenize,\n                )\n                features += features_cur\n\n                for f in features_cur:\n                    features_json.append(\n                        {\n                            \"qa_id\": f.qa_id,\n                            \"unique_id\": f.unique_id,\n                            \"tokens\": f.tokens,\n                            \"token_to_orig_map\": f.token_to_orig_map,\n                            \"token_is_max_context\": f.token_is_max_context,\n                            \"paragraph_len\": f.paragraph_len,\n                        }\n                    )\n                    unique_id_cur = f.unique_id\n                    unique_id_all.append(unique_id_cur)\n\n            examples_writer.write_all(qa_examples_json)\n            features_writer.write_all(features_json)\n\n            logger.info(\"QA examples are saved to {}\".format(examples_file))\n            logger.info(\"QA features are saved to {}\".format(features_file))\n\n        # TODO: maybe generalize the following code\n        input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n        input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n        segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)\n        cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)\n        p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)\n\n        if is_training:\n            start_positions = torch.tensor(\n                [f.start_position for f in features], dtype=torch.long\n            )\n            end_positions = torch.tensor(\n                [f.end_position for f in features], dtype=torch.long\n            )\n            qa_dataset = TensorDataset(\n                input_ids,\n                input_mask,\n                segment_ids,\n                start_positions,\n                end_positions,\n                cls_index,\n                p_mask,\n            )\n        else:\n            unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)\n            qa_dataset = TensorDataset(\n                input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all\n            )\n\n        return qa_dataset\n\n    def postprocess(\n        self,\n        results,\n        examples_file,\n        features_file,\n        n_best_size=20,\n        n_top_start=5,\n        n_top_end=5,\n        max_answer_length=30,\n        unanswerable_exists=False,\n        output_prediction_file=\"./qa_predictions.json\",\n        output_nbest_file=\"./nbest_predictions.json\",\n        output_null_log_odds_file=\"./null_odds.json\",\n        null_score_diff_threshold=0.0,\n        verbose_logging=False,\n    ):\n\n        \"\"\"\n        Postprocesses start and end logits generated by :meth:`AnswerExtractor.fit`.\n\n        Args:\n            results (list): List of :class:`QAResultExtended`.\n            examples_file (str): One of the files cached by\n                :meth:`QAProcessor.preprocess`. This file contains the original\n                document tokens that are used to generate the final answers from\n                the predicted start and end positions.\n            features_file (str): One of the files cached by\n                :meth:`QAProcessor.preprocess`. This file contains the mapping from\n                indices in the processed token list to the original document tokens\n                that are used to generate the final predicted answers.\n            n_best_size (int, optional): The number of candidates to choose from each\n                QAResult to generate the final prediction.\n                It's also the maximum number of n-best answers to output for\n                each question. Note that the number of n-best answers can be smaller\n                than `n_best_size` because some unqualified answers, e.g. answers\n                that are too long are removed.\n            n_top_start (int, optional): For XLNet only. Beam size for span start.\n                Note that this needs to be consistent with the XLNet model\n                configuration. Defaults to 5.\n            n_top_end (int, optional): For XLNet only. Beam size for span end.\n                Note that this needs to be consistent with the XLNet model\n                configuration. Defaults to 5.\n            max_answer_length (int, optional): Maximum length of the answer.\n                Defaults to 30.\n            unanswerable_exists (bool, optional): Whether there are unanswerable\n                questions in the data. If True, the start and end logits of the\n                [CLS] token, which indicate the probability of the answer being empty,\n                are included in the candidate answer list.\n                Defaults to False.\n            output_prediction_file (str, optional): Path of the file to save the\n                predicted answers.\n                Defaults to \"./qa_predictions.json\".\n            output_nbest_file (str, optional): Path of the file to save the n-best\n                answers. Defaults to \"./nbest_predictions.json\".\n            output_null_log_odds_file (str, optional): If unanswerable_exists is True,\n                the score difference between empty prediction and best non-empty\n                prediction are saved to this file. These scores can be used to find the\n                best threshold for predicting an empty answer.\n                Defaults to \"./null_odds.json\".\n            null_score_diff_threshold (float, optional): For BERT models only.\n                If unanswerable_exists=True and the score difference between empty\n                prediction and best non-empty prediction is higher than this threshold,\n                the final predicted answer is empty. Defaults to 0.0.\n            verbose_logging (bool, optional): Whether to log details of answer\n                postprocessing. Defaults to False.\n\n        Returns:\n            tuple: (OrderedDict, OrderedDict, OrderedDict)\n                The keys of the dictionaries are the `qa_id` in the original\n                :class:`utils_nlp.dataset.pytorch.QADataset`\n                The values of the first dictionary are the predicted answer texts\n                    in string type.\n                The values of the second dictionary are the softmax probabilities\n                    of the predicted answers.\n                The values of the third dictionary are the n-best answers for\n                    each qa_id. Note that the number of n-best answers can be smaller\n                    than `n_best_size` because some unqualified answers, e.g. answers\n                    that are too long, are removed.\n        \"\"\"\n        if self.model_type == \"xlnet\":\n            final_answers, answer_probs, nbest_answers = postprocess_xlnet_answer(\n                results=results,\n                examples_file=examples_file,\n                features_file=features_file,\n                tokenizer=self.tokenizer,\n                n_best_size=n_best_size,\n                n_top_start=n_top_start,\n                n_top_end=n_top_end,\n                max_answer_length=max_answer_length,\n                unanswerable_exists=unanswerable_exists,\n                output_prediction_file=output_prediction_file,\n                output_nbest_file=output_nbest_file,\n                output_null_log_odds_file=output_null_log_odds_file,\n                verbose_logging=verbose_logging,\n            )\n        else:\n            final_answers, answer_probs, nbest_answers = postprocess_bert_answer(\n                results=results,\n                examples_file=examples_file,\n                features_file=features_file,\n                do_lower_case=self.do_lower_case,\n                n_best_size=n_best_size,\n                max_answer_length=max_answer_length,\n                unanswerable_exists=unanswerable_exists,\n                output_prediction_file=output_prediction_file,\n                output_nbest_file=output_nbest_file,\n                output_null_log_odds_file=output_null_log_odds_file,\n                null_score_diff_threshold=null_score_diff_threshold,\n                verbose_logging=verbose_logging,\n            )\n        return final_answers, answer_probs, nbest_answers\n\n\nQAResult_ = collections.namedtuple(\n    \"QAResult\", [\"unique_id\", \"start_logits\", \"end_logits\"]\n)\n\n\n# create a wrapper class so that we can add docstrings\nclass QAResult(QAResult_):\n    \"\"\"\n    Question answering prediction result returned by AnswerExtractor.predict\n        for BERT models.\n\n    Args:\n        unique_id (int): An id identifying a unique document-question-answer triplet.\n            During postprocessing, this id is used to map the prediction results back\n            to the original document-question-answer triplet.\n        start_logits (list): List of logits for predicting each token being the\n            start of the answer span.\n        end__logits (list): List of logits for predicting each token being the end\n            of the answer span.\n\n    \"\"\"\n\n    pass\n\n\nQAResultExtended_ = collections.namedtuple(\n    \"QAResultExtended\",\n    [\n        \"unique_id\",\n        \"start_top_log_probs\",\n        \"start_top_index\",\n        \"end_top_log_probs\",\n        \"end_top_index\",\n        \"cls_logits\",\n    ],\n)\n\n\n# create a wrapper class so that we can add docstrings\nclass QAResultExtended(QAResultExtended_):\n    \"\"\"\n    Question answering prediction result returned by AnswerExtractor.predict for\n        XLNet models.\n\n    Args:\n        unique_id (int): An id identifying a unique document-question-answer triplet.\n            During postprocessing, this id is used to map the prediction results back\n            to the original document-question-answer triplet.\n        start_top_log_probs (list): Log probabilities for the top config.start_n_top\n            start token possibilities (beam-search).\n        start_top_index (list): Indices for the top config.start_n_top start token\n            possibilities (beam-search).\n        end_top_log_probs (list): Log probabilities for the top ``config.start_n_top *\n            config.end_n_top`` end token possibilities (beam-search).\n        end_top_index (list): Indices for the top\n            ``config.start_n_top * config.end_n_top`` end token possibilities\n            (beam-search).\n        cls_logits (float): Log probabilities for the ``is_impossible`` label\n            of the answers.\n\n    \"\"\"\n\n    pass\n\n\nclass AnswerExtractor(Transformer):\n    \"\"\"\n    Answer extractor based on pre-trained transformers models.\n\n    Args:\n        model_name (model_name, optional): Name of the pre-trained transformers model.\n            Call AnswerExtractor.list_supported_models() to see all the\n                models supported. Defaults to \"bert-base-cased\".\n        cache_dir (str, optional):  Location of BERT's cache directory.\n            When calling the `fit` method, if `cache_model` is `True`, the fine-tuned\n            model is saved to a `fine_tuned` folder under this directory.\n            Defaults to \".\".\n        load_model_from_dir (str, optional): Directory to load the model from.\n            The directory must contain a model file \"pytorch_model.bin\" and a\n            configuration file \"config.json\".\n            Defaults to None.\n\n    \"\"\"\n\n    def __init__(\n        self, model_name=\"bert-base-cased\", cache_dir=\".\", load_model_from_dir=None\n    ):\n        model = MODEL_CLASS[model_name].from_pretrained(\n            model_name if load_model_from_dir is None else load_model_from_dir,\n            cache_dir=cache_dir,\n            output_loading_info=False,\n        )\n        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)\n\n    @staticmethod\n    def list_supported_models():\n        return _list_supported_models()\n\n    def fit(\n        self,\n        train_dataloader,\n        num_epochs=1,\n        max_steps=-1,\n        gradient_accumulation_steps=1,\n        num_gpus=None,\n        gpu_ids=None,\n        local_rank=-1,\n        weight_decay=0.0,\n        learning_rate=5e-5,\n        adam_epsilon=1e-8,\n        warmup_steps=0,\n        fp16=False,\n        fp16_opt_level=\"O1\",\n        checkpoint_state_dict=None,\n        verbose=True,\n        seed=None,\n        cache_model=True,\n    ):\n        \"\"\"\n        Fine-tune pre-trained transofmer models for question answering.\n\n        Args:\n            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.\n            num_epochs (int, optional): Number of training epochs. Defaults to 1.\n            max_steps (int, optional): Total number of training steps.\n                If set to a positive value, it overrides num_epochs.\n                Otherwise, it's determined by the dataset length,\n                    gradient_accumulation_steps, and num_epochs.\n                Defualts to -1.\n            gradient_accumulation_steps (int, optional): Number of steps to accumulate\n                before performing a backward/update pass.\n                Default to 1.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used.\n                If set to 0 or GPUs are not available, CPU device will be used.\n                Defaults to None.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            local_rank (int, optional): Local_rank for distributed training on GPUs.\n                Defaults to -1, which means non-distributed training.\n            weight_decay (float, optional): Weight decay to apply after each\n                parameter update. Defaults to 0.0.\n            learning_rate (float, optional):  Learning rate of the AdamW optimizer.\n                Defaults to 5e-5.\n            adam_epsilon (float, optional): Epsilon of the AdamW optimizer.\n                Defaults to 1e-8.\n            warmup_steps (int, optional): Number of steps taken to increase\n                learning rate from 0 to `learning rate`.\n                Defaults to 0.\n            fp16 (bool): Whether to use 16-bit mixed precision through Apex\n                Defaults to False\n            fp16_opt_level (str): Apex AMP optimization level for fp16.\n                One of in ['O0', 'O1', 'O2', and 'O3']\n                See https://nvidia.github.io/apex/amp.html\"\n                Defaults to \"01\"\n            checkpoint_state_dict (dict): Checkpoint states of model and optimizer.\n                If specified, the model and optimizer's parameters are loaded using\n                checkpoint_state_dict[\"model\"] and checkpoint_state_dict[\"optimizer\"]\n                Defaults to None.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n            seed (int, optional): Random seed used to improve reproducibility.\n                Defaults to None.\n            cache_model (bool, optional): Whether to save the fine-tuned model.\n                If True, the fine-tuned model is saved to a `fine_tuned` folder\n                    under of the `cache_dir` of AnswerExtractor.\n                Defaults to True.\n\n        \"\"\"\n\n        # init device and optimizer\n        device, num_gpus, amp = self.prepare_model_and_optimizer(\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=local_rank,\n            weight_decay=weight_decay,\n            learning_rate=learning_rate,\n            adam_epsilon=adam_epsilon,\n            fp16=fp16,\n            fp16_opt_level=fp16_opt_level,\n            checkpoint_state_dict=checkpoint_state_dict,\n        )\n\n        # compute the max number of training steps\n        max_steps = compute_training_steps(\n            train_dataloader,\n            num_epochs=num_epochs,\n            max_steps=max_steps,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n        )\n\n        # inin scheduler\n        scheduler = Transformer.get_default_scheduler(\n            optimizer=self.optimizer,\n            warmup_steps=warmup_steps,\n            num_training_steps=max_steps,\n        )\n\n        # fine tune\n        super().fine_tune(\n            train_dataloader=train_dataloader,\n            get_inputs=QAProcessor.get_inputs,\n            device=device,\n            num_gpus=num_gpus,\n            max_steps=max_steps,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n            optimizer=self.optimizer,\n            scheduler=scheduler,\n            fp16=fp16,\n            amp=amp,\n            local_rank=local_rank,\n            verbose=verbose,\n            seed=seed,\n        )\n\n        if cache_model:\n            self.save_model()\n\n    def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):\n\n        \"\"\"\n        Predicts answer start and end logits.\n\n        Args:\n            test_dataloader (DataLoader): DataLoader for scoring the data.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used.\n                If set to 0 or GPUs are not available, CPU device will be used.\n                Defaults to None.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            verbose (bool, optional): Whether to print out the predicting log.\n                Defaults to True.\n\n        Returns:\n            list: List of :class:`QAResult` or :class:`QAResultExtended`.\n        \"\"\"\n\n        def _to_list(tensor):\n            return tensor.detach().cpu().tolist()\n\n        # get device\n        device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=-1)\n\n        # move model\n        self.model = move_model_to_device(model=self.model, device=device)\n\n        # parallelize model\n        self.model = parallelize_model(\n            model=self.model,\n            device=device,\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=-1,\n        )\n\n        all_results = []\n        for batch in tqdm(test_dataloader, desc=\"Evaluating\", disable=not verbose):\n            with torch.no_grad():\n                inputs = QAProcessor.get_inputs(\n                    batch, device, self.model_name, train_mode=False\n                )\n                outputs = self.model(**inputs)\n                unique_id_tensor = batch[5]\n\n            for i, u_id in enumerate(unique_id_tensor):\n                if self.model_type in [\"xlnet\"]:\n                    result = QAResultExtended(\n                        unique_id=u_id.item(),\n                        start_top_log_probs=_to_list(outputs[0][i]),\n                        start_top_index=_to_list(outputs[1][i]),\n                        end_top_log_probs=_to_list(outputs[2][i]),\n                        end_top_index=_to_list(outputs[3][i]),\n                        cls_logits=_to_list(outputs[4][i]),\n                    )\n                else:\n                    result = QAResult(\n                        unique_id=u_id.item(),\n                        start_logits=_to_list(outputs[0][i]),\n                        end_logits=_to_list(outputs[1][i]),\n                    )\n                all_results.append(result)\n            torch.cuda.empty_cache()\n\n        return all_results\n\n\ndef postprocess_bert_answer(\n    results,\n    examples_file,\n    features_file,\n    do_lower_case,\n    unanswerable_exists=False,\n    n_best_size=20,\n    max_answer_length=30,\n    output_prediction_file=\"./qa_predictions.json\",\n    output_nbest_file=\"./nbest_predictions.json\",\n    output_null_log_odds_file=\"./null_odds.json\",\n    null_score_diff_threshold=0.0,\n    verbose_logging=False,\n):\n    \"\"\"\n    Postprocesses start and end logits\n    generated by :meth:`AnswerExtractor.fit` for BERT.\n\n    Args:\n        results (list): List of :class:`QAResult`.\n        examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.\n            This file contains the original document tokens that are used to generate\n            the final answers from the predicted start and end positions.\n        features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.\n            This file contains the mapping from indices in the processed token list\n            to the original document tokens that are used to generate the final\n            predicted answers.\n        do_lower_case (bool): Whether an uncased tokenizer was used during\n            data preprocessing. This is required during answer finalization\n            by comparing the predicted answer text and the original\n            text span in :func:`_get_final_text`.\n        unanswerable_exists (bool, optional): Whether there are unanswerable\n            questions in the data. If True, the start and end logits of the [CLS]\n            token, which indicate the probability of the answer being empty,\n            are included in the candidate answer list.\n            Defaults to False.\n        n_best_size (int, optional): The number of candidates to choose from each\n            QAResult to generate the final prediction. It's also the maximum number\n            of n-best answers to output for each question.\n            Note that the number of n-best answers can be smaller than `n_best_size`\n            because some unqualified answers,\n            e.g. answer that are too long, are removed.\n        max_answer_length (int, optional): Maximum length of the answer. Defaults to 30.\n        output_prediction_file (str, optional): Path of the file to save the\n            predicted answers. Defaults to \"./qa_predictions.json\".\n        output_nbest_file (str, optional): Path of the file to save the n-best answers.\n            Defaults to \"./nbest_predictions.json\".\n        output_null_log_odds_file (str, optional): If unanswerable_exists is True,\n            the score difference between empty prediction and best non-empty prediction\n            are saved to this file. These scores can be used to find the best threshold\n            for predicting an empty answer. Defaults to \"./null_odds.json\".\n        null_score_diff_threshold (float, optional): If unanswerable_exists=True\n            and the score difference between empty prediction and best non-empty\n            prediction is higher than this threshold, the final predicted\n            answer is empty.\n            Defaults to 0.0.\n        verbose_logging (bool, optional): Whether to log details of\n            answer postprocessing. Defaults to False.\n\n    Returns:\n        tuple: (OrderedDict, OrderedDict, OrderedDict)\n            The keys of the dictionaries are the `qa_id` in the original\n            :class:`utils_nlp.dataset.pytorch.QADataset`\n            The values of the first dictionary are the predicted answer texts\n            in string type. The values of the second dictionary are the softmax\n            probabilities of the predicted answers.\n            The values of the third dictionary are the n-best answers for each qa_id.\n            Note that the number of n-best answers can be smaller than `n_best_size`\n            because some unqualified answers, e.g. answers that are too long,\n            are removed.\n\n    \"\"\"\n    with jsonlines.open(examples_file) as reader:\n        examples_all = list(reader.iter())\n\n    with jsonlines.open(features_file) as reader:\n        features_all = list(reader.iter())\n\n    qa_id_to_features = collections.defaultdict(list)\n    # Map unique features to the original doc-question-answer triplet\n    # Each doc-question-answer triplet can have multiple features because the doc\n    # could be split into multiple spans\n    for f in features_all:\n        qa_id_to_features[f[\"qa_id\"]].append(f)\n\n    unique_id_to_result = {}\n    for r in results:\n        unique_id_to_result[r.unique_id] = r\n\n    all_predictions = collections.OrderedDict()\n    all_probs = collections.OrderedDict()\n    all_nbest_json = collections.OrderedDict()\n    scores_diff_json = collections.OrderedDict()\n\n    for example in examples_all:\n        # get all the features belonging to the same example,\n        # i.e. paragaraph/question pair.\n        features = qa_id_to_features[example[\"qa_id\"]]\n\n        prelim_predictions = []\n        # keep track of the minimum score of null start+end of position 0\n        score_null = 1000000  # large and positive\n\n        min_null_feature_index = 0  # the paragraph slice with min null score\n        null_start_logit = 0  # the start logit at the slice with min null score\n        null_end_logit = 0  # the end logit at the slice with min null score\n        for (feature_index, f) in enumerate(features):\n            result = unique_id_to_result[f[\"unique_id\"]]\n            start_indexes = _get_best_indexes(result.start_logits, n_best_size)\n            end_indexes = _get_best_indexes(result.end_logits, n_best_size)\n            # if we could have irrelevant answers, get the min score of irrelevant\n            if unanswerable_exists:\n                # The first element of the start end end logits is the\n                # probability of predicting the [CLS] token as the start and\n                # end positions of the answer, which means the answer is\n                # empty.\n                feature_null_score = result.start_logits[0] + result.end_logits[0]\n                if feature_null_score < score_null:\n                    score_null = feature_null_score\n                    min_null_feature_index = feature_index\n                    null_start_logit = result.start_logits[0]\n                    null_end_logit = result.end_logits[0]\n            for start_index in start_indexes:\n                for end_index in end_indexes:\n                    # We could hypothetically create invalid predictions, e.g., predict\n                    # that the start of the span is in the question. We throw out all\n                    # invalid predictions.\n                    if start_index >= len(f[\"tokens\"]):\n                        continue\n                    if end_index >= len(f[\"tokens\"]):\n                        continue\n                    if str(start_index) not in f[\"token_to_orig_map\"]:\n                        continue\n                    if str(end_index) not in f[\"token_to_orig_map\"]:\n                        continue\n                    if not f[\"token_is_max_context\"].get(str(start_index), False):\n                        continue\n                    if end_index < start_index:\n                        continue\n                    length = end_index - start_index + 1\n                    if length > max_answer_length:\n                        continue\n                    prelim_predictions.append(\n                        _PrelimPrediction(\n                            feature_index=feature_index,\n                            start_index=start_index,\n                            end_index=end_index,\n                            start_logit=result.start_logits[start_index],\n                            end_logit=result.end_logits[end_index],\n                        )\n                    )\n        if unanswerable_exists:\n            prelim_predictions.append(\n                _PrelimPrediction(\n                    feature_index=min_null_feature_index,\n                    start_index=0,\n                    end_index=0,\n                    start_logit=null_start_logit,\n                    end_logit=null_end_logit,\n                )\n            )\n\n        # Sort by the sum of the start and end logits in ascending order,\n        # so that the first element is the most probable answer\n        prelim_predictions = sorted(\n            prelim_predictions,\n            key=lambda x: (x.start_logit + x.end_logit),\n            reverse=True,\n        )\n\n        seen_predictions = {}\n        nbest = []\n        for pred in prelim_predictions:\n            if len(nbest) >= n_best_size:\n                break\n            f = features[pred.feature_index]\n            if pred.start_index > 0:  # this is a non-null prediction\n                tok_tokens = f[\"tokens\"][pred.start_index : (pred.end_index + 1)]\n                orig_doc_start = f[\"token_to_orig_map\"][str(pred.start_index)]\n                orig_doc_end = f[\"token_to_orig_map\"][str(pred.end_index)]\n                orig_tokens = example[\"doc_tokens\"][orig_doc_start : (orig_doc_end + 1)]\n                tok_text = \" \".join(tok_tokens)\n\n                # De-tokenize WordPieces that have been split off.\n                tok_text = tok_text.replace(\" ##\", \"\")\n                tok_text = tok_text.replace(\"##\", \"\")\n\n                # Clean whitespace\n                tok_text = tok_text.strip()\n                tok_text = \" \".join(tok_text.split())\n                orig_text = \" \".join(orig_tokens)\n\n                final_text = _get_final_text(\n                    tok_text, orig_text, do_lower_case, verbose_logging\n                )\n                if final_text in seen_predictions:\n                    continue\n\n                seen_predictions[final_text] = True\n            else:\n                final_text = \"\"\n                seen_predictions[final_text] = True\n\n            nbest.append(\n                _NbestPrediction(\n                    text=final_text,\n                    start_logit=pred.start_logit,\n                    end_logit=pred.end_logit,\n                )\n            )\n        # if we didn't include the empty option in the n-best, include it\n        if unanswerable_exists:\n            if \"\" not in seen_predictions:\n                nbest.append(\n                    _NbestPrediction(\n                        text=\"\", start_logit=null_start_logit, end_logit=null_end_logit\n                    )\n                )\n\n            # In very rare edge cases we could only have single null prediction.\n            # So we just create a nonce prediction in this case to avoid failure.\n            if len(nbest) == 1:\n                nbest.insert(\n                    0, _NbestPrediction(text=\"empty\", start_logit=0.0, end_logit=0.0)\n                )\n\n        # In very rare edge cases we could have no valid predictions. So we\n        # just create a nonce prediction in this case to avoid failure.\n        if not nbest:\n            nbest.append(_NbestPrediction(text=\"empty\", start_logit=0.0, end_logit=0.0))\n\n        assert len(nbest) >= 1\n\n        total_scores = []\n        best_non_null_entry = None\n        for ie, entry in enumerate(nbest):\n            total_scores.append(entry.start_logit + entry.end_logit)\n            if not best_non_null_entry:\n                if entry.text:\n                    best_non_null_entry = entry\n                    best_non_null_entry_index = ie\n\n        probs = _compute_softmax(total_scores)\n\n        nbest_json = []\n        for (i, entry) in enumerate(nbest):\n            output = collections.OrderedDict()\n            output[\"text\"] = entry.text\n            output[\"probability\"] = probs[i]\n            output[\"start_logit\"] = entry.start_logit\n            output[\"end_logit\"] = entry.end_logit\n            nbest_json.append(output)\n\n            if entry.text == \"\":\n                null_prediction_index = i\n\n        assert len(nbest_json) >= 1\n\n        if not unanswerable_exists:\n            all_predictions[example[\"qa_id\"]] = nbest_json[0][\"text\"]\n            all_probs[example[\"qa_id\"]] = nbest_json[0][\"probability\"]\n        else:\n            # predict \"\" iff the null score - the score of best non-null > threshold\n            score_diff = (\n                score_null\n                - best_non_null_entry.start_logit\n                - (best_non_null_entry.end_logit)\n            )\n            scores_diff_json[example[\"qa_id\"]] = score_diff\n            if score_diff > null_score_diff_threshold:\n                all_predictions[example[\"qa_id\"]] = \"\"\n                # TODO: double check this\n                all_probs[example[\"qa_id\"]] = probs[null_prediction_index]\n            else:\n                all_predictions[example[\"qa_id\"]] = best_non_null_entry.text\n                all_probs[example[\"qa_id\"]] = probs[best_non_null_entry_index]\n        all_nbest_json[example[\"qa_id\"]] = nbest_json\n\n    \"\"\"Write final predictions to the json file and log-odds of null if needed.\"\"\"\n    logger.info(\"Writing predictions to: %s\" % (output_prediction_file))\n    logger.info(\"Writing nbest to: %s\" % (output_nbest_file))\n\n    with open(output_prediction_file, \"w\") as writer:\n        writer.write(json.dumps(all_predictions, indent=4) + \"\\n\")\n\n    with open(output_nbest_file, \"w\") as writer:\n        writer.write(json.dumps(all_nbest_json, indent=4) + \"\\n\")\n\n    if unanswerable_exists:\n        logger.info(\"Writing null odds to: %s\" % (output_null_log_odds_file))\n        with open(output_null_log_odds_file, \"w\") as writer:\n            writer.write(json.dumps(scores_diff_json, indent=4) + \"\\n\")\n\n    return all_predictions, all_probs, all_nbest_json\n\n\ndef postprocess_xlnet_answer(\n    results,\n    examples_file,\n    features_file,\n    tokenizer,\n    n_best_size=20,\n    n_top_start=5,\n    n_top_end=5,\n    max_answer_length=30,\n    unanswerable_exists=False,\n    output_prediction_file=\"./qa_predictions.json\",\n    output_nbest_file=\"./nbest_predictions.json\",\n    output_null_log_odds_file=\"./null_odds.json\",\n    verbose_logging=False,\n):\n    \"\"\"\n    Postprocesses start and end logits generated by :meth:`AnswerExtractor.fit`\n        for XLNet.\n\n    Args:\n        results (list): List of :class:`QAResultExtended`.\n        examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.\n            This file contains the original document tokens that are used to generate\n            the final answers from the predicted start and end positions.\n        features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.\n            This file contains the mapping from indices in the processed token list to\n            the original document tokens that are used to generate the final\n            predicted answers.\n        tokenizer (XLNetTokenizer): Tokenizer used during data preprocessing.\n        n_best_size (int, optional): The number of candidates to choose from each\n            QAResult to generate the final prediction. It's also the maximum number\n            of n-best answers to output for each question.\n            Note that the number of n-best answers can be smaller than `n_best_size`\n            because some unqualified answers, e.g. answer that are too long are removed.\n        n_top_start (int, optional): Beam size for span start. Note that this needs to\n            be consistent with the XLNet model configuration. Defaults to 5.\n        n_top_end (int, optional): Beam size for span end. Note that this needs to be\n            consistent with the XLNet model configuration. Defaults to 5.\n        max_answer_length (int, optional): Maximum length of the answer. Defaults to 30.\n        unanswerable_exists (bool, optional): Whether there are unanswerable questions\n            in the data. If True, the start and end logits of the [CLS] token, which\n            indicate the probability of the answer being empty, are included in the\n            candidate answer list.\n            Defaults to False.\n        output_prediction_file (str, optional): Path of the file to save the\n            predicted answers. Defaults to \"./qa_predictions.json\".\n        output_nbest_file (str, optional): Path of the file to save the n-best answers.\n            Defaults to \"./nbest_predictions.json\".\n        output_null_log_odds_file (str, optional): If unanswerable_exists is True,\n            the score difference between empty prediction and best non-empty prediction\n            are saved to this file. These scores can be used to find the best threshold\n            for predicting an empty answer. Defaults to \"./null_odds.json\".\n        verbose_logging (bool, optional): Whether to log details of answer\n            postprocessing. Defaults to False.\n\n    Returns:\n        tuple: (OrderedDict, OrderedDict, OrderedDict)\n            The keys of the dictionaries are the `qa_id` in the original\n            :class:`utils_nlp.dataset.pytorch.QADataset`\n            The values of the first dictionary are the predicted answer texts in\n                string type.\n            The values of the second dictionary are the softmax probabilities of\n                the predicted answers.\n            The values of the third dictionary are the n-best answers for each qa_id.\n            Note that the number of n-best answers can be smaller than `n_best_size`\n            because some unqualified answers, e.g. answers that are too\n            long are removed.\n\n    \"\"\"\n    with jsonlines.open(examples_file) as reader:\n        examples_all = list(reader.iter())\n\n    with jsonlines.open(features_file) as reader:\n        features_all = list(reader.iter())\n\n    qa_id_to_features = collections.defaultdict(list)\n    # Map unique features to the original doc-question-answer triplet\n    # Each doc-question-answer triplet can have multiple features because the doc\n    # could be split into multiple spans\n    for f in features_all:\n        qa_id_to_features[f[\"qa_id\"]].append(f)\n\n    unique_id_to_result = {}\n    for r in results:\n        unique_id_to_result[r.unique_id] = r\n\n    all_predictions = collections.OrderedDict()\n    all_probs = collections.OrderedDict()\n    all_nbest_json = collections.OrderedDict()\n    scores_diff_json = collections.OrderedDict()\n\n    for example in examples_all:\n        features = qa_id_to_features[example[\"qa_id\"]]\n\n        prelim_predictions = []\n        # keep track of the minimum score of null start+end of position 0\n        score_null = 1000000  # large and positive\n\n        for (feature_index, feature) in enumerate(features):\n            result = unique_id_to_result[feature[\"unique_id\"]]\n\n            cur_null_score = result.cls_logits\n\n            # if we could have irrelevant answers, get the min score of irrelevant\n            score_null = min(score_null, cur_null_score)\n\n            for i in range(n_top_start):\n                for j in range(n_top_end):\n                    start_log_prob = result.start_top_log_probs[i]\n                    start_index = result.start_top_index[i]\n\n                    j_index = i * n_top_end + j\n\n                    end_log_prob = result.end_top_log_probs[j_index]\n                    end_index = result.end_top_index[j_index]\n\n                    # We could hypothetically create invalid predictions, e.g., predict\n                    # that the start of the span is in the question. We throw out all\n                    # invalid predictions.\n                    if start_index >= feature[\"paragraph_len\"] - 1:\n                        continue\n                    if end_index >= feature[\"paragraph_len\"] - 1:\n                        continue\n\n                    if not feature[\"token_is_max_context\"].get(str(start_index), False):\n                        continue\n                    if end_index < start_index:\n                        continue\n                    length = end_index - start_index + 1\n                    if length > max_answer_length:\n                        continue\n\n                    prelim_predictions.append(\n                        _PrelimPrediction(\n                            feature_index=feature_index,\n                            start_index=start_index,\n                            end_index=end_index,\n                            start_logit=start_log_prob,\n                            end_logit=end_log_prob,\n                        )\n                    )\n\n        prelim_predictions = sorted(\n            prelim_predictions,\n            key=lambda x: (x.start_logit + x.end_logit),\n            reverse=True,\n        )\n\n        seen_predictions = {}\n        nbest = []\n        for pred in prelim_predictions:\n            if len(nbest) >= n_best_size:\n                break\n            feature = features[pred.feature_index]\n\n            # XLNet un-tokenizer\n            # Let's keep it simple for now and see if we need all this later.\n            #\n            # tok_start_to_orig_index = feature.tok_start_to_orig_index\n            # tok_end_to_orig_index = feature.tok_end_to_orig_index\n            # start_orig_pos = tok_start_to_orig_index[pred.start_index]\n            # end_orig_pos = tok_end_to_orig_index[pred.end_index]\n            # paragraph_text = example.paragraph_text\n            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()\n\n            # Previously used Bert untokenizer\n            tok_tokens = feature[\"tokens\"][pred.start_index : (pred.end_index + 1)]\n            orig_doc_start = feature[\"token_to_orig_map\"][str(pred.start_index)]\n            orig_doc_end = feature[\"token_to_orig_map\"][str(pred.end_index)]\n            orig_tokens = example[\"doc_tokens\"][orig_doc_start : (orig_doc_end + 1)]\n            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)\n\n            # Clean whitespace\n            tok_text = tok_text.strip()\n            tok_text = \" \".join(tok_text.split())\n            orig_text = \" \".join(orig_tokens)\n\n            final_text = _get_final_text(\n                tok_text, orig_text, tokenizer.do_lower_case, verbose_logging\n            )\n\n            if final_text in seen_predictions:\n                continue\n\n            seen_predictions[final_text] = True\n\n            nbest.append(\n                _NbestPrediction(\n                    text=final_text,\n                    start_logit=pred.start_logit,\n                    end_logit=pred.end_logit,\n                )\n            )\n\n        # In very rare edge cases we could have no valid predictions. So we\n        # just create a nonce prediction in this case to avoid failure.\n        if not nbest:\n            nbest.append(_NbestPrediction(text=\"\", start_logit=-1e6, end_logit=-1e6))\n\n        total_scores = []\n        best_non_null_entry = None\n        for ie, entry in enumerate(nbest):\n            total_scores.append(entry.start_logit + entry.end_logit)\n            if not best_non_null_entry:\n                best_non_null_entry = entry\n                best_non_null_entry_index = ie\n\n        probs = _compute_softmax(total_scores)\n\n        nbest_json = []\n        for (i, entry) in enumerate(nbest):\n            output = collections.OrderedDict()\n            output[\"text\"] = entry.text\n            output[\"probability\"] = probs[i]\n            output[\"start_logit\"] = entry.start_logit\n            output[\"end_logit\"] = entry.end_logit\n            nbest_json.append(output)\n\n        assert len(nbest_json) >= 1\n        assert best_non_null_entry is not None\n\n        score_diff = score_null\n        scores_diff_json[example[\"qa_id\"]] = score_diff\n        # note(zhiliny): always predict best_non_null_entry\n        # and the evaluation script will search for the best threshold\n        all_predictions[example[\"qa_id\"]] = best_non_null_entry.text\n\n        all_probs[example[\"qa_id\"]] = probs[best_non_null_entry_index]\n\n        all_nbest_json[example[\"qa_id\"]] = nbest_json\n\n    \"\"\"Write final predictions to the json file and log-odds of null if needed.\"\"\"\n    logger.info(\"Writing predictions to: %s\" % (output_prediction_file))\n    logger.info(\"Writing nbest to: %s\" % (output_nbest_file))\n\n    with open(output_prediction_file, \"w\") as writer:\n        writer.write(json.dumps(all_predictions, indent=4) + \"\\n\")\n\n    with open(output_nbest_file, \"w\") as writer:\n        writer.write(json.dumps(all_nbest_json, indent=4) + \"\\n\")\n\n    if unanswerable_exists:\n        logger.info(\"Writing null odds to: %s\" % (output_null_log_odds_file))\n        with open(output_null_log_odds_file, \"w\") as writer:\n            writer.write(json.dumps(scores_diff_json, indent=4) + \"\\n\")\n\n    return all_predictions, all_probs, all_nbest_json\n\n\n# -------------------------------------------------------------------------------------------------\n# Preprocessing helper functions\ndef _is_iterable_but_not_string(obj):\n    \"\"\"Check whether obj is a non-string Iterable.\"\"\"\n    return isinstance(obj, collections.Iterable) and not isinstance(obj, str)\n\n\ndef _create_qa_example(qa_input, is_training):\n    \"\"\" Initial preprocessing to create _QAExample for feature extraction. \"\"\"\n\n    # _QAExample is a data structure representing an unique document-question-answer\n    #   triplet.\n    # Args:\n    #     qa_id (int): An unique id identifying the document-question pair.\n    #         This is used to map prediction results to ground truth answers\n    #         during evaluation, because the data order is not preserved\n    #         during pre-processing and post-processing.\n    #     doc_tokens (list): White-space tokenized tokens of the document\n    #         text. This is used to generate the final answer based on\n    #         predicted start and end token indices during post-processing.\n    #     question_text (str): Text of the question.\n    #     orig_answer_text (str): Text of the ground truth answer if available.\n    #     start_position (int): Index of the starting token of the answer\n    #         span, if available.\n    #     end_position (int): Index of the ending token of the answer span,\n    #         if available.\n    #     is_impossible (bool): If the question is impossible to answer based\n    #         on the given document.\n    _QAExample = collections.namedtuple(\n        \"_QAExample\",\n        [\n            \"qa_id\",\n            \"doc_tokens\",\n            \"question_text\",\n            \"orig_answer_text\",\n            \"start_position\",\n            \"end_position\",\n            \"is_impossible\",\n        ],\n    )\n\n    def _is_whitespace(c):\n        if c == \" \" or c == \"\\t\" or c == \"\\r\" or c == \"\\n\" or ord(c) == 0x202F:\n            return True\n        return False\n\n    d_text = qa_input.doc_text\n    q_text = qa_input.question_text\n    a_start = qa_input.answer_start\n    a_text = qa_input.answer_text\n    q_id = qa_input.qa_id\n    impossible = qa_input.is_impossible\n\n    d_tokens = []\n    char_to_word_offset = []\n    prev_is_whitespace = True\n    for c in d_text:\n        if _is_whitespace(c):\n            prev_is_whitespace = True\n        else:\n            if prev_is_whitespace:\n                d_tokens.append(c)\n            else:\n                d_tokens[-1] += c\n            prev_is_whitespace = False\n        char_to_word_offset.append(len(d_tokens) - 1)\n\n    if _is_iterable_but_not_string(a_start):\n        if not _is_iterable_but_not_string(a_text):\n            raise Exception(\n                \"The answer text must be a list when answer start is a list.\"\n            )\n        if len(a_start) != 1 and is_training and not impossible:\n            raise Exception(\"For training, each question should have exactly 1 answer.\")\n        a_start = a_start[0]\n        a_text = a_text[0]\n\n    start_position = None\n    end_position = None\n    if is_training:\n        if not impossible:\n            answer_length = len(a_text)\n            start_position = char_to_word_offset[a_start]\n            end_position = char_to_word_offset[a_start + answer_length - 1]\n            # Only add answers where the text can be exactly recovered from the\n            # document. If this CAN'T happen it's likely due to weird Unicode\n            # stuff so we will just skip the example.\n            #\n            # Note that this means for training mode, every example is NOT\n            # guaranteed to be preserved.\n            actual_text = \" \".join(d_tokens[start_position : (end_position + 1)])\n            cleaned_answer_text = \" \".join(whitespace_tokenize(a_text))\n            if actual_text.find(cleaned_answer_text) == -1:\n                logger.warning(\n                    \"Could not find answer: '%s' vs. '%s'\",\n                    actual_text,\n                    cleaned_answer_text,\n                )\n                return\n        else:\n            start_position = -1\n            end_position = -1\n\n    return _QAExample(\n        qa_id=q_id,\n        doc_tokens=d_tokens,\n        question_text=q_text,\n        orig_answer_text=a_text,\n        start_position=start_position,\n        end_position=end_position,\n        is_impossible=impossible,\n    )\n\n\ndef _create_qa_features(\n    example,\n    model_type,\n    tokenizer,\n    unique_id,\n    is_training,\n    max_question_length,\n    max_seq_length,\n    doc_stride,\n    custom_tokenize=None,\n):\n    \"\"\"Extracts features for model training and scoring from document-question-answer\n        triplet.\n    \"\"\"\n\n    # _QAFeatures is data structure representing features of an unique document\n    # span-question-answer triplet.\n    # Args:\n    #     unique_id (int): An unique id identifying the span-question-answer triplet.\n    #     qa_id (int or str):  An unique id identifying the document-question-answer\n    #     sample in the original :class:`utils_nlp.dataset.pytorch.QADataset`\n    #     tokens (list): Concatenated question tokens and paragraph tokens.\n    #     token_to_orig_map (dict): A dictionary mapping token indices in the\n    #         document span back to the token indices in the original document\n    #         before document splitting.\n    #         This is needed during post-processing to generate the final\n    #         predicted answer.\n    #     token_is_max_context (list): List of booleans indicating whether a\n    #         token has the maximum context in teh current document span if it\n    #         appears in multiple document spans and gets multiple predicted\n    #         scores. We only want to consider the score with \"maximum context\".\n    #         \"Maximum context\" is defined as the *minimum* of its left and\n    #         right context.\n    #         For example:\n    #             Doc: the man went to the store and bought a gallon of milk\n    #             Span A: the man went to the\n    #             Span B: to the store and bought\n    #             Span C: and bought a gallon of\n\n    #         In the example the maximum context for 'bought' would be span C\n    #         since it has 1 left context and 3 right context, while span B\n    #         has 4 left context and 0 right context.\n    #         This is needed during post-processing to generate the final\n    #         predicted answer.\n    #     input_ids (list): List of numerical token indices corresponding to\n    #         the tokens.\n    #     input_mask (list): List of 1s and 0s indicating if a token is from\n    #         the input data or padded to conform to the maximum sequence\n    #         length. 1 for actual token and 0 for padded token.\n    #     segment_ids (list): List of 0s and 1s indicating if a token is from\n    #         the question text (0) or paragraph text (1).\n    #     start_position (int): Index of the starting token of the answer span.\n    #     end_position (int): Index of the ending token of the answer span.\n    #     cls_index (int): Index of the CLS token.\n    #     p_mask (list): Mask with 1 for token than cannot be in the answer,\n    #     0 for token which can be in an answer.\n    #     paragraph_len(int): Number of tokens in the document span.\n    _QAFeatures = collections.namedtuple(\n        \"_QAFeatures\",\n        [\n            \"unique_id\",\n            \"qa_id\",\n            \"tokens\",\n            \"token_to_orig_map\",\n            \"token_is_max_context\",\n            \"input_ids\",\n            \"input_mask\",\n            \"segment_ids\",\n            \"start_position\",\n            \"end_position\",\n            \"cls_index\",\n            \"p_mask\",\n            \"paragraph_len\",\n        ],\n    )\n\n    if custom_tokenize:\n        tokenize_func = custom_tokenize\n    else:\n        tokenize_func = tokenizer.tokenize\n\n    def _improve_answer_span(doc_tokens, input_start, input_end, orig_answer_text):\n        \"\"\"Returns tokenized answer spans that better match the annotated answer.\"\"\"\n\n        # We first project character-based annotations to\n        # whitespace-tokenized words. But then after WordPiece tokenization, we can\n        # often find a \"better match\". For example:\n        #\n        #   Question: What year was John Smith born?\n        #   Context: The leader was John Smith (1895-1943).\n        #   Answer: 1895\n        #\n        # The original whitespace-tokenized answer will be \"(1895-1943).\". However\n        # after tokenization, our tokens will be \"( 1895 - 1943 ) .\". So we can match\n        # the exact answer, 1895.\n        #\n        # However, this is not always possible. Consider the following:\n        #\n        #   Question: What country is the top exporter of electornics?\n        #   Context: The Japanese electronics industry is the lagest in the world.\n        #   Answer: Japan\n        #\n        # In this case, the annotator chose \"Japan\" as a character sub-span of\n        # the word \"Japanese\". Since our WordPiece tokenizer does not split\n        # \"Japanese\", we just use \"Japanese\" as the annotation. This is fairly rare,\n        # but does happen.\n        tok_answer_text = \" \".join(tokenize_func(orig_answer_text))\n\n        for new_start in range(input_start, input_end + 1):\n            for new_end in range(input_end, new_start - 1, -1):\n                text_span = \" \".join(doc_tokens[new_start : (new_end + 1)])\n                if text_span == tok_answer_text:\n                    return (new_start, new_end)\n\n        return (input_start, input_end)\n\n    def _check_is_max_context(doc_spans, cur_span_index, position):\n        \"\"\"Check if this is the 'max context' doc span for the token.\"\"\"\n\n        # Because of the sliding window approach taken to scoring documents, a single\n        # token can appear in multiple documents. E.g.\n        #  Doc: the man went to the store and bought a gallon of milk\n        #  Span A: the man went to the\n        #  Span B: to the store and bought\n        #  Span C: and bought a gallon of\n        #  ...\n        #\n        # Now the word 'bought' will have two scores from spans B and C. We only\n        # want to consider the score with \"maximum context\", which we define as\n        # the *minimum* of its left and right context (the *sum* of left and\n        # right context will always be the same, of course).\n        #\n        # In the example the maximum context for 'bought' would be span C since\n        # it has 1 left context and 3 right context, while span B has 4 left context\n        # and 0 right context.\n        best_score = None\n        best_span_index = None\n        for (span_index, doc_span) in enumerate(doc_spans):\n            end = doc_span.start + doc_span.length - 1\n            if position < doc_span.start:\n                continue\n            if position > end:\n                continue\n            num_left_context = position - doc_span.start\n            num_right_context = end - position\n            score = min(num_left_context, num_right_context) + 0.01 * doc_span.length\n            if best_score is None or score > best_score:\n                best_score = score\n                best_span_index = span_index\n\n        return cur_span_index == best_span_index\n\n    cls_token = \"[CLS]\"\n    sep_token = \"[SEP]\"\n    pad_token = 0\n    sequence_a_segment_id = 0\n    sequence_b_segment_id = 1\n    mask_padding_with_zero = True\n\n    if model_type == \"xlnet\":\n        cls_token_segment_id = 2\n        # Should this be 4, or it doesn't matter?\n        pad_token_segment_id = 3\n        cls_token_at_end = True\n    else:\n        cls_token_segment_id = 0\n        pad_token_segment_id = 0\n        cls_token_at_end = False\n\n    qa_features = []\n\n    # unique_id identified unique feature/label pairs. It's different\n    # from qa_id in that each qa_example can be broken down into\n    # multiple feature samples if the paragraph length is longer than\n    # maximum sequence length allowed\n    query_tokens = tokenize_func(example.question_text)\n\n    if len(query_tokens) > max_question_length:\n        query_tokens = query_tokens[0:max_question_length]\n    # map word-piece tokens to original tokens\n    tok_to_orig_index = []\n    # map original tokens to corresponding word-piece tokens\n    orig_to_tok_index = []\n    all_doc_tokens = []\n    for (i, token) in enumerate(example.doc_tokens):\n        orig_to_tok_index.append(len(all_doc_tokens))\n        sub_tokens = tokenize_func(token)\n        for sub_token in sub_tokens:\n            tok_to_orig_index.append(i)\n            all_doc_tokens.append(sub_token)\n\n    tok_start_position = None\n    tok_end_position = None\n    if is_training and example.is_impossible:\n        tok_start_position = -1\n        tok_end_position = -1\n    if is_training and not example.is_impossible:\n        tok_start_position = orig_to_tok_index[example.start_position]\n        if example.end_position < len(example.doc_tokens) - 1:\n            # +1: move the the token after the ending token in\n            # original tokens\n            # -1, moves one step back\n            # these two operations ensures word piece is covered\n            # when it's part of the original ending token.\n            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1\n        else:\n            tok_end_position = len(all_doc_tokens) - 1\n        (tok_start_position, tok_end_position) = _improve_answer_span(\n            all_doc_tokens,\n            tok_start_position,\n            tok_end_position,\n            example.orig_answer_text,\n        )\n\n    # The -3 accounts for [CLS], [SEP] and [SEP]\n    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3\n\n    # We can have documents that are longer than the maximum sequence length.\n    # To deal with this we do a sliding window approach, where we take chunks\n    # of the up to our max length with a stride of `doc_stride`.\n    _DocSpan = collections.namedtuple(\"DocSpan\", [\"start\", \"length\"])\n    doc_spans = []\n    start_offset = 0\n    while start_offset < len(all_doc_tokens):\n        length = len(all_doc_tokens) - start_offset\n        if length > max_tokens_for_doc:\n            length = max_tokens_for_doc\n        doc_spans.append(_DocSpan(start=start_offset, length=length))\n        if start_offset + length == len(all_doc_tokens):\n            break\n        start_offset += min(length, doc_stride)\n\n    for (doc_span_index, doc_span) in enumerate(doc_spans):\n        if is_training:\n            unique_id += 1\n        else:\n            unique_id += 2\n\n        tokens = []\n        token_to_orig_map = {}\n        token_is_max_context = {}\n        segment_ids = []\n\n        # p_mask: mask with 1 for token than cannot be in the answer\n        # (0 for token which can be in an answer)\n        # Original TF implem also keep the classification token (set to 0)\n        # (not sure why...)\n        # TODO: Should we set p_mask = 1 for cls token?\n        p_mask = []\n\n        # CLS token at the beginning\n        if not cls_token_at_end:\n            tokens.append(cls_token)\n            segment_ids.append(cls_token_segment_id)\n            p_mask.append(0)\n            cls_index = 0\n\n        # XLNet: P SEP Q SEP CLS\n        # Others: CLS Q SEP P SEP\n        if model_type != \"xlnet\":\n            # Query\n            tokens += query_tokens\n            segment_ids += [sequence_a_segment_id] * len(query_tokens)\n            p_mask += [1] * len(query_tokens)\n\n            # SEP token\n            tokens.append(sep_token)\n            segment_ids.append(sequence_a_segment_id)\n            p_mask.append(1)\n\n        # Paragraph\n        for i in range(doc_span.length):\n            split_token_index = doc_span.start + i\n            token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]\n\n            # TODO: maybe this can be improved to compute\n            # is_max_context for each token only once.\n            is_max_context = _check_is_max_context(\n                doc_spans, doc_span_index, split_token_index\n            )\n            token_is_max_context[len(tokens)] = is_max_context\n            tokens.append(all_doc_tokens[split_token_index])\n            if model_type == \"xlnet\":\n                segment_ids.append(sequence_a_segment_id)\n            else:\n                segment_ids.append(sequence_b_segment_id)\n            p_mask.append(0)\n        paragraph_len = doc_span.length\n\n        if model_type == \"xlnet\":\n            # SEP token\n            tokens.append(sep_token)\n            segment_ids.append(sequence_a_segment_id)\n            p_mask.append(1)\n\n            tokens += query_tokens\n            segment_ids += [sequence_b_segment_id] * len(query_tokens)\n            p_mask += [1] * len(query_tokens)\n\n        # SEP token\n        tokens.append(sep_token)\n        segment_ids.append(sequence_b_segment_id)\n        p_mask.append(1)\n\n        # CLS token at the end\n        if cls_token_at_end:\n            tokens.append(cls_token)\n            segment_ids.append(cls_token_segment_id)\n            p_mask.append(0)\n            cls_index = len(tokens) - 1  # Index of classification token\n\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n        # The mask has 1 for real tokens and 0 for padding tokens. Only real\n        # tokens are attended to.\n        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)\n\n        # Zero-pad up to the sequence length.\n        if len(input_ids) < max_seq_length:\n            pad_token_length = max_seq_length - len(input_ids)\n            pad_mask = 0 if mask_padding_with_zero else 1\n            input_ids += [pad_token] * pad_token_length\n            input_mask += [pad_mask] * pad_token_length\n            segment_ids += [pad_token_segment_id] * pad_token_length\n            p_mask += [1] * pad_token_length\n\n        assert len(input_ids) == max_seq_length\n        assert len(input_mask) == max_seq_length\n        assert len(segment_ids) == max_seq_length\n\n        span_is_impossible = example.is_impossible\n        start_position = None\n        end_position = None\n        if is_training and not span_is_impossible:\n            # For training, if our document chunk does not contain an annotation\n            # we throw it out, since there is nothing to predict.\n            doc_start = doc_span.start\n            doc_end = doc_span.start + doc_span.length - 1\n            out_of_span = False\n            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):\n                out_of_span = True\n            if out_of_span:\n                start_position = 0\n                end_position = 0\n                span_is_impossible = True\n            else:\n                # +1 for [CLS] token\n                # +1 for [SEP] token\n                if model_type == \"xlnet\":\n                    doc_offset = 0\n                else:\n                    doc_offset = len(query_tokens) + 2\n                start_position = tok_start_position - doc_start + doc_offset\n                end_position = tok_end_position - doc_start + doc_offset\n\n        if is_training and span_is_impossible:\n            start_position = cls_index\n            end_position = cls_index\n\n        qa_features.append(\n            _QAFeatures(\n                unique_id=unique_id,\n                qa_id=example.qa_id,\n                tokens=tokens,\n                token_to_orig_map=token_to_orig_map,\n                token_is_max_context=token_is_max_context,\n                input_ids=input_ids,\n                input_mask=input_mask,\n                segment_ids=segment_ids,\n                start_position=start_position,\n                end_position=end_position,\n                cls_index=cls_index,\n                p_mask=p_mask,\n                paragraph_len=paragraph_len,\n            )\n        )\n\n        return qa_features\n\n\n# Preprocessing helper functions end\n\n# -------------------------------------------------------------------------------------------------\n# Post processing helper functions\n_PrelimPrediction = collections.namedtuple(\n    \"PrelimPrediction\",\n    [\"feature_index\", \"start_index\", \"end_index\", \"start_logit\", \"end_logit\"],\n)\n\n_NbestPrediction = collections.namedtuple(\n    \"NbestPrediction\", [\"text\", \"start_logit\", \"end_logit\"]\n)\n\n\ndef _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):\n    \"\"\"Project the tokenized prediction back to the original text.\"\"\"\n\n    # When we created the data, we kept track of the alignment between original\n    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So\n    # now `orig_text` contains the span of our original text corresponding to the\n    # span that we predicted.\n    #\n    # However, `orig_text` may contain extra characters that we don't want in\n    # our prediction.\n    #\n    # For example, let's say:\n    #   pred_text = steve smith\n    #   orig_text = Steve Smith's\n    #\n    # We don't want to return `orig_text` because it contains the extra \"'s\".\n    #\n    # We don't want to return `pred_text` because it's already been normalized\n    # (the SQuAD eval script also does punctuation stripping/lower casing but\n    # our tokenizer does additional normalization like stripping accent\n    # characters).\n    #\n    # What we really want to return is \"Steve Smith\".\n    #\n    # Therefore, we have to apply a semi-complicated alignment heuristic between\n    # `pred_text` and `orig_text` to get a character-to-character alignment. This\n    # can fail in certain cases in which case we just return `orig_text`.\n\n    def _strip_spaces(text):\n        ns_chars = []\n        ns_to_s_map = collections.OrderedDict()\n        for (i, c) in enumerate(text):\n            if c == \" \":\n                continue\n            ns_to_s_map[len(ns_chars)] = i\n            ns_chars.append(c)\n        ns_text = \"\".join(ns_chars)\n        return (ns_text, ns_to_s_map)\n\n    # We first tokenize `orig_text`, strip whitespace from the result\n    # and `pred_text`, and check if they are the same length. If they are\n    # NOT the same length, the heuristic has failed. If they are the same\n    # length, we assume the characters are one-to-one aligned.\n    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)\n\n    tok_text = \" \".join(tokenizer.tokenize(orig_text))\n\n    start_position = tok_text.find(pred_text)\n    if start_position == -1:\n        if verbose_logging:\n            logger.info(\"Unable to find text: '%s' in '%s'\" % (pred_text, orig_text))\n        return orig_text\n    end_position = start_position + len(pred_text) - 1\n\n    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)\n    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)\n\n    if len(orig_ns_text) != len(tok_ns_text):\n        if verbose_logging:\n            logger.info(\n                \"Length not equal after stripping spaces: '%s' vs '%s'\",\n                orig_ns_text,\n                tok_ns_text,\n            )\n        return orig_text\n\n    # We then project the characters in `pred_text` back to `orig_text` using\n    # the character-to-character alignment.\n    tok_s_to_ns_map = {}\n    for (i, tok_index) in tok_ns_to_s_map.items():\n        tok_s_to_ns_map[tok_index] = i\n\n    orig_start_position = None\n    if start_position in tok_s_to_ns_map:\n        ns_start_position = tok_s_to_ns_map[start_position]\n        if ns_start_position in orig_ns_to_s_map:\n            orig_start_position = orig_ns_to_s_map[ns_start_position]\n\n    if orig_start_position is None:\n        if verbose_logging:\n            logger.info(\"Couldn't map start position\")\n        return orig_text\n\n    orig_end_position = None\n    if end_position in tok_s_to_ns_map:\n        ns_end_position = tok_s_to_ns_map[end_position]\n        if ns_end_position in orig_ns_to_s_map:\n            orig_end_position = orig_ns_to_s_map[ns_end_position]\n\n    if orig_end_position is None:\n        if verbose_logging:\n            logger.info(\"Couldn't map end position\")\n        return orig_text\n\n    output_text = orig_text[orig_start_position : (orig_end_position + 1)]\n    return output_text\n\n\ndef _get_best_indexes(logits, n_best_size):\n    \"\"\"Get the n-best logits from a list.\"\"\"\n    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)\n\n    best_indexes = []\n    for i in range(len(index_and_score)):\n        if i >= n_best_size:\n            break\n        best_indexes.append(index_and_score[i][0])\n    return best_indexes\n\n\ndef _compute_softmax(scores):\n    \"\"\"Compute softmax probability over raw logits.\"\"\"\n    if not scores:\n        return []\n\n    max_score = None\n    for score in scores:\n        if max_score is None or score > max_score:\n            max_score = score\n\n    exp_scores = []\n    total_sum = 0.0\n    for score in scores:\n        x = math.exp(score - max_score)\n        exp_scores.append(x)\n        total_sum += x\n\n    probs = []\n    for score in exp_scores:\n        probs.append(score / total_sum)\n    return probs\n\n\n# Post processing helper functions end\n"
  },
  {
    "path": "utils_nlp/models/transformers/sequence_classification.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport numpy as np\nfrom transformers import (\n    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,\n    AutoConfig,\n    AutoModelForSequenceClassification,\n    AutoTokenizer,\n)\n\nfrom utils_nlp.common.pytorch_utils import compute_training_steps\nfrom utils_nlp.models.transformers.common import MAX_SEQ_LEN, Transformer\nfrom utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet\n\nsupported_models = [\n    list(x.pretrained_config_archive_map)\n    for x in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING\n]\nsupported_models = sorted([x for y in supported_models for x in y])\n\n\nclass Processor:\n    \"\"\"\n    Class for preprocessing sequence classification data.\n\n    Args:\n        model_name (str, optional): Name of the model.\n            Call SequenceClassifier.list_supported_models() to get all supported models.\n            Defaults to \"bert-base-cased\".\n        to_lower (bool, optional): Whether to convert all letters to lower case during\n            tokenization. This is determined by if a cased model is used.\n            Defaults to False, which corresponds to a cased model.\n        cache_dir (str, optional): Directory to cache the tokenizer. Defaults to \".\".\n        output_loading_info (bool, optional): Display tokenizer loading info if True.\n    \"\"\"\n\n    def __init__(self, model_name=\"bert-base-cased\", to_lower=False, cache_dir=\".\"):\n        self.model_name = model_name\n        self.to_lower = to_lower\n        self.cache_dir = cache_dir\n        self.tokenizer = AutoTokenizer.from_pretrained(\n            model_name,\n            do_lower_case=to_lower,\n            cache_dir=cache_dir,\n            output_loading_info=False,\n        )\n\n    @staticmethod\n    def get_inputs(batch, device, model_name, train_mode=True):\n        \"\"\"\n        Creates an input dictionary given a model name.\n\n        Args:\n            batch (tuple): A tuple containing input ids, attention mask,\n                segment ids, and labels tensors.\n            device (torch.device): A PyTorch device.\n            model_name (bool): Model name used to format the inputs.\n            train_mode (bool, optional): Training mode flag.\n                Defaults to True.\n\n        Returns:\n            dict: Dictionary containing input ids, segment ids, masks, and labels.\n                Labels are only returned when train_mode is True.\n        \"\"\"\n        batch = tuple(t.to(device) for t in batch)\n        if model_name in supported_models:\n            if train_mode:\n                inputs = {\n                    \"input_ids\": batch[0],\n                    \"attention_mask\": batch[1],\n                    \"labels\": batch[3],\n                }\n            else:\n                inputs = {\"input_ids\": batch[0], \"attention_mask\": batch[1]}\n\n            # distilbert, bart don't support segment ids\n            if model_name.split(\"-\")[0] not in [\"distilbert\", \"bart\"]:\n                inputs[\"token_type_ids\"] = batch[2]\n\n            return inputs\n        else:\n            raise ValueError(\"Model not supported: {}\".format(model_name))\n\n    @staticmethod\n    def text_transform(text, tokenizer, max_len=MAX_SEQ_LEN):\n        \"\"\"\n        Text transformation function for sequence classification.\n        The function can be passed to a map-style PyTorch DataSet.\n\n        Args:\n            text (str): Input text.\n            tokenizer (PreTrainedTokenizer): A pretrained tokenizer.\n            max_len (int, optional): Max sequence length. Defaults to 512.\n\n        Returns:\n            tuple: Tuple containing input ids, attention masks, and segment ids.\n        \"\"\"\n        if max_len > MAX_SEQ_LEN:\n            print(\"setting max_len to max allowed seq length: {}\".format(MAX_SEQ_LEN))\n            max_len = MAX_SEQ_LEN\n        # truncate and add CLS & SEP markers\n        tokens = tokenizer.tokenize(text)[0 : max_len - 2]\n        tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]\n\n        # get input ids\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n        # pad sequence\n        input_ids = input_ids + [0] * (max_len - len(input_ids))\n        # create input mask\n        attention_mask = [min(1, x) for x in input_ids]\n        # create segment ids\n        token_type_ids = [0] * len(input_ids)\n\n        return input_ids, attention_mask, token_type_ids\n\n    @staticmethod\n    def text_pair_transform(text_1, text_2, tokenizer, max_len=MAX_SEQ_LEN):\n        \"\"\"\n        Text transformation function for sentence pair classification.\n        The function can be passed to a map-style PyTorch DataSet.\n\n        Args:\n            text_1 (str): Input text 1.\n            text_2 (str): Input text 2.\n            tokenizer (PreTrainedTokenizer): A pretrained tokenizer.\n            max_len (int, optional): Max sequence length. Defaults to 512.\n\n        Returns:\n            tuple: Tuple containing input ids, attention masks, and segment ids.\n        \"\"\"\n\n        def _truncate_seq_pair(tokens_a, tokens_b, max_length):\n            \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n            # This is a simple heuristic which will always truncate the longer\n            # sequence one token at a time. This makes more sense than\n            # truncating an equal percent of tokens from each, since if one\n            # sequence is very short then each token that's truncated likely\n            # contains more information than a longer sequence.\n\n            if not tokens_b:\n                max_length += 1\n\n            while True:\n                total_length = len(tokens_a) + len(tokens_b)\n                if total_length <= max_length:\n                    break\n                if len(tokens_a) > len(tokens_b):\n                    tokens_a.pop()\n                else:\n                    tokens_b.pop()\n\n            tokens_a.append(tokenizer.sep_token)\n\n            if tokens_b:\n                tokens_b.append(tokenizer.sep_token)\n\n            return tokens_a, tokens_b\n\n        if max_len > MAX_SEQ_LEN:\n            print(\"setting max_len to max allowed tokens: {}\".format(MAX_SEQ_LEN))\n            max_len = MAX_SEQ_LEN\n\n        tokens_1 = tokenizer.tokenize(text_1)\n\n        tokens_2 = tokenizer.tokenize(text_2)\n\n        tokens_1, tokens_2 = _truncate_seq_pair(tokens_1, tokens_2, max_len - 3)\n\n        # construct token_type_ids, prefix with [0] for [CLS]\n        # [0, 0, 0, 0, ... 0, 1, 1, 1, ... 1]\n        token_type_ids = [0] + [0] * len(tokens_1) + [1] * len(tokens_2)\n        # pad sequence\n        token_type_ids = token_type_ids + [0] * (max_len - len(token_type_ids))\n        # merge sentences\n        tokens = [tokenizer.cls_token] + tokens_1 + tokens_2\n        # convert tokens to indices\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n        # pad sequence\n        input_ids = input_ids + [0] * (max_len - len(input_ids))\n        # create input mask\n        attention_mask = [min(1, x) for x in input_ids]\n\n        return input_ids, attention_mask, token_type_ids\n\n    def dataset_from_dataframe(\n        self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN\n    ):\n        if text2_col is None:\n            return SCDataSet(\n                df,\n                text_col,\n                label_col,\n                transform=Processor.text_transform,\n                tokenizer=self.tokenizer,\n                max_len=max_len,\n            )\n        else:\n            return SPCDataSet(\n                df,\n                text_col,\n                text2_col,\n                label_col,\n                transform=Processor.text_pair_transform,\n                tokenizer=self.tokenizer,\n                max_len=max_len,\n            )\n\n\nclass SequenceClassifier(Transformer):\n    def __init__(self, model_name=\"bert-base-cased\", num_labels=2, cache_dir=\".\"):\n        config = AutoConfig.from_pretrained(\n            model_name, num_labels=num_labels, cache_dir=cache_dir\n        )\n        model = AutoModelForSequenceClassification.from_pretrained(\n            model_name, cache_dir=cache_dir, config=config, output_loading_info=False\n        )\n        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)\n\n    @staticmethod\n    def list_supported_models():\n        return supported_models\n\n    def fit(\n        self,\n        train_dataloader,\n        num_epochs=1,\n        max_steps=-1,\n        gradient_accumulation_steps=1,\n        num_gpus=None,\n        gpu_ids=None,\n        local_rank=-1,\n        weight_decay=0.0,\n        learning_rate=5e-5,\n        adam_epsilon=1e-8,\n        warmup_steps=0,\n        fp16=False,\n        fp16_opt_level=\"O1\",\n        checkpoint_state_dict=None,\n        verbose=True,\n        seed=None,\n    ):\n        \"\"\"\n        Fine-tunes a pre-trained sequence classification model.\n\n        Args:\n            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.\n            num_epochs (int, optional): Number of training epochs. Defaults to 1.\n            max_steps (int, optional): Total number of training steps.\n                If set to a positive value, it overrides num_epochs.\n                Otherwise, it's determined by the dataset length,\n                gradient_accumulation_steps, and num_epochs.\n                Defualts to -1.\n            gradient_accumulation_steps (int, optional): Number of steps to accumulate\n                before performing a backward/update pass.\n                Default to 1.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used.\n                If set to 0 or GPUs are not available, CPU device will be used.\n                Defaults to None.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            local_rank (int, optional): Local_rank for distributed training on GPUs.\n                Defaults to -1, which means non-distributed training.\n            weight_decay (float, optional): Weight decay to apply after each\n                parameter update.\n                Defaults to 0.0.\n            learning_rate (float, optional):  Learning rate of the AdamW optimizer.\n                Defaults to 5e-5.\n            adam_epsilon (float, optional): Epsilon of the AdamW optimizer.\n                Defaults to 1e-8.\n            warmup_steps (int, optional): Number of steps taken to increase learning\n                rate from 0 to `learning rate`. Defaults to 0.\n            fp16 (bool): Whether to use 16-bit mixed precision through Apex\n                Defaults to False\n            fp16_opt_level (str): Apex AMP optimization level for fp16.\n                One of in ['O0', 'O1', 'O2', and 'O3']\n                See https://nvidia.github.io/apex/amp.html\"\n                Defaults to \"01\"\n            checkpoint_state_dict (dict): Checkpoint states of model and optimizer.\n                If specified, the model and optimizer's parameters are loaded using\n                checkpoint_state_dict[\"model\"] and checkpoint_state_dict[\"optimizer\"]\n                Defaults to None.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n            seed (int, optional): Random seed used to improve reproducibility.\n                Defaults to None.\n        \"\"\"\n\n        # init device and optimizer\n        device, num_gpus, amp = self.prepare_model_and_optimizer(\n            num_gpus=num_gpus,\n            gpu_ids=gpu_ids,\n            local_rank=local_rank,\n            weight_decay=weight_decay,\n            learning_rate=learning_rate,\n            adam_epsilon=adam_epsilon,\n            fp16=fp16,\n            fp16_opt_level=fp16_opt_level,\n            checkpoint_state_dict=checkpoint_state_dict,\n        )\n\n        # compute the max number of training steps\n        max_steps = compute_training_steps(\n            dataloader=train_dataloader,\n            num_epochs=num_epochs,\n            max_steps=max_steps,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n        )\n\n        # init scheduler\n        scheduler = Transformer.get_default_scheduler(\n            optimizer=self.optimizer,\n            warmup_steps=warmup_steps,\n            num_training_steps=max_steps,\n        )\n\n        # fine tune\n        super().fine_tune(\n            train_dataloader=train_dataloader,\n            get_inputs=Processor.get_inputs,\n            device=device,\n            num_gpus=num_gpus,\n            max_steps=max_steps,\n            gradient_accumulation_steps=gradient_accumulation_steps,\n            optimizer=self.optimizer,\n            scheduler=scheduler,\n            fp16=fp16,\n            amp=amp,\n            local_rank=local_rank,\n            verbose=verbose,\n            seed=seed,\n        )\n\n    def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):\n        \"\"\"\n        Scores a dataset using a fine-tuned model and a given dataloader.\n\n        Args:\n            test_dataloader (DataLoader): DataLoader for scoring the data.\n            num_gpus (int, optional): The number of GPUs to use.\n                If None, all available GPUs will be used. If set to 0 or GPUs are\n                not available, CPU device will be used.\n                Defaults to None.\n            gpu_ids (list): List of GPU IDs to be used.\n                If set to None, the first num_gpus GPUs will be used.\n                Defaults to None.\n            verbose (bool, optional): Whether to print out the training log.\n                Defaults to True.\n\n        Returns\n            1darray: numpy array of predicted label indices.\n        \"\"\"\n\n        preds = list(\n            super().predict(\n                eval_dataloader=test_dataloader,\n                get_inputs=Processor.get_inputs,\n                num_gpus=num_gpus,\n                gpu_ids=gpu_ids,\n                verbose=verbose,\n            )\n        )\n        preds = np.concatenate(preds)\n        return np.argmax(preds, axis=1)\n"
  },
  {
    "path": "utils_nlp/models/xlnet/README.md",
    "content": "# XLNet-based Classes\n\nThis folder contains utility functions and classes based on the implementation of [Transformers](https://github.com/huggingface/transformers). \n\n## Summary\n\nThe following table summarizes each Python script.\n\n|Script|Description|\n|---|---|\n|[common.py](common.py)| This script includes <ul><li>the languages supported by XLNet-based classes</li><li> tokenization for text classification</li> <li>utilities to load data, etc.</li></ul>|\n|[sequence_classification.py](sequence_classification.py)| An implementation of sequence classification based on fine-turning XLNet. It is commonly used for text classification. The module includes logging functionality using MLFlow.|\n|[utils.py](utils.py)| This script includes a function to visualize a confusion matrix.|\n"
  },
  {
    "path": "utils_nlp/models/xlnet/common.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\n# This script reuses some code from\n# https://github.com/huggingface/transformers/blob/master/examples/utils_glue.py\nfrom enum import Enum\nfrom transformers import XLNetTokenizer\nfrom mlflow import log_metric, log_param, log_artifact\n\n\nclass Language(Enum):\n    \"\"\"\n    An enumeration of the supported pretrained models and languages.\n    \"\"\"\n\n    ENGLISHCASED = \"xlnet-base-cased\" #: Base cased model for xlnet\n    ENGLISHLARGECASED = \"xlnet-large-cased\" #: Large cased model for xlnet\n\nclass Tokenizer:\n    def __init__(\n        self, language=Language.ENGLISHCASED, cache_dir=\".\"\n    ):\n        \"\"\"Initializes the underlying pretrained XLNet tokenizer.\n\n        Args:\n            language (Language, optional): The pretrained model's language.\n                                           Defaults to Language.ENGLISHCASED\n        \"\"\"\n        self.tokenizer = XLNetTokenizer.from_pretrained(language.value, cache_dir=cache_dir)\n        self.language = language\n\n    def preprocess_classification_tokens(self, examples, max_seq_length):\n        \"\"\"Preprocessing of example input tokens:\n            - add XLNet sentence markers ([CLS] and [SEP])\n            - pad and truncate sequences\n            - create an input_mask\n            - create token type ids, aka. segment ids\n\n        Args:\n            examples (list): List of input strings to preprocess.\n            max_seq_length (int, optional): Maximum number of tokens\n                            (documents will be truncated or padded).\n                            Defaults to 512.\n        Returns:\n            (tuple): A tuple containing:\n                list of input ids\n                list of input mask\n                list of segment ids\n\n        \"\"\"\n        features = []\n        cls_token = self.tokenizer.cls_token\n        sep_token = self.tokenizer.sep_token\n        cls_token_segment_id=2\n        pad_on_left=True\n        pad_token_segment_id=4\n        sequence_a_segment_id=0\n        cls_token_at_end=True\n        mask_padding_with_zero=True\n        pad_token=0\n        \n        list_input_ids = []\n        list_input_mask = []\n        list_segment_ids = []\n        \n        \n        for (ex_index, example) in enumerate(examples):\n\n            tokens_a = self.tokenizer.tokenize(example)\n\n            if len(tokens_a) > max_seq_length - 2:\n                tokens_a = tokens_a[:(max_seq_length - 2)]\n\n            tokens = tokens_a + [sep_token]\n            segment_ids = [sequence_a_segment_id] * len(tokens)\n\n            if cls_token_at_end:\n                tokens = tokens + [cls_token]\n                segment_ids = segment_ids + [cls_token_segment_id]\n            else:\n                tokens = [cls_token] + tokens\n                segment_ids = [cls_token_segment_id] + segment_ids\n\n            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)\n\n\n            # The mask has 1 for real tokens and 0 for padding tokens. Only real\n            # tokens are attended to.\n            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)\n\n            # Zero-pad up to the sequence length.\n            padding_length = max_seq_length - len(input_ids)\n            if pad_on_left:\n                input_ids = ([pad_token] * padding_length) + input_ids\n                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask\n                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids\n            else:\n                input_ids = input_ids + ([pad_token] * padding_length)\n                input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)\n                segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)\n\n            assert len(input_ids) == max_seq_length\n            assert len(input_mask) == max_seq_length\n            assert len(segment_ids) == max_seq_length\n          \n            list_input_ids.append(input_ids)\n            list_input_mask.append(input_mask)\n            list_segment_ids.append(segment_ids)\n\n#             features.append({\"input_ids\":input_ids,\"input_mask\":input_mask,\"segment_ids\":segment_ids,\"label_id\":label_id})\n        return (list_input_ids, list_input_mask, list_segment_ids)\n\ndef log_xlnet_params(local_dict):\n    \"\"\"wrapper that abstracts away logging of ipython notebook local training parameters described at definition\n    Args:\n        local_dict(dict): dict containing all local varaibles from notebook \n    \"\"\"\n    params = [\"DATA_FOLDER\",\"XLNET_CACHE_DIR\",\"LANGUAGE\",\"MAX_SEQ_LENGTH\",\"BATCH_SIZE\",\"NUM_GPUS\",\n              \"NUM_EPOCHS\",\"TRAIN_SIZE\",\"LABEL_COL\",\"TEXT_COL\",\"LEARNING_RATE\",\"WEIGHT_DECAY\",\n              \"ADAM_EPSILON\",\"WARMUP_STEPS\",\"DEBUG\"]\n    for i in params:\n         log_param(i,local_dict[i])\n    return"
  },
  {
    "path": "utils_nlp/models/xlnet/sequence_classification.py",
    "content": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Utilities for Xlnet Sequence Classification\"\"\"\nimport os\nfrom collections import namedtuple\n\nimport mlflow\nimport mlflow.pytorch\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, RandomSampler, TensorDataset\nfrom tqdm import tqdm\nfrom transformers import AdamW, WarmupLinearSchedule, XLNetConfig, XLNetForSequenceClassification\n\nfrom utils_nlp.common.pytorch_utils import get_device, move_model_to_device\nfrom utils_nlp.models.xlnet.common import Language\n\n\nclass XLNetSequenceClassifier:\n    \"\"\"XLNet-based sequence classifier\"\"\"\n\n    def __init__(\n        self,\n        language=Language.ENGLISHCASED,\n        num_labels=5,\n        cache_dir=\".\",\n        num_gpus=None,\n        num_epochs=1,\n        batch_size=8,\n        lr=5e-5,\n        adam_eps=1e-8,\n        warmup_steps=0,\n        weight_decay=0.0,\n        max_grad_norm=1.0,\n    ):\n        \"\"\"Initializes the classifier and the underlying pretrained model.\n\n        Args:\n            language (Language, optional): The pretrained model's language.\n                                           Defaults to 'xlnet-base-cased'.\n            num_labels (int, optional): The number of unique labels in the\n                training data. Defaults to 5.\n            cache_dir (str, optional): Location of XLNet's cache directory.\n                Defaults to \".\".\n            num_gpus (int, optional): The number of gpus to use.\n                                      If None is specified, all available GPUs\n                                      will be used. Defaults to None.\n            num_epochs (int, optional): Number of training epochs.\n                Defaults to 1.\n            batch_size (int, optional): Training batch size. Defaults to 8.\n            lr (float): Learning rate of the Adam optimizer. Defaults to 5e-5.\n            adam_eps (float, optional): term added to the denominator to improve\n                                        numerical stability. Defaults to 1e-8.\n            warmup_steps (int, optional): Number of steps in which to increase\n                                        learning rate linearly from 0 to 1. Defaults to 0.\n            weight_decay (float, optional): Weight decay. Defaults to 0.\n            max_grad_norm (float, optional): Maximum norm for the gradients. Defaults to 1.0\n        \"\"\"\n\n        if num_labels < 2:\n            raise ValueError(\"Number of labels should be at least 2.\")\n\n        self.language = language\n        self.num_labels = num_labels\n        self.cache_dir = cache_dir\n\n        self.num_gpus = num_gpus\n        self.num_epochs = num_epochs\n        self.batch_size = batch_size\n        self.lr = lr\n        self.adam_eps = adam_eps\n        self.warmup_steps = warmup_steps\n        self.weight_decay = weight_decay\n        self.max_grad_norm = max_grad_norm\n\n        # create classifier\n        self.config = XLNetConfig.from_pretrained(self.language.value, num_labels=num_labels, cache_dir=cache_dir)\n        self.model = XLNetForSequenceClassification(self.config)\n\n    def fit(\n        self,\n        token_ids,\n        input_mask,\n        labels,\n        val_token_ids,\n        val_input_mask,\n        val_labels,\n        token_type_ids=None,\n        val_token_type_ids=None,\n        verbose=True,\n        logging_steps=0,\n        save_steps=0,\n        val_steps=0,\n    ):\n        \"\"\"Fine-tunes the XLNet classifier using the given training data.\n\n        Args:\n            token_ids (list): List of training token id lists.\n            input_mask (list): List of input mask lists.\n            labels (list): List of training labels.\n            token_type_ids (list, optional): List of lists. Each sublist\n                contains segment ids indicating if the token belongs to\n                the first sentence(0) or second sentence(1). Only needed\n                for two-sentence tasks.\n            verbose (bool, optional): If True, shows the training progress and\n                loss values. Defaults to True.\n        \"\"\"\n\n        device, num_gpus = get_device(self.num_gpus)\n        self.model = move_model_to_device(self.model, device, self.num_gpus)\n\n        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)\n        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)\n        labels_tensor = torch.tensor(labels, dtype=torch.long)\n\n        val_token_ids_tensor = torch.tensor(val_token_ids, dtype=torch.long)\n        val_input_mask_tensor = torch.tensor(val_input_mask, dtype=torch.long)\n        val_labels_tensor = torch.tensor(val_labels, dtype=torch.long)\n\n        if token_type_ids:\n            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)\n            val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long)\n\n            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor)\n\n            val_dataset = TensorDataset(\n                val_token_ids_tensor, val_input_mask_tensor, val_token_type_ids_tensor, val_labels_tensor,\n            )\n\n        else:\n\n            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor)\n\n            val_dataset = TensorDataset(val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor)\n\n        # define optimizer and model parameters\n        param_optimizer = list(self.model.named_parameters())\n        no_decay = [\"bias\", \"LayerNorm.weight\"]\n        optimizer_grouped_parameters = [\n            {\n                \"params\": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n                \"weight_decay\": self.weight_decay,\n            },\n            {\"params\": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], \"weight_decay\": 0.0},\n        ]\n\n        val_sampler = RandomSampler(val_dataset)\n\n        val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=self.batch_size)\n\n        num_examples = len(token_ids)\n        num_batches = int(np.ceil(num_examples / self.batch_size))\n        num_train_optimization_steps = num_batches * self.num_epochs\n\n        optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr, eps=self.adam_eps)\n        scheduler = WarmupLinearSchedule(\n            optimizer, warmup_steps=self.warmup_steps, t_total=num_train_optimization_steps\n        )\n\n        global_step = 0\n        self.model.train()\n        optimizer.zero_grad()\n        for epoch in range(self.num_epochs):\n\n            train_sampler = RandomSampler(train_dataset)\n\n            train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.batch_size)\n\n            tr_loss = 0.0\n            logging_loss = 0.0\n            val_loss = 0.0\n\n            for i, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")):\n                if token_type_ids:\n                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(t.to(device) for t in batch)\n                else:\n                    token_type_ids_batch = None\n                    x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)\n\n                outputs = self.model(\n                    input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=y_batch,\n                )\n\n                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers\n\n                loss.sum().backward()\n                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)\n\n                tr_loss += loss.sum().item()\n                optimizer.step()\n                # Update learning rate schedule\n                scheduler.step()\n                optimizer.zero_grad()\n                global_step += 1\n                # logging of learning rate and loss\n                if logging_steps > 0 and global_step % logging_steps == 0:\n                    mlflow.log_metric(\"learning rate\", scheduler.get_lr()[0], step=global_step)\n                    mlflow.log_metric(\n                        \"training loss\", (tr_loss - logging_loss) / (logging_steps * self.batch_size), step=global_step,\n                    )\n                    logging_loss = tr_loss\n                # model checkpointing\n                if save_steps > 0 and global_step % save_steps == 0:\n                    checkpoint_dir = os.path.join(os.getcwd(), \"checkpoints\")\n                    if not os.path.isdir(checkpoint_dir):\n                        os.makedirs(checkpoint_dir)\n                    checkpoint_path = checkpoint_dir + \"/\" + str(global_step) + \".pth\"\n                    torch.save(self.model.state_dict(), checkpoint_path)\n                    mlflow.log_artifact(checkpoint_path)\n                # model validation\n                if val_steps > 0 and global_step % val_steps == 0:\n                    # run model on validation set\n                    self.model.eval()\n                    val_loss = 0.0\n                    for j, val_batch in enumerate(val_dataloader):\n                        if token_type_ids:\n                            val_x_batch, val_mask_batch, val_token_type_ids_batch, val_y_batch = tuple(\n                                t.to(device) for t in val_batch\n                            )\n                        else:\n                            token_type_ids_batch = None\n                            val_x_batch, val_mask_batch, val_y_batch = tuple(t.to(device) for t in val_batch)\n                        val_outputs = self.model(\n                            input_ids=val_x_batch,\n                            token_type_ids=val_token_type_ids_batch,\n                            attention_mask=val_mask_batch,\n                            labels=val_y_batch,\n                        )\n                        vloss = val_outputs[0]\n                        val_loss += vloss.sum().item()\n                    mlflow.log_metric(\"validation loss\", val_loss / len(val_dataset), step=global_step)\n                    self.model.train()\n\n                if verbose:\n                    if i % ((num_batches // 10) + 1) == 0:\n                        if val_loss > 0:\n                            print(\n                                \"epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f};\\\n                                 average val loss:{:.6f}\".format(\n                                    epoch + 1,\n                                    self.num_epochs,\n                                    i + 1,\n                                    min(i + 1 + num_batches // 10, num_batches),\n                                    num_batches,\n                                    tr_loss / (i + 1),\n                                    val_loss / (j + 1),\n                                )\n                            )\n                        else:\n                            print(\n                                \"epoch:{}/{}; batch:{}->{}/{}; average train loss:{:.6f}\".format(\n                                    epoch + 1,\n                                    self.num_epochs,\n                                    i + 1,\n                                    min(i + 1 + num_batches // 10, num_batches),\n                                    num_batches,\n                                    tr_loss / (i + 1),\n                                )\n                            )\n        checkpoint_dir = os.path.join(os.getcwd(), \"checkpoints\")\n        if not os.path.isdir(checkpoint_dir):\n            os.makedirs(checkpoint_dir)\n        checkpoint_path = checkpoint_dir + \"/\" + \"final\" + \".pth\"\n        torch.save(self.model.state_dict(), checkpoint_path)\n        mlflow.log_artifact(checkpoint_path)\n        # empty cache\n        del [x_batch, y_batch, mask_batch, token_type_ids_batch]\n        if val_steps > 0:\n            del [val_x_batch, val_y_batch, val_mask_batch, val_token_type_ids_batch]\n        torch.cuda.empty_cache()\n\n    def predict(\n        self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=8, probabilities=False,\n    ):\n        \"\"\"Scores the given dataset and returns the predicted classes.\n\n        Args:\n            token_ids (list): List of training token lists.\n            input_mask (list): List of input mask lists.\n            token_type_ids (list, optional): List of lists. Each sublist\n                contains segment ids indicating if the token belongs to\n                the first sentence(0) or second sentence(1). Only needed\n                for two-sentence tasks.\n            num_gpus (int, optional): The number of gpus to use.\n                                      If None is specified, all available GPUs\n                                      will be used. Defaults to None.\n            batch_size (int, optional): Scoring batch size. Defaults to 8.\n            probabilities (bool, optional):\n                If True, the predicted probability distribution\n                is also returned. Defaults to False.\n        Returns:\n            1darray, namedtuple(1darray, ndarray): Predicted classes or\n                (classes, probabilities) if probabilities is True.\n        \"\"\"\n\n        device, num_gpus = get_device(num_gpus)\n        self.model = move_model_to_device(self.model, device, num_gpus)\n\n        self.model.eval()\n        preds = []\n\n        with tqdm(total=len(token_ids)) as pbar:\n            for i in range(0, len(token_ids), batch_size):\n                start = i\n                end = start + batch_size\n                x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device)\n                mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device)\n\n                token_type_ids_batch = torch.tensor(token_type_ids[start:end], dtype=torch.long, device=device)\n\n                with torch.no_grad():\n                    pred_batch = self.model(\n                        input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None,\n                    )\n                    preds.append(pred_batch[0].cpu())\n                    if i % batch_size == 0:\n                        pbar.update(batch_size)\n\n            preds = np.concatenate(preds)\n\n            if probabilities:\n                return namedtuple(\"Predictions\", \"classes probabilities\")(\n                    preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy()\n                )\n            else:\n                return preds.argmax(axis=1)\n"
  }
]