[
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.md",
    "content": "---\nname: Bug report\nabout: Report on a bug you encountered\ntitle: ''\nlabels: bug\nassignees: ''\n\n---\n\nWant to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first.\n\nPlease enter all the information below, otherwise your issue may be closed without a warning. \n\n\n**DeepPavlov version** (you can look it up by running `pip show deeppavlov`):\n\n**Python version**:\n\n**Operating system** (ubuntu linux, windows, ...):\n\n**Issue**:\n\n\n**Content or a name of a configuration file**:\n```\n\n```\n\n\n**Command that led to error**:\n```\n\n```\n\n**Error (including full traceback)**:\n```\n\n```\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "blank_issues_enabled: false\ncontact_links:\n  - name: Ask a question\n    url: https://forum.deeppavlov.ai/\n    about: If you have a different question, please ask it in the forum https://forum.deeppavlov.ai\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature-request.md",
    "content": "---\nname: Feature request\nabout: Suggest a feature to improve the DeepPavlov library\ntitle: ''\nlabels: enhancement\nassignees: ''\n\n---\n\nWant to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first.\n\n\n**What problem are we trying to solve?**:\n```\n\n```\n\n**How can we solve it?**:\n```\n\n```\n\n**Are there other issues that block this solution?**:\n```\n\n```\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# dotenv\n.env\n\n# virtualenv\n.venv\nvenv/\nENV/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n\n#IDEA\n.idea/\n\n#Atom IDE\n.ftpconfig\n\n#vscode IDE\n.vscode\n\n# Vim\n*.vim\n*.vimrc\n\n#GIT\n.git/\n\n#Default usr dir\ndownload/\n\n#project test\n/test/\n.pytest_cache\n\n# project data\n/data/\n\n# local dockerfiles\n/Dockerfile\n/entrypoint.sh\n/.dockerignore\n"
  },
  {
    "path": ".readthedocs.yml",
    "content": "# .readthedocs.yml\nversion: 2\n\nbuild:\n  os: \"ubuntu-20.04\"\n  tools:\n    python: \"3.10\"\nformats: []\n\npython:\n  install:\n    - method: pip\n      path: .\n      extra_requirements:\n        - docs\n"
  },
  {
    "path": "CNAME",
    "content": "deeppavlov.ai"
  },
  {
    "path": "Jenkinsfile",
    "content": "node('cuda-module') {\n    timestamps {\n        try {\n            stage('Clean') {\n                sh \"rm -rf .[^.] .??* *\"\n            }\n            stage('Checkout') {\n                checkout scm\n            }\n            stage('Setup') {\n                env.TFHUB_CACHE_DIR=\"tfhub_cache\"\n                sh \"\"\"\n                    EPOCH=\\$(date +%s) docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG build\n                \"\"\"\n            }\n            stage('Tests') {\n                sh \"\"\"\n                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py36 py37\n                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1\n                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py38 py39\n                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1\n                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py310 py311\n                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 || exit 0\n                \"\"\"\n                currentBuild.result = 'SUCCESS'\n            }\n        }\n        catch(e) {\n            currentBuild.result = 'FAILURE'\n            throw e\n        }\n        finally {\n            sh \"\"\"\n                docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG rm -f\n                docker network rm \\$(echo $BUILD_TAG | awk '{print tolower(\\$0)}')_default\n            \"\"\"\n            emailext to: \"\\${DEFAULT_RECIPIENTS}\",\n                subject: \"${env.JOB_NAME} - Build # ${currentBuild.number} - ${currentBuild.result}!\",\n                body: '${BRANCH_NAME} - ${BUILD_URL}',\n                attachLog: true\n        }\n    }\n}\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2018 Neural Systems and Deep Learning Laboratory\n                  Moscow Institute of Physics and Technology\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "include README.MD\ninclude LICENSE\ninclude requirements.txt\ninclude deeppavlov/requirements/*.txt\nrecursive-include deeppavlov *.json\nrecursive-include deeppavlov *.md\n"
  },
  {
    "path": "README.md",
    "content": "# DeepPavlov 1.0\n\n[![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE)\n![Python 3.6, 3.7, 3.8, 3.9, 3.10, 3.11](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-green.svg)\n[![Downloads](https://pepy.tech/badge/deeppavlov)](https://pepy.tech/project/deeppavlov)\n[![Static Badge](https://img.shields.io/badge/DeepPavlov%20Community-blue)](https://forum.deeppavlov.ai/)\n[![Static Badge](https://img.shields.io/badge/DeepPavlov%20Demo-blue)](https://demo.deeppavlov.ai/)\n\n\nDeepPavlov 1.0 is an open-source NLP framework built on [PyTorch](https://pytorch.org/) and [transformers](https://github.com/huggingface/transformers). DeepPavlov 1.0 is created for modular and configuration-driven development of state-of-the-art NLP models and supports a wide range of NLP model applications. DeepPavlov 1.0 is designed for practitioners with limited knowledge of NLP/ML.\n\n## Quick Links\n\n|name|Description|\n|--|--|\n| ⭐️ [*Demo*](https://demo.deeppavlov.ai/)|Check out our NLP models in the online demo|\n| 📚 [*Documentation*](http://docs.deeppavlov.ai/)|How to use DeepPavlov 1.0 and its features|\n| 🚀 [*Model List*](http://docs.deeppavlov.ai/en/master/features/overview.html)|Find the NLP model you need in the list of available models|\n| 🪐 [*Contribution Guide*](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html)|Please read the contribution guidelines before making a contribution|\n| 🎛 [*Issues*](https://github.com/deeppavlov/DeepPavlov/issues)|If you have an issue with DeepPavlov, please let us know|\n| ⏩ [*Forum*](https://forum.deeppavlov.ai/)|Please let us know if you have a problem with DeepPavlov|\n| 📦 [*Blogs*](https://medium.com/deeppavlov)|Read about our current development|\n| 🦙 [Extended colab tutorials](https://github.com/deeppavlov/dp_tutorials)|Check out the code tutorials for our models|\n| 🌌 [*Docker Hub*](https://hub.docker.com/u/deeppavlov/)|Check out the Docker images for rapid deployment|\n| 👩‍🏫 [*Feedback*](https://forms.gle/i64fowQmiVhMMC7f9)|Please leave us your feedback to make DeepPavlov better|\n\n\n## Installation\n\n0. DeepPavlov supports `Linux`, `Windows 10+` (through WSL/WSL2), `MacOS` (Big Sur+) platforms, `Python 3.6`, `3.7`, `3.8`, `3.9` and `3.10`.\n    Depending on the model used, you may need from 4 to 16 GB RAM.\n\n1. Create and activate a virtual environment:\n    * `Linux`\n\n    ```\n    python -m venv env\n    source ./env/bin/activate\n    ```\n\n2. Install the package inside the environment:\n\n    ```\n    pip install deeppavlov\n    ```\n\n## QuickStart\n\nThere is a bunch of great pre-trained NLP models in DeepPavlov. Each model is\ndetermined by its config file.\n\nList of models is available on\n[the doc page](http://docs.deeppavlov.ai/en/master/features/overview.html) in\nthe `deeppavlov.configs` (Python):\n\n```python\nfrom deeppavlov import configs\n```\n\nWhen you're decided on the model (+ config file), there are two ways to train,\nevaluate and infer it:\n\n* via [Command line interface (CLI)](#command-line-interface-cli) and\n* via [Python](#python).\n\n#### GPU requirements\n\nBy default, DeepPavlov installs models requirements from PyPI. PyTorch from PyPI could not support your device CUDA\ncapability. To run supported DeepPavlov models on GPU you should have [CUDA](https://developer.nvidia.com/cuda-toolkit)\ncompatible with used GPU and [PyTorch version](deeppavlov/requirements/pytorch.txt) required by DeepPavlov models.\nSee [docs](https://docs.deeppavlov.ai/en/master/intro/quick_start.html#using-gpu) for details.\nGPU with Pascal or newer architecture and 4+ GB VRAM is recommended.\n\n### Command line interface (CLI)\n\nTo get predictions from a model interactively through CLI, run\n\n```bash\npython -m deeppavlov interact <config_path> [-d] [-i]\n```\n\n* `-d` downloads required data - pretrained model files and embeddings (optional).\n* `-i` installs model requirements (optional).\n\nYou can train it in the same simple way:\n\n```bash\npython -m deeppavlov train <config_path> [-d] [-i]\n```\n\nDataset will be downloaded regardless of whether there was `-d` flag or not.\n\nTo train on your own data you need to modify dataset reader path in the\n[train config doc](http://docs.deeppavlov.ai/en/master/intro/config_description.html#train-config).\nThe data format is specified in the corresponding model doc page.\n\nThere are even more actions you can perform with configs:\n\n```bash\npython -m deeppavlov <action> <config_path> [-d] [-i]\n```\n\n* `<action>` can be\n  * `install` to install model requirements (same as `-i`),\n  * `download` to download model's data (same as `-d`),\n  * `train` to train the model on the data specified in the config file,\n  * `evaluate` to calculate metrics on the same dataset,\n  * `interact` to interact via CLI,\n  * `riseapi` to run a REST API server (see\n    [doc](http://docs.deeppavlov.ai/en/master/integrations/rest_api.html)),\n  * `predict` to get prediction for samples from *stdin* or from\n      *<file_path>* if `-f <file_path>` is specified.\n* `<config_path>` specifies path (or name) of model's config file\n* `-d` downloads required data\n* `-i` installs model requirements\n\n### Python\n\nTo get predictions from a model interactively through Python, run\n\n```python\nfrom deeppavlov import build_model\n\nmodel = build_model(<config_path>, install=True, download=True)\n\n# get predictions for 'input_text1', 'input_text2'\nmodel(['input_text1', 'input_text2'])\n```\n\nwhere\n\n* `install=True` installs model requirements (optional),\n* `download=True` downloads required data from web - pretrained model files and embeddings (optional),\n* `<config_path>` is model name (e.g. `'ner_ontonotes_bert_mult'`), path to the chosen model's config file (e.g.\n  `\"deeppavlov/configs/ner/ner_ontonotes_bert_mult.json\"`),  or `deeppavlov.configs` attribute (e.g.\n  `deeppavlov.configs.ner.ner_ontonotes_bert_mult` without quotation marks).\n\nYou can train it in the same simple way:\n\n```python\nfrom deeppavlov import train_model \n\nmodel = train_model(<config_path>, install=True, download=True)\n```\n\nTo train on your own data you need to modify dataset reader path in the\n[train config doc](http://docs.deeppavlov.ai/en/master/intro/config_description.html#train-config).\nThe data format is specified in the corresponding model doc page.\n\nYou can also calculate metrics on the dataset specified in your config file:\n\n```python\nfrom deeppavlov import evaluate_model \n\nmodel = evaluate_model(<config_path>, install=True, download=True)\n```\n\nDeepPavlov also [allows](https://docs.deeppavlov.ai/en/master/intro/python.html) to build a model from components for\ninference using Python.\n\n## License\n\nDeepPavlov is Apache 2.0 - licensed.\n\n## Citation\n```\n@inproceedings{savkin-etal-2024-deeppavlov,\n    title = \"DeepPavlov 1.0: Your Gateway to Advanced NLP Models Backed by Transformers and Transfer Learning\",\n    author = \"Savkin Maksim and Voznyuk Anastasia and Ignatov Fedor and Korzanova Anna and Karpov Dmitry and Popov Alexander and Konovalov Vasily\"\n    editor = \"Hernandez Farias and Delia Irazu and Hope Tom and Li Manling\",\n    booktitle = \"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations\",\n    month = nov,\n    year = \"2024\",\n    address = \"Miami, Florida, USA\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2024.emnlp-demo.47\",\n    pages = \"465--474\",\n    abstract = \"We present DeepPavlov 1.0, an open-source framework for using Natural Language Processing (NLP) models by leveraging transfer learning techniques. DeepPavlov 1.0 is created for modular and configuration-driven development of state-of-the-art NLP models and supports a wide range of NLP model applications. DeepPavlov 1.0 is designed for practitioners with limited knowledge of NLP/ML. DeepPavlov is based on PyTorch and supports HuggingFace transformers. DeepPavlov is publicly released under the Apache 2.0 license and provides access to an online demo.\",\n}\n```\n"
  },
  {
    "path": "_config.yml",
    "content": "theme: jekyll-theme-leap-day\ngoogle_analytics: UA-139843736-5\ninclude:\n  - _static\n"
  },
  {
    "path": "_layouts/default.html",
    "content": "<!doctype html>\n<html lang=\"{{ site.lang | default: \"en-US\" }}\">\n  <head>\n    <meta charset=\"utf-8\">\n    <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\">\n\n{% seo %}\n    <link rel=\"stylesheet\" href=\"{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}\">\n    <script src=\"https://code.jquery.com/jquery-3.3.0.min.js\" integrity=\"sha256-RTQy8VOmNlT6b2PIRur37p6JEBZUE7o8wPgMvu18MC4=\" crossorigin=\"anonymous\"></script>\n    <script src=\"{{ '/assets/js/main.js' | relative_url }}\"></script>\n    <!--[if lt IE 9]>\n      <script src=\"https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv.min.js\" integrity=\"sha256-3Jy/GbSLrg0o9y5Z5n1uw0qxZECH7C6OQpVBgNFYa0g=\" crossorigin=\"anonymous\"></script>\n    <![endif]-->\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1, user-scalable=no\">\n    \n    <link rel=\"stylesheet\" type=\"text/css\" href=\"//cdnjs.cloudflare.com/ajax/libs/cookieconsent2/3.1.0/cookieconsent.min.css\" />\n    <script src=\"//cdnjs.cloudflare.com/ajax/libs/cookieconsent2/3.1.0/cookieconsent.min.js\"></script>\n    <script>\n        window.addEventListener(\"load\", function(){\n            window.cookieconsent.initialise({\n                \"palette\": {\n                    \"popup\": {\n                        \"background\": \"#237afc\"\n                    },\n                    \"button\": {\n                        \"background\": \"#fff\",\n                        \"text\": \"#237afc\"\n                    }\n                },\n                \"showLink\": false,\n                \"position\": \"bottom-right\",\n                \"theme\": \"classic\",\n                \"content\": {\n                    \"message\": \"This website uses cookies. By continuing to use this site, you accept our use of cookies.\",\n                    \"dismiss\": \"ACCEPT &amp; CLOSE\"\n                }\n            })});\n    </script>\n\n  </head>\n  <body>\n\n      <header>\n        <h1>{{ site.title | default: site.github.repository_name }}</h1>\n        <p>{{ site.description | default: site.github.project_tagline }}</p>\n      </header>\n\n      <div id=\"banner\">\n        <span id=\"logo\"></span>\n\n        <a href=\"{{ site.github.repository_url }}\" class=\"button fork\"><strong>View On GitHub</strong></a>\n        {% if site.show_downloads %}\n          <div class=\"downloads\">\n            <span>Downloads:</span>\n            <ul>\n              <li><a href=\"{{ site.github.zip_url }}\" class=\"button\">ZIP</a></li>\n              <li><a href=\"{{ site.github.tar_url }}\" class=\"button\">TAR</a></li>\n            </ul>\n          </div>\n        {% endif %}\n      </div><!-- end banner -->\n\n    <div class=\"wrapper\">\n      <nav>\n        <ul></ul>\n      </nav>\n      <section>\n        {{ content }}\n\n      </section>\n      <footer>\n        {% if site.github.is_project_page %}\n          <p>Project maintained by <a href=\"{{ site.github.owner_url }}\">{{ site.github.owner_name }}</a></p>\n        {% endif %}\n        <p><small>Hosted on GitHub Pages &mdash; Theme by <a href=\"https://twitter.com/michigangraham\">mattgraham</a></small></p>\n      </footer>\n    </div>\n\n    {% if site.google_analytics %}\n      <script>\n        (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){\n        (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),\n        m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)\n        })(window,document,'script','//www.google-analytics.com/analytics.js','ga');\n        ga('create', '{{ site.google_analytics }}', 'auto');\n        ga('send', 'pageview');\n      </script>\n    {% endif %}\n  </body>\n</html>\n"
  },
  {
    "path": "deeppavlov/__init__.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nfrom pathlib import Path\n\nfrom ._meta import __author__, __description__, __email__, __keywords__, __license__, __version__\nfrom .configs import configs\nfrom .core.commands.infer import build_model\nfrom .core.commands.train import train_evaluate_model_from_config\nfrom .core.common.base import Element, Model\nfrom .core.common.chainer import Chainer\nfrom .core.common.log import init_logger\nfrom .download import deep_download\n\n\n# TODO: make better\ndef train_model(config: [str, Path, dict], install: bool = False,\n                download: bool = False, recursive: bool = False) -> Chainer:\n    train_evaluate_model_from_config(config, install=install, download=download, recursive=recursive)\n    return build_model(config, load_trained=True)\n\n\ndef evaluate_model(config: [str, Path, dict], install: bool = False,\n                   download: bool = False, recursive: bool = False) -> dict:\n    return train_evaluate_model_from_config(config, to_train=False, install=install,\n                                            download=download, recursive=recursive)\n\n\n# check version\nassert sys.hexversion >= 0x3060000, 'Does not work in python3.5 or lower'\n\n# resolve conflicts with previous DeepPavlov installations versioned up to 0.0.9\ndot_dp_path = Path('~/.deeppavlov').expanduser().resolve()\nif dot_dp_path.is_file():\n    dot_dp_path.unlink()\n\n# initiate logging\ninit_logger()\n"
  },
  {
    "path": "deeppavlov/__main__.py",
    "content": "if __name__ == '__main__':\n    from .deep import main\n\n    main()\n"
  },
  {
    "path": "deeppavlov/_meta.py",
    "content": "__version__ = '1.7.0'\n__author__ = 'Neural Networks and Deep Learning lab, MIPT'\n__description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'\n__keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']\n__license__ = 'Apache License, Version 2.0'\n__email__ = 'info@deeppavlov.ai'\n"
  },
  {
    "path": "deeppavlov/configs/__init__.py",
    "content": "from pathlib import Path\nfrom typing import Iterator, Dict, Union, Iterable\n\n\nclass Struct:\n    def __iter__(self) -> Iterator[str]:\n        return iter(self._keys)\n\n    def __len__(self) -> int:\n        return len(self._keys)\n\n    def __init__(self, tree: Dict[str, Union[dict, Path]]) -> None:\n        self._keys = set()\n        for key, value in tree.items():\n            key = key.replace('.', '_')\n            self._keys.add(key)\n            setattr(self, key,\n                    Struct(value) if isinstance(value, dict) else value)\n        self._keys = frozenset(self._keys)\n\n        self.keys = lambda: self._keys\n\n    def _asdict(self, *, to_string: bool=False) -> dict:\n        res = []\n        for key in self._keys:\n            value = getattr(self, key)\n            if isinstance(value, Struct):\n                value = value._asdict(to_string=to_string)\n            elif to_string:\n                value = str(value)\n            res.append((key, value))\n\n        return dict(res)\n\n    def __getitem__(self, key: str) -> Union[dict, Path]:\n        if key not in self._keys:\n            raise KeyError(key)\n\n        item = getattr(self, key)\n        if isinstance(item, Struct):\n            item = item._asdict()\n        return item\n\n    def __dir__(self) -> Iterable:\n        return self._keys\n\n    def _ipython_key_completions_(self) -> Iterable:\n        return self._keys\n\n    def __str__(self) -> str:\n        return str(self._asdict(to_string=True))\n\n    def __repr__(self) -> str:\n        return f'Struct({repr(self._asdict())})'\n\n    def _repr_pretty_(self, p, cycle):\n        \"\"\"method that defines ``Struct``'s pretty printing rules for iPython\n\n        Args:\n            p (IPython.lib.pretty.RepresentationPrinter): pretty printer object\n            cycle (bool): is ``True`` if pretty detected a cycle\n        \"\"\"\n        if cycle:\n            p.text('Struct(...)')\n        else:\n            with p.group(7, 'Struct(', ')'):\n                p.pretty(self._asdict())\n\n\ndef _build_configs_tree() -> Struct:\n    root = Path(__file__).resolve().parent\n\n    tree = {}\n\n    for config in root.glob('**/*.json'):\n        leaf = tree\n        for part in config.relative_to(root).parent.parts:\n            if part not in leaf:\n                leaf[part] = {}\n            leaf = leaf[part]\n        leaf[config.stem] = config\n\n    return Struct(tree)\n\n\nconfigs = _build_configs_tree()\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/boolqa_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"boolqa_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/boolqa_data\",\n    \"language\": \"ru\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 243\n  },\n  \"chainer\": {\n    \"in\": [\"text_a\", \"text_b\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 128,\n        \"in\": [\"text_a\", \"text_b\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": 2,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODELS_PATH}/boolqa_rubert/model_rubert\",\n        \"load_path\": \"{MODELS_PATH}/boolqa_rubert/model_rubert\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 2e-05},\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y\"],\n        \"out\": [\"predictions\"]\n      }\n    ],\n    \"out\": [\"predictions\"]\n  },\n  \"train\": {\n    \"epochs\": 50,\n    \"batch_size\": 32,\n    \"train_metrics\": [\"f1\", \"acc\"],\n    \"metrics\": [\"f1\", \"acc\"],\n    \"validation_patience\": 5,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"evaluation_targets\": [\"valid\", \"train\"],\n    \"show_examples\": false,\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/few_shot_roberta.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"texts\", \"dataset\"],\n    \"in_y\": [\"y_true\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"dnnc_pair_generator\",\n        \"in\": [\"texts\", \"dataset\"],\n        \"out\": [\"x\", \"x_support\", \"x_populated\", \"y_support\"],\n        \"bidirectional\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"in\": [\"x_populated\", \"x_support\"],\n        \"out\": [\"bert_features\"],\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": true,\n        \"max_seq_length\": 128\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"main\": true,\n        \"in\": [\"bert_features\"],\n        \"out\": [\"simmilarity_scores\"],\n        \"n_classes\": 2,\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\"\n      },\n      {\n        \"class_name\": \"dnnc_proba2labels\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"in\": [\"simmilarity_scores\", \"x\", \"x_populated\", \"x_support\", \"y_support\"],\n        \"out\": [\"y_pred\"],\n        \"confidence_threshold\": 0.0\n      }\n    ],\n    \"out\": [\"y_pred\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"MODEL_PATH\": \"{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10\",\n      \"BINARY_CLASSIFICATION\": true,\n      \"BASE_MODEL\": \"roberta-base\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_cola_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": \"sentence\",\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\"x\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 32,\n    \"metrics\": [\"matthews_correlation\"],\n    \"validation_patience\": 10,\n    \"val_every_n_batches\": 250,\n    \"log_every_n_batches\": 250,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"glue\",\n      \"TASK\": \"cola\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/glue/glue_cola_roberta.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_mnli_cased_bert_torch.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"glue\",\n    \"name\": \"mnli\",\n    \"train\": \"train\",\n    \"valid\": \"validation_matched\",\n    \"test\": \"test_matched\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"hypothesis\", \"premise\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"hypothesis\", \"premise\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 128,\n        \"in\": [\"hypothesis\", \"premise\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 64,\n    \"metrics\": [\"accuracy\"],\n    \"validation_patience\": 10,\n    \"val_every_n_batches\": 250,\n    \"log_every_n_batches\": 250,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/glue_mnli_torch_cased_bert\",\n      \"BASE_MODEL\": \"bert-base-cased\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_mnli_mm_cased_bert_torch.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"glue\",\n    \"name\": \"mnli\",\n    \"train\": \"train\",\n    \"valid\": \"validation_mismatched\",\n    \"test\": \"test_mismatched\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"hypothesis\", \"premise\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"hypothesis\", \"premise\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 128,\n        \"in\": [\"hypothesis\", \"premise\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 64,\n    \"metrics\": [\"accuracy\"],\n    \"validation_patience\": 10,\n    \"val_every_n_batches\": 250,\n    \"log_every_n_batches\": 250,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/glue_mnli_mm_torch_cased_bert\",\n      \"BASE_MODEL\": \"bert-base-cased\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation_matched\",\n    \"test\": \"test_matched\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"hypothesis\", \"premise\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"hypothesis\", \"premise\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 128,\n        \"in\": [\"hypothesis\", \"premise\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 1e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 4,\n    \"metrics\": [\"accuracy\"],\n    \"validation_patience\": 10,\n    \"val_every_n_batches\": 250,\n    \"log_every_n_batches\": 250,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"glue\",\n      \"TASK\": \"mnli\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/0.16/classifiers/glue_mnli.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_mrpc_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"sentence1\", \"sentence2\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"sentence1\", \"sentence2\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"sentence1\", \"sentence2\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 1e-06\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 4,\n    \"metrics\": [\"accuracy\"],\n    \"epochs\": 2,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"glue\",\n      \"TASK\": \"mrpc\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/glue/glue_mrpc_roberta.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_qnli_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"question\", \"sentence\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"question\", \"sentence\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 128,\n        \"in\": [\"question\", \"sentence\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 16,\n    \"metrics\": [\"accuracy\"],\n    \"validation_patience\": 10,\n    \"val_every_n_batches\": 250,\n    \"log_every_n_batches\": 250,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"glue\",\n      \"TASK\": \"qnli\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/glue/glue_qnli_roberta.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_qqp_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"question1\", \"question2\"],\n    \"label\": \"label\",\n    \"use_label_name\": false,\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"question1\", \"question2\"],\n    \"in_y\": [\"y_ids\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 128,\n        \"in\": [\"question1\", \"question2\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": 2,\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": 2,\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      }\n    ],\n    \"out\": [\"y_pred_ids\"]\n  },\n  \"train\": {\n    \"batch_size\": 16,\n    \"metrics\": [\n      \"f1\",\n      \"accuracy\"\n    ],\n    \"validation_patience\": 10,\n    \"val_every_n_batches\": 250,\n    \"log_every_n_batches\": 250,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"glue\",\n      \"TASK\": \"qqp\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/glue/glue_qqp_roberta.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_rte_cased_bert_torch.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"glue\",\n    \"name\": \"rte\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"sentence1\", \"sentence2\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"sentence1\", \"sentence2\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"sentence1\", \"sentence2\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 32,\n    \"metrics\": [\"accuracy\"],\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/glue_rte_torch_cased_bert\",\n      \"BASE_MODEL\": \"bert-base-cased\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"sentence1\", \"sentence2\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"sentence1\", \"sentence2\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"sentence1\", \"sentence2\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 1e-06\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 4,\n    \"metrics\": [\"accuracy\"],\n    \"epochs\": 2,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large-mnli\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"glue\",\n      \"TASK\": \"rte\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/0.16/classifiers/glue_rte.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_sst2_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": \"sentence\",\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"bert-base-cased\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\"x\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"bert-base-cased\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 128,\n    \"metrics\": [\"accuracy\"],\n    \"validation_patience\": 10,\n    \"val_every_n_batches\": 250,\n    \"log_every_n_batches\": 250,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"glue\",\n      \"TASK\": \"sst2\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/glue/glue_sst2_roberta.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_stsb_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"sentence1\", \"sentence2\"],\n    \"label\": \"label\",\n    \"use_label_name\": false,\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"sentence1\", \"sentence2\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\"sentence1\", \"sentence2\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": 1,\n        \"return_probas\": false,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y\"],\n        \"out\": [\"y_pred\"]\n      }\n    ],\n    \"out\": [\"y_pred\"]\n  },\n  \"train\": {\n    \"batch_size\": 32,\n    \"metrics\": [\n      \"pearson_correlation\",\n      \"spearman_correlation\"\n    ],\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"glue\",\n      \"TASK\": \"stsb\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/glue/glue_stsb_roberta.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/glue/glue_wnli_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"sentence1\", \"sentence2\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"sentence1\", \"sentence2\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 192,\n        \"truncation\": \"longest_first\",\n        \"padding\": \"longest\",\n        \"in\": [\"sentence1\", \"sentence2\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 1e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 8,\n    \"metrics\": [\"accuracy\"],\n    \"epochs\": 1,\n    \"val_every_n_batches\": 250,\n    \"log_every_n_batches\": 250,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"glue\",\n      \"TASK\": \"wnli\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/0.16/classifiers/glue_wnli_roberta.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/insults_kaggle_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"basic_classification_reader\",\n    \"x\": \"Comment\",\n    \"y\": \"Class\",\n    \"data_path\": \"{DOWNLOADS_PATH}/insults_data\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": true,\n        \"max_seq_length\": 64,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"bert_features\"\n        ]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"y_ids\"\n        ]\n      },\n      {\n        \"in\": [\n          \"y_ids\"\n        ],\n        \"out\": [\n          \"y_onehot\"\n        ],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 1e-05\n        },\n        \"learning_rate_drop_patience\": 5,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"y_ids\"\n        ],\n        \"out\": [\n          \"y_pred_probas\"\n        ]\n      },\n      {\n        \"in\": [\n          \"y_pred_probas\"\n        ],\n        \"out\": [\n          \"y_pred_ids\"\n        ],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\n          \"y_pred_ids\"\n        ],\n        \"out\": [\n          \"y_pred_labels\"\n        ],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\n      \"y_pred_labels\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 100,\n    \"batch_size\": 64,\n    \"metrics\": [\n      {\n        \"name\": \"roc_auc\",\n        \"inputs\": [\n          \"y_onehot\",\n          \"y_pred_probas\"\n        ]\n      },\n      \"accuracy\",\n      \"f1_macro\"\n    ],\n    \"validation_patience\": 5,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"TRANSFORMER\": \"bert-base-uncased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/insults_kaggle_torch_bert\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/insults_data.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_torch_bert_v5.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/classifiers\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json",
    "content": " {\n  \"dataset_reader\": {\n    \"class_name\": \"paraphraser_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/paraphraser_data\",\n    \"do_lower_case\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"siamese_iterator\",\n    \"seed\": 243,\n    \"len_valid\": 500\n  },\n  \"chainer\": {\n    \"in\": [\"text_a\", \"text_b\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\"text_a\", \"text_b\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": 2,\n        \"return_probas\": false,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"attention_probs_keep_prob\": 0.11,\n        \"hidden_keep_prob\": 1.0, \n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 1.89e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 1.5,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"predictions\"\n        ]\n      }\n    ],\n    \"out\": [\"predictions\"]\n  },\n  \"train\": {\n    \"epochs\": 100,\n    \"batch_size\": 64,\n    \"metrics\": [\n        \"f1\",\n        \"accuracy\"\n    ],\n    \"validation_patience\": 7,\n    \"val_every_n_batches\": 50,\n    \"log_every_n_batches\": 50,\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\",\n      \"test\"\n    ],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"TRANSFORMER\": \"DeepPavlov/distilrubert-tiny-cased-conversational\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/paraphraser_convers_distilrubert_2L\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_2L.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/paraphraser.zip\",\n        \"subdir\": \"{DOWNLOADS_PATH}/paraphraser_data\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/paraphraser_gold.zip\",\n        \"subdir\": \"{DOWNLOADS_PATH}/paraphraser_data\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"paraphraser_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/paraphraser_data\",\n    \"do_lower_case\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"siamese_iterator\",\n    \"seed\": 243,\n    \"len_valid\": 500\n  },\n  \"chainer\": {\n    \"in\": [\"text_a\", \"text_b\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\"text_a\", \"text_b\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": 2,\n        \"return_probas\": false,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"attention_probs_keep_prob\": 0.0,\n        \"hidden_keep_prob\": 0.67, \n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 7.22e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 1.5,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"predictions\"\n        ]\n      }\n    ],\n    \"out\": [\"predictions\"]\n  },\n  \"train\": {\n    \"epochs\": 100,\n    \"batch_size\": 64,\n    \"metrics\": [\n        \"f1\",\n        \"accuracy\"\n    ],\n    \"validation_patience\": 7,\n    \"val_every_n_batches\": 50,\n    \"log_every_n_batches\": 50,\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\",\n      \"test\"\n    ],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"TRANSFORMER\": \"DeepPavlov/distilrubert-base-cased-conversational\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/paraphraser_convers_distilrubert_6L\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_6L.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/paraphraser.zip\",\n        \"subdir\": \"{DOWNLOADS_PATH}/paraphraser_data\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/paraphraser_gold.zip\",\n        \"subdir\": \"{DOWNLOADS_PATH}/paraphraser_data\"\n      }\n    ]\n  }\n} \n"
  },
  {
    "path": "deeppavlov/configs/classifiers/paraphraser_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"paraphraser_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/paraphraser_data\",\n    \"do_lower_case\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"siamese_iterator\",\n    \"seed\": 243,\n    \"len_valid\": 500\n  },\n  \"chainer\": {\n    \"in\": [\"text_a\", \"text_b\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\"text_a\", \"text_b\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": 2,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 2e-05},\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y\"],\n        \"out\": [\"predictions\"]\n      }\n    ],\n    \"out\": [\"predictions\"]\n  },\n  \"train\": {\n    \"batch_size\": 64,\n    \"pytest_max_batches\": 2,\n    \"train_metrics\": [\"f1\", \"acc\"],\n    \"metrics\": [\"f1\", \"acc\"],\n    \"validation_patience\": 7,\n    \"val_every_n_batches\": 50,\n    \"log_every_n_batches\": 50,\n    \"evaluation_targets\": [\"valid\", \"test\"],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/paraphraser_rubert_torch\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/paraphraser.zip\",\n        \"subdir\": \"{DOWNLOADS_PATH}/paraphraser_data\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/paraphraser_gold.zip\",\n        \"subdir\": \"{DOWNLOADS_PATH}/paraphraser_data\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/classifiers/paraphraser_rubert/paraphraser_rubert_v1.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/query_pr.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"sq_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/query_prediction/query_prediction_eng.pickle\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\"x\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 1e-05},\n        \"learning_rate_drop_patience\": 5,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"epochs\": 100,\n    \"batch_size\": 64,\n    \"metrics\": [\n      \"f1_macro\",\n      \"accuracy\",\n      {\n        \"name\": \"roc_auc\",\n        \"inputs\": [\"y_onehot\", \"y_pred_probas\"]\n      }\n    ],\n    \"validation_patience\": 10,\n    \"val_every_n_batches\": 100,\n    \"log_every_n_batches\": 100,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\", \"test\"],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"TRANSFORMER\": \"haisongzhang/roberta-tiny-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/query_prediction_eng\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/query_prediction_eng.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/classifiers/query_prediction_eng\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/query_prediction_eng.pickle\",\n        \"subdir\": \"{DOWNLOADS_PATH}/query_prediction\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/rusentiment_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"basic_classification_reader\",\n    \"x\": \"text\",\n    \"y\": \"label\",\n    \"data_path\": \"{DOWNLOADS_PATH}/rusentiment/\",\n    \"train\": \"rusentiment_random_posts.csv\",\n    \"test\": \"rusentiment_test.csv\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42,\n    \"split_seed\": 23,\n    \"field_to_split\": \"train\",\n    \"split_fields\": [\n      \"train\",\n      \"valid\"\n    ],\n    \"split_proportions\": [\n      0.9,\n      0.1\n    ]\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"bert_features\"\n        ]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": \"y\",\n        \"out\": \"y_ids\"\n      },\n      {\n        \"in\": \"y_ids\",\n        \"out\": \"y_onehot\",\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer_parameters\": {\"lr\": 1e-05},\n        \"learning_rate_drop_patience\": 5,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"y_onehot\"\n        ],\n        \"out\": [\n          \"y_pred_probas\"\n        ]\n      },\n      {\n        \"in\": \"y_pred_probas\",\n        \"out\": \"y_pred_ids\",\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": \"y_pred_ids\",\n        \"out\": \"y_pred_labels\",\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\n      \"y_pred_labels\"\n    ]\n  },\n  \"train\": {\n    \"batch_size\": 64,\n    \"epochs\": 100,\n    \"metrics\": [\n      \"f1_weighted\",\n      \"f1_macro\",\n      \"accuracy\",\n      {\n        \"name\": \"roc_auc\",\n        \"inputs\": [\n          \"y_onehot\",\n          \"y_pred_probas\"\n        ]\n      }\n    ],\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"validation_patience\": 5,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/rusentiment_bert_torch\",\n      \"TRANSFORMER\": \"bert-base-multilingual-cased\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/classifiers/rusentiment_bert/rusentiment_bert_torch.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/rusentiment_convers_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"basic_classification_reader\",\n    \"x\": \"text\",\n    \"y\": \"label\",\n    \"data_path\": \"{DOWNLOADS_PATH}/rusentiment/\",\n    \"train\": \"rusentiment_random_posts.csv\",\n    \"test\": \"rusentiment_test.csv\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42,\n    \"split_seed\": 23,\n    \"field_to_split\": \"train\",\n    \"split_fields\": [\n      \"train\",\n      \"valid\"\n    ],\n    \"split_proportions\": [\n      0.9,\n      0.1\n    ]\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"bert_features\"\n        ]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": \"y\",\n        \"out\": \"y_ids\"\n      },\n      {\n        \"in\": \"y_ids\",\n        \"out\": \"y_onehot\",\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer_parameters\": {\"lr\": 1e-05},\n        \"learning_rate_drop_patience\": 5,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"y_onehot\"\n        ],\n        \"out\": [\n          \"y_pred_probas\"\n        ]\n      },\n      {\n        \"in\": \"y_pred_probas\",\n        \"out\": \"y_pred_ids\",\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": \"y_pred_ids\",\n        \"out\": \"y_pred_labels\",\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\n      \"y_pred_labels\"\n    ]\n  },\n  \"train\": {\n    \"batch_size\": 64,\n    \"epochs\": 100,\n    \"metrics\": [\n      \"f1_weighted\",\n      \"f1_macro\",\n      \"accuracy\",\n      {\n        \"name\": \"roc_auc\",\n        \"inputs\": [\n          \"y_onehot\",\n          \"y_pred_probas\"\n        ]\n      }\n    ],\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"validation_patience\": 5,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/rusentiment_convers_bert_torch\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased-conversational\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/classifiers/rusentiment_convers_bert/rusentiment_convers_bert_torch.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"basic_classification_reader\",\n    \"x\": \"text\",\n    \"y\": \"label\",\n    \"data_path\": \"{DOWNLOADS_PATH}/rusentiment/\",\n    \"train\": \"rusentiment_random_posts.csv\",\n    \"test\": \"rusentiment_test.csv\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42,\n    \"split_seed\": 23,\n    \"field_to_split\": \"train\",\n    \"split_fields\": [\n      \"train\",\n      \"valid\"\n    ],\n    \"split_proportions\": [\n      0.9,\n      0.1\n    ]\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": true,\n        \"max_seq_length\": 64,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"bert_features\"\n        ]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": \"y\",\n        \"out\": \"y_ids\"\n      },\n      {\n        \"in\": \"y_ids\",\n        \"out\": \"y_onehot\",\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"attention_probs_keep_prob\": 0.78,\n        \"hidden_keep_prob\": 0.89, \n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 7.22e-05\n        },\n        \"learning_rate_drop_patience\": 5,\n        \"learning_rate_drop_div\": 1.5,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"y_ids\"\n        ],\n        \"out\": [\n          \"y_pred_probas\"\n        ]\n      },\n      {\n        \"in\": \"y_pred_probas\",\n        \"out\": \"y_pred_ids\",\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": \"y_pred_ids\",\n        \"out\": \"y_pred_labels\",\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\n      \"y_pred_labels\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 100,\n    \"batch_size\": 64,\n    \"metrics\": [\n        \"f1_weighted\",\n        \"f1_macro\",\n        \"accuracy\",\n        {\n            \"name\": \"roc_auc\",\n            \"inputs\": [\n                \"y_onehot\",\n                \"y_pred_probas\"\n            ]\n        }\n    ],\n    \"validation_patience\": 5,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\",\n      \"test\"\n    ],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"TRANSFORMER\": \"DeepPavlov/distilrubert-tiny-cased-conversational\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_2L\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_2L.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/classifiers/\"\n      }\n    ]\n  }\n} \n"
  },
  {
    "path": "deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"basic_classification_reader\",\n    \"x\": \"text\",\n    \"y\": \"label\",\n    \"data_path\": \"{DOWNLOADS_PATH}/rusentiment/\",\n    \"train\": \"rusentiment_random_posts.csv\",\n    \"test\": \"rusentiment_test.csv\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42,\n    \"split_seed\": 23,\n    \"field_to_split\": \"train\",\n    \"split_fields\": [\n      \"train\",\n      \"valid\"\n    ],\n    \"split_proportions\": [\n      0.9,\n      0.1\n    ]\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": true,\n        \"max_seq_length\": 64,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"bert_features\"\n        ]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": \"y\",\n        \"out\": \"y_ids\"\n      },\n      {\n        \"in\": \"y_ids\",\n        \"out\": \"y_onehot\",\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"attention_probs_keep_prob\": 0.78,\n        \"hidden_keep_prob\": 0, \n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 4.56e-05\n        },\n        \"learning_rate_drop_patience\": 5,\n        \"learning_rate_drop_div\": 1.5,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"y_ids\"\n        ],\n        \"out\": [\n          \"y_pred_probas\"\n        ]\n      },\n      {\n        \"in\": \"y_pred_probas\",\n        \"out\": \"y_pred_ids\",\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": \"y_pred_ids\",\n        \"out\": \"y_pred_labels\",\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\n      \"y_pred_labels\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 100,\n    \"batch_size\": 64,\n    \"metrics\": [\n        \"f1_weighted\",\n        \"f1_macro\",\n        \"accuracy\",\n        {\n            \"name\": \"roc_auc\",\n            \"inputs\": [\n                \"y_onehot\",\n                \"y_pred_probas\"\n            ]\n        }\n    ],\n    \"validation_patience\": 5,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\",\n      \"test\"\n    ],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"TRANSFORMER\": \"DeepPavlov/distilrubert-base-cased-conversational\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_6L\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_6L.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/classifiers/\"\n      }\n    ]\n  }\n} \n"
  },
  {
    "path": "deeppavlov/configs/classifiers/sentiment_sst_conv_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"basic_classification_reader\",\n    \"x\": \"text\",\n    \"y\": \"fine_grained_label\",\n    \"data_path\": \"{DOWNLOADS_PATH}/stanfordSentimentTreebank\",\n    \"train\": \"train_fine_grained.csv\",\n    \"valid\": \"valid_fine_grained.csv\",\n    \"test\": \"test_fine_grained.csv\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"bert_features\"\n        ]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": \"y\",\n        \"out\": \"y_ids\"\n      },\n      {\n        \"in\": \"y_ids\",\n        \"out\": \"y_onehot\",\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer_parameters\": {\"lr\": 1e-05},\n        \"learning_rate_drop_patience\": 5,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"y_onehot\"\n        ],\n        \"out\": [\n          \"y_pred_probas\"\n        ]\n      },\n      {\n        \"in\": \"y_pred_probas\",\n        \"out\": \"y_pred_ids\",\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": \"y_pred_ids\",\n        \"out\": \"y_pred_labels\",\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\n      \"y_pred_labels\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 100,\n    \"batch_size\": 64,\n    \"metrics\": [\n      \"accuracy\",\n      {\n        \"name\": \"roc_auc\",\n        \"inputs\": [\n          \"y_onehot\",\n          \"y_pred_probas\"\n        ]\n      },\n      \"f1_macro\"\n    ],\n    \"validation_patience\": 5,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/sentiment_sst_bert_torch\",\n      \"TRANSFORMER\": \"DeepPavlov/bert-base-cased-conversational\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/stanfordSentimentTreebank.zip\",\n        \"subdir\": \"{DOWNLOADS_PATH}\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/classifiers/sentiment_sst_bert/sentiment_sst_bert_torch.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/sentiment_twitter.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"basic_classification_reader\",\n    \"x\": \"Twit\",\n    \"y\": \"Class\",\n    \"data_path\": \"{DOWNLOADS_PATH}/sentiment_twitter_data\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": \"y\",\n        \"out\": \"y_ids\"\n      },\n      {\n        \"in\": \"x\",\n        \"out\": \"x_tok\",\n        \"id\": \"my_tokenizer\",\n        \"class_name\": \"nltk_tokenizer\",\n        \"tokenizer\": \"wordpunct_tokenize\"\n      },\n      {\n        \"in\": \"x_tok\",\n        \"out\": \"x_emb\",\n        \"id\": \"my_embedder\",\n        \"class_name\": \"fasttext\",\n        \"load_path\": \"{DOWNLOADS_PATH}/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin\",\n        \"pad_zero\": true\n      },\n      {\n        \"in\": \"y_ids\",\n        \"out\": \"y_onehot\",\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"in\": [\n          \"x_emb\"\n        ],\n        \"in_y\": [\n          \"y_ids\"\n        ],\n        \"out\": [\n          \"y_pred_probas\"\n        ],\n        \"main\": true,\n        \"class_name\": \"torch_text_classification_model\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"embedding_size\": \"#my_embedder.dim\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"kernel_sizes_cnn\": [\n          3,\n          5,\n          7\n        ],\n        \"filters_cnn\": 256,\n        \"dropout_rate\": 0.5,\n        \"dense_size\": 64,\n        \"optimizer\": \"SGD\",\n        \"optimizer_parameters\": {\n          \"lr\": 0.0001,\n          \"momentum\": 0.9,\n          \"weight_decay\": 0.0001\n        }\n      },\n      {\n        \"in\": \"y_pred_probas\",\n        \"out\": \"y_pred_ids\",\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": \"y_pred_ids\",\n        \"out\": \"y_pred_labels\",\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\n      \"y_pred_labels\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 100,\n    \"batch_size\": 128,\n    \"metrics\": [\n      \"accuracy\",\n      \"f1_macro\",\n      {\n        \"name\": \"roc_auc\",\n        \"inputs\": [\"y_onehot\", \"y_pred_probas\"]\n      }\n    ],\n    \"validation_patience\": 5,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/sentiment_twitter_torch\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/sentiment_twitter_data.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin\",\n        \"subdir\": \"{DOWNLOADS_PATH}/embeddings\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/classifiers/sentiment_twitter/sentiment_twitter_torch.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/superglue/superglue_boolq_roberta_mnli.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"dev_percentage\": 50\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"question\", \"passage\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"question\", \"passage\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"question\", \"passage\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 0.1\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"confidence_threshold\": 0.5\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 24,\n    \"metrics\": [\"accuracy\"],\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large-mnli\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"super_glue\",\n      \"TASK\": \"boolq\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\",\n      \"BINARY_CLASSIFICATION\": true\n    },\n    \"download\": [\n\t  {\n\t\t\"url\": \"http://files.deeppavlov.ai/v1/superglue/superglue_boolq_roberta_mnli.tar.gz\",\n\t\t\"subdir\": \"{MODEL_PATH}\"\n\t  }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/superglue/superglue_copa_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"contexts\", \"choices\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"contexts_list\", \"choices_list\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_multiplechoice_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\"contexts_list\", \"choices_list\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_multiplechoice\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 16,\n    \"metrics\": [\"accuracy\"],\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"super_glue\",\n      \"TASK\": \"copa\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/0.17/classifiers/superglue/superglue_copa_roberta.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/superglue/superglue_record_roberta.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"downsample_ratio\": [1.8, 1.8, 1],\n    \"do_index_correction\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"idx\", \"query\", \"passage\", \"entities\", \"num_examples\"],\n    \"label\": \"label\",\n    \"seed\": 42,\n    \"use_label_name\": false\n  },\n  \"chainer\": {\n    \"in\": [\"idx\", \"query\", \"passage\", \"entities\", \"num_examples\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 320,\n        \"in\": [\"query\", \"passage\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": 2,\n        \"return_probas\": true,\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 0.1\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"class_name\": \"proba2labels\",\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"max_proba\": true\n      },\n      {\n        \"class_name\": \"torch_record_postprocessor\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"in\": [\"idx\", \"y\", \"y_pred_probas\", \"entities\", \"num_examples\"],\n        \"out\": [\"record_examples\"]\n      }\n    ],\n    \"out\": [\"y_pred_probas\"]\n  },\n  \"train\": {\n    \"batch_size\": 24,\n    \"train_metrics\": [\n      {\n        \"name\": \"accuracy\",\n        \"inputs\": [\"y\", \"y_pred_ids\"]\n      }\n    ],\n    \"metrics\": [\n      {\n        \"name\": \"record_em_score\",\n        \"inputs\": [\"record_examples\"]\n      },\n      {\n        \"name\": \"record_f1_score\",\n        \"inputs\": [\"record_examples\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"inputs\": [\"y\", \"y_pred_ids\"]\n      }\n    ],\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"class_name\": \"torch_trainer\",\n    \"evaluation_targets\": [\"valid\"],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"roberta-large\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"super_glue\",\n      \"TASK\": \"record\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\",\n      \"BINARY_CLASSIFICATION\": false\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/0.17/classifiers/superglue/superglue_record_roberta.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/superglue/superglue_wic_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"sentence1\", \"sentence2\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"sentence1\", \"sentence2\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"sentence1\", \"sentence2\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 2e-05},\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 16,\n    \"metrics\": [\"accuracy\"],\n    \"epochs\": 10,\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"val_every_n_batches\": 1000,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"bert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"super_glue\",\n      \"TASK\": \"wic\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/superglue/superglue_wic_bert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/classifiers/topics_distilbert_base_uncased.json",
    "content": "{\r\n  \"dataset_reader\": {\r\n    \"class_name\": \"basic_classification_reader\",\r\n    \"class_sep\": \";\",\r\n    \"x\": \"text\",\r\n    \"y\": \"topic\",\r\n    \"data_path\": \"{DOWNLOADS_PATH}/dp_topics_downsampled_data/\",\r\n    \"train\" : \"train.csv\",\r\n    \"valid\" : \"valid.csv\"  \r\n  },\r\n  \"dataset_iterator\": {\r\n    \"class_name\": \"basic_classification_iterator\",\r\n    \"seed\": 42\r\n  },\r\n  \"chainer\": {\r\n    \"in\": [\r\n      \"x\"\r\n    ],\r\n    \"in_y\": [\r\n      \"y\"\r\n    ],\r\n    \"pipe\": [\r\n      {\r\n        \"class_name\": \"torch_transformers_preprocessor\",\r\n        \"vocab_file\": \"{TRANSFORMER}\",\r\n        \"do_lower_case\": true,\r\n        \"max_seq_length\": 128,\r\n        \"in\": [\r\n          \"x\"\r\n        ],\r\n        \"out\": [\r\n          \"bert_features\"\r\n        ]\r\n      },\r\n      {\r\n        \"id\": \"classes_vocab\",\r\n        \"class_name\": \"simple_vocab\",\r\n        \"fit_on\": [\r\n          \"y\"\r\n        ],\r\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\r\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\r\n        \"in\": [\r\n          \"y\"\r\n        ],\r\n        \"out\": [\r\n          \"y_ids\"\r\n        ]\r\n      },\r\n      {\r\n        \"in\": [\r\n          \"y_ids\"\r\n        ],\r\n        \"out\": [\r\n          \"y_onehot\"\r\n        ],\r\n        \"class_name\": \"one_hotter\",\r\n        \"id\": \"my_one_hotter\",\r\n        \"depth\": \"#classes_vocab.len\",\r\n        \"single_vector\": true\r\n      },\r\n      {\r\n        \"class_name\": \"torch_transformers_classifier\",\r\n        \"n_classes\": \"#classes_vocab.len\",\r\n        \"return_probas\": true,\r\n        \"pretrained_bert\": \"{TRANSFORMER}\",\r\n        \"save_path\": \"{MODEL_PATH}/model\",\r\n        \"load_path\": \"{MODEL_PATH}/model\",\r\n        \"multilabel\": true,\r\n        \"optimizer\": \"AdamW\",\r\n        \"optimizer_parameters\": {\r\n          \"lr\": 1e-05\r\n        },\r\n        \"learning_rate_drop_patience\": 5,\r\n        \"learning_rate_drop_div\": 2.0,\r\n        \"in\": [\r\n          \"bert_features\"\r\n        ],\r\n        \"in_y\": [\r\n          \"y_onehot\"\r\n        ],\r\n        \"out\": [\r\n          \"y_pred_probas\"\r\n        ]\r\n      },\r\n      {\r\n        \"in\": \"y_pred_probas\",\r\n        \"out\": \"y_pred_ids\",\r\n        \"class_name\": \"proba2labels\",\r\n        \"max_proba\": false,\r\n        \"confidence_threshold\": 0.5\r\n      },\r\n      {\r\n        \"in\": \"y_pred_ids\",\r\n        \"out\": \"y_pred_labels\",\r\n        \"ref\": \"classes_vocab\"\r\n      },\r\n      {\r\n        \"ref\": \"my_one_hotter\",\r\n        \"in\": \"y_pred_ids\",\r\n        \"out\": \"y_pred_onehot\"\r\n      }\r\n    ],\r\n    \"out\": [\r\n      \"y_pred_labels\"\r\n    ]\r\n  },\r\n  \"train\": {\r\n    \"epochs\": 100,\r\n    \"batch_size\": 64,\r\n    \"metrics\": [\r\n      {\r\n        \"name\": \"f1_macro\",\r\n        \"inputs\": [\r\n          \"y_onehot\",\r\n          \"y_pred_onehot\"\r\n        ]\r\n      },\r\n      {\r\n        \"name\": \"f1_weighted\",\r\n        \"inputs\": [\r\n          \"y_onehot\",\r\n          \"y_pred_onehot\"\r\n        ]\r\n      },\r\n      {\r\n        \"name\": \"accuracy\",\r\n        \"inputs\": [\r\n          \"y\",\r\n          \"y_pred_labels\"\r\n        ]\r\n      },\r\n      {\r\n        \"name\": \"roc_auc\",\r\n        \"inputs\": [\r\n          \"y_onehot\",\r\n          \"y_pred_probas\"\r\n        ]\r\n      }\r\n    ],\r\n    \"validation_patience\": 10,\r\n    \"val_every_n_epochs\": 1,\r\n    \"log_every_n_epochs\": 1,\r\n    \"log_every_n_batches\": 100,\r\n    \"show_examples\": false,\r\n    \"evaluation_targets\": [\r\n      \"train\",\r\n      \"valid\",\r\n      \"test\"\r\n    ],\r\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/logs\",\r\n    \"class_name\": \"torch_trainer\"\r\n  },\r\n  \"metadata\": {\r\n    \"variables\": {\r\n      \"TRANSFORMER\": \"distilbert-base-uncased\",\r\n      \"ROOT_PATH\": \"~/.deeppavlov\",\r\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\r\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\r\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/topic_distilbert_base_v0\"\r\n    },\r\n    \"download\": [\r\n      {\r\n        \"url\": \"http://files.deeppavlov.ai/datasets/dp_topics_downsampled_dataset_v0.tar.gz\",\r\n        \"subdir\": \"{DOWNLOADS_PATH}\"\r\n      },\r\n      {\r\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/classifiers/topic_distilbert_base_v0.tar.gz\",\r\n        \"subdir\": \"{MODELS_PATH}/classifiers\"\r\n      }\r\n    ]\r\n  }\r\n}"
  },
  {
    "path": "deeppavlov/configs/doc_retrieval/en_ranker_pop_wiki.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"odqa_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/odqa/enwiki\",\n    \"save_path\": \"{DOWNLOADS_PATH}/odqa/enwiki.db\",\n    \"dataset_format\": \"wiki\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"sqlite_iterator\",\n    \"shuffle\": false,\n    \"load_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_l100.db\"\n  },\n  \"chainer\": {\n    \"in\": [\"docs\"],\n    \"in_y\": [\"doc_ids\", \"doc_nums\"],\n    \"out\": [\"pop_doc_ids\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"hashing_tfidf_vectorizer\",\n        \"id\": \"vectorizer\",\n        \"fit_on\": [\"docs\", \"doc_ids\", \"doc_nums\"],\n        \"save_path\": \"{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz\",\n        \"load_path\": \"{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz\",\n        \"tokenizer\": {\n          \"class_name\": \"stream_spacy_tokenizer\",\n          \"lemmas\": true,\n          \"lowercase\": true,\n          \"filter_stopwords\": true,\n          \"ngram_range\": [1, 3]\n        }\n      },\n      {\n        \"class_name\": \"tfidf_ranker\",\n        \"top_n\": 100,\n        \"in\": [\"docs\"],\n        \"out\": [\"tfidf_doc_ids\", \"tfidf_doc_scores\"],\n        \"vectorizer\": \"#vectorizer\"\n      },\n      {\n        \"class_name\": \"pop_ranker\",\n        \"pop_dict_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_popularities.json\",\n        \"load_path\": \"{MODELS_PATH}/odqa/logreg_3features_v2.joblib\",\n        \"top_n\": 100,\n        \"in\": [\"tfidf_doc_ids\", \"tfidf_doc_scores\"],\n        \"out\": [\"pop_doc_ids\", \"pop_doc_scores\"]\n      }\n    ]\n  },\n  \"train\": {\n    \"batch_size\": 10000,\n    \"evaluation_targets\": [],\n    \"class_name\": \"fit_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/odqa\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/odqa\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_popularities.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/odqa\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib\",\n        \"subdir\": \"{MODELS_PATH}/odqa\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/doc_retrieval/en_ranker_tfidf_wiki.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"odqa_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/odqa/enwiki\",\n    \"save_path\": \"{DOWNLOADS_PATH}/odqa/enwiki.db\",\n    \"dataset_format\": \"wiki\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"sqlite_iterator\",\n    \"shuffle\": false,\n    \"load_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_l100.db\"\n  },\n  \"chainer\": {\n    \"in\": [\"docs\"],\n    \"in_y\": [\"doc_ids\", \"doc_nums\"],\n    \"out\": [\"tfidf_doc_ids\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"hashing_tfidf_vectorizer\",\n        \"id\": \"vectorizer\",\n        \"fit_on\": [\"docs\", \"doc_ids\", \"doc_nums\"],\n        \"save_path\": \"{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz\",\n        \"load_path\": \"{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz\",\n        \"tokenizer\": {\n          \"class_name\": \"stream_spacy_tokenizer\",\n          \"lemmas\": true,\n          \"lowercase\": true,\n          \"filter_stopwords\": true,\n          \"ngram_range\": [1, 3]\n        }\n      },\n      {\n        \"class_name\": \"tfidf_ranker\",\n        \"top_n\": 100,\n        \"in\": [\"docs\"],\n        \"out\": [\"tfidf_doc_ids\", \"tfidf_doc_scores\"],\n        \"vectorizer\": \"#vectorizer\"\n      }\n    ]\n  },\n  \"train\": {\n    \"batch_size\": 10000,\n    \"evaluation_targets\": [],\n    \"class_name\": \"fit_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/odqa\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/odqa\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"odqa_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/odqa/ruwiki\",\n    \"save_path\": \"{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db\",\n    \"dataset_format\": \"wiki\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"sqlite_iterator\",\n    \"shuffle\": false,\n    \"load_path\": \"{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db\"\n  },\n  \"chainer\": {\n    \"in\": [\"docs\"],\n    \"in_y\": [\"doc_ids\", \"doc_nums\"],\n    \"out\": [\"tfidf_doc_ids\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"hashing_tfidf_vectorizer\",\n        \"id\": \"vectorizer\",\n        \"fit_on\": [\"docs\", \"doc_ids\", \"doc_nums\"],\n        \"save_path\": \"{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz\",\n        \"load_path\": \"{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz\",\n        \"tokenizer\": {\n          \"class_name\": \"stream_spacy_tokenizer\",\n          \"spacy_model\": \"ru_core_news_sm\",\n          \"lemmas\": true,\n          \"lowercase\": true,\n          \"filter_stopwords\": true,\n          \"ngram_range\": [1, 3]\n        }\n      },\n      {\n        \"class_name\": \"tfidf_ranker\",\n        \"top_n\": 100,\n        \"in\": [\"docs\"],\n        \"out\": [\"tfidf_doc_ids\", \"tfidf_doc_scores\"],\n        \"vectorizer\": \"#vectorizer\"\n      }\n    ]\n  },\n  \"train\": {\n    \"batch_size\": 10000,\n    \"evaluation_targets\": [],\n    \"class_name\": \"fit_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_par_page_compr.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/odqa\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_tfidf_matrix_compr.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/odqa\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/embedder/bert_embedder.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"texts\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"transformers_bert_preprocessor\",\n        \"vocab_file\": \"{BERT_PATH}/vocab.txt\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"in\": [\"texts\"],\n        \"out\": [\"tokens\", \"subword_tokens\", \"subword_tok_ids\", \"startofword_markers\", \"attention_mask\"]\n      },\n      {\n        \"class_name\": \"transformers_bert_embedder\",\n        \"bert_config_path\": \"{BERT_PATH}/bert_config.json\",\n        \"load_path\": \"{BERT_PATH}\",\n        \"truncate\": true,\n        \"in\": [\"subword_tok_ids\", \"startofword_markers\", \"attention_mask\"],\n        \"out\": [\"word_emb\", \"subword_emb\", \"max_emb\", \"mean_emb\", \"pooler_output\"]\n      }\n    ],\n    \"out\": [\"tokens\", \"word_emb\", \"subword_tokens\", \"subword_emb\", \"max_emb\", \"mean_emb\", \"pooler_output\"]\n  },\n  \"train\": {},\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"BERT_PATH\": \"{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12_pt\"\n    },\n    \"labels\": {},\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/bert_models\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/embedder/bert_sentence_embedder.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"texts\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"transformers_bert_preprocessor\",\n        \"vocab_file\": \"{BERT_PATH}/vocab.txt\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"in\": [\"texts\"],\n        \"out\": [\"tokens\", \"subword_tokens\", \"subword_tok_ids\", \"startofword_markers\", \"attention_mask\"]\n      },\n      {\n        \"class_name\": \"transformers_bert_embedder\",\n        \"bert_config_path\": \"{BERT_PATH}/config.json\",\n        \"load_path\": \"{BERT_PATH}\",\n        \"truncate\": false,\n        \"in\": [\"subword_tok_ids\", \"startofword_markers\", \"attention_mask\"],\n        \"out\": [\"word_emb\", \"subword_emb\", \"max_emb\", \"mean_emb\", \"pooler_output\"]\n      }\n    ],\n    \"out\": [\"max_emb\", \"mean_emb\", \"pooler_output\"]\n  },\n  \"train\": {},\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"BERT_PATH\": \"{DOWNLOADS_PATH}/bert_models/sentence_multi_cased_L-12_H-768_A-12_pt_v1\"\n    },\n    \"labels\": {},\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/bert_models\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/entity_extraction/entity_detection_en.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"ner_chunker\",\n        \"batch_size\": 16,\n        \"max_seq_len\" : 300,\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"in\": [\"x\"],\n        \"out\": [\"x_chunk\", \"chunk_nums\", \"chunk_sentences_offsets\", \"chunk_sentences\"]\n      },\n      {\n        \"thres_proba\": 0.6,\n        \"o_tag\": \"O\",\n        \"tags_file\": \"{NER_PATH}/tag.dict\",\n        \"class_name\": \"entity_detection_parser\",\n        \"id\": \"edp\"\n      },\n      {\n        \"class_name\": \"ner_chunk_model\",\n        \"ner\": {\n          \"config_path\": \"{CONFIGS_PATH}/ner/ner_ontonotes_bert.json\",\n          \"overwrite\": {\n            \"chainer.out\": [\"x_tokens\", \"tokens_offsets\", \"y_pred\", \"probas\"]\n          }\n        },\n        \"ner_parser\": \"#edp\",\n        \"in\": [\"x_chunk\", \"chunk_nums\", \"chunk_sentences_offsets\", \"chunk_sentences\"],\n        \"out\": [\"entity_substr\", \"entity_offsets\", \"entity_positions\", \"tags\", \"sentences_offsets\", \"sentences\", \"probas\"]\n      }\n    ],\n    \"out\": [\"entity_substr\", \"entity_offsets\", \"entity_positions\", \"tags\", \"sentences_offsets\", \"sentences\", \"probas\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\",\n      \"TRANSFORMER\": \"bert-base-cased\",\n      \"NER_PATH\": \"{MODELS_PATH}/ner_ontonotes_bert_torch_crf\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/entity_extraction/entity_detection_ru.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"ner_chunker\",\n        \"batch_size\": 16,\n        \"max_seq_len\" : 300,\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"in\": [\"x\"],\n        \"out\": [\"x_chunk\", \"chunk_nums\", \"chunk_sentences_offsets\", \"chunk_sentences\"]\n      },\n      {\n        \"thres_proba\": 0.05,\n        \"o_tag\": \"O\",\n        \"tags_file\": \"{NER_PATH}/tag.dict\",\n        \"class_name\": \"entity_detection_parser\",\n        \"id\": \"edp\"\n      },\n      {\n        \"class_name\": \"ner_chunk_model\",\n        \"ner\": {\"config_path\": \"{CONFIGS_PATH}/ner/ner_rus_bert_probas.json\"},\n        \"ner_parser\": \"#edp\",\n        \"in\": [\"x_chunk\", \"chunk_nums\", \"chunk_sentences_offsets\", \"chunk_sentences\"],\n        \"out\": [\"entity_substr\", \"entity_offsets\", \"entity_positions\", \"tags\", \"sentences_offsets\", \"sentences\", \"probas\"]\n      }\n    ],\n    \"out\": [\"entity_substr\", \"entity_offsets\", \"entity_positions\", \"tags\", \"sentences_offsets\", \"sentences\", \"probas\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"NER_PATH\": \"{MODELS_PATH}/wiki_ner_rus_bert\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/entity_extraction/entity_extraction_en.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"pipe\": [\n      {\n        \"config_path\": \"{CONFIGS_PATH}/entity_extraction/entity_detection_en.json\",\n        \"in\": [\"x\"],\n        \"out\": [\"entity_substr\", \"entity_offsets\", \"entity_positions\", \"tags\", \"sentences_offsets\", \"sentences\", \"probas\"]\n      },\n      {\n        \"config_path\": \"{CONFIGS_PATH}/entity_extraction/entity_linking_en.json\",\n        \"in\": [\"entity_substr\", \"tags\", \"probas\", \"sentences\", \"entity_offsets\", \"sentences_offsets\"],\n        \"out\": [\"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"]\n      }\n    ],\n    \"out\": [\"entity_substr\", \"tags\", \"entity_offsets\", \"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/entity_extraction/entity_extraction_ru.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"pipe\": [\n      {\n        \"config_path\": \"{CONFIGS_PATH}/entity_extraction/entity_detection_ru.json\",\n        \"in\": [\"x\"],\n        \"out\": [\"entity_substr\", \"entity_offsets\", \"entity_positions\", \"tags\", \"sentences_offsets\", \"sentences\", \"probas\"]\n      },\n      {\n        \"config_path\": \"{CONFIGS_PATH}/entity_extraction/entity_linking_ru.json\",\n        \"in\": [\"entity_substr\", \"tags\", \"probas\", \"sentences\", \"entity_offsets\", \"sentences_offsets\"],\n        \"out\": [\"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"]\n      }\n    ],\n    \"out\": [\"entity_substr\", \"tags\", \"entity_offsets\", \"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/entity_extraction/entity_linking_en.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"entity_substr\", \"tags\", \"probas\", \"sentences\", \"entity_offsets\", \"sentences_offsets\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_entity_ranker_infer\",\n        \"id\": \"entity_descr_ranking\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"encoder_weights_path\": \"{MODELS_PATH}/entity_linking_eng/encoder.pth.tar\",\n        \"bilinear_weights_path\": \"{MODELS_PATH}/entity_linking_eng/bilinear.pth.tar\",\n        \"special_token_id\": 30522,\n        \"emb_size\": 512,\n        \"block_size\": 8\n      },\n      {\n        \"class_name\": \"entity_linker\",\n        \"in\": [\"entity_substr\", \"tags\", \"probas\", \"sentences\", \"entity_offsets\", \"sentences_offsets\"],\n        \"out\": [\"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"],\n        \"load_path\": \"{DOWNLOADS_PATH}/entity_linking_eng\",\n        \"entities_database_filename\": \"el_eng_v2.db\",\n        \"entity_ranker\": \"#entity_descr_ranking\",\n        \"rank_in_runtime\": true,\n        \"num_entities_for_bert_ranking\": 20,\n        \"include_mention\": false,\n        \"num_entities_to_return\": 3,\n        \"lemmatize\": true,\n        \"use_descriptions\": true,\n        \"use_connections\": true,\n        \"use_tags\": true,\n        \"full_paragraph\": true,\n        \"return_confidences\": true,\n        \"lang\": \"en\"\n      }\n    ],\n    \"out\": [\"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"prajjwal1/bert-small\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/downloads/el_db_eng_v2.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/entity_linking_eng\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/entity_linking/el_ranker_eng.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/entity_linking_eng\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/entity_extraction/entity_linking_ru.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"entity_substr\", \"tags\", \"probas\", \"sentences\", \"entity_offsets\", \"sentences_offsets\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_entity_ranker_infer\",\n        \"id\": \"entity_descr_ranking\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"encoder_weights_path\": \"{MODELS_PATH}/entity_linking_rus/encoder.pth.tar\",\n        \"bilinear_weights_path\": \"{MODELS_PATH}/entity_linking_rus/bilinear.pth.tar\",\n        \"special_token_id\": 30522,\n        \"emb_size\": 264,\n        \"block_size\": 6\n      },\n      {\n        \"class_name\": \"entity_linker\",\n        \"in\": [\"entity_substr\", \"tags\", \"probas\", \"sentences\", \"entity_offsets\", \"sentences_offsets\"],\n        \"out\": [\"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"],\n        \"load_path\": \"{DOWNLOADS_PATH}/entity_linking_rus\",\n        \"entities_database_filename\": \"el_rus_v2.db\",\n        \"words_dict_filename\": \"{DOWNLOADS_PATH}/entity_linking_rus/words_dict.pickle\",\n        \"ngrams_matrix_filename\": \"{DOWNLOADS_PATH}/entity_linking_rus/ngrams_matrix.npz\",\n        \"entity_ranker\": \"#entity_descr_ranking\",\n        \"rank_in_runtime\": true,\n        \"num_entities_for_bert_ranking\": 30,\n        \"use_gpu\": false,\n        \"include_mention\": false,\n        \"num_entities_to_return\": 3,\n        \"lemmatize\": true,\n        \"use_descriptions\": true,\n        \"use_connections\": true,\n        \"use_tags\": true,\n        \"kb_filename\": \"{DOWNLOADS_PATH}/wikidata/wikidata_lite.hdt\",\n        \"prefixes\": {\"entity\": [\"http://we\"],\n                     \"rels\": {\"direct\": \"http://wpd\",\n                              \"no_type\": \"http://wp\",\n                              \"statement\": \"http://wps\",\n                              \"qualifier\": \"http://wpq\"\n                              }\n                     },\n        \"full_paragraph\": true,\n        \"return_confidences\": true,\n        \"lang\": \"ru\"\n      }\n    ],\n    \"out\": [\"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"DeepPavlov/distilrubert-tiny-cased-conversational-v1\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/downloads/el_files_rus_v2.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/entity_linking_rus\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/entity_linking/el_ranker_rus.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/entity_linking_rus\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/wikidata_lite.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/wikidata\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/faq/fasttext_logreg.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"basic_classification_reader\",\n    \"format\": \"json\",\n    \"orient\": \"split\",\n    \"x\": \"text\",\n    \"y\": \"category\",\n    \"data_path\": \"{DOWNLOADS_PATH}/massive/{LANGUAGE}\",\n    \"train\": \"train.json\",\n    \"valid\": \"dev.json\",\n    \"test\": \"test.json\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42,\n    \"shuffle\": true,\n    \"shot\": 5\n  },\n  \"chainer\": {\n    \"in\": [\"text\"],\n    \"in_y\": [\"category\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"stream_spacy_tokenizer\",\n        \"in\": [\"text\"],\n        \"id\": \"my_tokenizer\",\n        \"lemmas\": false,\n        \"out\": \"token_lemmas\",\n        \"spacy_model\": \"{SPACY_MODEL}\"\n      },\n      {\n        \"ref\": \"my_tokenizer\",\n        \"in\": [\"token_lemmas\"],\n        \"out\": [\"text_lem\"]\n      },\n      {\n        \"class_name\": \"fasttext\",\n        \"in\": [\"token_lemmas\"],\n        \"load_path\": \"{DOWNLOADS_PATH}/embeddings/fasttext/{LANGUAGE}.bin\",\n        \"mean\": true,\n        \"out\": [\"text_vector\"]\n      },\n      {\n        \"id\": \"answers_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": \"category\",\n        \"save_path\": \"{MODEL_PATH}/cat_answers.dict\",\n        \"load_path\": \"{MODEL_PATH}/cat_answers.dict\",\n        \"in\": [\"category\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"text_vector\"],\n        \"fit_on\": [\"text_vector\", \"y_ids\"],\n        \"out\": [\"y_pred_proba\"],\n        \"class_name\": \"sklearn_component\",\n        \"main\": true,\n        \"save_path\": \"{MODEL_PATH}/model.pkl\",\n        \"load_path\": \"{MODEL_PATH}/model.pkl\",\n        \"model_class\": \"sklearn.linear_model:LogisticRegression\",\n        \"infer_method\": \"predict_proba\",\n        \"C\": 10,\n        \"penalty\": \"l2\"\n      },\n      {\n        \"in\": [\"y_pred_proba\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_category\"],\n        \"ref\": \"answers_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_category\"]\n  },\n  \"train\": {\n    \"evaluation_targets\": [\"train\", \"valid\", \"test\"],\n    \"class_name\": \"fit_trainer\",\n    \"metrics\": [\n      {\n        \"name\": \"accuracy\",\n        \"inputs\": [\"category\", \"y_pred_category\"]\n      }\n    ]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"LANGUAGE\": \"en\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"SPACY_MODEL\": \"en_core_web_sm\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODEL_PATH\": \"{ROOT_PATH}/models/faq/{LANGUAGE}/fasttext_logreg\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/embeddings/fasttext/{LANGUAGE}.bin\",\n        \"subdir\": \"{DOWNLOADS_PATH}/embeddings/fasttext\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/massive-{LANGUAGE}.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/massive/{LANGUAGE}\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/faq/fasttext_logreg_{LANGUAGE}.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/kbqa/kbqa_cq_en.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"lcquad_reader\",\n    \"question_types\": [\"statement_property\", \"right-subgraph\", \"simple question left\",\n                      \"simple question right\", \"left-subgraph\", \"rank\"],\n    \"num_samples\": 100,\n    \"data_path\": \"{DOWNLOADS_PATH}/lcquad/lcquad2.json\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"gold_answer_ids\", \"gold_answer_labels\", \"gold_query\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"question_sign_checker\",\n        \"in\": [\"x\"],\n        \"out\": [\"x_punct\"]\n      },\n      {\n        \"config_path\": \"{CONFIGS_PATH}/classifiers/query_pr.json\",\n        \"in\": [\"x_punct\"],\n        \"out\": [\"template_type\"]\n      },\n      {\n        \"class_name\": \"query_formatter\",\n        \"query_info\": {\"unk_var\": \"?answer\", \"mid_var\": \"?ent\"},\n        \"in\": [\"gold_query\"],\n        \"out\": [\"f_gold_query\"]\n      },\n      {\n        \"config_path\": \"{CONFIGS_PATH}/entity_extraction/entity_detection_en.json\",\n        \"overwrite\": {\n            \"chainer.pipe.1.make_tags_from_probas\": true,\n            \"chainer.pipe.2.ner\": {\n              \"config_path\": \"{CONFIGS_PATH}/ner/ner_ontonotes_bert.json\",\n              \"overwrite\": {\n                \"chainer.out\": [\"x_tokens\", \"tokens_offsets\", \"y_pred\", \"probas\"],\n                \"chainer.pipe.2.use_crf\": false,\n                \"metadata.variables.TRANSFORMER\": \"distilbert-base-cased\",\n                \"metadata.variables.MODEL_PATH\": \"{MODELS_PATH}/entity_type_detection_distilbert_lcquad2.0\"\n              }\n            },\n            \"metadata.variables.NER_PATH\": \"{MODELS_PATH}/entity_type_detection_distilbert_lcquad2.0\"\n        },\n        \"in\": [\"x_punct\", \"template_type\"],\n        \"out\": [\"entity_type_substr\", \"entity_offsets\", \"entity_positions\", \"tags\", \"sentences_offsets\", \"sentences\", \"probas\"]\n      },\n      {\n        \"class_name\": \"entity_type_split\",\n        \"in\": [\"entity_type_substr\", \"tags\"],\n        \"out\": [\"entity_substr\", \"entity_tags\", \"type_substr\"]\n      },\n      {\n        \"class_name\": \"answer_types_extractor\",\n        \"lang\": \"@en\",\n        \"types_filename\": \"{DOWNLOADS_PATH}/wikidata_eng/types_labels_dict_en.pickle\",\n        \"types_sets_filename\": \"{DOWNLOADS_PATH}/wikidata_eng/answer_types.pickle\",\n        \"in\": [\"x_punct\", \"entity_substr\", \"tags\"],\n        \"out\": [\"answer_types\", \"f_entity_substr\", \"f_tags\"]\n      },\n      {\n        \"class_name\": \"entity_linker\",\n        \"load_path\": \"{DOWNLOADS_PATH}/entity_linking_eng\",\n        \"entities_database_filename\": \"el_db_lcquad2.db\",\n        \"num_entities_to_return\": 7,\n        \"lemmatize\": true,\n        \"use_descriptions\": false,\n        \"use_connections\": false,\n        \"use_tags\": true,\n        \"alias_coef\": 1.0,\n        \"prefixes\": {\"entity\": [\"http://we\"],\n                     \"rels\": {\"direct\": \"http://wpd\",\n                              \"no_type\": \"http://wp\",\n                              \"statement\": \"http://wps\",\n                              \"qualifier\": \"http://wpq\"\n                              }\n                     },\n        \"return_confidences\": true,\n        \"lang\": \"en\",\n        \"id\": \"entity_linker\"\n      },\n      {\n        \"class_name\": \"wiki_parser\",\n        \"id\": \"wiki_p\",\n        \"wiki_filename\": \"{DOWNLOADS_PATH}/wikidata/wikidata_full.hdt\",\n        \"lang\": \"@en\"\n      },\n      {\n        \"class_name\": \"template_matcher\",\n        \"id\": \"template_m\",\n        \"num_processors\": 16,\n        \"load_path\": \"{DOWNLOADS_PATH}/wikidata_eng\",\n        \"templates_filename\": \"templates_eng.json\"\n      },\n      {\n        \"class_name\": \"rel_ranking_infer\",\n        \"id\": \"rel_r_inf\",\n        \"ranker\": {\"config_path\": \"{CONFIGS_PATH}/ranking/rel_ranking_roberta_en.json\",\n                   \"overwrite\": {\"chainer.out\": [\"y_pred_probas\"]}\n        },\n        \"wiki_parser\": \"#wiki_p\",\n        \"batch_size\": 32,\n        \"rank_answers\": true,\n        \"load_path\": \"{DOWNLOADS_PATH}/wikidata_eng\",\n        \"rel_q2name_filename\": \"wiki_dict_properties_eng.pickle\"\n      },\n      {\n        \"class_name\": \"query_generator\",\n        \"id\": \"query_g\",\n        \"entity_linker\": \"#entity_linker\",\n        \"template_matcher\": \"#template_m\",\n        \"rel_ranker\": \"#rel_r_inf\",\n        \"wiki_parser\": \"#wiki_p\",\n        \"load_path\": \"{DOWNLOADS_PATH}/wikidata\",\n        \"rels_in_ranking_queries_fname\": \"rels_in_ranking_queries.json\",\n        \"sparql_queries_filename\": \"{DOWNLOADS_PATH}/wikidata/sparql_queries_eng.json\",\n        \"entities_to_leave\": 5,\n        \"rels_to_leave\": 10,\n        \"return_answers\": false,\n        \"map_query_str_to_kb\": [[\"P0\", \"http://wd\"], [\"P00\", \"http://wl\"], [\"wd:\", \"http://we/\"], [\"wdt:\", \"http://wpd/\"],\n                                [\" p:\", \" http://wp/\"], [\"ps:\", \"http://wps/\"], [\"pq:\", \"http://wpq/\"]],\n        \"kb_prefixes\": {\"entity\": \"wd:E\", \"rel\": \"wdt:R\", \"type\": \"wd:T\", \"type_rel\": \"wdt:P\", \"type_rels\": [\"P31\", \"P279\"]},\n        \"gold_query_info\": {\"unk_var\": \"?answer\", \"mid_var\": \"?ent\"},\n        \"in\": [\"x_punct\", \"x_punct\", \"template_type\", \"entity_substr\", \"type_substr\", \"entity_tags\", \"probas\", \"answer_types\"],\n        \"out\": [\"cand_answers\", \"template_answers\"]\n      },\n      {\n        \"class_name\": \"rel_ranking_infer\",\n        \"ranker\": {\"config_path\": \"{CONFIGS_PATH}/ranking/path_ranking_nll_roberta_en.json\"},\n        \"wiki_parser\": \"#wiki_p\",\n        \"batch_size\": 32,\n        \"nll_path_ranking\": true,\n        \"return_elements\": [\"answer_ids\", \"queries\"],\n        \"rank_answers\": true,\n        \"load_path\": \"{DOWNLOADS_PATH}/wikidata_eng\",\n        \"rel_q2name_filename\": \"wiki_dict_properties_eng.pickle\",\n        \"in\": [\"x_punct\", \"template_type\", \"cand_answers\", \"entity_substr\", \"template_answers\"],\n        \"out\": [\"answers\", \"answer_ids\", \"query\"]\n      }\n    ],\n    \"out\": [\"answers\", \"answer_ids\", \"query\"]\n  },\n  \"train\": {\n    \"evaluation_targets\": [\"test\"],\n    \"batch_size\": 1,\n    \"metrics\": [\n      {\n        \"name\": \"kbqa_accuracy\",\n        \"inputs\": [\"x\", \"answers\", \"answer_ids\", \"query\", \"gold_answer_labels\", \"gold_answer_ids\", \"f_gold_query\"]\n      }\n    ],\n    \"class_name\": \"nn_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/datasets/lcquad2.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/lcquad\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/models/entity_type_detection_distilbert_lcquad2.0.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/entity_type_detection_distilbert_lcquad2.0\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/queries_and_rels_lcquad2_v2.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/wikidata\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/downloads/el_db_lcquad2.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/entity_linking_eng\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/wikidata_full.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/wikidata\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/kbqa_files_en.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/wikidata_eng\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/kbqa/kbqa_cq_ru.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"rubq_reader\",\n    \"version\": \"2.0\",\n    \"question_types\": [\"all\"],\n    \"num_samples\": 100,\n    \"data_path\": \"{DOWNLOADS_PATH}/rubq/rubq2.0.json\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"gold_answer_ids\", \"gold_answer_labels\", \"gold_query\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"question_sign_checker\",\n        \"delete_brackets\": true,\n        \"in\": [\"x\"],\n        \"out\": [\"x_punct\"]\n      },\n      {\n        \"class_name\": \"query_formatter\",\n        \"query_info\": {\"unk_var\": \"?answer\", \"mid_var\": \"?ent\"},\n        \"in\": [\"gold_query\"],\n        \"out\": [\"f_gold_query\"]\n      },\n      {\n        \"class_name\": \"ner_chunker\",\n        \"batch_size\": 16,\n        \"max_seq_len\" : 300,\n        \"vocab_file\": \"distilbert-base-multilingual-cased\",\n        \"in\": [\"x_punct\"],\n        \"out\": [\"x_chunk\", \"chunk_nums\", \"chunk_sentences_offsets\", \"chunk_sentences\"]\n      },\n      {\n        \"thres_proba\": 0.05,\n        \"o_tag\": \"O\",\n        \"tags_file\": \"{NER_PATH}/tag.dict\",\n        \"class_name\": \"entity_detection_parser\",\n        \"ignored_tags\": [\"DATE\", \"CARDINAL\", \"ORDINAL\", \"QUANTITY\", \"PERCENT\", \"NORP\"],\n        \"lang\": \"ru\",\n        \"id\": \"edp\"\n      },\n      {\n        \"thres_proba\": 0.05,\n        \"o_tag\": \"O\",\n        \"tags_file\": \"{NER_PATH2}/tag.dict\",\n        \"class_name\": \"entity_detection_parser\",\n        \"ignored_tags\": [\"T\"],\n        \"lang\": \"ru\",\n        \"id\": \"edp2\"\n      },\n      {\n        \"class_name\": \"ner_chunk_model\",\n        \"ner\": {\n          \"config_path\": \"{CONFIGS_PATH}/ner/ner_ontonotes_bert_mult.json\",\n          \"overwrite\": {\n            \"chainer.pipe.2.use_crf\": false,\n            \"metadata.variables.TRANSFORMER\": \"distilbert-base-multilingual-cased\",\n            \"chainer.out\": [\"x_tokens\", \"tokens_offsets\", \"y_pred\", \"probas\"],\n            \"metadata.variables.MODEL_PATH\": \"{MODELS_PATH}/ner_ontonotes_torch_distilbert_mult\"\n          }\n        },\n        \"ner_parser\": \"#edp\",\n        \"ner2\": {\n          \"config_path\": \"{CONFIGS_PATH}/ner/ner_ontonotes_bert_mult.json\",\n          \"overwrite\": {\n            \"chainer.pipe.2.use_crf\": false,\n            \"metadata.variables.TRANSFORMER\": \"DeepPavlov/distilrubert-small-cased-conversational\",\n            \"chainer.out\": [\"x_tokens\", \"tokens_offsets\", \"y_pred\", \"probas\"],\n            \"metadata.variables.MODEL_PATH\": \"{MODELS_PATH}/entity_detection_rubq\"\n          }\n        },\n        \"ner_parser2\": \"#edp2\",\n        \"in\": [\"x_chunk\", \"chunk_nums\", \"chunk_sentences_offsets\", \"chunk_sentences\"],\n        \"out\": [\"entity_substr\", \"entity_offsets\", \"entity_positions\", \"tags\", \"sentences_offsets\", \"sentences\", \"probas\"]\n      },\n      {\n        \"class_name\": \"answer_types_extractor\",\n        \"lang\": \"@ru\",\n        \"types_filename\": \"{DOWNLOADS_PATH}/wikidata_rus/types_labels_dict_ru.pickle\",\n        \"types_sets_filename\": \"{DOWNLOADS_PATH}/wikidata_rus/answer_types.pickle\",\n        \"in\": [\"x_punct\", \"entity_substr\", \"tags\"],\n        \"out\": [\"answer_types\", \"f_entity_substr\", \"f_tags\"]\n      },\n      {\n        \"class_name\": \"entity_linker\",\n        \"load_path\": \"{DOWNLOADS_PATH}/entity_linking_rus\",\n        \"entities_database_filename\": \"el_db_rus.db\",\n        \"words_dict_filename\": \"{DOWNLOADS_PATH}/entity_linking_rus/words_dict.pickle\",\n        \"ngrams_matrix_filename\": \"{DOWNLOADS_PATH}/entity_linking_rus/ngrams_matrix.npz\",\n        \"include_mention\": false,\n        \"num_entities_to_return\": 7,\n        \"lemmatize\": true,\n        \"use_descriptions\": false,\n        \"use_connections\": true,\n        \"use_tags\": true,\n        \"kb_filename\": \"{DOWNLOADS_PATH}/wikidata/wikidata_full.hdt\",\n        \"prefixes\": {\"entity\": [\"http://we\"],\n                     \"rels\": {\"direct\": \"http://wpd\",\n                              \"no_type\": \"http://wp\",\n                              \"statement\": \"http://wps\",\n                              \"qualifier\": \"http://wpq\"\n                              }\n                     },\n        \"return_confidences\": true,\n        \"lang\": \"ru\",\n        \"id\": \"entity_linker\"\n      },\n      {\n        \"class_name\": \"wiki_parser\",\n        \"id\": \"wiki_p\",\n        \"wiki_filename\": \"{DOWNLOADS_PATH}/wikidata/wikidata_full.hdt\",\n        \"max_comb_num\": 40000,\n        \"lang\": \"@ru\"\n      },\n      {\n        \"class_name\": \"slovnet_syntax_parser\",\n        \"load_path\": \"{MODELS_PATH}/slovnet_syntax_parser\",\n        \"navec_filename\": \"{MODELS_PATH}/slovnet_syntax_parser/navec_news_v1_1B_250K_300d_100q.tar\",\n        \"syntax_parser_filename\": \"{MODELS_PATH}/slovnet_syntax_parser/slovnet_syntax_news_v1.tar\",\n        \"tree_patterns_filename\": \"{MODELS_PATH}/slovnet_syntax_parser/tree_patterns.json\",\n        \"id\": \"slovnet_parser\"\n      },\n      {\n        \"class_name\": \"ru_adj_to_noun\",\n        \"freq_dict_filename\": \"{DOWNLOADS_PATH}/wikidata_rus/freqrnc2011.csv\",\n        \"id\": \"adj2noun\"\n      },\n      {\n        \"class_name\": \"tree_to_sparql\",\n        \"sparql_queries_filename\": \"{DOWNLOADS_PATH}/wikidata/sparql_queries_rus.json\",\n        \"adj_to_noun\": \"#adj2noun\",\n        \"syntax_parser\": \"#slovnet_parser\",\n        \"kb_prefixes\": {\"entity\": \"wd:E\", \"rel\": \"wdt:R\", \"type\": \"wd:T\", \"type_rel\": \"wdt:P\", \"type_rels\": [\"P31\", \"P279\"]},\n        \"in\": [\"x_punct\", \"entity_substr\", \"tags\", \"entity_offsets\", \"entity_positions\", \"probas\"],\n        \"out\": [\"x_sanitized\", \"query_nums\", \"s_entity_substr\", \"s_tags\", \"s_probas\", \"entities_to_link\", \"s_types_substr\"]\n      },\n      {\n        \"class_name\": \"template_matcher\",\n        \"id\": \"template_m\",\n        \"num_processors\": 8,\n        \"load_path\": \"{DOWNLOADS_PATH}/wikidata_rus\",\n        \"templates_filename\": \"templates_rus.json\"\n      },\n      {\n        \"class_name\": \"rel_ranking_infer\",\n        \"id\": \"rel_r_inf\",\n        \"ranker\": {\"config_path\": \"{CONFIGS_PATH}/ranking/rel_ranking_nll_bert_ru.json\"},\n        \"wiki_parser\": \"#wiki_p\",\n        \"batch_size\": 32,\n        \"nll_rel_ranking\": true,\n        \"return_elements\": [\"answer_ids\", \"queries\"],\n        \"load_path\": \"{DOWNLOADS_PATH}/wikidata_rus\",\n        \"rank\": false,\n        \"rel_thres\": -4.0,\n        \"type_rels\": [\"P31\", \"P279\"],\n        \"rel_q2name_filename\": \"wiki_dict_properties_full_rus.pickle\"\n      },\n      {\n        \"class_name\": \"query_generator\",\n        \"id\": \"query_g\",\n        \"entity_linker\": \"#entity_linker\",\n        \"template_matcher\": \"#template_m\",\n        \"rel_ranker\": \"#rel_r_inf\",\n        \"wiki_parser\": \"#wiki_p\",\n        \"load_path\": \"{DOWNLOADS_PATH}/wikidata\",\n        \"rels_in_ranking_queries_fname\": \"rels_in_ranking_queries.json\",\n        \"sparql_queries_filename\": \"{DOWNLOADS_PATH}/wikidata/sparql_queries_rus.json\",\n        \"entities_to_leave\": 9,\n        \"rels_to_leave\": 10,\n        \"max_comb_num\": 1000,\n        \"map_query_str_to_kb\": [[\"P0\", \"http://wd\"], [\"P00\", \"http://wl\"], [\"wd:\", \"http://we/\"], [\"wdt:\", \"http://wpd/\"],\n                                [\" p:\", \" http://wp/\"], [\"ps:\", \"http://wps/\"], [\"pq:\", \"http://wpq/\"]],\n        \"kb_prefixes\": {\"entity\": \"wd:E\", \"rel\": \"wdt:R\", \"type\": \"wd:T\", \"type_rel\": \"wdt:P\", \"type_rels\": [\"P31\", \"P279\"]},\n        \"gold_query_info\": {\"unk_var\": \"?answer\", \"mid_var\": \"?ent\"},\n        \"syntax_structure_known\": true,\n        \"in\": [\"x_punct\", \"x_sanitized\", \"query_nums\", \"s_entity_substr\", \"s_types_substr\", \"s_tags\", \"s_probas\", \"answer_types\", \"entities_to_link\"],\n        \"out\": [\"answers\", \"answer_ids\", \"query\"]\n      }\n    ],\n    \"out\": [\"answers\", \"answer_ids\", \"query\"]\n  },\n  \"train\": {\n    \"evaluation_targets\": [\"test\"],\n    \"batch_size\": 1,\n    \"metrics\": [\n      {\n        \"name\": \"kbqa_accuracy\",\n        \"inputs\": [\"x\", \"answers\", \"answer_ids\", \"query\", \"gold_answer_labels\", \"gold_answer_ids\", \"f_gold_query\"]\n      }\n    ],\n    \"class_name\": \"nn_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\",\n      \"NER_PATH\": \"{MODELS_PATH}/ner_ontonotes_torch_distilbert_mult\",\n      \"NER_PATH2\": \"{MODELS_PATH}/entity_detection_rubq\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/rubq2.0.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/rubq\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/downloads/el_files_rus.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/entity_linking_rus\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/models/ner_ontonotes_torch_distilbert_mult.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/ner_ontonotes_torch_distilbert_mult\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/models/entity_detection_rubq.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/entity_detection_rubq\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/queries_and_rels_rus_v2.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/wikidata\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/kbqa_files_ru.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/wikidata_rus\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/syntax_parser/slovnet_syntax_parser_v2.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/slovnet_syntax_parser\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/wikidata_full.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/wikidata\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/kbqa/wiki_parser.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"parser_info\", \"query\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"wiki_parser\",\n        \"in\": [\"parser_info\", \"query\"],\n        \"out\": [\"wiki_parser_output\"],\n        \"wiki_filename\": \"{DOWNLOADS_PATH}/wikidata/wikidata_compr.pickle\",\n        \"file_format\": \"pickle\",\n        \"lang\": \"@en\"\n      }\n    ],\n    \"out\": [\"wiki_parser_output\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/wikidata_compr.pickle\",\n        \"subdir\": \"{DOWNLOADS_PATH}/wikidata\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/morpho_syntax_parser/morpho_ru_syntagrus_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"morphotagger_dataset_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/UD2.3_source\",\n    \"language\": \"ru_syntagrus\",\n    \"data_types\": [\"train\", \"dev\", \"test\"]\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"morphotagger_dataset_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"in\": [\"x\"],\n        \"class_name\": \"lazy_tokenizer\",\n        \"out\": [\"x_words\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\"x_words\"],\n        \"out\": [\"x_tokens\", \"x_subword_tokens\", \"x_subword_tok_ids\", \"startofword_markers\", \"attention_mask\", \"tokens_offsets\"]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"min_freq\": 3,\n        \"fit_on\": [\"y\"],\n        \"in\": [\"y\"],\n        \"out\": [\"y_ind\"],\n        \"special_tokens\": [\"PAD\", \"BEGIN\", \"END\"],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\"\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"use_crf\": false,\n        \"encoder_layer_ids\": [-6, -5, -4, -3, -2, -1],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [0.9, 0.999],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 10,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\"x_subword_tok_ids\", \"attention_mask\", \"startofword_markers\"],\n        \"in_y\": [\"y_ind\"],\n        \"out\": [\"y_pred_ind\", \"probas\"]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\"y_pred_ind\"],\n        \"out\": [\"y_pred\"]\n      },\n      {\n        \"in\": [\"x_words\"],\n        \"out\": [\"y_lemmas\"],\n        \"model\": \"ru_core_news_sm\",\n        \"class_name\": \"spacy_lemmatizer\"\n      },\n      {\n        \"in\": [\"x_words\", \"y_pred\", \"y_lemmas\"],\n        \"out\": [\"y_prettified\"],\n        \"id\": \"prettifier\",\n        \"class_name\": \"lemmatized_output_prettifier\"\n      }\n    ],\n    \"out\": [\"y_prettified\"]\n  },\n  \"train\": {\n    \"epochs\": 10,\n    \"batch_size\": 32,\n    \"metrics\": [\n      {\n        \"name\": \"per_token_accuracy\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      }\n    ],\n    \"validation_patience\": 15,\n    \"val_every_n_epochs\": 1,\n    \"val_every_n_batches\": 300,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\"valid\", \"test\"],\n    \"class_name\": \"nn_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/morpho_ru_syntagrus_torch_bert\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.3/ru_syntagrus.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/UD2.3_source/ru_syntagrus\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/syntax_parsing/morpho_ru_syntagrus_torch_bert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/morpho_syntax_parser/ru_syntagrus_joint_parsing.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"x_words\"],\n    \"pipe\": [\n      {\n        \"id\": \"main\",\n        \"class_name\": \"joint_tagger_parser\",\n        \"tagger\": {\n          \"config_path\": \"{CONFIGS_PATH}/morpho_syntax_parser/morpho_ru_syntagrus_bert.json\",\n          \"overwrite\": {\"chainer.pipe.6.return_string\": false}\n        },\n        \"parser\": {\n          \"config_path\": \"{CONFIGS_PATH}/morpho_syntax_parser/syntax_ru_syntagrus_bert.json\",\n          \"overwrite\": {\"chainer.pipe.6.return_string\": false}\n        },\n        \"in\": [\"x_words\"],\n        \"out\": [\"y_parsed\"]\n      }\n    ],\n    \"out\": [\"y_parsed\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/morpho_syntax_parser/syntax_ru_syntagrus_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"morphotagger_dataset_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/UD2.3_source\",\n    \"language\": \"ru_syntagrus\",\n    \"data_types\": [\"train\", \"dev\", \"test\"],\n    \"read_syntax\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"morphotagger_dataset_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y_tags\", \"y_heads\", \"y_deps\"],\n    \"pipe\": [\n      {\n        \"in\": [\"x\"],\n        \"class_name\": \"lazy_tokenizer\",\n        \"out\": [\"x_words\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\"x_words\"],\n        \"out\": [\"x_tokens\", \"x_subword_tokens\", \"x_subword_tok_ids\", \"startofword_markers\", \"attention_mask\", \"tokens_offsets\"]\n      },\n      {\n        \"id\": \"dep_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"min_freq\": 3,\n        \"fit_on\": [\"y_deps\"],\n        \"in\": [\"y_deps\"],\n        \"out\": [\"y_deps_indexes\"],\n        \"special_tokens\": [\"PAD\"],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/deps.dict\",\n        \"load_path\": \"{MODEL_PATH}/deps.dict\"\n      },\n      {\n        \"class_name\": \"torch_transformers_syntax_parser\",\n        \"n_deps\": \"#dep_vocab.len\",\n        \"state_size\": 384,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"return_probas\": true,\n        \"encoder_layer_ids\": [6, 7, 8, 9, 10, 11],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [0.9, 0.999],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"use_birnn\": true,\n        \"learning_rate_drop_patience\": 10,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\"x_subword_tok_ids\", \"attention_mask\", \"startofword_markers\"],\n        \"in_y\": [\"y_heads\", \"y_deps_indexes\"],\n        \"out\": [\"y_predicted_heads_probs\", \"y_predicted_deps_indexes\"]\n      },\n      {\n        \"class_name\": \"chu_liu_edmonds_transformer\",\n        \"in\": [\"y_predicted_heads_probs\"],\n        \"out\": [\"y_predicted_heads\"]\n      },\n      {\n        \"ref\": \"dep_vocab\",\n        \"in\": [\"y_predicted_deps_indexes\"],\n        \"out\": [\"y_predicted_deps\"]\n      },\n      {\n        \"in\": [\"x_words\", \"y_predicted_heads\", \"y_predicted_deps\"],\n        \"out\": [\"y_prettified\"],\n        \"id\": \"dependency_output_prettifier\",\n        \"class_name\": \"dependency_output_prettifier\"\n      }\n    ],\n    \"out\": [\"y_prettified\"]\n  },\n  \"train\": {\n    \"epochs\": 10,\n    \"batch_size\": 32,\n    \"metrics\": [\n      {\n        \"name\": \"multitask_token_accuracy\",\n        \"alias\": \"LAS\",\n        \"inputs\": [\"y_deps\", \"y_heads\", \"y_predicted_deps\", \"y_predicted_heads\"]\n      },\n      {\n        \"name\": \"per_token_accuracy\",\n        \"alias\": \"UAS\",\n        \"inputs\": [\"y_heads\", \"y_predicted_heads\"]\n      }\n    ],\n    \"validation_patience\": 15,\n    \"val_every_n_batches\": 300,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\"valid\", \"test\"],\n    \"class_name\": \"nn_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/syntax_parsing/rus_6layers\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.3/ru_syntagrus.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/UD2.3_source/ru_syntagrus\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/syntax_parsing/rus_6layers.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/multitask/mt_glue.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"multitask_reader\",\n    \"task_defaults\": {\n      \"class_name\": \"huggingface_dataset_reader\",\n      \"path\": \"glue\",\n      \"train\": \"train\",\n      \"valid\": \"validation\"\n    },\n    \"tasks\": {\n      \"cola\": {\"name\": \"cola\"},\n      \"sst2\": {\"name\": \"sst2\"},\n      \"qqp\": {\"name\": \"qqp\"},\n      \"mrpc\": {\"name\": \"mrpc\"},\n      \"rte\": {\"name\": \"rte\"},\n      \"mnli\": {\n        \"name\": \"mnli\",\n        \"valid\": \"validation_matched\"\n      },\n      \"qnli\": {\"name\": \"qnli\"},\n      \"stsb\": {\"name\": \"stsb\"}\n    }\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"multitask_iterator\",\n    \"num_train_epochs\": \"{NUM_TRAIN_EPOCHS}\",\n    \"gradient_accumulation_steps\": \"{GRADIENT_ACC_STEPS}\",\n    \"seed\": 42,\n    \"task_defaults\": {\n      \"class_name\": \"huggingface_dataset_iterator\",\n      \"label\": \"label\",\n      \"use_label_name\": false,\n      \"seed\": 42\n    },\n    \"tasks\": {\n      \"cola\": {\n        \"features\": [\"sentence\"]\n      },\n      \"sst2\": {\n        \"features\": [\"sentence\"]\n      },\n      \"qqp\": {\n        \"features\": [\"question1\", \"question2\"]\n      },\n      \"mrpc\": {\n        \"features\": [\"sentence1\", \"sentence2\"]\n      },\n      \"rte\": {\n        \"features\": [\"sentence1\", \"sentence2\"]\n      },\n      \"mnli\": {\n        \"features\": [\"premise\", \"hypothesis\"]\n      },\n      \"qnli\": {\n        \"features\": [\"question\", \"sentence\"]\n      },\n      \"stsb\": {\n        \"features\": [\"sentence1\", \"sentence2\"]\n      }\n    }\n  },\n  \"chainer\": {\n    \"in\": [\"x_cola\", \"x_sst2\", \"x_qqp\", \"x_mrpc\", \"x_rte\", \"x_mnli\", \"x_qnli\", \"x_stsb\"],\n    \"in_y\": [\"y_cola\", \"y_sst2\", \"y_qqp\", \"y_mrpc\", \"y_rte\", \"y_mnli\", \"y_qnli\", \"y_stsb\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"multitask_pipeline_preprocessor\",\n        \"possible_keys_to_extract\": [0, 1],\n        \"preprocessor\": \"TorchTransformersPreprocessor\",\n        \"vocab_file\": \"{BACKBONE}\",\n        \"max_seq_length\": 128,\n        \"do_lower_case\": true,\n        \"n_task\": 8,\n        \"in\": [\"x_cola\", \"x_sst2\", \"x_qqp\", \"x_mrpc\", \"x_rte\", \"x_mnli\", \"x_qnli\", \"x_stsb\"],\n        \"out\": [\n          \"bert_features_cola\",\n          \"bert_features_sst2\",\n          \"bert_features_qqp\",\n          \"bert_features_mrpc\",\n          \"bert_features_rte\",\n          \"bert_features_mnli\",\n          \"bert_features_qnli\",\n          \"bert_features_stsb\"\n        ]\n      },\n      {\n        \"id\": \"multitask_transformer\",\n        \"class_name\": \"multitask_transformer\",\n        \"optimizer_parameters\": {\"lr\": 2e-5},\n        \"gradient_accumulation_steps\": \"{GRADIENT_ACC_STEPS}\",\n        \"learning_rate_drop_patience\": 2,\n        \"learning_rate_drop_div\": 2.0,\n        \"return_probas\": true,\n        \"backbone_model\": \"{BACKBONE}\",\n        \"save_path\": \"{MODEL_PATH}\",\n        \"load_path\": \"{MODEL_PATH}\",\n        \"tasks\": {\n          \"cola\": {\n            \"type\": \"classification\",\n            \"options\": 2\n          },\n          \"sst2\": {\n            \"type\": \"classification\",\n            \"options\": 2\n          },\n          \"qqp\": {\n            \"type\": \"classification\",\n            \"options\": 2\n          },\n          \"mrpc\": {\n            \"type\": \"classification\",\n            \"options\": 2\n          },\n          \"rte\": {\n            \"type\": \"classification\",\n            \"options\": 2\n          },\n          \"mnli\": {\n            \"type\": \"classification\",\n            \"options\": 3\n          },\n          \"qnli\": {\n            \"type\": \"classification\",\n            \"options\": 2\n          },\n          \"stsb\": {\n            \"type\": \"regression\",\n            \"options\": 1\n          }\n        },\n        \"in\": [\n          \"bert_features_cola\",\n          \"bert_features_sst2\",\n          \"bert_features_qqp\",\n          \"bert_features_mrpc\",\n          \"bert_features_rte\",\n          \"bert_features_mnli\",\n          \"bert_features_qnli\",\n          \"bert_features_stsb\"\n        ],\n        \"in_y\": [\"y_cola\", \"y_sst2\", \"y_qqp\", \"y_mrpc\", \"y_rte\", \"y_mnli\", \"y_qnli\", \"y_stsb\"],\n        \"out\": [\n          \"y_cola_pred_probas\",\n          \"y_sst2_pred_probas\",\n          \"y_qqp_pred_probas\",\n          \"y_mrpc_pred_probas\",\n          \"y_rte_pred_probas\",\n          \"y_mnli_pred_probas\",\n          \"y_qnli_pred_probas\",\n          \"y_stsb_pred\"\n        ]\n      },\n      {\n        \"in\": [\n          \"y_cola_pred_probas\",\n          \"y_sst2_pred_probas\",\n          \"y_qqp_pred_probas\",\n          \"y_mrpc_pred_probas\",\n          \"y_rte_pred_probas\",\n          \"y_mnli_pred_probas\",\n          \"y_qnli_pred_probas\"\n        ],\n        \"out\": [\n          \"y_cola_pred_ids\",\n          \"y_sst2_pred_ids\",\n          \"y_qqp_pred_ids\",\n          \"y_mrpc_pred_ids\",\n          \"y_rte_pred_ids\",\n          \"y_mnli_pred_ids\",\n          \"y_qnli_pred_ids\"\n        ],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      }\n    ],\n    \"out\": [\n      \"y_cola_pred_probas\",\n      \"y_sst2_pred_probas\",\n      \"y_qqp_pred_probas\",\n      \"y_mrpc_pred_probas\",\n      \"y_rte_pred_probas\",\n      \"y_mnli_pred_probas\",\n      \"y_qnli_pred_probas\",\n      \"y_stsb_pred\",\n      \"y_cola_pred_ids\",\n      \"y_sst2_pred_ids\",\n      \"y_qqp_pred_ids\",\n      \"y_mrpc_pred_ids\",\n      \"y_rte_pred_ids\",\n      \"y_mnli_pred_ids\",\n      \"y_qnli_pred_ids\",\n      \"y_stsb_pred\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": \"{NUM_TRAIN_EPOCHS}\",\n    \"batch_size\": 32,\n    \"metrics\": [\n      {\n        \"name\": \"multitask_accuracy\",\n        \"inputs\": [\n          \"y_rte\",\n          \"y_mnli\",\n          \"y_qnli\",\n          \"y_mrpc\",\n          \"y_cola\",\n          \"y_sst2\",\n          \"y_qqp\",\n          \"y_rte_pred_ids\",\n          \"y_mnli_pred_ids\",\n          \"y_qnli_pred_ids\",\n          \"y_mrpc_pred_ids\",\n          \"y_cola_pred_ids\",\n          \"y_sst2_pred_ids\",\n          \"y_qqp_pred_ids\"\n        ]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_mrpc\",\n        \"inputs\": [\"y_mrpc\", \"y_mrpc_pred_ids\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_rte\",\n        \"inputs\": [\"y_rte\", \"y_rte_pred_ids\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_mnli\",\n        \"inputs\": [\"y_mnli\", \"y_mnli_pred_ids\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_qnli\",\n        \"inputs\": [\"y_qnli\", \"y_qnli_pred_ids\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_sst\",\n        \"inputs\": [\"y_sst2\", \"y_sst2_pred_ids\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_cola\",\n        \"inputs\": [\"y_cola\", \"y_cola_pred_ids\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_qqp\",\n        \"inputs\": [\"y_qqp\", \"y_qqp_pred_ids\"]\n      },\n      {\n        \"name\": \"pearson_correlation\",\n        \"alias\": \"pearson_correlation_stsb\",\n        \"inputs\": [\"y_stsb\", \"y_stsb_pred\"]\n      },\n      {\n        \"name\": \"spearman_correlation\",\n        \"alias\": \"spearman_correlation_stsb\",\n        \"inputs\": [\"y_stsb\", \"y_stsb_pred\"]\n      }\n    ],\n    \"validation_patience\": 3,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BACKBONE\": \"bert-base-uncased\",\n      \"MODELS_PATH\": \"~/.deeppavlov/models/glue\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/8task\",\n      \"NUM_TRAIN_EPOCHS\": 5,\n      \"GRADIENT_ACC_STEPS\": 1\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/multitask/glue.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/multitask/multitask_example.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"multitask_reader\",\n    \"task_defaults\": {\n      \"class_name\": \"huggingface_dataset_reader\",\n      \"path\": \"glue\",\n      \"train\": \"train\",\n      \"valid\": \"validation\",\n      \"test\": \"test\"\n    },\n    \"tasks\": {\n      \"cola\": {\"name\": \"cola\"},\n      \"rte\": {\"name\": \"rte\"},\n      \"stsb\": {\"name\": \"stsb\"},\n      \"copa\": {\n        \"path\": \"super_glue\",\n        \"name\": \"copa\"\n      },\n      \"conll\": {\n        \"class_name\": \"conll2003_reader\",\n        \"use_task_defaults\": false,\n        \"data_path\": \"{DOWNLOADS_PATH}/conll2003/\",\n        \"dataset_name\": \"conll2003\",\n        \"provide_pos\": false\n      }\n    }\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"multitask_iterator\",\n    \"num_train_epochs\": \"{NUM_TRAIN_EPOCHS}\",\n    \"gradient_accumulation_steps\": \"{GRADIENT_ACC_STEPS}\",\n    \"seed\": 42,\n    \"task_defaults\": {\n      \"class_name\": \"huggingface_dataset_iterator\",\n      \"label\": \"label\",\n      \"use_label_name\": false,\n      \"seed\": 42\n    },\n    \"tasks\": {\n      \"cola\": {\n        \"features\": [\"sentence\"]\n      },\n      \"rte\": {\n        \"features\": [\"sentence1\", \"sentence2\"]\n      },\n      \"stsb\": {\n        \"features\": [\"sentence1\", \"sentence2\"]\n      },\n      \"copa\": {\n        \"features\": [\"contexts\", \"choices\"]\n      },\n      \"conll\": {\n        \"class_name\": \"basic_classification_iterator\",\n        \"seed\": 42,\n        \"use_task_defaults\": false\n      }\n    }\n  },\n  \"chainer\": {\n    \"in\": [\"x_cola\", \"x_rte\", \"x_stsb\", \"x_copa\", \"x_conll\"],\n    \"in_y\": [\"y_cola\", \"y_rte\", \"y_stsb\", \"y_copa\", \"y_conll\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"multitask_pipeline_preprocessor\",\n        \"possible_keys_to_extract\": [0, 1],\n        \"preprocessors\": [\n          \"TorchTransformersPreprocessor\",\n          \"TorchTransformersPreprocessor\",\n          \"TorchTransformersPreprocessor\",\n          \"TorchTransformersMultiplechoicePreprocessor\",\n          \"TorchTransformersNerPreprocessor\"\n        ],\n        \"do_lower_case\": true,\n        \"n_task\": 5,\n        \"vocab_file\": \"{BACKBONE}\",\n        \"max_seq_length\": 200,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"return_features\": true,\n        \"in\": [\"x_cola\", \"x_rte\", \"x_stsb\", \"x_copa\", \"x_conll\"],\n        \"out\": [\n          \"bert_features_cola\",\n          \"bert_features_rte\",\n          \"bert_features_stsb\",\n          \"bert_features_copa\",\n          \"bert_features_conll\"\n        ]\n      },\n      {\n        \"id\": \"vocab_conll\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\"O\"],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODELS_PATH}/tag.dict\",\n        \"load_path\": \"{MODELS_PATH}/tag.dict\",\n        \"fit_on\": [\"y_conll\"],\n        \"in\": [\"y_conll\"],\n        \"out\": [\"y_ids_conll\"]\n      },\n      {\n        \"id\": \"multitask_transformer\",\n        \"class_name\": \"multitask_transformer\",\n        \"optimizer_parameters\": {\"lr\": 2e-5},\n        \"gradient_accumulation_steps\": \"{GRADIENT_ACC_STEPS}\",\n        \"learning_rate_drop_patience\": 2,\n        \"learning_rate_drop_div\": 2.0,\n        \"return_probas\": true,\n        \"backbone_model\": \"{BACKBONE}\",\n        \"save_path\": \"{MODEL_PATH}\",\n        \"load_path\": \"{MODEL_PATH}\",\n        \"tasks\": {\n          \"cola\": {\n            \"type\": \"classification\",\n            \"options\": 2\n          },\n          \"rte\": {\n            \"type\": \"classification\",\n            \"options\": 2\n          },\n          \"stsb\": {\n            \"type\": \"regression\",\n            \"options\": 1\n          },\n          \"copa\": {\n            \"type\": \"multiple_choice\",\n            \"options\": 2\n          },\n          \"conll\": {\n            \"type\": \"sequence_labeling\",\n            \"options\": \"#vocab_conll.len\"\n          }\n        },\n        \"in\": [\n          \"bert_features_cola\",\n          \"bert_features_rte\",\n          \"bert_features_stsb\",\n          \"bert_features_copa\",\n          \"bert_features_conll\"\n        ],\n        \"in_y\": [\"y_cola\", \"y_rte\", \"y_stsb\", \"y_copa\", \"y_ids_conll\"],\n        \"out\": [\n          \"y_cola_pred_probas\",\n          \"y_rte_pred_probas\",\n          \"y_stsb_pred\",\n          \"y_copa_pred_probas\",\n          \"y_conll_pred_ids\"\n        ]\n      },\n      {\n        \"in\": [\"y_cola_pred_probas\"],\n        \"out\": [\"y_cola_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_rte_pred_probas\"],\n        \"out\": [\"y_rte_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_copa_pred_probas\"],\n        \"out\": [\"y_copa_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_conll_pred_ids\"],\n        \"out\": [\"y_conll_pred_labels\"],\n        \"ref\": \"vocab_conll\"\n      }\n    ],\n    \"out\": [\"y_cola_pred_ids\", \"y_rte_pred_ids\", \"y_stsb_pred\", \"y_copa_pred_ids\", \"y_conll_pred_labels\"]\n  },\n  \"train\": {\n    \"epochs\": \"{NUM_TRAIN_EPOCHS}\",\n    \"batch_size\": 32,\n    \"metrics\": [\n      {\n        \"name\": \"multitask_accuracy\",\n        \"inputs\": [\"y_rte\", \"y_cola\", \"y_copa\", \"y_rte_pred_ids\", \"y_cola_pred_ids\", \"y_copa_pred_ids\"]\n      },\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\"y_conll\", \"y_conll_pred_labels\"]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\"y_conll\", \"y_conll_pred_labels\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_cola\",\n        \"inputs\": [\"y_cola\", \"y_cola_pred_ids\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_rte\",\n        \"inputs\": [\"y_rte\", \"y_rte_pred_ids\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"alias\": \"accuracy_copa\",\n        \"inputs\": [\"y_copa\", \"y_copa_pred_ids\"]\n      },\n      {\n        \"name\": \"pearson_correlation\",\n        \"alias\": \"pearson_stsb\",\n        \"inputs\": [\"y_stsb\", \"y_stsb_pred\"]\n      },\n      {\n        \"name\": \"spearman_correlation\",\n        \"alias\": \"spearman_stsb\",\n        \"inputs\": [\"y_stsb\", \"y_stsb_pred\"]\n      }\n    ],\n    \"validation_patience\": 3,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models/multitask_example\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"BACKBONE\": \"distilbert-base-uncased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{BACKBONE}\",\n      \"NUM_TRAIN_EPOCHS\": 5,\n      \"GRADIENT_ACC_STEPS\": 1\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/multitask/multitask_example.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_bert_base.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"in\": [\"x\"],\n        \"out\": [\"x_tokens\", \"x_subword_tokens\", \"x_subword_tok_ids\", \"startofword_markers\", \"attention_mask\", \"tokens_offsets\"]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\"O\"],\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\"y\"],\n        \"in\": [\"y\"],\n        \"out\": [\"y_ind\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\"x_subword_tok_ids\", \"attention_mask\", \"startofword_markers\"],\n        \"in_y\": [\"y_ind\"],\n        \"out\": [\"y_pred_ind\", \"probas\"]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\"y_pred_ind\"],\n        \"out\": [\"y_pred\"]\n      }\n    ],\n    \"out\": [\"x_tokens\", \"y_pred\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"bert-base-multilingual-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ner/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_bert_base.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_case_agnostic_mdistilbert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/conll2003/\",\n    \"dataset_name\": \"conll2003\",\n    \"provide_pos\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\"x\"],\n        \"out\": [\"x_tokens\", \"x_subword_tokens\", \"x_subword_tok_ids\", \"startofword_markers\", \"attention_mask\", \"tokens_offsets\"]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\"O\"],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\"y\"],\n        \"in\": [\"y\"],\n        \"out\": [\"y_ind\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"use_crf\": true,\n        \"encoder_layer_ids\": [-1],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [0.9, 0.999],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 20,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\"x_subword_tok_ids\", \"attention_mask\", \"startofword_markers\"],\n        \"in_y\": [\"y_ind\"],\n        \"out\": [\"y_pred_ind\", \"probas\"]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\"y_pred_ind\"],\n        \"out\": [\"y_pred\"]\n      }\n    ],\n    \"out\": [\"x_tokens\", \"y_pred\"]\n  },\n  \"train\": {\n    \"epochs\": 50,\n    \"batch_size\": 8,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      }\n    ],\n    \"validation_patience\": 100,\n    \"val_every_n_batches\": 50,\n    \"log_every_n_batches\": 50,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\"test\", \"valid\"],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"~/.deeppavlov/downloads\",\n      \"MODELS_PATH\": \"~/.deeppavlov/models\",\n      \"TRANSFORMER\": \"distilbert-base-multilingual-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ner/ner_case_agnostic_mdistilbert\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_case_agnostic_mdistilbert.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_collection3_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/collection3/\",\n    \"dataset_name\": \"collection3\",\n    \"provide_pos\": false,\n    \"provide_chunk\": false,\n    \"iobes\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"x_tokens\",\n          \"x_subword_tokens\",\n          \"x_subword_tok_ids\",\n          \"startofword_markers\",\n          \"attention_mask\",\n          \"tokens_offsets\"\n        ]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\n          \"O\"\n        ],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"in\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"y_ind\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"encoder_layer_ids\": [\n          -1\n        ],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [\n            0.9,\n            0.999\n          ],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 30,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\n          \"x_subword_tok_ids\",\n          \"attention_mask\",\n          \"startofword_markers\"\n        ],\n        \"in_y\": [\n          \"y_ind\"\n        ],\n        \"out\": [\n          \"y_pred_ind\",\n          \"probas\"\n        ]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\n          \"y_pred_ind\"\n        ],\n        \"out\": [\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"x_tokens\",\n      \"y_pred\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 30,\n    \"batch_size\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"validation_patience\": 100,\n    \"val_every_n_batches\": 20,\n    \"log_every_n_batches\": 20,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ner_rus_bert_coll3_torch\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_rus_bert_coll3_torch.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_conll2003_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/conll2003/\",\n    \"dataset_name\": \"conll2003\",\n    \"provide_pos\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"x_tokens\",\n          \"x_subword_tokens\",\n          \"x_subword_tok_ids\",\n          \"startofword_markers\",\n          \"attention_mask\",\n          \"tokens_offsets\"\n        ]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\n          \"O\"\n        ],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"in\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"y_ind\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"use_crf\": true,\n        \"encoder_layer_ids\": [\n          -1\n        ],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [\n            0.9,\n            0.999\n          ],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 30,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\n          \"x_subword_tok_ids\",\n          \"attention_mask\",\n          \"startofword_markers\"\n        ],\n        \"in_y\": [\n          \"y_ind\"\n        ],\n        \"out\": [\n          \"y_pred_ind\",\n          \"probas\"\n        ]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\n          \"y_pred_ind\"\n        ],\n        \"out\": [\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"x_tokens\",\n      \"y_pred\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 30,\n    \"batch_size\": 16,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"validation_patience\": 100,\n    \"val_every_n_batches\": 20,\n    \"log_every_n_batches\": 20,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"bert-base-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ner_conll2003_torch_bert_crf\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_conll2003_bert_torch_crf.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_conll2003_deberta_crf.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/conll2003/\",\n    \"dataset_name\": \"conll2003\",\n    \"provide_pos\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"x_tokens\",\n          \"x_subword_tokens\",\n          \"x_subword_tok_ids\",\n          \"startofword_markers\",\n          \"attention_mask\",\n          \"tokens_offsets\"\n        ]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\n          \"O\"\n        ],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"in\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"y_ind\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"use_crf\": true,\n        \"encoder_layer_ids\": [\n          -1\n        ],\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\n          \"x_subword_tok_ids\",\n          \"attention_mask\",\n          \"startofword_markers\"\n        ],\n        \"in_y\": [\n          \"y_ind\"\n        ],\n        \"out\": [\n          \"y_pred_ind\",\n          \"probas\"\n        ]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\n          \"y_pred_ind\"\n        ],\n        \"out\": [\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"x_tokens\",\n      \"y_pred\"\n    ]\n  },\n  \"train\": {\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"evaluation_targets\": [\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"microsoft/deberta-v3-base\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ner_conll2003_deberta_crf\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_conll2003_deberta_crf.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_ontonotes_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/ontonotes/\",\n    \"dataset_name\": \"ontonotes\",\n    \"provide_pos\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\"x\"],\n        \"out\": [\"x_tokens\", \"x_subword_tokens\", \"x_subword_tok_ids\", \"startofword_markers\", \"attention_mask\", \"tokens_offsets\"]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\"O\"],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\"y\"],\n        \"in\": [\"y\"],\n        \"out\": [\"y_ind\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"use_crf\": true,\n        \"encoder_layer_ids\": [-1],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [0.9, 0.999],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 30,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\"x_subword_tok_ids\", \"attention_mask\", \"startofword_markers\"],\n        \"in_y\": [\"y_ind\"],\n        \"out\": [\"y_pred_ind\", \"probas\"]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\"y_pred_ind\"],\n        \"out\": [\"y_pred\"]\n      }\n    ],\n    \"out\": [\"x_tokens\", \"y_pred\"]\n  },\n  \"train\": {\n    \"epochs\": 30,\n    \"batch_size\": 60,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      }\n    ],\n    \"validation_patience\": 100,\n    \"val_every_n_batches\": 20,\n    \"log_every_n_batches\": 20,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\"valid\", \"test\"],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"bert-base-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ner_ontonotes_bert_torch_crf\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_torch_crf.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_ontonotes_bert_mult.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/ontonotes/\",\n    \"dataset_name\": \"ontonotes\",\n    \"provide_pos\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\"x\"],\n        \"out\": [\"x_tokens\", \"x_subword_tokens\", \"x_subword_tok_ids\", \"startofword_markers\", \"attention_mask\", \"tokens_offsets\"]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\"O\"],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\"y\"],\n        \"in\": [\"y\"],\n        \"out\": [\"y_ind\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"use_crf\": true,\n        \"encoder_layer_ids\": [-1],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [0.9, 0.999],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 30,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\"x_subword_tok_ids\", \"attention_mask\", \"startofword_markers\"],\n        \"in_y\": [\"y_ind\"],\n        \"out\": [\"y_pred_ind\", \"probas\"]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\"y_pred_ind\"],\n        \"out\": [\"y_pred\"]\n      }\n    ],\n    \"out\": [\"x_tokens\", \"y_pred\"]\n  },\n  \"train\": {\n    \"epochs\": 30,\n    \"batch_size\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      }\n    ],\n    \"validation_patience\": 100,\n    \"val_every_n_batches\": 20,\n    \"log_every_n_batches\": 20,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\"valid\", \"test\"],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"bert-base-multilingual-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ner_ontonotes_torch_bert_mult_crf\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_mult_torch_crf.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_ontonotes_deberta_crf.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/ontonotes/\",\n    \"dataset_name\": \"ontonotes\",\n    \"provide_pos\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\"x\"],\n        \"out\": [\"x_tokens\", \"x_subword_tokens\", \"x_subword_tok_ids\", \"startofword_markers\", \"attention_mask\", \"tokens_offsets\"]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\"O\"],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\"y\"],\n        \"in\": [\"y\"],\n        \"out\": [\"y_ind\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"use_crf\": true,\n        \"encoder_layer_ids\": [-1],\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\"x_subword_tok_ids\", \"attention_mask\", \"startofword_markers\"],\n        \"in_y\": [\"y_ind\"],\n        \"out\": [\"y_pred_ind\", \"probas\"]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\"y_pred_ind\"],\n        \"out\": [\"y_pred\"]\n      }\n    ],\n    \"out\": [\"x_tokens\", \"y_pred\"]\n  },\n  \"train\": {\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      }\n    ],\n    \"evaluation_targets\": [\"valid\", \"test\"],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"microsoft/deberta-v3-base\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ner_ontonotes_deberta_crf\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_ontonotes_deberta_crf.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_rus_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/total_rus/\",\n    \"dataset_name\": \"collection_rus\",\n    \"provide_pos\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"x_tokens\",\n          \"x_subword_tokens\",\n          \"x_subword_tok_ids\",\n          \"startofword_markers\",\n          \"attention_mask\",\n          \"tokens_offsets\"\n        ]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\n          \"O\"\n        ],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"in\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"y_ind\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"encoder_layer_ids\": [\n          -1\n        ],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [\n            0.9,\n            0.999\n          ],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 30,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\n          \"x_subword_tok_ids\",\n          \"attention_mask\",\n          \"startofword_markers\"\n        ],\n        \"in_y\": [\n          \"y_ind\"\n        ],\n        \"out\": [\n          \"y_pred_ind\",\n          \"probas\"\n        ]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\n          \"y_pred_ind\"\n        ],\n        \"out\": [\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"x_tokens\",\n      \"y_pred\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 30,\n    \"batch_size\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"validation_patience\": 100,\n    \"val_every_n_batches\": 20,\n    \"log_every_n_batches\": 20,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ner_rus_bert_torch\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch_new.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_rus_bert_probas.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"sq_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/wiki_ner_rus/wikipedia_dataset.pickle\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\"x\"],\n        \"out\": [\"x_tokens\", \"x_subword_tokens\", \"x_subword_tok_ids\", \"startofword_markers\", \"attention_mask\", \"tokens_offsets\"]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\"O\"],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\"y\"],\n        \"in\": [\"y\"],\n        \"out\": [\"y_ind\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"encoder_layer_ids\": [-1],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [\n            0.9,\n            0.999\n          ],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 30,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\"x_subword_tok_ids\", \"attention_mask\", \"startofword_markers\"],\n        \"in_y\": [\"y_ind\"],\n        \"out\": [\"y_pred_ind\", \"probas\"]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\"y_pred_ind\"],\n        \"out\": [\"y_pred\"]\n      }\n    ],\n    \"out\": [\"x_tokens\", \"tokens_offsets\", \"y_pred\", \"probas\"]\n  },\n  \"train\": {\n    \"epochs\": 30,\n    \"batch_size\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"validation_patience\": 100,\n    \"val_every_n_batches\": 20,\n    \"log_every_n_batches\": 20,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/wiki_ner_rus_bert\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/rus_dream_entity_detection/wiki_ner_rus_bert.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/wiki_ner_rus_bert\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/wiki_ner_rus/wiki_ner_rus_dataset.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/wiki_ner_rus\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json",
    "content": " {\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/total_rus/\",\n    \"dataset_name\": \"collection_rus\",\n    \"provide_pos\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"x_tokens\",\n          \"x_subword_tokens\",\n          \"x_subword_tok_ids\",\n          \"startofword_markers\",\n          \"attention_mask\",\n          \"tokens_offsets\"\n        ]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\n          \"O\"\n        ],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"in\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"y_ind\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.11,\n        \"hidden_keep_prob\": 0.11,\n        \"encoder_layer_ids\": [\n          -1\n        ],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 5.45e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [\n            0.9,\n            0.999\n          ],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 30,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\n          \"x_subword_tok_ids\",\n          \"attention_mask\",\n          \"startofword_markers\"\n        ],\n        \"in_y\": [\n          \"y_ind\"\n        ],\n        \"out\": [\n          \"y_pred_ind\",\n          \"probas\"\n        ]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\n          \"y_pred_ind\"\n        ],\n        \"out\": [\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"x_tokens\",\n      \"y_pred\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 30,\n    \"batch_size\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"validation_patience\": 100,\n    \"val_every_n_batches\": 20,\n    \"log_every_n_batches\": 20,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"valid\",\n      \"test\"\n    ],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\", \n      \"MODEL_PATH\": \"{MODELS_PATH}/ner_rus_conversational_distilrubert_2L\",\n      \"TRANSFORMER\": \"DeepPavlov/distilrubert-tiny-cased-conversational\"\n    }, \n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_2L.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json",
    "content": " {\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/total_rus/\",\n    \"dataset_name\": \"collection_rus\",\n    \"provide_pos\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"x_tokens\",\n          \"x_subword_tokens\",\n          \"x_subword_tok_ids\",\n          \"startofword_markers\",\n          \"attention_mask\",\n          \"tokens_offsets\"\n        ]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\n          \"O\"\n        ],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\n          \"y\"\n        ],\n        \"in\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"y_ind\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.44,\n        \"hidden_keep_prob\": 0.89,\n        \"encoder_layer_ids\": [\n          -1\n        ],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2.78e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [\n            0.9,\n            0.999\n          ],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 30,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\n          \"x_subword_tok_ids\",\n          \"attention_mask\",\n          \"startofword_markers\"\n        ],\n        \"in_y\": [\n          \"y_ind\"\n        ],\n        \"out\": [\n          \"y_pred_ind\",\n          \"probas\"\n        ]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\n          \"y_pred_ind\"\n        ],\n        \"out\": [\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"x_tokens\",\n      \"y_pred\"\n    ]\n  },\n  \"train\": {\n    \"epochs\": 30,\n    \"batch_size\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\n          \"y\",\n          \"y_pred\"\n        ]\n      }\n    ],\n    \"validation_patience\": 100,\n    \"val_every_n_batches\": 20,\n    \"log_every_n_batches\": 20,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"valid\",\n      \"test\"\n    ],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\", \n      \"MODEL_PATH\": \"{MODELS_PATH}/ner_rus_conversational_distilrubert_6L\",\n      \"TRANSFORMER\": \"DeepPavlov/distilrubert-base-cased-conversational\"\n    }, \n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_6L.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/odqa/en_odqa_infer_wiki.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"question_raw\"],\n    \"out\": [\"answer\", \"answer_score\", \"answer_place\"],\n    \"pipe\": [\n      {\n        \"config_path\": \"{CONFIGS_PATH}/doc_retrieval/en_ranker_tfidf_wiki.json\",\n        \"in\": [\"question_raw\"],\n        \"out\": [\"tfidf_doc_ids\"]\n      },\n      {\n        \"class_name\": \"bpr\",\n        \"load_path\": \"{MODELS_PATH}/bpr/eng\",\n        \"query_encoder_file\": \"query_encoder_en.pth.tar\",\n        \"bpr_index\": \"bpr_finetuned_nq_adv.idx\",\n        \"pretrained_model\": \"bert-base-uncased\",\n        \"top_n\": 100,\n        \"in\": [\"question_raw\"],\n        \"out\": [\"bpr_doc_ids\"]\n      },\n      {\n        \"class_name\": \"concat_lists\",\n        \"in\": [\"tfidf_doc_ids\", \"bpr_doc_ids\"],\n        \"out\": [\"doc_ids\"]\n      },\n      {\n        \"class_name\": \"wiki_sqlite_vocab\",\n        \"in\": [\"doc_ids\"],\n        \"out\": [\"doc_text\"],\n        \"join_docs\": false,\n        \"shuffle\": false,\n        \"load_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_l100.db\"\n      },\n      {\n        \"class_name\": \"string_multiplier\",\n        \"in\": [\"question_raw\", \"doc_text\"],\n        \"out\":[\"questions\"]\n      },\n      {\n        \"class_name\": \"logit_ranker\",\n        \"batch_size\": 64,\n        \"squad_model\": {\"config_path\": \"{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json\"},\n        \"sort_noans\": true,\n        \"in\": [\"doc_text\", \"questions\"],\n        \"out\": [\"answer\", \"answer_score\", \"answer_place\"]\n      }\n    ]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/bpr/eng\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/odqa/en_odqa_pop_infer_wiki.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"question_raw\"],\n    \"out\": [\"answer\", \"answer_score\", \"answer_place\"],\n    \"pipe\": [\n      {\n        \"config_path\": \"{CONFIGS_PATH}/doc_retrieval/en_ranker_pop_wiki.json\",\n        \"in\": [\"question_raw\"],\n        \"out\": [\"tfidf_doc_ids\"]\n      },\n      {\n        \"class_name\": \"bpr\",\n        \"load_path\": \"{MODELS_PATH}/bpr/eng\",\n        \"query_encoder_file\": \"query_encoder_en.pth.tar\",\n        \"bpr_index\": \"bpr_finetuned_nq_adv.idx\",\n        \"pretrained_model\": \"bert-base-uncased\",\n        \"top_n\": 100,\n        \"in\": [\"question_raw\"],\n        \"out\": [\"bpr_doc_ids\"]\n      },\n      {\n        \"class_name\": \"concat_lists\",\n        \"in\": [\"tfidf_doc_ids\", \"bpr_doc_ids\"],\n        \"out\": [\"doc_ids\"]\n      },\n      {\n        \"class_name\": \"wiki_sqlite_vocab\",\n        \"in\": [\"doc_ids\"],\n        \"out\": [\"doc_text\"],\n        \"join_docs\": false,\n        \"shuffle\": false,\n        \"load_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_l100.db\"\n      },\n      {\n        \"class_name\": \"string_multiplier\",\n        \"in\": [\"question_raw\", \"doc_text\"],\n        \"out\":[\"questions\"]\n      },\n      {\n        \"class_name\": \"logit_ranker\",\n        \"batch_size\": 64,\n        \"squad_model\": {\"config_path\": \"{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json\"},\n        \"sort_noans\": true,\n        \"in\": [\"doc_text\", \"questions\"],\n        \"out\": [\"answer\", \"answer_score\", \"answer_place\"]\n      }\n    ]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/bpr/eng\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/odqa/ru_odqa_infer_wiki.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"question_raw\"],\n    \"out\": [\"best_answer\"],\n    \"pipe\": [\n      {\n        \"config_path\": \"{CONFIGS_PATH}/doc_retrieval/ru_ranker_tfidf_wiki.json\",\n        \"in\": [\"question_raw\"],\n        \"out\": [\"tfidf_doc_ids\"]\n      },\n      {\n        \"class_name\": \"wiki_sqlite_vocab\",\n        \"in\": [\"tfidf_doc_ids\"],\n        \"out\": [\"tfidf_doc_text\"],\n        \"join_docs\": false,\n        \"shuffle\": false,\n        \"load_path\": \"{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db\"\n      },\n      {\n        \"class_name\": \"string_multiplier\",\n        \"in\": [\"question_raw\", \"tfidf_doc_text\"],\n        \"out\":[\"questions\"]\n      },\n      {\n        \"class_name\": \"logit_ranker\",\n        \"batch_size\": 64,\n        \"squad_model\": {\"config_path\": \"{CONFIGS_PATH}/squad/qa_multisberquad_bert.json\"},\n        \"sort_noans\": true,\n        \"in\": [\"tfidf_doc_text\", \"questions\"],\n        \"out\": [\"best_answer\", \"best_answer_score\"]\n      }\n    ]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\"\n    },\n    \"download\": [\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ranking/path_ranking_nll_roberta_en.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"question\", \"rels\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"path_ranking_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"additional_special_tokens\": [\"<one_rel>\", \"</one_rel>\", \"<double>\", \"</double>\", \"<first_rel>\", \"<mid>\", \"</second_rel>\"],\n        \"max_seq_length\": 96,\n        \"in\": [\"question\", \"rels\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_nll_ranker\",\n        \"in\": [\"bert_features\"],\n        \"out\": [\"model_output\"],\n        \"return_probas\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"encoder_save_path\": \"{MODEL_PATH}/encoder\",\n        \"linear_save_path\": \"{MODEL_PATH}/linear\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"learning_rate_drop_patience\": 5,\n        \"learning_rate_drop_div\": 1.5,\n        \"optimizer_parameters\": {\"lr\": 1e-5, \"weight_decay\": 0.01, \"eps\": 1e-6}\n      }\n    ],\n    \"out\": [\"model_output\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"TRANSFORMER\": \"haisongzhang/roberta-tiny-cased\",\n      \"MODEL_PATH\": \"~/.deeppavlov/models/classifiers/path_ranking_nll_roberta_lcquad2\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/models/path_ranking_nll_roberta_lcquad2.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ranking/ranking_ubuntu_v2_torch_bert_uncased.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"ubuntu_v2_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/ubuntu_v2_data\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"siamese_iterator\",\n    \"seed\": 243\n  },\n  \"chainer\": {\n    \"in\": [\n      \"x\"\n    ],\n    \"in_y\": [\n      \"y\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_bert_ranker_preprocessor\",\n        \"vocab_file\": \"bert-base-uncased\",\n        \"do_lower_case\": true,\n        \"max_seq_length\": 128,\n        \"in\": [\n          \"x\"\n        ],\n        \"out\": [\n          \"bert_features\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_bert_ranker\",\n        \"pretrained_bert\": \"bert-base-uncased\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-5,\n          \"weight_decay\": 1e-2,\n          \"betas\": [\n            0.9,\n            0.999\n          ],\n          \"eps\": 1e-6\n        },\n        \"clip_norm\": 1.0,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"y\"\n        ],\n        \"out\": [\n          \"predictions\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"predictions\"\n    ]\n  },\n  \"train\": {\n    \"batch_size\": 32,\n    \"pytest_max_batches\": 2,\n    \"train_metrics\": [],\n    \"metrics\": [\n      \"r@1\",\n      \"r@2\",\n      \"r@5\"\n    ],\n    \"validation_patience\": 1,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"evaluation_targets\": [\n      \"valid\",\n      \"test\"\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/ubuntu_v2_uncased_torch_bert_model\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/ubuntu_v2_data\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_torch_bert_model_v2.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ranking/rel_ranking_nll_bert_ru.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"question\", \"rels\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"path_ranking_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 96,\n        \"in\": [\"question\", \"rels\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_nll_ranker\",\n        \"in\": [\"bert_features\"],\n        \"out\": [\"model_output\"],\n        \"return_probas\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"encoder_save_path\": \"{MODEL_PATH}/encoder\",\n        \"linear_save_path\": \"{MODEL_PATH}/linear\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"learning_rate_drop_patience\": 4,\n        \"learning_rate_drop_div\": 1.5,\n        \"optimizer_parameters\": {\"lr\": 1e-5, \"weight_decay\": 0.01, \"eps\": 1e-6}\n      }\n    ],\n    \"out\": [\"model_output\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/rel_ranking_nll_bert_ru\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/models/rel_ranking_nll_bert_ru.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/ranking/rel_ranking_roberta_en.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"sq_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/rel_ranking_eng/lcquad_one_rel_ranking.json\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"question\", \"rel_list\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"rel_ranking_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 64,\n        \"in\": [\"question\", \"rel_list\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 1e-05},\n        \"learning_rate_drop_patience\": 5,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"epochs\": 3,\n    \"batch_size\": 30,\n    \"metrics\": [\n      \"accuracy\",\n      \"f1_macro\"\n    ],\n    \"validation_patience\": 10,\n    \"val_every_n_batches\": 100,\n    \"log_every_n_batches\": 100,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\", \"test\"],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"haisongzhang/roberta-tiny-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/rel_ranking_roberta_en\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/models/rel_ranking_roberta_en.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/kbqa/wikidata/lcquad_rel_ranking.pickle\",\n        \"subdir\": \"{DOWNLOADS_PATH}/rel_ranking_eng\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/regressors/translation_ranker.json",
    "content": "{\n  \"metadata\":\n  {\n    \"variables\": {\n      \"BASE_MODEL\": \"cointegrated/LaBSE-en-ru\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/classifiers/ranker_labse\",\n      \"SEED\": 42\n    },\n    \"download\": [\n\t{\n\t\t\"url\": \"http://files.deeppavlov.ai/v1/tmp/translation_ranker.tar.gz\",\n\t\t\"subdir\": \"{MODELS_PATH}\"\n\t}\n    ]\n  },\n    \"dataset_iterator\": {\n      \"class_name\": \"huggingface_dataset_iterator\",\n      \"features\": [\n        \"source\",\n        \"hypothesis\"\n      ],\n    \"label\": \"agg_score\",\n    \"seed\": \"{SEED}\",\n    \"use_label_name\": false\n  },\n    \"chainer\": {\n      \"in\": [\n        \"source\",\n        \"hypothesis\"\n      ],\n      \"in_y\": [\n        \"score\"\n      ],\n      \"pipe\": [\n        {\n          \"class_name\": \"torch_transformers_preprocessor\",\n          \"vocab_file\": \"{BASE_MODEL}\",\n          \"do_lower_case\": false,\n          \"max_seq_length\": 256,\n          \"in\": [\n          \"source\",\n          \"hypothesis\"\n          ],\n          \"out\": [\n            \"bert_features\"\n          ]\n        },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": 1,\n        \"return_probas\": false,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-06,\n          \"weight_decay\": 0.1\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"score\"\n        ],\n        \"out\": [\n          \"pred_score\"\n        ]\n      }\n      ],\n      \"out\": [\n        \"pred_score\"\n      ]\n    },\n    \"train\": {\n    \"batch_size\": 32,\n    \"metrics\": [\n      {\n        \"name\": \"mean_squared_error\",\n        \"inputs\": [\n          \"score\",\n          \"pred_score\"\n        ]\n      }\n    ],\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"class_name\": \"torch_trainer\",\n    \"evaluation_targets\": [\n      \"train\",\n      \"valid\"\n    ],\n    \"metric_optimization\": \"minimize\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/relation_extraction/re_docred.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"docred_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/docred/\",\n    \"rel2id_path\": \"{DOWNLOADS_PATH}/docred/meta/rel2id.json\",\n    \"rel_info_path\": \"{DOWNLOADS_PATH}/docred/rel_info.json\",\n    \"valid_test_data_size\": 150\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"tokens\", \"entity_pos\", \"entity_tags\"],\n    \"in_y\": [\"y_ids\"],\n    \"pipe\": [\n      {\n        \"in\": [\"tokens\", \"entity_pos\", \"entity_tags\"],\n        \"out\": [\"input_ids\", \"attention_mask\", \"upd_entity_pos\", \"upd_entity_tags\", \"nf_samples\"],\n        \"class_name\": \"re_preprocessor\",\n        \"vocab_file\": \"bert-base-cased\",\n        \"default_tag\": \"PER\"\n      },\n      {\n        \"class_name\": \"re_classifier\",\n        \"in\": [\"input_ids\", \"attention_mask\", \"upd_entity_pos\", \"upd_entity_tags\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"model_output\"],\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer_parameters\": {\"lr\": 5e-5, \"weight_decay\": 0.01, \"eps\": 1e-6},\n        \"n_classes\": 97,\n        \"num_ner_tags\": 6,\n        \"pretrained_bert\": \"bert-base-cased\",\n        \"return_probas\": true\n      },\n      {\n        \"class_name\": \"re_postprocessor\",\n        \"rel2id_path\": \"{DOWNLOADS_PATH}/docred/meta/rel2id.json\",\n        \"rel2label_path\": \"{DOWNLOADS_PATH}/docred/rel_info.json\",\n        \"in\": [\"model_output\", \"nf_samples\"],\n        \"out\": [\"wikidata_relation_id\", \"relation_name\"]\n      }\n    ],\n    \"out\": [\"wikidata_relation_id\", \"relation_name\"]\n  },\n  \"train\": {\n    \"epochs\": 50,\n    \"batch_size\": 30,\n    \"log_every_n_batches\": 100,\n    \"train_metrics\": [\"f1_weighted\", \"acc\"],\n    \"evaluation_targets\": [\"valid\", \"train\"],\n    \"metrics\": [\"f1_weighted\", \"acc\"],\n    \"validation_patience\": 50,\n    \"val_every_n_batches\": 200,\n    \"show_examples\": false,\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/re_docred\"\n      },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/docred.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/docred\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/re_docred_model_v1.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/re_docred\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/rel2label.json\",\n        \"subdir\": \"{DOWNLOADS_PATH}/docred\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/relation_extraction/re_rured.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"rured_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/rured/\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"basic_classification_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"tokens\", \"entity_pos\", \"entity_tags\"],\n    \"in_y\": [\"y_ids\"],\n    \"pipe\": [\n      {\n        \"in\": [\"tokens\", \"entity_pos\", \"entity_tags\"],\n        \"out\": [\"input_ids\", \"attention_mask\", \"upd_entity_pos\", \"upd_entity_tags\", \"nf_samples\"],\n        \"class_name\": \"re_preprocessor\",\n        \"ner_tags\": [\"WORK_OF_ART\", \"NORP\", \"GROUP\", \"LAW\", \"NATIONALITY\", \"EVENT\", \"DATE\", \"CURRENCY\", \"GPE\",\n                     \"QUANTITY\", \"FAMILY\", \"ORDINAL\", \"RELIGION\", \"CITY\", \"MONEY\", \"AGE\", \"LOCATION\", \"PERCENT\",\n                     \"BOROUGH\", \"STREET\", \"PERSON\", \"REGION\", \"COUNTRY\", \"PROFESSION\", \"ORGANIZATION\", \"FAC\",\n                     \"CARDINAL\", \"PRODUCT\", \"TIME\"],\n        \"max_seq_length\": 512,\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"default_tag\": \"PERSON\"\n      },\n      {\n        \"class_name\": \"re_classifier\",\n        \"in\": [\"input_ids\", \"attention_mask\", \"upd_entity_pos\", \"upd_entity_tags\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"model_output\"],\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer_parameters\": {\"lr\": 5e-5, \"weight_decay\": 0.01, \"eps\": 1e-6},\n        \"n_classes\": 30,\n        \"num_ner_tags\": 29,\n        \"pretrained_bert\": \"{TRANSFORMER}\"\n      },\n      {\n        \"class_name\": \"re_postprocessor\",\n        \"rel2id_path\": \"{DOWNLOADS_PATH}/rured/rel2id.json\",\n        \"rel2label_path\": \"{DOWNLOADS_PATH}/rured/rel2label.json\",\n        \"in\": [\"model_output\", \"nf_samples\"],\n        \"out\": [\"wikidata_relation_id\", \"relation_name\"]\n      }\n    ],\n    \"out\": [\"wikidata_relation_id\", \"relation_name\"]\n  },\n  \"train\": {\n    \"epochs\": 50,\n    \"batch_size\": 16,\n    \"train_metrics\": [\"acc\"],\n    \"metrics\": [\"acc\"],\n    \"validation_patience\": 50,\n    \"val_every_n_batches\": 100,\n    \"log_every_n_batches\": 100,\n    \"evaluation_targets\": [\"valid\", \"train\"],\n    \"show_examples\": false,\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/re_rured\"\n      },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/rured.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/rured\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/re_rured_model_v1.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}/re_rured\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/russian_super_glue/russian_superglue_danetqa_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"data_url\": \"http://files.deeppavlov.ai/datasets/russian_super_glue/DaNetQA\",\n    \"ignore_verifications\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"question\", \"passage\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"question\", \"passage\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"in\": [\"question\", \"passage\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 2e-05},\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 4,\n    \"metrics\": [\"accuracy\"],\n    \"epochs\": 10,\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"russian_super_glue\",\n      \"BINARY_CLASSIFICATION\": false,\n      \"TASK\": \"danetqa\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_danetqa_rubert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/russian_super_glue/russian_superglue_lidirus_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"test\": \"test\",\n    \"data_url\": \"http://files.deeppavlov.ai/datasets/russian_super_glue/LiDiRus\",\n    \"ignore_verifications\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"sentence1\", \"sentence2\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"sentence1\", \"sentence2\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"sentence1\", \"sentence2\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 2e-05, \"weight_decay\": 0.1},\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"confidence_threshold\": 0.5\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 16,\n    \"metrics\": [\"matthews_correlation\"],\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"test\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"russian_super_glue\",\n      \"BINARY_CLASSIFICATION\": false,\n      \"TASK\": \"lidirus\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/terra/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_terra_rubert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/russian_super_glue/russian_superglue_muserc_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"data_url\": \"http://files.deeppavlov.ai/datasets/russian_super_glue/MuSeRC\",\n    \"ignore_verifications\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"context\", \"answer\", \"idx\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"context\", \"answer\", \"idx\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 512,\n        \"in\": [\"context\", \"answer\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"BINARY_CLASSIFICATION\": false,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 2e-05},\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 8,\n    \"metrics\": [\"roc_auc\",\"f1\"],\n    \"epochs\": 10,\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"russian_super_glue\",\n      \"BINARY_CLASSIFICATION\": false,\n      \"TASK\": \"muserc\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_muserc_rubert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/russian_super_glue/russian_superglue_parus_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"data_url\": \"http://files.deeppavlov.ai/datasets/russian_super_glue/PARus\",\n    \"ignore_verifications\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"contexts\", \"choices\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"contexts_list\", \"choices_list\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_multiplechoice_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"contexts_list\", \"choices_list\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_multiplechoice\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 4e-05},\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 4,\n    \"metrics\": [\"accuracy\"],\n    \"validation_patience\": 10,\n    \"epochs\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"russian_super_glue\",\n      \"TASK\": \"parus\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_parus_rubert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/russian_super_glue/russian_superglue_rcb_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"data_url\": \"http://files.deeppavlov.ai/datasets/russian_super_glue/RCB\",\n    \"ignore_verifications\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"premise\", \"hypothesis\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"premise\", \"hypothesis\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"premise\", \"hypothesis\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 4e-05},\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 4,\n    \"metrics\": [\"accuracy\", \"f1_macro\"],\n    \"validation_patience\": 10,\n    \"epochs\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"russian_super_glue\",\n      \"TASK\": \"rcb\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_rcb_rubert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/russian_super_glue/russian_superglue_rucos_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"data_url\": \"http://files.deeppavlov.ai/datasets/russian_super_glue/RuCoS\",\n    \"ignore_verifications\": true,\n    \"downsample_ratio\": [1.8, 1.8, 1],\n    \"do_index_correction\": false\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"idx\", \"query\", \"passage\", \"entities\", \"num_examples\"],\n    \"label\": \"label\",\n    \"use_label_name\": false\n  },\n  \"chainer\": {\n    \"in\": [\"idx\", \"query\", \"passage\", \"entities\", \"num_examples\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 320,\n        \"in\": [\"query\", \"passage\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": 2,\n        \"return_probas\": true,\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 2e-05},\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"class_name\": \"proba2labels\",\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"max_proba\": true\n      },\n      {\n        \"class_name\": \"torch_record_postprocessor\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"in\": [\"idx\", \"y\", \"y_pred_probas\", \"entities\", \"num_examples\"],\n        \"out\": [\"record_examples\"]\n      }\n    ],\n    \"out\": [\"y_pred_probas\"]\n  },\n  \"train\": {\n    \"batch_size\": 12,\n    \"train_metrics\": [\n      {\n        \"name\": \"accuracy\",\n        \"inputs\": [\"y\", \"y_pred_ids\"]\n      }\n    ],\n    \"metrics\": [\n      {\n        \"name\": \"record_em_score\",\n        \"inputs\": [\"record_examples\"]\n      },\n      {\n        \"name\": \"record_f1_score\",\n        \"inputs\": [\"record_examples\"]\n      },\n      {\n        \"name\": \"accuracy\",\n        \"inputs\": [\"y\", \"y_pred_ids\"]\n      }\n    ],\n    \"epochs\": 10,\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"class_name\": \"torch_trainer\",\n    \"evaluation_targets\": [\"valid\"],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"russian_super_glue\",\n      \"BINARY_CLASSIFICATION\": false,\n      \"TASK\": \"rucos\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_rucos_rubert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/russian_super_glue/russian_superglue_russe_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"data_url\": \"http://files.deeppavlov.ai/datasets/russian_super_glue/RUSSE\",\n    \"ignore_verifications\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"sentence1\", \"sentence2\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"sentence1\", \"sentence2\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"sentence1\", \"sentence2\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 2e-05},\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 4,\n    \"metrics\": [\"accuracy\"],\n    \"epochs\": 10,\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"val_every_n_batches\": 1000,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"russian_super_glue\",\n      \"TASK\": \"russe\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_russe_rubert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/russian_super_glue/russian_superglue_rwsd_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"data_url\": \"http://files.deeppavlov.ai/datasets/russian_super_glue/RWSD\",\n    \"ignore_verifications\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"text\", \"answer\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"text\", \"answer\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": true,\n        \"max_seq_length\": 256,\n        \"in\": [\"text\", \"answer\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"Adam\",\n        \"optimizer_parameters\": {\"lr\": 2e-05},\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 4,\n    \"metrics\": [\"accuracy\"],\n    \"epochs\": 10,\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"COMPETITION\": \"russian_super_glue\",\n      \"TASK\": \"rwsd\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_rwsd_rubert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/russian_super_glue/russian_superglue_terra_rubert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"huggingface_dataset_reader\",\n    \"path\": \"{COMPETITION}\",\n    \"name\": \"{TASK}\",\n    \"train\": \"train\",\n    \"valid\": \"validation\",\n    \"test\": \"test\",\n    \"data_url\": \"http://files.deeppavlov.ai/datasets/russian_super_glue/TERRa\",\n    \"ignore_verifications\": true\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"huggingface_dataset_iterator\",\n    \"features\": [\"premise\", \"hypothesis\"],\n    \"label\": \"label\",\n    \"seed\": 42\n  },\n  \"chainer\": {\n    \"in\": [\"premise\", \"hypothesis\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_preprocessor\",\n        \"vocab_file\": \"{BASE_MODEL}\",\n        \"do_lower_case\": false,\n        \"max_seq_length\": 256,\n        \"in\": [\"premise\", \"hypothesis\"],\n        \"out\": [\"bert_features\"]\n      },\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"fit_on\": [\"y\"],\n        \"save_path\": \"{MODEL_PATH}/classes.dict\",\n        \"load_path\": \"{MODEL_PATH}/classes.dict\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_ids\"]\n      },\n      {\n        \"in\": [\"y_ids\"],\n        \"out\": [\"y_onehot\"],\n        \"class_name\": \"one_hotter\",\n        \"depth\": \"#classes_vocab.len\",\n        \"single_vector\": true\n      },\n      {\n        \"class_name\": \"torch_transformers_classifier\",\n        \"n_classes\": \"#classes_vocab.len\",\n        \"return_probas\": true,\n        \"pretrained_bert\": \"{BASE_MODEL}\",\n        \"is_binary\": \"{BINARY_CLASSIFICATION}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\"lr\": 2e-05},\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"y_ids\"],\n        \"out\": [\"y_pred_probas\"]\n      },\n      {\n        \"in\": [\"y_pred_probas\"],\n        \"out\": [\"y_pred_ids\"],\n        \"class_name\": \"proba2labels\",\n        \"max_proba\": true\n      },\n      {\n        \"in\": [\"y_pred_ids\"],\n        \"out\": [\"y_pred_labels\"],\n        \"ref\": \"classes_vocab\"\n      }\n    ],\n    \"out\": [\"y_pred_labels\"]\n  },\n  \"train\": {\n    \"batch_size\": 4,\n    \"metrics\": [\"accuracy\"],\n    \"epochs\": 10,\n    \"validation_patience\": 10,\n    \"val_every_n_epochs\": 1,\n    \"log_every_n_epochs\": 1,\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"train\", \"valid\"],\n    \"class_name\": \"torch_trainer\",\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/\",\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 2\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"BASE_MODEL\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"COMPETITION\": \"russian_super_glue\",\n      \"TASK\": \"terra\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"BINARY_CLASSIFICATION\": false,\n      \"MODEL_PATH\": \"{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_terra_rubert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/sentence_segmentation/sentseg_dailydialog_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"conll2003_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/dailydialog/\",\n    \"dataset_name\": \"dailydialog\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"data_learning_iterator\"\n  },\n  \"chainer\": {\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_transformers_ner_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": true,\n        \"max_seq_length\": 512,\n        \"max_subword_length\": 15,\n        \"token_masking_prob\": 0.0,\n        \"in\": [\"x\"],\n        \"out\": [\"x_tokens\", \"x_subword_tokens\", \"x_subword_tok_ids\", \"startofword_markers\", \"attention_mask\", \"tokens_offsets\"]\n      },\n      {\n        \"id\": \"tag_vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"unk_token\": [\"O\"],\n        \"pad_with_zeros\": true,\n        \"save_path\": \"{MODEL_PATH}/tag.dict\",\n        \"load_path\": \"{MODEL_PATH}/tag.dict\",\n        \"fit_on\": [\"y\"],\n        \"in\": [\"y\"],\n        \"out\": [\"y_ind\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_sequence_tagger\",\n        \"n_tags\": \"#tag_vocab.len\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"attention_probs_keep_prob\": 0.5,\n        \"encoder_layer_ids\": [-1],\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 1e-06,\n          \"betas\": [0.9, 0.999],\n          \"eps\": 1e-06\n        },\n        \"clip_norm\": 1.0,\n        \"min_learning_rate\": 1e-07,\n        \"learning_rate_drop_patience\": 6,\n        \"learning_rate_drop_div\": 1.5,\n        \"load_before_drop\": true,\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"in\": [\"x_subword_tok_ids\", \"attention_mask\", \"startofword_markers\"],\n        \"in_y\": [\"y_ind\"],\n        \"out\": [\"y_pred_ind\", \"probas\"]\n      },\n      {\n        \"ref\": \"tag_vocab\",\n        \"in\": [\"y_pred_ind\"],\n        \"out\": [\"y_pred\"]\n      },\n      {\n        \"in\": [\"x_tokens\", \"y_pred\"],\n        \"out\": \"punctuated_sents\",\n        \"class_name\": \"sentseg_restore_sent\"\n      }\n    ],\n    \"out\": [\"x_tokens\", \"punctuated_sents\"]\n  },\n  \"train\": {\n    \"epochs\": 30,\n    \"batch_size\": 30,\n    \"metrics\": [\n      {\n        \"name\": \"ner_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      },\n      {\n        \"name\": \"ner_token_f1\",\n        \"inputs\": [\"y\", \"y_pred\"]\n      }\n    ],\n    \"validation_patience\": 20,\n    \"val_every_n_batches\": 100,\n    \"log_every_n_batches\": 100,\n    \"show_examples\": false,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 8,\n    \"evaluation_targets\": [\"valid\", \"test\"],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"TRANSFORMER\": \"bert-base-uncased\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/sentseg_dailydialog_bert\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/sentseg_dailydialog_bert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"typos_wikipedia_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"typos_iterator\",\n    \"test_ratio\": 0.05\n  },\n  \"chainer\":{\n    \"in\": [\"x\"],\n    \"in_y\": [\"y\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"str_lower\",\n        \"id\": \"lower\",\n        \"in\": [\"x\"],\n        \"out\": [\"x_lower\"]\n      },\n      {\n        \"class_name\": \"nltk_moses_tokenizer\",\n        \"id\": \"tokenizer\",\n        \"in\": [\"x_lower\"],\n        \"out\": [\"x_tokens\"]\n      },\n      {\n        \"ref\": \"tokenizer\",\n        \"in\": [\"y\"],\n        \"out\": [\"y_tokens\"]\n      },\n      {\n        \"fit_on\": [\"x_tokens\", \"y_tokens\"],\n        \"in\": [\"x_tokens\"],\n        \"out\": [\"tokens_candidates\"],\n        \"class_name\": \"spelling_error_model\",\n        \"window\": 1,\n        \"candidates_count\": 4,\n        \"dictionary\": {\n          \"class_name\": \"wikitionary_100K_vocab\",\n          \"data_dir\": \"{DOWNLOADS_PATH}/vocabs\"\n        },\n        \"save_path\": \"{MODELS_PATH}/error_model/error_model.tsv\"\n      },\n      {\n        \"class_name\": \"kenlm_elector\",\n        \"in\": [\"tokens_candidates\"],\n        \"out\": [\"y_predicted_tokens\"],\n        \"load_path\": \"{DOWNLOADS_PATH}/language_models/en_wiki_no_punkt.arpa.binary\"\n      },\n      {\n        \"ref\": \"tokenizer\",\n        \"in\": [\"y_predicted_tokens\"],\n        \"out\": [\"y_predicted\"]\n      }\n    ],\n    \"out\": [\"y_predicted\"]\n  },\n  \"train\": {\n    \"evaluation_targets\": [\"test\"],\n    \"class_name\": \"fit_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/error_model.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/lang_models/en_wiki_no_punkt.arpa.binary.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/language_models\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/wiktionary/wikipedia_100K_vocab.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/vocabs\"\n      }\n    ]\n  }\n}"
  },
  {
    "path": "deeppavlov/configs/spelling_correction/levenshtein_corrector_ru.json",
    "content": "{\n  \"chainer\":{\n    \"in\": [\"x\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"str_lower\",\n        \"id\": \"lower\",\n        \"in\": [\"x\"],\n        \"out\": [\"x_lower\"]\n      },\n      {\n        \"class_name\": \"nltk_moses_tokenizer\",\n        \"id\": \"tokenizer\",\n        \"in\": [\"x_lower\"],\n        \"out\": [\"x_tokens\"]\n      },\n      {\n        \"id\": \"vocab\",\n        \"class_name\": \"simple_vocab\",\n        \"save_path\": \"{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict\",\n        \"load_path\": \"{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict\"\n      },\n      {\n        \"in\": [\"x_tokens\"],\n        \"out\": [\"tokens_candidates\"],\n        \"class_name\": \"spelling_levenshtein\",\n        \"words\": \"#vocab.keys()\"\n      },\n      {\n        \"class_name\": \"kenlm_elector\",\n        \"in\": [\"tokens_candidates\"],\n        \"out\": [\"y_predicted_tokens\"],\n        \"load_path\": \"{DOWNLOADS_PATH}/language_models/ru_wiyalen_no_punkt.arpa.binary\"\n      },\n      {\n        \"ref\": \"tokenizer\",\n        \"in\": [\"y_predicted_tokens\"],\n        \"out\": [\"y_predicted\"]\n      }\n    ],\n    \"out\": [\"y_predicted\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/vocabs/russian_words_vocab.dict.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/vocabs\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/language_models\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/squad/qa_multisberquad_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"multi_squad_dataset_reader\",\n    \"dataset\": \"MultiSQuADRuRetrClean\",\n    \"url\": \"http://files.deeppavlov.ai/datasets/multi_squad_ru_retr_clean.tar.gz\",\n    \"data_path\": \"{DOWNLOADS_PATH}/multi_squad_ru_retr_clean/\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"multi_squad_retr_iterator\",\n    \"seed\": 1337,\n    \"shuffle\": false,\n    \"with_answer_rate\": 0.666\n  },\n  \"chainer\": {\n    \"in\": [\"context_raw\", \"question_raw\"],\n    \"in_y\": [\"ans_raw\", \"ans_raw_start\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_squad_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"max_seq_length\": 384,\n        \"in\": [\"question_raw\", \"context_raw\"],\n        \"out\": [\"bert_features\", \"subtokens\", \"split_context\"]\n      },\n      {\n        \"class_name\": \"squad_bert_mapping\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"in\": [\"split_context\", \"bert_features\", \"subtokens\"],\n        \"out\": [\"subtok2chars\", \"char2subtoks\"]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_preprocessor\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"in\": [\"ans_raw\", \"ans_raw_start\", \"char2subtoks\"],\n        \"out\": [\"ans\", \"ans_start\", \"ans_end\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_squad\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 0.01,\n          \"betas\": [0.9, 0.999],\n          \"eps\": 1e-06\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"ans_start\", \"ans_end\"],\n        \"out\": [\"ans_start_predicted\", \"ans_end_predicted\", \"logits\", \"scores\", \"inds\"]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_postprocessor\",\n        \"in\": [\"ans_start_predicted\", \"ans_end_predicted\", \"split_context\", \"subtok2chars\", \"subtokens\", \"inds\"],\n        \"out\": [\"ans_predicted\", \"ans_start_predicted\", \"ans_end_predicted\"]\n      }\n    ],\n    \"out\": [\"ans_predicted\", \"ans_start_predicted\", \"scores\"]\n  },\n  \"train\": {\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\"],\n    \"log_every_n_batches\": 250,\n    \"val_every_n_batches\": 500,\n    \"batch_size\": 20,\n    \"valid_batch_size\": 64,\n    \"validation_patience\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"squad_v1_f1\",\n        \"inputs\": [\"ans\", \"ans_predicted\"]\n      },\n      {\n        \"name\": \"squad_v1_em\",\n        \"inputs\": [\"ans\", \"ans_predicted\"]\n      },\n      {\n        \"name\": \"squad_v2_f1\",\n        \"inputs\": [\"ans\", \"ans_predicted\"]\n      },\n      {\n        \"name\": \"squad_v2_em\",\n        \"inputs\": [\"ans\", \"ans_predicted\"]\n      }\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"LOWERCASE\": false,\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/multi_squad_ru_torch_bert_retr_noans/{TRANSFORMER}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/squad/multi_squad_ru_torch_bert_retr_noans.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/squad/qa_nq_psgcls_bert.json",
    "content": "{\n  \"chainer\": {\n    \"in\": [\"context_raw\", \"question_raw\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_squad_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"max_seq_length\": 384,\n        \"in\": [\"question_raw\", \"context_raw\"],\n        \"out\": [\"bert_features\", \"subtokens\", \"split_context\"]\n      },\n      {\n        \"class_name\": \"squad_bert_mapping\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"in\": [\"split_context\", \"bert_features\", \"subtokens\"],\n        \"out\": [\"subtok2chars\", \"char2subtoks\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_squad\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"torch_seed\": 1,\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 0.01,\n          \"betas\": [0.9, 0.999],\n          \"eps\": 1e-06\n        },\n        \"random_seed\": 1,\n        \"psg_cls\": true,\n        \"learning_rate_drop_patience\": 2,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\"bert_features\"],\n        \"out\": [\"ans_start_predicted\", \"ans_end_predicted\", \"logits\", \"scores\", \"inds\"]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_postprocessor\",\n        \"in\": [\"ans_start_predicted\", \"ans_end_predicted\", \"split_context\", \"subtok2chars\", \"subtokens\", \"inds\"],\n        \"out\": [\"ans_predicted\", \"ans_start_predicted\", \"ans_end_predicted\"]\n      }\n    ],\n    \"out\": [\"ans_predicted\", \"ans_start_predicted\", \"scores\"]\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"LOWERCASE\": true,\n      \"TRANSFORMER\": \"bert-base-uncased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/passage_reader_classifier_eng\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/nq_psgcls_bert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/squad/qa_squad2_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"squad_dataset_reader\",\n    \"dataset\": \"SQuAD2.0\",\n    \"data_path\": \"{DOWNLOADS_PATH}/squad2/\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"squad_iterator\",\n    \"seed\": 1337,\n    \"shuffle\": true\n  },\n  \"chainer\": {\n    \"in\": [\n      \"context_raw\",\n      \"question_raw\"\n    ],\n    \"in_y\": [\n      \"ans_raw\",\n      \"ans_raw_start\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_squad_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"max_seq_length\": 384,\n        \"in\": [\n          \"question_raw\",\n          \"context_raw\"\n        ],\n        \"out\": [\n          \"bert_features\",\n          \"subtokens\",\n          \"split_context\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_mapping\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"in\": [\n          \"split_context\",\n          \"bert_features\",\n          \"subtokens\"\n        ],\n        \"out\": [\n          \"subtok2chars\",\n          \"char2subtoks\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_preprocessor\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"in\": [\n          \"ans_raw\",\n          \"ans_raw_start\",\n          \"char2subtoks\"\n        ],\n        \"out\": [\n          \"ans\",\n          \"ans_start\",\n          \"ans_end\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_squad\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"torch_seed\": 1,\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 0.01,\n          \"betas\": [\n            0.9,\n            0.999\n          ],\n          \"eps\": 1e-06\n        },\n        \"random_seed\": 1,\n        \"learning_rate_drop_patience\": 2,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"ans_start\",\n          \"ans_end\"\n        ],\n        \"out\": [\n          \"ans_start_predicted\",\n          \"ans_end_predicted\",\n          \"logits\",\n          \"scores\",\n          \"inds\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_postprocessor\",\n        \"in\": [\n          \"ans_start_predicted\",\n          \"ans_end_predicted\",\n          \"split_context\",\n          \"subtok2chars\",\n          \"subtokens\",\n          \"inds\"\n        ],\n        \"out\": [\n          \"ans_predicted\",\n          \"ans_start_predicted\",\n          \"ans_end_predicted\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"ans_predicted\",\n      \"ans_start_predicted\",\n      \"scores\"\n    ]\n  },\n  \"train\": {\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"valid\"\n    ],\n    \"log_every_n_batches\": 50,\n    \"val_every_n_batches\": 500,\n    \"batch_size\": 20,\n    \"valid_batch_size\": 60,\n    \"valid_batch_size\": 32,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 5,\n    \"validation_patience\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"squad_v1_f1\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v1_em\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v2_f1\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v2_em\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      }\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"LOWERCASE\": false,\n      \"TRANSFORMER\": \"bert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/squad2_bert\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/squad/squad2_bert.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/squad/squad_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"squad_dataset_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/squad/\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"squad_iterator\",\n    \"seed\": 1337,\n    \"shuffle\": true\n  },\n  \"chainer\": {\n    \"in\": [\"context_raw\", \"question_raw\"],\n    \"in_y\": [\"ans_raw\", \"ans_raw_start\"],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_squad_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"max_seq_length\": 384,\n        \"in\": [\"question_raw\", \"context_raw\"],\n        \"out\": [\"bert_features\", \"subtokens\", \"split_context\"]\n      },\n      {\n        \"class_name\": \"squad_bert_mapping\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"in\": [\"split_context\", \"bert_features\", \"subtokens\"],\n        \"out\": [\"subtok2chars\", \"char2subtoks\"]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_preprocessor\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"in\": [\"ans_raw\", \"ans_raw_start\", \"char2subtoks\"],\n        \"out\": [\"ans\", \"ans_start\", \"ans_end\"]\n      },\n      {\n        \"class_name\": \"torch_transformers_squad\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 0.01,\n          \"betas\": [0.9, 0.999],\n          \"eps\": 1e-06\n        },\n        \"learning_rate_drop_patience\": 2,\n        \"learning_rate_drop_div\": 2.0,\n        \"batch_size\": 10,\n        \"in\": [\"bert_features\"],\n        \"in_y\": [\"ans_start\", \"ans_end\"],\n        \"out\": [\"ans_start_predicted\", \"ans_end_predicted\", \"logits\", \"scores\", \"inds\"]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_postprocessor\",\n        \"in\": [\"ans_start_predicted\", \"ans_end_predicted\", \"split_context\", \"subtok2chars\", \"subtokens\", \"inds\"],\n        \"out\": [\"ans_predicted\", \"ans_start_predicted\", \"ans_end_predicted\"]\n      }\n    ],\n    \"out\": [\"ans_predicted\", \"ans_start_predicted\", \"scores\"]\n  },\n  \"train\": {\n    \"show_examples\": false,\n    \"evaluation_targets\": [\"valid\"],\n    \"log_every_n_batches\": 250,\n    \"val_every_n_batches\": 500,\n    \"batch_size\": 10,\n    \"pytest_max_batches\": 2,\n    \"pytest_batch_size\": 5,\n    \"validation_patience\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"squad_v1_f1\",\n        \"inputs\": [\"ans\", \"ans_predicted\"]\n      },\n      {\n        \"name\": \"squad_v1_em\",\n        \"inputs\": [\"ans\", \"ans_predicted\"]\n      },\n      {\n        \"name\": \"squad_v2_f1\",\n        \"inputs\": [\"ans\", \"ans_predicted\"]\n      },\n      {\n        \"name\": \"squad_v2_em\",\n        \"inputs\": [\"ans\", \"ans_predicted\"]\n      }\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"LOWERCASE\": false,\n      \"TRANSFORMER\": \"bert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/squad_torch_bert/cased/{TRANSFORMER}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/squad/squad_torch_bert_cased.tar.gz\",\n        \"subdir\": \"{MODEL_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/squad/squad_ru_bert.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"squad_dataset_reader\",\n    \"dataset\": \"SberSQuADClean\",\n    \"url\": \"http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz\",\n    \"data_path\": \"{DOWNLOADS_PATH}/squad_ru_clean/\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"squad_iterator\",\n    \"seed\": 1337,\n    \"shuffle\": true\n  },\n  \"chainer\": {\n    \"in\": [\n      \"context_raw\",\n      \"question_raw\"\n    ],\n    \"in_y\": [\n      \"ans_raw\",\n      \"ans_raw_start\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_squad_transformers_preprocessor\",\n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"max_seq_length\": 384,\n        \"in\": [\n          \"question_raw\",\n          \"context_raw\"\n        ],\n        \"out\": [\n          \"bert_features\",\n          \"subtokens\",\n          \"split_context\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_mapping\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"in\": [\n          \"split_context\",\n          \"bert_features\",\n          \"subtokens\"\n        ],\n        \"out\": [\n          \"subtok2chars\",\n          \"char2subtoks\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_preprocessor\",\n        \"do_lower_case\": \"{LOWERCASE}\",\n        \"in\": [\n          \"ans_raw\",\n          \"ans_raw_start\",\n          \"char2subtoks\"\n        ],\n        \"out\": [\n          \"ans\",\n          \"ans_start\",\n          \"ans_end\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_squad\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 2e-05,\n          \"weight_decay\": 0.01,\n          \"betas\": [\n            0.9,\n            0.999\n          ],\n          \"eps\": 1e-06\n        },\n        \"learning_rate_drop_patience\": 3,\n        \"learning_rate_drop_div\": 2.0,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"ans_start\",\n          \"ans_end\"\n        ],\n        \"out\": [\n          \"ans_start_predicted\",\n          \"ans_end_predicted\",\n          \"logits\",\n          \"scores\",\n          \"inds\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_postprocessor\",\n        \"in\": [\n          \"ans_start_predicted\",\n          \"ans_end_predicted\",\n          \"split_context\",\n          \"subtok2chars\",\n          \"subtokens\",\n          \"inds\"\n        ],\n        \"out\": [\n          \"ans_predicted\",\n          \"ans_start_predicted\",\n          \"ans_end_predicted\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"ans_predicted\",\n      \"ans_start_predicted\",\n      \"scores\"\n    ]\n  },\n  \"train\": {\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"valid\"\n    ],\n    \"log_every_n_batches\": 250,\n    \"val_every_n_batches\": 500,\n    \"batch_size\": 10,\n    \"validation_patience\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"squad_v1_f1\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v1_em\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v2_f1\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v2_em\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      }\n    ],\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"LOWERCASE\": false,\n      \"TRANSFORMER\": \"DeepPavlov/rubert-base-cased\",\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/squad_ru_torch_bert/{TRANSFORMER}\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/v1/squad/squad_ru_torch_bert.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"squad_dataset_reader\",\n    \"dataset\": \"SberSQuADClean\",\n    \"url\": \"http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz\",\n    \"data_path\": \"{DOWNLOADS_PATH}/squad_ru_clean/\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"squad_iterator\",\n    \"seed\": 1337,\n    \"shuffle\": true\n  },\n  \"chainer\": {\n    \"in\": [\n      \"context_raw\",\n      \"question_raw\"\n    ],\n    \"in_y\": [\n      \"ans_raw\",\n      \"ans_raw_start\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_squad_transformers_preprocessor\",\n        \"add_token_type_ids\": true, \n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": \"{lowercase}\",\n        \"max_seq_length\": 384,\n        \"in\": [\n          \"question_raw\",\n          \"context_raw\"\n        ],\n        \"out\": [\n          \"bert_features\",\n          \"subtokens\",\n          \"split_context\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_mapping\",\n        \"do_lower_case\": \"{lowercase}\",\n        \"in\": [\n          \"split_context\",\n          \"bert_features\",\n          \"subtokens\"\n        ],\n        \"out\": [\n          \"subtok2chars\",\n          \"char2subtoks\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_preprocessor\",\n        \"do_lower_case\": \"{lowercase}\",\n        \"in\": [\n          \"ans_raw\",\n          \"ans_raw_start\",\n          \"char2subtoks\"\n        ],\n        \"out\": [\n          \"ans\",\n          \"ans_start\",\n          \"ans_end\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_squad\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"attention_probs_keep_prob\": 0.11,\n        \"hidden_keep_prob\": 0.33, \n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 9e-05\n        },\n        \"learning_rate_drop_patience\": 2,\n        \"learning_rate_drop_div\": 1.5,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"ans_start\",\n          \"ans_end\"\n        ],\n        \"out\": [\n          \"ans_start_predicted\",\n          \"ans_end_predicted\",\n          \"logits\",\n          \"scores\",\n          \"inds\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_postprocessor\",\n        \"in\": [\n          \"ans_start_predicted\",\n          \"ans_end_predicted\",\n          \"split_context\",\n          \"subtok2chars\",\n          \"subtokens\",\n          \"inds\"\n        ],\n        \"out\": [\n          \"ans_predicted\",\n          \"ans_start_predicted\",\n          \"ans_end_predicted\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"ans_predicted\",\n      \"ans_start_predicted\",\n      \"scores\"\n    ]\n  },\n  \"train\": {\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"valid\"\n    ],\n    \"log_every_n_batches\": 250,\n    \"val_every_n_batches\": 500,\n    \"batch_size\": 10,\n    \"validation_patience\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"squad_v2_f1\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v2_em\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v1_f1\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v1_em\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      }\n    ],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/logs\",\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"lowercase\": false, \n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"TRANSFORMER\": \"DeepPavlov/distilrubert-tiny-cased-conversational\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/squad_ru_convers_distilrubert_2L\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      }\n    ]\n  }\n} \n"
  },
  {
    "path": "deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"squad_dataset_reader\",\n    \"dataset\": \"SberSQuADClean\",\n    \"url\": \"http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz\",\n    \"data_path\": \"{DOWNLOADS_PATH}/squad_ru_clean/\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"squad_iterator\",\n    \"seed\": 1337,\n    \"shuffle\": true\n  },\n  \"chainer\": {\n    \"in\": [\n      \"context_raw\",\n      \"question_raw\"\n    ],\n    \"in_y\": [\n      \"ans_raw\",\n      \"ans_raw_start\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"torch_squad_transformers_preprocessor\", \n        \"add_token_type_ids\": true, \n        \"vocab_file\": \"{TRANSFORMER}\",\n        \"do_lower_case\": \"{lowercase}\",\n        \"max_seq_length\": 384,\n        \"in\": [\n          \"question_raw\",\n          \"context_raw\"\n        ],\n        \"out\": [\n          \"bert_features\",\n          \"subtokens\",\n          \"split_context\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_mapping\",\n        \"do_lower_case\": \"{lowercase}\",\n        \"in\": [\n          \"split_context\",\n          \"bert_features\",\n          \"subtokens\"\n        ],\n        \"out\": [\n          \"subtok2chars\",\n          \"char2subtoks\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_preprocessor\",\n        \"do_lower_case\": \"{lowercase}\",\n        \"in\": [\n          \"ans_raw\",\n          \"ans_raw_start\",\n          \"char2subtoks\"\n        ],\n        \"out\": [\n          \"ans\",\n          \"ans_start\",\n          \"ans_end\"\n        ]\n      },\n      {\n        \"class_name\": \"torch_transformers_squad\",\n        \"pretrained_bert\": \"{TRANSFORMER}\",\n        \"save_path\": \"{MODEL_PATH}/model\",\n        \"load_path\": \"{MODEL_PATH}/model\",\n        \"attention_probs_keep_prob\": 0.0,\n        \"hidden_keep_prob\": 0.33, \n        \"optimizer\": \"AdamW\",\n        \"optimizer_parameters\": {\n          \"lr\": 3.67e-5\n        },\n        \"learning_rate_drop_patience\": 2,\n        \"learning_rate_drop_div\": 1.5,\n        \"in\": [\n          \"bert_features\"\n        ],\n        \"in_y\": [\n          \"ans_start\",\n          \"ans_end\"\n        ],\n        \"out\": [\n          \"ans_start_predicted\",\n          \"ans_end_predicted\",\n          \"logits\",\n          \"scores\",\n          \"inds\"\n        ]\n      },\n      {\n        \"class_name\": \"squad_bert_ans_postprocessor\",\n        \"in\": [\n          \"ans_start_predicted\",\n          \"ans_end_predicted\",\n          \"split_context\",\n          \"subtok2chars\",\n          \"subtokens\",\n          \"inds\"\n        ],\n        \"out\": [\n          \"ans_predicted\",\n          \"ans_start_predicted\",\n          \"ans_end_predicted\"\n        ]\n      }\n    ],\n    \"out\": [\n      \"ans_predicted\",\n      \"ans_start_predicted\",\n      \"scores\"\n    ]\n  },\n  \"train\": {\n    \"show_examples\": false,\n    \"evaluation_targets\": [\n      \"valid\"\n    ],\n    \"log_every_n_batches\": 250,\n    \"val_every_n_batches\": 500,\n    \"batch_size\": 10,\n    \"validation_patience\": 10,\n    \"metrics\": [\n      {\n        \"name\": \"squad_v2_f1\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v2_em\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v1_f1\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      },\n      {\n        \"name\": \"squad_v1_em\",\n        \"inputs\": [\n          \"ans\",\n          \"ans_predicted\"\n        ]\n      }\n    ],\n    \"tensorboard_log_dir\": \"{MODEL_PATH}/logs\",\n    \"class_name\": \"torch_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"lowercase\": false, \n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"TRANSFORMER\": \"DeepPavlov/distilrubert-base-cased-conversational\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\",\n      \"MODEL_PATH\": \"{MODELS_PATH}/squad_ru_convers_distilrubert_6L\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L.tar.gz\",\n        \"subdir\": \"{MODELS_PATH}\"\n      }\n    ]\n  }\n} \n"
  },
  {
    "path": "deeppavlov/core/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/core/commands/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/core/commands/infer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport json\nimport sys\nfrom itertools import islice\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Optional, Union\n\nfrom deeppavlov.core.commands.utils import import_packages, parse_config\nfrom deeppavlov.core.common.chainer import Chainer\nfrom deeppavlov.core.common.params import from_params\nfrom deeppavlov.core.data.utils import jsonify_data\nfrom deeppavlov.download import deep_download\nfrom deeppavlov.utils.pip_wrapper import install_from_config\n\nlog = getLogger(__name__)\n\n\ndef build_model(config: Union[str, Path, dict], mode: str = 'infer',\n                load_trained: bool = False, install: bool = False, download: bool = False) -> Chainer:\n    \"\"\"Build and return the model described in corresponding configuration file.\"\"\"\n    config = parse_config(config)\n\n    if install:\n        install_from_config(config)\n    if download:\n        deep_download(config)\n\n    import_packages(config.get('metadata', {}).get('imports', []))\n\n    model_config = config['chainer']\n\n    model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'))\n\n    for component_config in model_config['pipe']:\n        if load_trained and ('fit_on' in component_config or 'in_y' in component_config):\n            try:\n                component_config['load_path'] = component_config['save_path']\n            except KeyError:\n                log.warning('No \"save_path\" parameter for the {} component, so \"load_path\" will not be renewed'\n                            .format(component_config.get('class_name', component_config.get('ref', 'UNKNOWN'))))\n\n        component = from_params(component_config, mode=mode)\n\n        if 'id' in component_config:\n            model._components_dict[component_config['id']] = component\n\n        if 'in' in component_config:\n            c_in = component_config['in']\n            c_out = component_config['out']\n            in_y = component_config.get('in_y', None)\n            main = component_config.get('main', False)\n            model.append(component, c_in, c_out, in_y, main)\n\n    return model\n\n\ndef interact_model(config: Union[str, Path, dict]) -> None:\n    \"\"\"Start interaction with the model described in corresponding configuration file.\"\"\"\n    model = build_model(config)\n\n    while True:\n        args = []\n        for in_x in model.in_x:\n            args.append((input('{}::'.format(in_x)),))\n            # check for exit command\n            if args[-1][0] in {'exit', 'stop', 'quit', 'q'}:\n                return\n\n        pred = model(*args)\n        if len(model.out_params) > 1:\n            pred = zip(*pred)\n\n        print('>>', *pred)\n\n\ndef predict_on_stream(config: Union[str, Path, dict],\n                      batch_size: Optional[int] = None,\n                      file_path: Optional[str] = None) -> None:\n    \"\"\"Make a prediction with the component described in corresponding configuration file.\"\"\"\n\n    batch_size = batch_size or 1\n    if file_path is None or file_path == '-':\n        if sys.stdin.isatty():\n            raise RuntimeError('To process data from terminal please use interact mode')\n        f = sys.stdin\n    else:\n        f = open(file_path, encoding='utf8')\n\n    model: Chainer = build_model(config)\n\n    args_count = len(model.in_x)\n    while True:\n        batch = list((l.strip() for l in islice(f, batch_size * args_count)))\n\n        if not batch:\n            break\n\n        args = []\n        for i in range(args_count):\n            args.append(batch[i::args_count])\n\n        res = model(*args)\n        if len(model.out_params) == 1:\n            res = [res]\n        for res in zip(*res):\n            res = json.dumps(jsonify_data(res), ensure_ascii=False)\n            print(res, flush=True)\n\n    if f is not sys.stdin:\n        f.close()\n"
  },
  {
    "path": "deeppavlov/core/commands/train.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Dict, Union, Optional, Iterable\n\nfrom deeppavlov.core.commands.utils import expand_path, import_packages, parse_config\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.params import resolve\nfrom deeppavlov.core.common.registry import get_model\nfrom deeppavlov.core.data.data_fitting_iterator import DataFittingIterator\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\nfrom deeppavlov.core.data.utils import get_all_elems_from_json\nfrom deeppavlov.download import deep_download\nfrom deeppavlov.utils.pip_wrapper import install_from_config\n\nlog = getLogger(__name__)\n\n\ndef read_data_by_config(config: dict):\n    \"\"\"Read data by dataset_reader from specified config.\"\"\"\n    dataset_config = config.get('dataset', None)\n\n    if dataset_config:\n        config.pop('dataset')\n        ds_type = dataset_config['type']\n        if ds_type == 'classification':\n            reader = {'class_name': 'basic_classification_reader'}\n            iterator = {'class_name': 'basic_classification_iterator'}\n            config['dataset_reader'] = {**dataset_config, **reader}\n            config['dataset_iterator'] = {**dataset_config, **iterator}\n        else:\n            raise Exception(\"Unsupported dataset type: {}\".format(ds_type))\n\n    try:\n        reader_config = dict(config['dataset_reader'])\n    except KeyError:\n        raise ConfigError(\"No dataset reader is provided in the JSON config.\")\n\n    reader = get_model(reader_config.pop('class_name'))()\n    data_path = reader_config.get('data_path')\n    if isinstance(data_path, list):\n        reader_config['data_path'] = [expand_path(path) for path in data_path]\n    elif data_path is not None:\n        reader_config['data_path'] = expand_path(data_path)\n    return reader.read(**reader_config)\n\n\ndef get_iterator_from_config(config: dict, data: dict):\n    \"\"\"Create iterator (from config) for specified data.\"\"\"\n    iterator_config = {k: resolve(v) for k, v in config['dataset_iterator'].items()}\n    iterator: Union[DataLearningIterator, DataFittingIterator] = get_model(iterator_config.pop('class_name'))(\n        **iterator_config, data=data)\n    return iterator\n\n\ndef train_evaluate_model_from_config(config: Union[str, Path, dict],\n                                     iterator: Union[DataLearningIterator, DataFittingIterator] = None, *,\n                                     to_train: bool = True,\n                                     evaluation_targets: Optional[Iterable[str]] = None,\n                                     install: bool = False,\n                                     download: bool = False,\n                                     start_epoch_num: Optional[int] = None,\n                                     recursive: bool = False) -> Dict[str, Dict[str, float]]:\n    \"\"\"Make training and evaluation of the model described in corresponding configuration file.\"\"\"\n    config = parse_config(config)\n\n    if install:\n        install_from_config(config)\n    if download:\n        deep_download(config)\n\n    if to_train and recursive:\n        for subconfig in get_all_elems_from_json(config['chainer'], 'config_path'):\n            log.info(f'Training \"{subconfig}\"')\n            train_evaluate_model_from_config(subconfig, download=False, recursive=True)\n\n    import_packages(config.get('metadata', {}).get('imports', []))\n\n    if iterator is None:\n        try:\n            data = read_data_by_config(config)\n            # TODO: check class objects, not strings\n            is_mtl = config['dataset_reader']['class_name'] == 'multitask_reader'\n            if config.get('train', {}).get('val_every_n_epochs') and not data.get('valid') and not is_mtl:\n                error_message = 'The value \"val_every_n_epochs\" is set in the config but no validation data is provided'\n                raise AttributeError(error_message)\n        except ConfigError as e:\n            to_train = False\n            log.warning(f'Skipping training. {e.message}')\n        else:\n            iterator = get_iterator_from_config(config, data)\n\n    if 'train' not in config:\n        log.warning('Train config is missing. Populating with default values')\n    train_config = config.get('train', {})\n\n    if start_epoch_num is not None:\n        train_config['start_epoch_num'] = start_epoch_num\n\n    trainer_class = get_model(train_config.pop('class_name', 'torch_trainer'))\n\n    trainer = trainer_class(config['chainer'], **train_config)\n\n    if to_train:\n        trainer.train(iterator)\n\n    res = {}\n\n    if iterator is not None:\n        res = trainer.evaluate(iterator, evaluation_targets)\n        trainer.get_chainer().destroy()\n\n    res = {k: v['metrics'] for k, v in res.items()}\n\n    return res\n"
  },
  {
    "path": "deeppavlov/core/commands/utils.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nfrom copy import deepcopy\nfrom pathlib import Path\nfrom typing import Any, Union, Dict, TypeVar, Optional\n\nfrom deeppavlov.core.common.file import read_json, find_config\nfrom deeppavlov.core.common.registry import inverted_registry\nfrom deeppavlov.core.data.utils import get_all_elems_from_json\n\n# noinspection PyShadowingBuiltins\n_T = TypeVar('_T', str, float, bool, list, dict)\n\n\ndef _parse_config_property(item: _T, variables: Dict[str, Union[str, Path, float, bool, int, None]],\n                           variables_exact: Dict[str, Union[str, Path, float, bool, int, None]]) -> _T:\n    \"\"\"Recursively apply config's variables values to its property\"\"\"\n    if isinstance(item, str):\n        if item in variables_exact:\n            return variables_exact[item]\n        return item.format(**variables)\n    elif isinstance(item, list):\n        return [_parse_config_property(item, variables, variables_exact) for item in item]\n    elif isinstance(item, dict):\n        return {k: _parse_config_property(v, variables, variables_exact) for k, v in item.items()}\n    else:\n        return item\n\n\ndef _get_variables_from_config(config: Union[str, Path, dict]):\n    \"\"\"Read config's variables\"\"\"\n    if isinstance(config, (str, Path)):\n        config = read_json(find_config(config))\n\n    variables = {\n        'DEEPPAVLOV_PATH': os.getenv(f'DP_DEEPPAVLOV_PATH', Path(__file__).parent.parent.parent)\n    }\n    variables_exact = {f'{{{k}}}': v for k, v in variables.items()}\n    for name, value in config.get('metadata', {}).get('variables', {}).items():\n        env_name = f'DP_{name}'\n        if env_name in os.environ:\n            value = os.getenv(env_name)\n        if value in variables_exact:\n            value = variables_exact[value]\n        elif isinstance(value, str):\n            value = value.format(**variables)\n        variables[name] = value\n        variables_exact[f'{{{name}}}'] = value\n\n    return variables, variables_exact\n\n\ndef _update_requirements(config: dict) -> dict:\n    \"\"\"\n    Generates requirements for DeepPavlov model and adds them as ``metadata.requirements`` field to the returned dict.\n\n    Searches for the ``class_name`` keys in the passed config at all nesting levels. For each found component,\n    function looks for dependencies in the requirements registry. Found dependencies are added to the returned copy of\n    the config as ``metadata.requirements``. If the config already has ``metadata.requirements``, the existing one\n    is complemented by the found requirements.\n\n    Args:\n        config: DeepPavlov model config\n    Returns:\n        config copy with updated ``metadata.requirements`` field according to the config components.\n    \"\"\"\n    components = get_all_elems_from_json(config, 'class_name')\n    components = {inverted_registry.get(component, component) for component in components}\n    requirements_registry_path = Path(__file__).parents[1] / 'common' / 'requirements_registry.json'\n    requirements_registry = read_json(requirements_registry_path)\n    requirements = []\n    for component in components:\n        requirements.extend(requirements_registry.get(component, []))\n    requirements.extend(config.get('metadata', {}).get('requirements', []))\n    response = deepcopy(config)\n    response['metadata'] = response.get('metadata', {})\n    response['metadata']['requirements'] = list(set(requirements))\n    return response\n\n\ndef _overwrite(data: Any, value: Any, nested_keys: list) -> None:\n    \"\"\"Changes ``data`` nested key value to ``value`` using ``nested_keys`` as nested keys list.\n\n    Example:\n        >>> x = {'a': [None, {'b': 2}]}\n        >>> _overwrite(x, 42, ['a', 1, 'b'])\n        >>> x\n        {'a': [None, {'b': 42}]}\n\n    \"\"\"\n    key = nested_keys.pop(0)\n    if not nested_keys:\n        data[key] = value\n    else:\n        _overwrite(data[key], value, nested_keys)\n\n\ndef parse_config(config: Union[str, Path, dict], overwrite: Optional[dict] = None) -> dict:\n    \"\"\"Apply metadata.variables values to placeholders inside config and update nested configs using overwrite parameter\n\n    Args:\n        config: Config to parse.\n        overwrite: If not None - key-value pairs of nested keys and values to overwrite config.\n            For {'chainer.pipe.0.class_name': 'simple_vocab'} it will update config\n            config['chainer']['pipe'][0]['class_name'] = 'simple_vocab'.\n\n    \"\"\"\n    if isinstance(config, (str, Path)):\n        config = read_json(find_config(config))\n\n    if overwrite is not None:\n        for key, value in overwrite.items():\n            items = [int(item) if item.isdigit() else item for item in key.split('.')]\n            _overwrite(config, value, items)\n\n    updated_config = _update_requirements(config)\n\n    variables, variables_exact = _get_variables_from_config(updated_config)\n\n    return _parse_config_property(updated_config, variables, variables_exact)\n\n\ndef expand_path(path: Union[str, Path]) -> Path:\n    \"\"\"Convert relative paths to absolute with resolving user directory.\"\"\"\n    return Path(path).expanduser().resolve()\n\n\ndef import_packages(packages: list) -> None:\n    \"\"\"Import packages from list to execute their code.\"\"\"\n    for package in packages:\n        __import__(package)\n\n\ndef parse_value_with_config(value: Union[str, Path], config: Union[str, Path, dict]) -> Path:\n    \"\"\"Fill the variables in `value` with variables values from `config`.\n    `value` should be a string. If `value` is a string of only variable, `value` will be replaced with\n    variable's value from config (the variable's value could be anything then).\"\"\"\n    variables, variables_exact = _get_variables_from_config(config)\n\n    return _parse_config_property(str(value), variables, variables_exact)\n"
  },
  {
    "path": "deeppavlov/core/common/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/core/common/aliases.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nALIASES = {\n    'kbqa_cq': 'kbqa_cq_en',\n    'kbqa_cq_online': 'kbqa_cq_en',\n    'kbqa_cq_rus': 'kbqa_cq_ru',\n    'multi_squad_noans': 'qa_squad2_bert',\n    'multi_squad_noans_infer': 'qa_squad2_bert',\n    'multi_squad_retr_noans': 'qa_squad2_bert',\n    'ner_collection3_m1': 'ner_collection3_bert',\n    'ner_conll2003': 'ner_conll2003_bert',\n    'ner_conll2003_torch_bert': 'ner_conll2003_bert',\n    'ner_dstc2': 'ner_conll2003_bert',\n    'ner_few_shot_ru': 'ner_rus_bert',\n    'ner_few_shot_ru_simulate': 'ner_rus_bert',\n    'ner_ontonotes': 'ner_ontonotes_bert',\n    'ner_ontonotes_bert_emb': 'ner_ontonotes_bert',\n    'ner_ontonotes_bert_mult_torch': 'ner_ontonotes_bert_mult',\n    'ner_ontonotes_bert_torch': 'ner_ontonotes_bert',\n    'ner_rus': 'ner_rus_bert',\n    'paraphraser_bert': 'paraphraser_rubert',\n    'ru_odqa_infer_wiki_rubert': 'ru_odqa_infer_wiki',\n    'sentseg_dailydialog': 'sentseg_dailydialog_bert',\n    'squad': 'squad_bert',\n    'squad_bert_infer': 'squad_bert',\n    'squad_bert_multilingual_freezed_emb': 'squad_bert',\n    'squad_ru': 'squad_ru_bert',\n    'squad_ru_bert_infer': 'squad_ru_bert',\n    'squad_ru_convers_distilrubert_2L_infer': 'squad_ru_convers_distilrubert_2L',\n    'squad_ru_convers_distilrubert_6L_infer': 'squad_ru_convers_distilrubert_6L',\n    'squad_ru_rubert': 'squad_ru_bert',\n    'squad_ru_rubert_infer': 'squad_ru_bert',\n    'squad_torch_bert': 'squad_bert',\n    'squad_torch_bert_infer': 'squad_bert'\n}\n"
  },
  {
    "path": "deeppavlov/core/common/base.py",
    "content": "# Copyright 2021 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom types import FunctionType\nfrom typing import List, Optional, Union\n\nfrom deeppavlov.core.common.chainer import Chainer\nfrom deeppavlov.core.models.component import Component\n\n\nclass Element:\n    \"\"\"DeepPavlov model pipeline element.\"\"\"\n    def __init__(self, component: Union[Component, FunctionType],\n                 x: Optional[Union[str, list]] = None,\n                 out: Optional[Union[str, list]] = None,\n                 y: Optional[Union[str, list]] = None,\n                 main: bool = False) -> None:\n        \"\"\"\n        Args:\n            component: Pipeline component object.\n            x: Names of the component inference inputs. Output from other pipeline elements with such names will be fed\n                to the input of this component.\n            out: Names of the component inference outputs. Component outputs can be fed to other pipeline elements\n                using this names.\n            y: Names of additional inputs (targets) for component training and evaluation.\n            main: Set True if this is the main component. Main component is trained during model training process.\n        \"\"\"\n        self.component = component\n        self.x = x\n        self.y = y\n        self.out = out\n        self.main = main\n\n\nclass Model(Chainer):\n    \"\"\"Builds a component pipeline to train and infer models.\"\"\"\n    def __init__(self, x: Optional[Union[str, list]] = None,\n                 out: Optional[Union[str, list]] = None,\n                 y: Optional[Union[str, list]] = None,\n                 pipe: Optional[List[Element]] = None) -> None:\n        \"\"\"\n        Args:\n            x: Names of pipeline inference inputs.\n            out: Names of pipeline inference outputs.\n            y: Names of additional inputs (targets) for pipeline training and evaluation.\n            pipe: List of pipeline elements.\n        \"\"\"\n        super().__init__(in_x=x, out_params=out, in_y=y)\n        if pipe is not None:\n            for element in pipe:\n                self.append(element.component, element.x, element.out, element.y, element.main)\n"
  },
  {
    "path": "deeppavlov/core/common/chainer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport pickle\nfrom itertools import islice\nfrom logging import getLogger\nfrom types import FunctionType\nfrom typing import Union, Tuple, List, Optional, Hashable, Reversible\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.core.models.nn_model import NNModel\nfrom deeppavlov.core.models.serializable import Serializable\n\nlog = getLogger(__name__)\n\n\nclass Chainer(Component):\n    \"\"\"\n    Builds a component pipeline from heterogeneous components (Rule-based/ML/DL). It allows to train\n    and infer models in a pipeline as a whole.\n\n    Attributes:\n        pipe: list of components and their input and output variable names for inference\n        train_pipe: list of components and their input and output variable names for training and evaluation\n        in_x: names of inputs for pipeline inference mode\n        out_params: names of pipeline inference outputs\n        in_y: names of additional inputs for pipeline training and evaluation modes\n        forward_map: list of all variables in chainer's memory after  running every component in ``self.pipe``\n        train_map: list of all variables in chainer's memory after  running every component in ``train_pipe.pipe``\n        main: reference to the main component\n\n    Args:\n        in_x: names of inputs for pipeline inference mode\n        out_params: names of pipeline inference outputs\n        in_y: names of additional inputs for pipeline training and evaluation modes\n    \"\"\"\n\n    def __init__(self, in_x: Union[str, list] = None, out_params: Union[str, list] = None,\n                 in_y: Union[str, list] = None, *args, **kwargs) -> None:\n        self.pipe: List[Tuple[Tuple[List[str], List[str]], List[str], Component]] = []\n        self.train_pipe = []\n        if isinstance(in_x, str):\n            in_x = [in_x]\n        if isinstance(in_y, str):\n            in_y = [in_y]\n        if isinstance(out_params, str):\n            out_params = [out_params]\n        self.in_x = in_x or ['x']\n        self.in_y = in_y or ['y']\n        self.out_params = out_params or self.in_x\n\n        self.forward_map = set(self.in_x)\n        self.train_map = self.forward_map.union(self.in_y)\n\n        self._components_dict = {}\n\n        self.main = None\n\n    def __getitem__(self, item):\n        if isinstance(item, int):\n            in_params, out_params, component = self.train_pipe[item]\n            return component\n        return self._components_dict[item]\n\n    def _ipython_key_completions_(self):\n        return self._components_dict.keys()\n\n    def __repr__(self):\n        reversed_components_dict = {v: f'{repr(k)}: ' for k, v in self._components_dict.items()\n                                    if isinstance(v, Hashable)}\n\n        components_list = []\n        for in_params, out_params, component in self.train_pipe:\n            component_repr = repr(component)\n            if isinstance(component, Hashable) and component in reversed_components_dict:\n                component_repr = reversed_components_dict[component] + component_repr\n            else:\n                for k, v in self._components_dict.items():\n                    if v is component:\n                        component_repr = f'{k}: {component_repr}'\n                        break\n            components_list.append(component_repr)\n\n        return f'Chainer[{\", \".join(components_list)}]'\n\n    def _repr_pretty_(self, p, cycle):\n        \"\"\"method that defines ``Struct``'s pretty printing rules for iPython\n\n        Args:\n            p (IPython.lib.pretty.RepresentationPrinter): pretty printer object\n            cycle (bool): is ``True`` if pretty detected a cycle\n        \"\"\"\n        if cycle:\n            p.text('Chainer(...)')\n        else:\n            with p.group(8, 'Chainer[', ']'):\n                reversed_components_dict = {v: k for k, v in self._components_dict.items()\n                                            if isinstance(v, Hashable)}\n                # p.pretty(self.__prepare_repr())\n                for i, (in_params, out_params, component) in enumerate(self.train_pipe):\n                    if i > 0:\n                        p.text(',')\n                        p.breakable()\n                    if isinstance(component, Hashable) and component in reversed_components_dict:\n                        p.pretty(reversed_components_dict[component])\n                        p.text(': ')\n                    else:\n                        for k, v in self._components_dict.items():\n                            if v is component:\n                                p.pretty(k)\n                                p.text(': ')\n                                break\n                    p.pretty(component)\n\n    def append(self, component: Union[Component, FunctionType], in_x: [str, list, dict] = None,\n               out_params: [str, list] = None, in_y: [str, list, dict] = None, main: bool = False):\n        if isinstance(in_x, str):\n            in_x = [in_x]\n        if isinstance(in_y, str):\n            in_y = [in_y]\n        if isinstance(out_params, str):\n            out_params = [out_params]\n        in_x = in_x or self.in_x\n\n        if isinstance(in_x, dict):\n            x_keys, in_x = zip(*in_x.items())\n        else:\n            x_keys = []\n        out_params = out_params or in_x\n        if in_y is not None:\n            if isinstance(in_y, dict):\n                y_keys, in_y = zip(*in_y.items())\n            else:\n                y_keys = []\n            keys = x_keys + y_keys\n\n            if bool(x_keys) != bool(y_keys):\n                raise ConfigError('`in` and `in_y` for a component have to both be lists or dicts')\n\n            component: NNModel\n            main = True\n            assert self.train_map.issuperset(in_x + in_y), ('Arguments {} are expected but only {} are set'\n                                                            .format(in_x + in_y, self.train_map))\n            preprocessor = Chainer(self.in_x, in_x + in_y, self.in_y)\n            for (t_in_x_keys, t_in_x), t_out, t_component in self.train_pipe:\n                if t_in_x_keys:\n                    t_in_x = dict(zip(t_in_x_keys, t_in_x))\n                preprocessor.append(t_component, t_in_x, t_out)\n\n            def train_on_batch(*args, **kwargs):\n                preprocessed = preprocessor.compute(*args, **kwargs)\n                if len(in_x + in_y) == 1:\n                    preprocessed = [preprocessed]\n                if keys:\n                    return component.train_on_batch(**dict(zip(keys, preprocessed)))\n                else:\n                    return component.train_on_batch(*preprocessed)\n\n            self.train_on_batch = train_on_batch\n            self.process_event = component.process_event\n        if main:\n            self.main = component\n        if self.forward_map.issuperset(in_x):\n            self.pipe.append(((x_keys, in_x), out_params, component))\n            self.forward_map = self.forward_map.union(out_params)\n\n        if self.train_map.issuperset(in_x):\n            self.train_pipe.append(((x_keys, in_x), out_params, component))\n            self.train_map = self.train_map.union(out_params)\n        else:\n            raise ConfigError('Arguments {} are expected but only {} are set'.format(in_x, self.train_map))\n\n    def compute(self, x, y=None, targets=None):\n        if targets is None:\n            targets = self.out_params\n        in_params = list(self.in_x)\n        if len(in_params) == 1:\n            args = [x]\n        else:\n            args = list(zip(*x))\n\n        if y is None:\n            pipe = self.pipe\n        else:\n            pipe = self.train_pipe\n            if len(self.in_y) == 1:\n                args.append(y)\n            else:\n                args += list(zip(*y))\n            in_params += self.in_y\n\n        return self._compute(*args, pipe=pipe, param_names=in_params, targets=targets)\n\n    def __call__(self, *args):\n        return self._compute(*args, param_names=self.in_x, pipe=self.pipe, targets=self.out_params)\n\n    @staticmethod\n    def _compute(*args, param_names, pipe, targets):\n        expected = set(targets)\n        final_pipe = []\n        for (in_keys, in_params), out_params, component in reversed(pipe):\n            if expected.intersection(out_params):\n                expected = expected - set(out_params) | set(in_params)\n                final_pipe.append(((in_keys, in_params), out_params, component))\n        final_pipe.reverse()\n        if not expected.issubset(param_names):\n            raise RuntimeError(f'{expected} are required to compute {targets} but were not found in memory or inputs')\n        pipe = final_pipe\n\n        mem = dict(zip(param_names, args))\n        del args\n\n        for (in_keys, in_params), out_params, component in pipe:\n            x = [mem[k] for k in in_params]\n            if in_keys:\n                res = component.__call__(**dict(zip(in_keys, x)))\n            else:\n                res = component.__call__(*x)\n            if len(out_params) == 1:\n                mem[out_params[0]] = res\n            else:\n                mem.update(zip(out_params, res))\n\n        res = [mem[k] for k in targets]\n        if len(res) == 1:\n            res = res[0]\n        return res\n\n    def batched_call(self, *args: Reversible, batch_size: int = 16) -> Union[list, Tuple[list, ...]]:\n        \"\"\"\n        Partitions data into mini-batches and applies :meth:`__call__` to each batch.\n\n        Args:\n            args: input data, each element of the data corresponds to a single model inputs sequence.\n            batch_size: the size of a batch.\n\n        Returns:\n            the model output as if the data was passed to the :meth:`__call__` method.\n        \"\"\"\n        args = [iter(arg) for arg in args]\n        answer = [[] for _ in self.out_params]\n\n        while True:\n            batch = [list(islice(arg, batch_size)) for arg in args]\n            if not any(batch):  # empty batch, reached the end\n                break\n\n            curr_answer = self.__call__(*batch)\n            if len(self.out_params) == 1:\n                curr_answer = [curr_answer]\n\n            for y, curr_y in zip(answer, curr_answer):\n                y.extend(curr_y)\n\n        if len(self.out_params) == 1:\n            answer = answer[0]\n        return answer\n\n    def get_main_component(self) -> Optional[Serializable]:\n        try:\n            return self.main or self.pipe[-1][-1]\n        except IndexError:\n            log.warning('Cannot get a main component for an empty chainer')\n            return None\n\n    def save(self) -> None:\n        main_component = self.get_main_component()\n        if isinstance(main_component, Serializable):\n            main_component.save()\n\n    def load(self) -> None:\n        for in_params, out_params, component in self.train_pipe:\n            if callable(getattr(component, 'load', None)):\n                component.load()\n\n    def reset(self) -> None:\n        for in_params, out_params, component in self.train_pipe:\n            if callable(getattr(component, 'reset', None)):\n                component.reset()\n\n    def destroy(self):\n        if hasattr(self, 'train_pipe'):\n            for in_params, out_params, component in self.train_pipe:\n                if callable(getattr(component, 'destroy', None)):\n                    component.destroy()\n            self.train_pipe.clear()\n        if hasattr(self, 'pipe'):\n            self.pipe.clear()\n        super().destroy()\n"
  },
  {
    "path": "deeppavlov/core/common/cross_validation.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport shutil\nfrom collections import OrderedDict\nfrom logging import getLogger\nfrom pathlib import Path\n\nimport numpy as np\nfrom sklearn.model_selection import KFold\n\nfrom deeppavlov.core.commands.train import train_evaluate_model_from_config, get_iterator_from_config, \\\n    read_data_by_config\nfrom deeppavlov.core.commands.utils import expand_path, parse_config\nfrom deeppavlov.core.common.params_search import ParamsSearch\n\nSAVE_PATH_ELEMENT_NAME = 'save_path'\nTEMP_DIR_FOR_CV = 'cv_tmp'\nlog = getLogger(__name__)\n\n\ndef change_savepath_for_model(config):\n    params_helper = ParamsSearch()\n\n    dirs_for_saved_models = set()\n    for p in params_helper.find_model_path(config, SAVE_PATH_ELEMENT_NAME):\n        p.append(SAVE_PATH_ELEMENT_NAME)\n        save_path = Path(params_helper.get_value_from_config(config, p))\n        new_save_path = save_path.parent / TEMP_DIR_FOR_CV / save_path.name\n\n        dirs_for_saved_models.add(expand_path(new_save_path.parent))\n\n        params_helper.insert_value_or_dict_into_config(config, p, str(new_save_path))\n\n    return config, dirs_for_saved_models\n\n\ndef delete_dir_for_saved_models(dirs_for_saved_models):\n    for new_save_dir in dirs_for_saved_models:\n        shutil.rmtree(str(new_save_dir))\n\n\ndef create_dirs_to_save_models(dirs_for_saved_models):\n    for new_save_dir in dirs_for_saved_models:\n        new_save_dir.mkdir(exist_ok=True, parents=True)\n\n\ndef generate_train_valid(data, n_folds=5, is_loo=False):\n    all_data = data['train'] + data['valid']\n\n    if is_loo:\n        # for Leave One Out\n        for i in range(len(all_data)):\n            data_i = {\n                'train': all_data.copy(),\n                'test': data['test']\n            }\n            data_i['valid'] = [data_i['train'].pop(i)]\n\n            yield data_i\n    else:\n        # for Cross Validation\n        kf = KFold(n_splits=n_folds, shuffle=True)\n        for train_index, valid_index in kf.split(all_data):\n            data_i = {\n                'train': [all_data[i] for i in train_index],\n                'valid': [all_data[i] for i in valid_index],\n                'test': data['test']\n            }\n\n            yield data_i\n\n\ndef calc_cv_score(config, data=None, n_folds=5, is_loo=False):\n    config = parse_config(config)\n\n    if data is None:\n        data = read_data_by_config(config)\n\n    config, dirs_for_saved_models = change_savepath_for_model(config)\n\n    cv_score = OrderedDict()\n    for data_i in generate_train_valid(data, n_folds=n_folds, is_loo=is_loo):\n        iterator = get_iterator_from_config(config, data_i)\n        create_dirs_to_save_models(dirs_for_saved_models)\n        score = train_evaluate_model_from_config(config, iterator=iterator)\n        delete_dir_for_saved_models(dirs_for_saved_models)\n        for key, value in score['valid'].items():\n            if key not in cv_score:\n                cv_score[key] = []\n            cv_score[key].append(value)\n\n    for key, value in cv_score.items():\n        cv_score[key] = np.mean(value)\n        log.info('Cross-Validation \\\"{}\\\" is: {}'.format(key, cv_score[key]))\n\n    return cv_score\n"
  },
  {
    "path": "deeppavlov/core/common/errors.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport logging\n\nlogger = logging.getLogger(__name__)\n\n\nclass ConfigError(Exception):\n    \"\"\"Any configuration error.\"\"\"\n\n    def __init__(self, message):\n        super(ConfigError, self).__init__()\n        self.message = message\n\n    def __str__(self):\n        return repr(self.message)\n"
  },
  {
    "path": "deeppavlov/core/common/file.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nimport pickle\nfrom collections import OrderedDict\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Union, Any, Iterable\n\nfrom deeppavlov.core.common.aliases import ALIASES\n\nlog = getLogger(__name__)\n\n_red_text, _reset_text_color, _sharp_line = \"\\x1b[31;20m\", \"\\x1b[0m\", '#'*80\nDEPRECATOIN_MSG = f\"{_red_text}\\n\\n{_sharp_line}\\n\" \\\n                  \"# The model '{0}' has been removed from the DeepPavlov configs.\\n\" \\\n                  \"# The model '{1}' is used instead.\\n\" \\\n                  \"# To disable this message please switch to '{1}'.\\n\" \\\n                  \"# Automatic name resolving will be disabled in the deeppavlov 1.2.0,\\n\" \\\n                  \"# and if you try to use '{0}' you will get an ERROR.\\n\" \\\n                  f\"{_sharp_line}{_reset_text_color}\\n\"\n\n\ndef find_config(pipeline_config_path: Union[str, Path]) -> Path:\n    if pipeline_config_path in ALIASES:\n        new_pipeline_config_path = ALIASES[pipeline_config_path]\n        log.warning(DEPRECATOIN_MSG.format(pipeline_config_path, new_pipeline_config_path))\n        pipeline_config_path = new_pipeline_config_path\n\n    if not Path(pipeline_config_path).is_file():\n        configs = [c for c in Path(__file__).parent.parent.parent.glob(f'configs/**/{pipeline_config_path}.json')\n                   if str(c.with_suffix('')).endswith(pipeline_config_path)]  # a simple way to not allow * and ?\n        if configs:\n            log.debug(f\"Interpreting '{pipeline_config_path}' as '{configs[0]}'\")\n            pipeline_config_path = configs[0]\n\n    return Path(pipeline_config_path)\n\n\ndef read_json(fpath: Union[str, Path]) -> dict:\n    with open(fpath, encoding='utf8') as fin:\n        return json.load(fin, object_pairs_hook=OrderedDict)\n\n\ndef save_json(data: dict, fpath: Union[str, Path]) -> None:\n    with open(fpath, 'w', encoding='utf8') as fout:\n        json.dump(data, fout, ensure_ascii=False, indent=2)\n\n\ndef save_pickle(data: dict, fpath: Union[str, Path]) -> None:\n    with open(fpath, 'wb') as fout:\n        pickle.dump(data, fout, protocol=4)\n\n\ndef load_pickle(fpath: Union[str, Path]) -> Any:\n    with open(fpath, 'rb') as fin:\n        return pickle.load(fin)\n\n\ndef save_jsonl(data: Iterable[dict], fpath: Union[str, Path]) -> None:\n    with open(fpath, 'w') as f:\n        for item in data:\n            f.write(f\"{json.dumps(item, ensure_ascii=False)}\\n\")\n"
  },
  {
    "path": "deeppavlov/core/common/log.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nimport logging\nimport logging.config\nfrom pathlib import Path\n\nfrom .paths import get_settings_path\n\nLOG_CONFIG_FILENAME = 'log_config.json'\nTRACEBACK_LOGGER_ERRORS = True\n\nroot_path = Path(__file__).resolve().parents[3]\n\nlog_config_path = get_settings_path() / LOG_CONFIG_FILENAME\n\nwith log_config_path.open(encoding='utf8') as log_config_json:\n    log_config = json.load(log_config_json)\n\n\nclass ProbeFilter(logging.Filter):\n    \"\"\"ProbeFilter class is used to filter POST requests to /probe endpoint from logs.\"\"\"\n\n    def filter(self, record: logging.LogRecord) -> bool:\n        \"\"\"To log the record method should return True.\"\"\"\n        return 'POST /probe HTTP' not in record.getMessage()\n\n\ndef init_logger():\n    configured_loggers = [log_config.get('root', {})] + [logger for logger in\n                                                         log_config.get('loggers', {}).values()]\n\n    used_handlers = {handler for log in configured_loggers for handler in log.get('handlers', [])}\n\n    for handler_id, handler in list(log_config['handlers'].items()):\n        if handler_id not in used_handlers:\n            del log_config['handlers'][handler_id]\n        elif 'filename' in handler.keys():\n            filename = handler['filename']\n            logfile_path = Path(filename).expanduser().resolve()\n            handler['filename'] = str(logfile_path)\n\n    logging.config.dictConfig(log_config)\n"
  },
  {
    "path": "deeppavlov/core/common/log_events.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import Optional\nfrom deeppavlov.core.commands.utils import expand_path\n\nlog = getLogger(__name__)\n\n\nclass TBWriter:\n    def __init__(self, tensorboard_log_dir: str):\n        # TODO: After adding wandb logger, create common parent class for both loggers\n        from torch.utils.tensorboard import SummaryWriter\n        tensorboard_log_dir = expand_path(tensorboard_log_dir)\n        self.tb_train_writer = SummaryWriter(str(tensorboard_log_dir / 'train_log'))\n        self.tb_valid_writer = SummaryWriter(str(tensorboard_log_dir / 'valid_log'))\n\n    # TODO: find how to write Summary\n    def write_train(self, tag, scalar_value, global_step):\n        self.tb_train_writer.add_scalar(tag, scalar_value, global_step)\n\n    def write_valid(self, tag, scalar_value, global_step):\n        self.tb_valid_writer.add_scalar(tag, scalar_value, global_step)\n\n    def flush(self):\n        self.tb_train_writer.flush()\n        self.tb_valid_writer.flush()\n\n\ndef get_tb_writer(tensorboard_log_dir: Optional[str]) -> Optional[TBWriter]:\n    try:\n        if tensorboard_log_dir is not None:\n            tb_writer = TBWriter(tensorboard_log_dir)\n        else:\n            tb_writer = None\n    except ImportError:\n        log.error('Failed to import SummaryWriter from torch.utils.tensorboard. Failed to initialize Tensorboard '\n                  'logger. Install appropriate Pytorch version to use this logger or remove tensorboard_log_dir '\n                  'parameter from the train parameters list in the configuration file.')\n        tb_writer = None\n    return tb_writer\n"
  },
  {
    "path": "deeppavlov/core/common/metrics_registry.json",
    "content": "{\n  \"acc\": \"deeppavlov.metrics.accuracy:round_accuracy\",\n  \"accuracy\": \"deeppavlov.metrics.accuracy:accuracy\",\n  \"average__ner_f1__f1_macro__f1\": \"deeppavlov.metrics.fmeasure:ner_f1__f1_macro__f1\",\n  \"average__roc_auc__roc_auc__ner_f1\": \"deeppavlov.metrics.fmeasure:roc_auc__roc_auc__ner_f1\",\n  \"bleu\": \"deeppavlov.metrics.bleu:bleu\",\n  \"bleu_advanced\": \"deeppavlov.metrics.bleu:bleu_advanced\",\n  \"elmo_loss2ppl\": \"deeppavlov.metrics.elmo_metrics:elmo_loss2ppl\",\n  \"f1\": \"deeppavlov.metrics.fmeasure:round_f1\",\n  \"f1_macro\": \"deeppavlov.metrics.fmeasure:round_f1_macro\",\n  \"f1_weighted\": \"deeppavlov.metrics.fmeasure:round_f1_weighted\",\n  \"google_bleu\": \"deeppavlov.metrics.bleu:google_bleu\",\n  \"kbqa_accuracy\": \"deeppavlov.metrics.accuracy:kbqa_accuracy\",\n  \"log_loss\": \"deeppavlov.metrics.log_loss:sk_log_loss\",\n  \"matthews_correlation\": \"deeppavlov.metrics.correlation:matthews_correlation\",\n  \"mean_squared_error\": \"deeppavlov.metrics.mse:mse\",\n  \"multitask_accuracy\": \"deeppavlov.metrics.accuracy:multitask_accuracy\",\n  \"multitask_sequence_accuracy\": \"deeppavlov.metrics.accuracy:multitask_sequence_accuracy\",\n  \"multitask_token_accuracy\": \"deeppavlov.metrics.accuracy:multitask_token_accuracy\",\n  \"ner_f1\": \"deeppavlov.metrics.fmeasure:ner_f1\",\n  \"ner_token_f1\": \"deeppavlov.metrics.fmeasure:ner_token_f1\",\n  \"pearson_correlation\": \"deeppavlov.metrics.correlation:pearson_correlation\",\n  \"per_item_bleu\": \"deeppavlov.metrics.bleu:per_item_bleu\",\n  \"per_item_dialog_accuracy\": \"deeppavlov.metrics.accuracy:per_item_dialog_accuracy\",\n  \"per_item_dialog_bleu\": \"deeppavlov.metrics.bleu:per_item_dialog_bleu\",\n  \"per_token_accuracy\": \"deeppavlov.metrics.accuracy:per_token_accuracy\",\n  \"r@1\": \"deeppavlov.metrics.recall_at_k:r_at_1\",\n  \"r@10\": \"deeppavlov.metrics.recall_at_k:r_at_10\",\n  \"r@1_insQA\": \"deeppavlov.models.ranking.metrics:r_at_1_insQA\",\n  \"r@2\": \"deeppavlov.metrics.recall_at_k:r_at_2\",\n  \"r@5\": \"deeppavlov.metrics.recall_at_k:r_at_5\",\n  \"rank_response\": \"deeppavlov.models.ranking.metrics:rank_response\",\n  \"roc_auc\": \"deeppavlov.metrics.roc_auc_score:roc_auc_score\",\n  \"sets_accuracy\": \"deeppavlov.metrics.accuracy:sets_accuracy\",\n  \"slots_accuracy\": \"deeppavlov.metrics.accuracy:slots_accuracy\",\n  \"spearman_correlation\": \"deeppavlov.metrics.correlation:spearman_correlation\",\n  \"squad_v1_em\": \"deeppavlov.metrics.squad_metrics:squad_v1_exact_match\",\n  \"squad_v1_f1\": \"deeppavlov.metrics.squad_metrics:squad_v1_f1\",\n  \"squad_v2_em\": \"deeppavlov.metrics.squad_metrics:squad_v2_exact_match\",\n  \"squad_v2_f1\": \"deeppavlov.metrics.squad_metrics:squad_v2_f1\",\n  \"record_f1_score\": \"deeppavlov.metrics.record_metrics:record_f1_score\",\n  \"record_em_score\": \"deeppavlov.metrics.record_metrics:record_em_score\"\n}\n"
  },
  {
    "path": "deeppavlov/core/common/metrics_registry.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport importlib\nimport json\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Callable, Any\n\nfrom deeppavlov.core.common.errors import ConfigError\n\nlog = getLogger(__name__)\n\n_registry_path = Path(__file__).parent / 'metrics_registry.json'\nif _registry_path.exists():\n    with _registry_path.open(encoding='utf-8') as f:\n        _REGISTRY = json.load(f)\nelse:\n    _REGISTRY = {}\n\n\ndef fn_from_str(name: str) -> Callable[..., Any]:\n    \"\"\"Returns a function object with the name given in string.\"\"\"\n    try:\n        module_name, fn_name = name.split(':')\n        return getattr(importlib.import_module(module_name), fn_name)\n    except ValueError:\n        raise ConfigError('Expected function description in a `module.submodules:function_name` form, but got `{}`'\n                          .format(name))\n    except AttributeError:\n        # noinspection PyUnboundLocalVariable\n        raise ConfigError(f\"Incorrect metric: '{module_name}' has no attribute '{fn_name}'.\")\n\n\ndef register_metric(metric_name: str) -> Callable[..., Any]:\n    \"\"\"Decorator for metric registration.\"\"\"\n\n    def decorate(fn):\n        fn_name = fn.__module__ + ':' + fn.__name__\n        if metric_name in _REGISTRY and _REGISTRY[metric_name] != fn_name:\n            log.warning('\"{}\" is already registered as a metric name, the old function will be ignored'\n                        .format(metric_name))\n        _REGISTRY[metric_name] = fn_name\n        return fn\n\n    return decorate\n\n\ndef get_metric_by_name(name: str) -> Callable[..., Any]:\n    \"\"\"Returns a metric callable with a corresponding name.\"\"\"\n    name = _REGISTRY.get(name, name)\n    return fn_from_str(name)\n"
  },
  {
    "path": "deeppavlov/core/common/params.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport inspect\nfrom logging import getLogger\nfrom types import FunctionType\nfrom typing import Any, Dict, Union\n\nfrom deeppavlov.core.commands.utils import expand_path, parse_config\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import get_model\nfrom deeppavlov.core.models.component import Component\n\nlog = getLogger(__name__)\n\n_refs = {}\n\n\ndef resolve(val):\n    if isinstance(val, str) and val.startswith('#'):\n        component_id, *attributes = val[1:].split('.')\n        try:\n            val = _refs[component_id]\n        except KeyError:\n            e = ConfigError('Component with id \"{id}\" was referenced but not initialized'\n                            .format(id=component_id))\n            log.exception(e)\n            raise e\n        attributes = ['val'] + attributes\n        val = eval('.'.join(attributes))\n    return val\n\n\ndef _init_param(param, mode):\n    if isinstance(param, str):\n        param = resolve(param)\n    elif isinstance(param, (list, tuple)):\n        param = [_init_param(p, mode) for p in param]\n    elif isinstance(param, dict):\n        if {'ref', 'class_name', 'config_path'}.intersection(param.keys()):\n            param = from_params(param, mode=mode)\n        else:\n            param = {k: _init_param(v, mode) for k, v in param.items()}\n    return param\n\n\ndef from_params(params: Dict, mode: str = 'infer', **kwargs) -> Union[Component, FunctionType]:\n    \"\"\"Builds and returns the Component from corresponding dictionary of parameters.\"\"\"\n    # what is passed in json:\n    config_params = {k: resolve(v) for k, v in params.items()}\n\n    # get component by reference (if any)\n    if 'ref' in config_params:\n        try:\n            return _refs[config_params['ref']]\n        except KeyError:\n            e = ConfigError('Component with id \"{id}\" was referenced but not initialized'\n                            .format(id=config_params['ref']))\n            log.exception(e)\n            raise e\n\n    elif 'config_path' in config_params:\n        from deeppavlov.core.commands.infer import build_model\n        refs = _refs.copy()\n        _refs.clear()\n        config = parse_config(expand_path(config_params['config_path']), config_params.get('overwrite'))\n        model = build_model(config)\n        _refs.clear()\n        _refs.update(refs)\n        try:\n            _refs[config_params['id']] = model\n        except KeyError:\n            pass\n        return model\n\n    cls_name = config_params.pop('class_name', None)\n    if not cls_name:\n        e = ConfigError('Component config has no `class_name` nor `ref` fields')\n        log.exception(e)\n        raise e\n    obj = get_model(cls_name)\n\n    if inspect.isclass(obj):\n        # find the submodels params recursively\n        config_params = {k: _init_param(v, mode) for k, v in config_params.items()}\n        try:\n            spec = inspect.getfullargspec(obj)\n            if 'mode' in spec.args + spec.kwonlyargs or spec.varkw is not None:\n                kwargs['mode'] = mode\n\n            component = obj(**dict(config_params, **kwargs))\n            try:\n                _refs[config_params['id']] = component\n            except KeyError:\n                pass\n        except Exception:\n            log.exception(\"Exception in {}\".format(obj))\n            raise\n    else:\n        component = obj\n\n    return component\n"
  },
  {
    "path": "deeppavlov/core/common/params_search.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport random\nfrom copy import deepcopy\nfrom logging import getLogger\nfrom typing import List, Generator, Any, Tuple\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\n\nlog = getLogger(__name__)\n\n\n@register('params_search')\nclass ParamsSearch:\n    \"\"\"\n    Class determine the main operations for parameters search\n    like finding all changing parameters.\n\n    Args:\n        prefix: prefix to determine special keys like \"`prefix`_range\", \"`prefix`_bool\", \"`prefix`_choice\"\n        seed: random seed for initialization\n        **kwargs: basic config with parameters\n\n    Attributes:\n        basic_config: dictionary with initial config with possible values of searched parameters\n        prefix: prefix to determine special keys like \"`prefix`_range\", \"`prefix`_bool\", \"`prefix`_choice\"\n        paths_to_params: list of lists of keys and/or integers (for list)\n                with relative paths to searched parameters\n        n_params: number of searched parameters\n        eps: threshold value\n    \"\"\"\n\n    def __init__(self,\n                 prefix=\"search\",\n                 seed: int = None,\n                 **kwargs):\n        \"\"\"\n        Initialize evolution with random population\n        \"\"\"\n\n        self.basic_config = deepcopy(kwargs)\n        self.prefix = prefix\n\n        self.paths_to_params = []\n        for search_type in [prefix + \"_range\", prefix + \"_choice\", prefix + \"_bool\"]:\n            for path_ in self.find_model_path(self.basic_config, search_type):\n                self.paths_to_params.append(path_)\n\n        self.n_params = len(self.paths_to_params)\n\n        self.eps = 1e-6\n\n        if seed is None:\n            pass\n        else:\n            np.random.seed(seed)\n            random.seed(seed)\n\n    def find_model_path(self, config: dict, key_model: str, path: list = []) -> Generator:\n        \"\"\"\n        Find paths to all dictionaries in config that contain key 'key_model'\n\n        Args:\n            config: dictionary\n            key_model: key of sub-dictionary to be found\n            path: list of keys and/or integers (for list) with relative path (needed for recursion)\n\n        Returns:\n            path in config -- list of keys (strings and integers)\n        \"\"\"\n        config_pointer = config\n        if isinstance(config_pointer, dict) and key_model in config_pointer.keys():\n            yield path\n        else:\n            if isinstance(config_pointer, dict):\n                for key in list(config_pointer.keys()):\n                    for path_ in self.find_model_path(config_pointer[key], key_model, path + [key]):\n                        yield path_\n            elif isinstance(config_pointer, list):\n                for i in range(len(config_pointer)):\n                    for path_ in self.find_model_path(config_pointer[i], key_model, path + [i]):\n                        yield path_\n\n    @staticmethod\n    def insert_value_or_dict_into_config(config: dict, path: list,\n                                         value: [int, float, str, bool, list, dict, np.ndarray]) -> None:\n        \"\"\"\n        Insert value to dictionary determined by path[:-1] in field with key path[-1]\n\n        Args:\n            config: dictionary\n            path: list of keys and/or integers (for list)\n            value: value to be inserted\n\n        Returns:\n            config with inserted value\n        \"\"\"\n        config_pointer = config\n        for el in path[:-1]:\n            if isinstance(config_pointer, dict):\n                config_pointer = config_pointer.setdefault(el, {})\n            elif isinstance(config_pointer, list):\n                config_pointer = config_pointer[el]\n            else:\n                pass\n        config_pointer[path[-1]] = value\n\n    @staticmethod\n    def get_value_from_config(config: dict, path: list) -> Any:\n        \"\"\"\n        Return value of config element determined by path\n\n        Args:\n            config: dictionary\n            path: list of keys and/or integers (for list)\n\n        Returns:\n            value\n        \"\"\"\n        config_copy = deepcopy(config)\n        config_pointer = config_copy\n        for el in path[:-1]:\n            if isinstance(config_pointer, dict):\n                config_pointer = config_pointer.setdefault(el, {})\n            elif isinstance(config_pointer, list):\n                config_pointer = config_pointer[el]\n            else:\n                pass\n        return config_pointer[path[-1]]\n\n    @staticmethod\n    def remove_key_from_config(config: dict, path: list) -> Tuple[dict, Any]:\n        \"\"\"\n        Remove config element determined by path\n\n        Args:\n            config: dictionary\n            path: list of keys and/or integers (for list)\n\n        Returns:\n            dictionary without value from path, value from path\n        \"\"\"\n        config_copy = deepcopy(config)\n        config_pointer = config_copy\n        for el in path[:-1]:\n            if isinstance(config_pointer, dict):\n                config_pointer = config_pointer.setdefault(el, {})\n            elif isinstance(config_pointer, list):\n                config_pointer = config_pointer[el]\n            else:\n                pass\n        value = config_pointer.pop(path[-1])\n        return config_copy, value\n\n    def initialize_params_in_config(self, basic_config: dict, paths: List[list]) -> dict:\n        \"\"\"\n        Randomly initialize all the changable parameters in config\n\n        Args:\n            basic_config: config where changable parameters are dictionaries with keys\n                ``prefix`_range`, ``prefix`_bool`, ``prefix`_choice`\n            paths: list of paths to changable parameters\n\n        Returns:\n            config\n        \"\"\"\n        config = deepcopy(basic_config)\n        for path_ in paths:\n            param_name = path_[-1]\n            value = self.get_value_from_config(basic_config, path_)\n            if isinstance(value, dict):\n                if (value.get(self.prefix + \"_choice\") or\n                        value.get(self.prefix + \"_range\") or\n                        value.get(self.prefix + \"_bool\")):\n                    self.insert_value_or_dict_into_config(\n                        config, path_,\n                        self.sample_params(**{param_name: deepcopy(value)})[param_name])\n\n        return config\n\n    def sample_params(self, **params) -> dict:\n        \"\"\"\n        Sample parameters according to the given possible values\n\n        Args:\n            **params: dictionary like {\"param_0\": {\"`prefix`_range\": [0, 10]},\n                                       \"param_1\": {\"`prefix`_range\": [0, 10], \"discrete\": true},\n                                       \"param_2\": {\"`prefix`_range\": [0, 1], \"scale\": \"log\"},\n                                       \"param_3\": {\"`prefix`_bool\": true},\n                                       \"param_4\": {\"`prefix`_choice\": [0, 1, 2, 3]}}\n\n        Returns:\n            dictionary with randomly sampled parameters\n        \"\"\"\n        if not params:\n            return {}\n        else:\n            params_copy = deepcopy(params)\n        params_sample = dict()\n        for param, param_val in params_copy.items():\n            if isinstance(param_val, dict):\n                if self.prefix + '_bool' in param_val and param_val[self.prefix + '_bool']:\n                    sample = bool(random.choice([True, False]))\n                elif self.prefix + '_range' in param_val:\n                    sample = self._sample_from_ranges(param_val)\n                elif self.prefix + '_choice' in param_val:\n                    sample = random.choice(param_val[self.prefix + '_choice'])\n                else:\n                    sample = param_val\n                params_sample[param] = sample\n            else:\n                params_sample[param] = params_copy[param]\n        return params_sample\n\n    def _sample_from_ranges(self, opts: dict) -> [int, float]:\n        \"\"\"\n        Sample parameters from ranges\n\n        Args:\n            opts: dictionary  {\"`prefix`_range\": [0, 10]} or \\\n                              {\"`prefix`_range\": [0, 10], \"discrete\": true} or \\\n                              {\"`prefix`_range\": [0, 1], \"scale\": \"log\"}\n\n        Returns:\n            random parameter value from range\n        \"\"\"\n        from_ = opts[self.prefix + '_range'][0]\n        to_ = opts[self.prefix + '_range'][1]\n        if opts.get('scale', None) == 'log':\n            sample = self._sample_log(from_, to_)\n        else:\n            sample = np.random.uniform(from_, to_)\n        if opts.get('discrete', False):\n            sample = int(np.round(sample))\n        return sample\n\n    @staticmethod\n    def _sample_log(from_: float = 0., to_: float = 1.) -> float:\n        \"\"\"\n        Sample parameters from ranges with log scale\n\n        Args:\n            from_: lower boundary of values\n            to_:  upper boundary of values\n\n        Returns:\n            random parameters value from range with log scale\n        \"\"\"\n        sample = np.exp(np.random.uniform(np.log(from_), np.log(to_)))\n        return float(sample)\n"
  },
  {
    "path": "deeppavlov/core/common/paths.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport shutil\n\nfrom pathlib import Path\n\n_root_path = Path(__file__).resolve().parents[3]\n_default_settings_path: Path = _root_path / 'deeppavlov' / 'utils' / 'settings'\n_settings_path = Path(os.getenv('DP_SETTINGS_PATH', _default_settings_path)).expanduser().resolve()\nif _settings_path.is_file():\n    raise FileExistsError(f'DP_SETTINGS_PATH={_settings_path} is a file and not a directory')\n\nif _default_settings_path in _settings_path.parents:\n    raise RecursionError(f'DP_SETTINGS_PATH={_settings_path} is relative'\n                         f' to the default settings path {_default_settings_path}')\n\n\ndef get_settings_path() -> Path:\n    \"\"\"Return an absolute path to the DeepPavlov settings directory\"\"\"\n    populate_settings_dir()\n    return _settings_path\n\n\ndef populate_settings_dir(force: bool = False) -> bool:\n    \"\"\"\n    Populate settings directory with default settings files\n\n    Args:\n        force: if ``True``, replace existing settings files with default ones\n\n    Returns:\n        ``True`` if any files were copied and ``False`` otherwise\n    \"\"\"\n    res = False\n    if _default_settings_path == _settings_path:\n        return res\n\n    for src in list(_default_settings_path.glob('**/*.json')):\n        dest = _settings_path / src.relative_to(_default_settings_path)\n        if not force and dest.exists():\n            continue\n        res = True\n        dest.parent.mkdir(parents=True, exist_ok=True)\n        shutil.copy(src, dest)\n    return res\n"
  },
  {
    "path": "deeppavlov/core/common/prints.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nfrom contextlib import redirect_stdout\n\n\nclass RedirectedPrints(redirect_stdout):\n    \"\"\"Context manager for temporarily redirecting stdout to another stream \"\"\"\n\n    def __init__(self, new_target=sys.stderr):\n        super().__init__(new_target=new_target)\n"
  },
  {
    "path": "deeppavlov/core/common/registry.json",
    "content": "{\n  \"answer_types_extractor\": \"deeppavlov.models.kbqa.type_define:AnswerTypesExtractor\",\n  \"api_requester\": \"deeppavlov.models.api_requester.api_requester:ApiRequester\",\n  \"api_router\": \"deeppavlov.models.api_requester.api_router:ApiRouter\",\n  \"basic_classification_iterator\": \"deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator\",\n  \"basic_classification_reader\": \"deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader\",\n  \"boolqa_reader\": \"deeppavlov.dataset_readers.boolqa_reader:BoolqaReader\",\n  \"bpr\": \"deeppavlov.models.doc_retrieval.bpr:BPR\",\n  \"chu_liu_edmonds_transformer\": \"deeppavlov.models.morpho_syntax_parser.dependency_decoding:ChuLiuEdmonds\",\n  \"concat_lists\": \"deeppavlov.models.doc_retrieval.utils:concat_lists\",\n  \"conll2003_reader\": \"deeppavlov.dataset_readers.conll2003_reader:Conll2003DatasetReader\",\n  \"cos_sim_classifier\": \"deeppavlov.models.classifiers.cos_sim_classifier:CosineSimilarityClassifier\",\n  \"data_fitting_iterator\": \"deeppavlov.core.data.data_fitting_iterator:DataFittingIterator\",\n  \"data_learning_iterator\": \"deeppavlov.core.data.data_learning_iterator:DataLearningIterator\",\n  \"dependency_output_prettifier\": \"deeppavlov.models.morpho_syntax_parser.syntax_parsing:DependencyOutputPrettifier\",\n  \"dirty_comments_preprocessor\": \"deeppavlov.models.preprocessors.dirty_comments_preprocessor:DirtyCommentsPreprocessor\",\n  \"docred_reader\": \"deeppavlov.dataset_readers.docred_reader:DocREDDatasetReader\",\n  \"document_chunker\": \"deeppavlov.models.preprocessors.odqa_preprocessors:DocumentChunker\",\n  \"dnnc_pair_generator\": \"deeppavlov.models.preprocessors.dnnc_preprocessor:PairGenerator\",\n  \"dnnc_proba2labels\": \"deeppavlov.models.classifiers.dnnc_proba2labels:Proba2Labels\",\n  \"entity_detection_parser\": \"deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser\",\n  \"entity_linker\": \"deeppavlov.models.entity_extraction.entity_linking:EntityLinker\",\n  \"entity_type_split\": \"deeppavlov.models.entity_extraction.entity_detection_parser:entity_type_split\",\n  \"faq_reader\": \"deeppavlov.dataset_readers.faq_reader:FaqDatasetReader\",\n  \"fasttext\": \"deeppavlov.models.embedders.fasttext_embedder:FasttextEmbedder\",\n  \"fit_trainer\": \"deeppavlov.core.trainers.fit_trainer:FitTrainer\",\n  \"hashing_tfidf_vectorizer\": \"deeppavlov.models.vectorizers.hashing_tfidf_vectorizer:HashingTfIdfVectorizer\",\n  \"huggingface_dataset_iterator\": \"deeppavlov.dataset_iterators.huggingface_dataset_iterator:HuggingFaceDatasetIterator\",\n  \"huggingface_dataset_reader\": \"deeppavlov.dataset_readers.huggingface_dataset_reader:HuggingFaceDatasetReader\",\n  \"imdb_reader\": \"deeppavlov.dataset_readers.imdb_reader:ImdbReader\",\n  \"joint_tagger_parser\": \"deeppavlov.models.morpho_syntax_parser.joint:JointTaggerParser\",\n  \"kenlm_elector\": \"deeppavlov.models.spelling_correction.electors.kenlm_elector:KenlmElector\",\n  \"lazy_tokenizer\": \"deeppavlov.models.tokenizers.lazy_tokenizer:lazy_tokenizer\",\n  \"lcquad_reader\": \"deeppavlov.dataset_readers.sq_reader:LCQuADReader\",\n  \"lemmatized_output_prettifier\": \"deeppavlov.models.morpho_syntax_parser.syntax_parsing:LemmatizedOutputPrettifier\",\n  \"line_reader\": \"deeppavlov.dataset_readers.line_reader:LineReader\",\n  \"logit_ranker\": \"deeppavlov.models.doc_retrieval.logit_ranker:LogitRanker\",\n  \"mask\": \"deeppavlov.models.preprocessors.mask:Mask\",\n  \"morphotagger_dataset_iterator\": \"deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerDatasetIterator\",\n  \"morphotagger_dataset_reader\": \"deeppavlov.dataset_readers.morphotagging_dataset_reader:MorphotaggerDatasetReader\",\n  \"multitask_reader\":\"deeppavlov.dataset_readers.multitask_reader:MultiTaskReader\",\n  \"multitask_pipeline_preprocessor\":\"deeppavlov.models.preprocessors.multitask_preprocessor:MultiTaskPipelinePreprocessor\",\n  \"multitask_transformer\":\"deeppavlov.models.torch_bert.multitask_transformer:MultiTaskTransformer\",\n  \"multitask_iterator\":\"deeppavlov.dataset_iterators.multitask_iterator:MultiTaskIterator\",\n  \"multi_squad_dataset_reader\": \"deeppavlov.dataset_readers.squad_dataset_reader:MultiSquadDatasetReader\",\n  \"multi_squad_iterator\": \"deeppavlov.dataset_iterators.squad_iterator:MultiSquadIterator\",\n  \"multi_squad_retr_iterator\": \"deeppavlov.dataset_iterators.squad_iterator:MultiSquadRetrIterator\",\n  \"ner_chunk_model\": \"deeppavlov.models.entity_extraction.ner_chunker:NerChunkModel\",\n  \"ner_chunker\": \"deeppavlov.models.entity_extraction.ner_chunker:NerChunker\",\n  \"ner_vocab\": \"deeppavlov.models.preprocessors.ner_preprocessor:NerVocab\",\n  \"nltk_moses_tokenizer\": \"deeppavlov.models.tokenizers.nltk_moses_tokenizer:NLTKMosesTokenizer\",\n  \"nltk_tokenizer\": \"deeppavlov.models.tokenizers.nltk_tokenizer:NLTKTokenizer\",\n  \"nn_trainer\": \"deeppavlov.core.trainers.nn_trainer:NNTrainer\",\n  \"odqa_reader\": \"deeppavlov.dataset_readers.odqa_reader:ODQADataReader\",\n  \"one_hotter\": \"deeppavlov.models.preprocessors.one_hotter:OneHotter\",\n  \"params_search\": \"deeppavlov.core.common.params_search:ParamsSearch\",\n  \"paraphraser_reader\": \"deeppavlov.dataset_readers.paraphraser_reader:ParaphraserReader\",\n  \"path_ranking_preprocessor\": \"deeppavlov.models.preprocessors.torch_transformers_preprocessor:PathRankingPreprocessor\",\n  \"pop_ranker\": \"deeppavlov.models.doc_retrieval.pop_ranker:PopRanker\",\n  \"proba2labels\": \"deeppavlov.models.classifiers.proba2labels:Proba2Labels\",\n  \"query_formatter\": \"deeppavlov.models.kbqa.query_generator:QueryFormatter\",\n  \"query_generator\": \"deeppavlov.models.kbqa.query_generator:QueryGenerator\",\n  \"question_sign_checker\": \"deeppavlov.models.entity_extraction.entity_detection_parser:QuestionSignChecker\",\n  \"re_classifier\": \"deeppavlov.models.relation_extraction.relation_extraction_bert:REBertModel\",\n  \"re_postprocessor\": \"deeppavlov.models.preprocessors.re_preprocessor:REPostprocessor\",\n  \"re_preprocessor\": \"deeppavlov.models.preprocessors.re_preprocessor:REPreprocessor\",\n  \"rel_ranking_infer\": \"deeppavlov.models.kbqa.rel_ranking_infer:RelRankerInfer\",\n  \"rel_ranking_preprocessor\": \"deeppavlov.models.preprocessors.torch_transformers_preprocessor:RelRankingPreprocessor\",\n  \"rel_ranking_reader\": \"deeppavlov.dataset_readers.rel_ranking_reader:ParaphraserReader\",\n  \"response_base_loader\": \"deeppavlov.models.preprocessors.response_base_loader:ResponseBaseLoader\",\n  \"ru_adj_to_noun\": \"deeppavlov.models.kbqa.ru_adj_to_noun:RuAdjToNoun\",\n  \"rubq_reader\": \"deeppavlov.dataset_readers.sq_reader:RuBQReader\",\n  \"rured_reader\": \"deeppavlov.dataset_readers.rured_reader:RuREDDatasetReader\",\n  \"russian_words_vocab\": \"deeppavlov.vocabs.typos:RussianWordsVocab\",\n  \"sanitizer\": \"deeppavlov.models.preprocessors.sanitizer:Sanitizer\",\n  \"sentseg_restore_sent\": \"deeppavlov.models.preprocessors.sentseg_preprocessor:SentSegRestoreSent\",\n  \"siamese_iterator\": \"deeppavlov.dataset_iterators.siamese_iterator:SiameseIterator\",\n  \"simple_vocab\": \"deeppavlov.core.data.simple_vocab:SimpleVocabulary\",\n  \"sklearn_component\": \"deeppavlov.models.sklearn.sklearn_component:SklearnComponent\",\n  \"slovnet_syntax_parser\": \"deeppavlov.models.kbqa.tree_to_sparql:SlovnetSyntaxParser\",\n  \"spacy_lemmatizer\": \"deeppavlov.models.morpho_syntax_parser.spacy_lemmatizer:SpacyLemmatizer\",\n  \"spelling_error_model\": \"deeppavlov.models.spelling_correction.brillmoore.error_model:ErrorModel\",\n  \"spelling_levenshtein\": \"deeppavlov.models.spelling_correction.levenshtein.searcher_component:LevenshteinSearcherComponent\",\n  \"split_tokenizer\": \"deeppavlov.models.tokenizers.split_tokenizer:SplitTokenizer\",\n  \"sq_reader\": \"deeppavlov.dataset_readers.sq_reader:SQReader\",\n  \"sqlite_iterator\": \"deeppavlov.dataset_iterators.sqlite_iterator:SQLiteDataIterator\",\n  \"squad_bert_ans_postprocessor\": \"deeppavlov.models.preprocessors.squad_preprocessor:SquadBertAnsPostprocessor\",\n  \"squad_bert_ans_preprocessor\": \"deeppavlov.models.preprocessors.squad_preprocessor:SquadBertAnsPreprocessor\",\n  \"squad_bert_mapping\": \"deeppavlov.models.preprocessors.squad_preprocessor:SquadBertMappingPreprocessor\",\n  \"squad_dataset_reader\": \"deeppavlov.dataset_readers.squad_dataset_reader:SquadDatasetReader\",\n  \"squad_iterator\": \"deeppavlov.dataset_iterators.squad_iterator:SquadIterator\",\n  \"static_dictionary\": \"deeppavlov.vocabs.typos:StaticDictionary\",\n  \"str_lower\": \"deeppavlov.models.preprocessors.str_lower:str_lower\",\n  \"str_token_reverser\": \"deeppavlov.models.preprocessors.str_token_reverser:StrTokenReverser\",\n  \"str_utf8_encoder\": \"deeppavlov.models.preprocessors.str_utf8_encoder:StrUTF8Encoder\",\n  \"stream_spacy_tokenizer\": \"deeppavlov.models.tokenizers.spacy_tokenizer:StreamSpacyTokenizer\",\n  \"string_multiplier\": \"deeppavlov.models.preprocessors.odqa_preprocessors:StringMultiplier\",\n  \"template_matcher\": \"deeppavlov.models.kbqa.template_matcher:TemplateMatcher\",\n  \"tfidf_ranker\": \"deeppavlov.models.doc_retrieval.tfidf_ranker:TfidfRanker\",\n  \"tfidf_weighted\": \"deeppavlov.models.embedders.tfidf_weighted_embedder:TfidfWeightedEmbedder\",\n  \"top1_elector\": \"deeppavlov.models.spelling_correction.electors.top1_elector:TopOneElector\",\n  \"torch_bert_ranker\": \"deeppavlov.models.torch_bert.torch_bert_ranker:TorchBertRankerModel\",\n  \"torch_bert_ranker_preprocessor\": \"deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchBertRankerPreprocessor\",\n  \"torch_record_postprocessor\": \"deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchRecordPostprocessor\",\n  \"torch_squad_transformers_preprocessor\": \"deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchSquadTransformersPreprocessor\",\n  \"torch_text_classification_model\": \"deeppavlov.models.classifiers.torch_classification_model:TorchTextClassificationModel\",\n  \"torch_trainer\": \"deeppavlov.core.trainers.torch_trainer:TorchTrainer\",\n  \"torch_transformers_classifier\": \"deeppavlov.models.torch_bert.torch_transformers_classifier:TorchTransformersClassifierModel\",\n  \"torch_transformers_el_ranker\": \"deeppavlov.models.torch_bert.torch_transformers_el_ranker:TorchTransformersElRanker\",\n  \"torch_transformers_entity_ranker_infer\": \"deeppavlov.models.torch_bert.torch_transformers_el_ranker:TorchTransformersEntityRankerInfer\",\n  \"torch_transformers_entity_ranker_preprocessor\": \"deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersEntityRankerPreprocessor\",\n  \"torch_transformers_multiplechoice\": \"deeppavlov.models.torch_bert.torch_transformers_multiplechoice:TorchTransformersMultiplechoiceModel\",\n  \"torch_transformers_multiplechoice_preprocessor\": \"deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersMultiplechoicePreprocessor\",\n  \"torch_transformers_ner_preprocessor\": \"deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersNerPreprocessor\",\n  \"torch_transformers_nll_ranker\": \"deeppavlov.models.torch_bert.torch_transformers_nll_ranking:TorchTransformersNLLRanker\",\n  \"torch_transformers_preprocessor\": \"deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersPreprocessor\",\n  \"torch_transformers_sequence_tagger\": \"deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger\",\n  \"torch_transformers_squad\": \"deeppavlov.models.torch_bert.torch_transformers_squad:TorchTransformersSquad\",\n  \"torch_transformers_syntax_parser\": \"deeppavlov.models.torch_bert.torch_transformers_syntax_parser:TorchTransformersSyntaxParser\",\n  \"transformers_bert_embedder\": \"deeppavlov.models.embedders.transformers_embedder:TransformersBertEmbedder\",\n  \"transformers_bert_preprocessor\": \"deeppavlov.models.preprocessors.transformers_preprocessor:TransformersBertPreprocessor\",\n  \"tree_to_sparql\": \"deeppavlov.models.kbqa.tree_to_sparql:TreeToSparql\",\n  \"typos_custom_reader\": \"deeppavlov.dataset_readers.typos_reader:TyposCustom\",\n  \"typos_iterator\": \"deeppavlov.dataset_iterators.typos_iterator:TyposDatasetIterator\",\n  \"typos_kartaslov_reader\": \"deeppavlov.dataset_readers.typos_reader:TyposKartaslov\",\n  \"typos_wikipedia_reader\": \"deeppavlov.dataset_readers.typos_reader:TyposWikipedia\",\n  \"ubuntu_v2_reader\": \"deeppavlov.dataset_readers.ubuntu_v2_reader:UbuntuV2Reader\",\n  \"wiki_parser\": \"deeppavlov.models.kbqa.wiki_parser:WikiParser\",\n  \"wiki_sqlite_vocab\": \"deeppavlov.vocabs.wiki_sqlite:WikiSQLiteVocab\",\n  \"wikitionary_100K_vocab\": \"deeppavlov.vocabs.typos:Wiki100KDictionary\"\n}\n"
  },
  {
    "path": "deeppavlov/core/common/registry.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport importlib\nimport json\nfrom logging import getLogger\nfrom pathlib import Path\n\nfrom deeppavlov.core.common.errors import ConfigError\n\nlogger = getLogger(__name__)\n\n_registry_path = Path(__file__).parent / 'registry.json'\nif _registry_path.exists():\n    with _registry_path.open(encoding='utf-8') as f:\n        _REGISTRY = json.load(f)\nelse:\n    _REGISTRY = {}\n\ninverted_registry = {val: key for key, val in _REGISTRY.items()}\n\n\ndef cls_from_str(name: str) -> type:\n    \"\"\"Returns a class object with the name given as a string.\"\"\"\n    try:\n        module_name, cls_name = name.split(':')\n    except ValueError:\n        raise ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`'\n                          .format(name))\n\n    return getattr(importlib.import_module(module_name), cls_name)\n\n\ndef register(name: str = None) -> type:\n    \"\"\"\n    Register classes that could be initialized from JSON configuration file.\n    If name is not passed, the class name is converted to snake-case.\n    \"\"\"\n\n    def decorate(model_cls: type, reg_name: str = None) -> type:\n        model_name = reg_name or short_name(model_cls)\n        global _REGISTRY\n        cls_name = model_cls.__module__ + ':' + model_cls.__name__\n        if model_name in _REGISTRY and _REGISTRY[model_name] != cls_name:\n            logger.warning('Registry name \"{}\" has been already registered and will be overwritten.'.format(model_name))\n        _REGISTRY[model_name] = cls_name\n        return model_cls\n\n    return lambda model_cls_name: decorate(model_cls_name, name)\n\n\ndef short_name(cls: type) -> str:\n    \"\"\"Returns just a class name (without package and module specification).\"\"\"\n    return cls.__name__.split('.')[-1]\n\n\ndef get_model(name: str) -> type:\n    \"\"\"Returns a registered class object with the name given in the string.\"\"\"\n    if name not in _REGISTRY:\n        if ':' not in name:\n            raise ConfigError(\"Model {} is not registered.\".format(name))\n        return cls_from_str(name)\n    return cls_from_str(_REGISTRY[name])\n\n\ndef list_models() -> list:\n    \"\"\"Returns a list of names of registered classes.\"\"\"\n    return list(_REGISTRY)\n"
  },
  {
    "path": "deeppavlov/core/common/requirements_registry.json",
    "content": "{\n  \"answer_types_extractor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt\"\n  ],\n  \"chu_liu_edmonds_transformer\": [\n    \"{DEEPPAVLOV_PATH}/requirements/dependency_decoding.txt\"\n  ],\n  \"bpr\": [\n    \"{DEEPPAVLOV_PATH}/requirements/faiss.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"entity_linker\": [\n    \"{DEEPPAVLOV_PATH}/requirements/hdt.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/rapidfuzz.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt\"\n  ],\n  \"fasttext\": [\n    \"{DEEPPAVLOV_PATH}/requirements/fasttext.txt\"\n  ],\n  \"huggingface_dataset_iterator\": [\n    \"{DEEPPAVLOV_PATH}/requirements/datasets.txt\"\n  ],\n  \"huggingface_dataset_reader\": [\n    \"{DEEPPAVLOV_PATH}/requirements/datasets.txt\"\n  ],\n  \"kenlm_elector\": [\n    \"{DEEPPAVLOV_PATH}/requirements/kenlm.txt\"\n  ],\n  \"ner_chunk_model\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"ner_chunker\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"nltk_moses_tokenizer\": [\n    \"{DEEPPAVLOV_PATH}/requirements/sacremoses.txt\"\n  ],\n  \"path_ranking_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"query_generator\": [\n    \"{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/hdt.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/rapidfuzz.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/whapi.txt\"\n  ],\n  \"re_classifier\": [\n    \"{DEEPPAVLOV_PATH}/requirements/opt_einsum.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"re_postprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"re_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"rel_ranking_infer\": [\n    \"{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/hdt.txt\"\n  ],\n  \"rel_ranking_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"ru_adj_to_noun\": [\n    \"{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt\"\n  ],\n  \"russian_words_vocab\": [\n    \"{DEEPPAVLOV_PATH}/requirements/lxml.txt\"\n  ],\n  \"slovnet_syntax_parser\": [\n    \"{DEEPPAVLOV_PATH}/requirements/slovnet.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/razdel.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt\"\n  ],\n  \"spacy_lemmatizer\": [\n    \"{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt\"\n  ],\n  \"spelling_error_model\": [\n    \"{DEEPPAVLOV_PATH}/requirements/lxml.txt\"\n  ],\n  \"spelling_levenshtein\": [\n    \"{DEEPPAVLOV_PATH}/requirements/sortedcontainers.txt\"\n  ],\n  \"static_dictionary\": [\n    \"{DEEPPAVLOV_PATH}/requirements/lxml.txt\"\n  ],\n  \"stream_spacy_tokenizer\": [\n    \"{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt\"\n  ],\n  \"torch_bert_ranker\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_bert_ranker_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_record_postprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_squad_transformers_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_text_classification_model\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\"\n  ],\n  \"torch_transformers_classifier\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"multitask_transformer\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_transformers_el_ranker\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_transformers_entity_ranker_infer\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_transformers_entity_ranker_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_transformers_multiplechoice\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_transformers_multiplechoice_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_transformers_ner_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/sentencepiece.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/protobuf.txt\"\n  ],\n  \"torch_transformers_nll_ranker\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_transformers_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_transformers_sequence_tagger\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/torchcrf.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"torch_transformers_syntax_parser\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/torchcrf.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"multitask_pipeline_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/torchcrf.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],  \n  \"torch_transformers_squad\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"transformers_bert_embedder\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"transformers_bert_preprocessor\": [\n    \"{DEEPPAVLOV_PATH}/requirements/pytorch.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/transformers.txt\"\n  ],\n  \"tree_to_sparql\": [\n    \"{DEEPPAVLOV_PATH}/requirements/udapi.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/razdel.txt\",\n    \"{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt\"\n  ],\n  \"typos_custom_reader\": [\n    \"{DEEPPAVLOV_PATH}/requirements/lxml.txt\"\n  ],\n  \"typos_kartaslov_reader\": [\n    \"{DEEPPAVLOV_PATH}/requirements/lxml.txt\"\n  ],\n  \"typos_wikipedia_reader\": [\n    \"{DEEPPAVLOV_PATH}/requirements/lxml.txt\"\n  ],\n  \"wiki_parser\": [\n    \"{DEEPPAVLOV_PATH}/requirements/hdt.txt\"\n  ],\n  \"wikitionary_100K_vocab\": [\n    \"{DEEPPAVLOV_PATH}/requirements/lxml.txt\"\n  ]\n}\n"
  },
  {
    "path": "deeppavlov/core/data/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/core/data/data_fitting_iterator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom random import Random\nfrom typing import List, Generator, Tuple, Any, Optional\n\nfrom deeppavlov.core.common.registry import register\n\nlogger = getLogger(__name__)\n\n\n@register('data_fitting_iterator')\nclass DataFittingIterator:\n    \"\"\"Dataset iterator for fitting estimator models, like vocabs, kNN, vectorizers.\n    Data is passed as a list of strings(documents).\n    Generate batches (for large datasets).\n\n    Args:\n        data: list of documents\n        doc_ids: provided document ids\n        seed: random seed for data shuffling\n        shuffle: whether to shuffle data during batching\n\n    Attributes:\n        shuffle: whether to shuffle data during batching\n        random: instance of :class:`Random` initialized with a seed\n        data: list of documents\n        doc_ids: provided by a user ids or generated automatically ids\n\n    \"\"\"\n\n    def __init__(self, data: List[str], doc_ids: List[Any] = None,\n                 seed: int = None, shuffle: bool = True,\n                 *args, **kwargs) -> None:\n\n        self.shuffle = shuffle\n        self.random = Random(seed)\n        self.data = data\n        self.doc_ids = doc_ids or self.get_doc_ids()\n\n    def get_doc_ids(self):\n        \"\"\"Generate doc ids.\n\n        Returns: doc ids\n\n        \"\"\"\n        return list(range(len(self.data)))\n\n    def get_doc_content(self, doc_id: Any) -> Optional[str]:\n        \"\"\"Get doc content by id.\n\n        Args:\n            doc_id: an id for a doc which content should be extracted\n\n        Returns:\n            doc content as a string if id exists or raise an error\n\n        \"\"\"\n        return self.data[doc_id]\n\n    def gen_batches(self, batch_size: int, shuffle: bool = None) \\\n            -> Generator[Tuple[List[str], List[int]], Any, None]:\n        \"\"\"Gen batches of documents.\n\n        Args:\n            batch_size: a number of samples in a single batch\n            shuffle: whether to shuffle data during batching\n\n        Yields:\n            generated tuple of documents and their ids\n\n        \"\"\"\n        if shuffle is None:\n            shuffle = self.shuffle\n\n        if shuffle:\n            _doc_ids = self.random.sample(self.doc_ids, len(self.doc_ids))\n        else:\n            _doc_ids = self.doc_ids\n\n        if batch_size > 0:\n            batches = [_doc_ids[i:i + batch_size] for i in\n                       range(0, len(_doc_ids), batch_size)]\n        else:\n            batches = [_doc_ids]\n\n        # DEBUG\n        # len_batches = len(batches)\n\n        for i, doc_ids in enumerate(batches):\n            # DEBUG\n            # logger.info(\n            #     \"Processing batch # {} of {} ({} documents)\".format(i, len_batches, len(doc_index)))\n            docs = [self.get_doc_content(doc_id) for doc_id in doc_ids]\n            yield docs, doc_ids\n\n    def get_instances(self):\n        \"\"\"Get all data\"\"\"\n        doc_ids = list(self.doc_ids)\n        docs = [self.get_doc_content(doc_id) for doc_id in doc_ids]\n        return docs, doc_ids\n"
  },
  {
    "path": "deeppavlov/core/data/data_learning_iterator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom random import Random\nfrom typing import List, Dict, Tuple, Any, Iterator\n\nfrom deeppavlov.core.common.registry import register\n\n\n@register('data_learning_iterator')\nclass DataLearningIterator:\n    \"\"\"Dataset iterator for learning models, e. g. neural networks.\n\n    Args:\n        data: list of (x, y) pairs for every data type in ``'train'``, ``'valid'`` and ``'test'``\n        seed: random seed for data shuffling\n        shuffle: whether to shuffle data during batching\n\n    Attributes:\n        shuffle: whether to shuffle data during batching\n        random: instance of ``Random`` initialized with a seed\n    \"\"\"\n\n    def split(self, *args, **kwargs):\n        \"\"\" Manipulate self.train, self.valid, and self.test into their final form. \"\"\"\n        pass\n\n    def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple[Any, Any]]:\n        \"\"\" Transform the data for a specific data type (e.g. ``'train'``). \"\"\"\n        return data\n\n    def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], seed: int = None, shuffle: bool = True,\n                 *args, **kwargs) -> None:\n        self.shuffle = shuffle\n\n        self.random = Random(seed)\n\n        self.train = self.preprocess(data.get('train', []), *args, **kwargs)\n        self.valid = self.preprocess(data.get('valid', []), *args, **kwargs)\n        self.test = self.preprocess(data.get('test', []), *args, **kwargs)\n        self.split(*args, **kwargs)\n        self.data = {\n            'train': self.train,\n            'valid': self.valid,\n            'test': self.test,\n            'all': self.train + self.test + self.valid\n        }\n\n    def gen_batches(self, batch_size: int, data_type: str = 'train',\n                    shuffle: bool = None) -> Iterator[Tuple[tuple, tuple]]:\n        \"\"\"Generate batches of inputs and expected output to train neural networks\n\n        Args:\n            batch_size: number of samples in batch\n            data_type: can be either 'train', 'test', or 'valid'\n            shuffle: whether to shuffle dataset before batching\n\n        Yields:\n             a tuple of a batch of inputs and a batch of expected outputs\n        \"\"\"\n        if shuffle is None:\n            shuffle = self.shuffle\n\n        data = self.data[data_type]\n        data_len = len(data)\n\n        if data_len == 0:\n            return\n\n        order = list(range(data_len))\n        if shuffle:\n            self.random.shuffle(order)\n\n        if batch_size < 0:\n            batch_size = data_len\n\n        for i in range((data_len - 1) // batch_size + 1):\n            yield tuple(zip(*[data[o] for o in order[i * batch_size:(i + 1) * batch_size]]))\n\n    def get_instances(self, data_type: str = 'train') -> Tuple[tuple, tuple]:\n        \"\"\"Get all data for a selected data type\n\n        Args:\n            data_type (str): can be either ``'train'``, ``'test'``, ``'valid'`` or ``'all'``\n\n        Returns:\n             a tuple of all inputs for a data type and all expected outputs for a data type\n        \"\"\"\n        data = self.data[data_type]\n        return tuple(zip(*data))\n"
  },
  {
    "path": "deeppavlov/core/data/dataset_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List, Dict, Tuple, Any\n\n\nclass DatasetReader:\n    \"\"\"An abstract class for reading data from some location and construction of a dataset.\"\"\"\n\n    def read(self, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:\n        \"\"\"Reads a file from a path and returns data as a list of tuples of inputs and correct outputs\n         for every data type in ``train``, ``valid`` and ``test``.\n        \"\"\"\n        raise NotImplementedError\n"
  },
  {
    "path": "deeppavlov/core/data/simple_vocab.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections import Counter, defaultdict\nfrom itertools import chain\nfrom logging import getLogger\nfrom typing import Iterable, Optional, Tuple\n\nimport numpy as np\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.utils import zero_pad, is_str_batch, flatten_str_batch\nfrom deeppavlov.core.models.estimator import Estimator\n\nlog = getLogger(__name__)\n\n\n@register('simple_vocab')\nclass SimpleVocabulary(Estimator):\n    \"\"\"Implements simple vocabulary.\n\n    Parameters:\n        special_tokens: tuple of tokens that shouldn't be counted.\n        max_tokens: upper bound for number of tokens in the vocabulary.\n        min_freq: minimal count of a token (except special tokens).\n        pad_with_zeros: if True, then batch of elements will be padded with zeros up to length of\n            the longest element in batch.\n        unk_token: label assigned to unknown tokens.\n        freq_drop_load: if True, then frequencies of tokens are set to min_freq on the model load.\n        \"\"\"\n\n    def __init__(self,\n                 special_tokens: Tuple[str, ...] = tuple(),\n                 max_tokens: int = 2 ** 30,\n                 min_freq: int = 0,\n                 pad_with_zeros: bool = False,\n                 unk_token: Optional[str] = None,\n                 freq_drop_load: Optional[bool] = None,\n                 *args,\n                 **kwargs):\n        super().__init__(**kwargs)\n        self.special_tokens = special_tokens\n        self._max_tokens = max_tokens\n        self._min_freq = min_freq\n        self._pad_with_zeros = pad_with_zeros\n        self.unk_token = unk_token\n        self.freq_drop_load = freq_drop_load\n        self.reset()\n        if self.load_path:\n            self.load()\n\n    def fit(self, *args):\n        self.reset()\n        tokens = chain(*args)\n        # filter(None, <>) -- to filter empty tokens\n        self.freqs = Counter(filter(None, flatten_str_batch(tokens)))\n        for special_token in self.special_tokens:\n            self._t2i[special_token] = self.count\n            self._i2t.append(special_token)\n            self.count += 1\n        for token, freq in self.freqs.most_common()[:self._max_tokens]:\n            if token in self.special_tokens:\n                continue\n            if freq >= self._min_freq:\n                self._t2i[token] = self.count\n                self._i2t.append(token)\n                self.count += 1\n\n    def _add_tokens_with_freqs(self, tokens, freqs):\n        self.freqs = Counter()\n        self.freqs.update(dict(zip(tokens, freqs)))\n        for token, freq in zip(tokens, freqs):\n            if freq >= self._min_freq or token in self.special_tokens:\n                self._t2i[token] = self.count\n                self._i2t.append(token)\n                self.count += 1\n\n    def __call__(self, batch, is_top=True, **kwargs):\n        if isinstance(batch, Iterable) and not isinstance(batch, str):\n            if all([k is None for k in batch]):\n                return batch\n            else:\n                looked_up_batch = [self(sample, is_top=False) for sample in batch]\n        else:\n            return self[batch]\n        if self._pad_with_zeros and is_top and not is_str_batch(looked_up_batch):\n            looked_up_batch = zero_pad(looked_up_batch)\n\n        return looked_up_batch\n\n    def save(self):\n        log.info(\"[saving vocabulary to {}]\".format(self.save_path))\n        with self.save_path.open('wt', encoding='utf8') as f:\n            for n in range(len(self)):\n                token = self._i2t[n]\n                cnt = self.freqs[token]\n                f.write('{}\\t{:d}\\n'.format(token, cnt))\n\n    def load(self):\n        self.reset()\n        if self.load_path:\n            if self.load_path.is_file():\n                log.debug(\"[loading vocabulary from {}]\".format(self.load_path))\n                tokens, counts = [], []\n                for ln in self.load_path.open('r', encoding='utf8'):\n                    token, cnt = self.load_line(ln)\n                    tokens.append(token)\n                    counts.append(int(cnt))\n                self._add_tokens_with_freqs(tokens, counts)\n            elif not self.load_path.parent.is_dir():\n                raise ConfigError(\"Provided `load_path` for {} doesn't exist!\".format(\n                    self.__class__.__name__))\n        else:\n            raise ConfigError(\"`load_path` for {} is not provided!\".format(self))\n\n    def load_line(self, ln):\n        if self.freq_drop_load:\n            token = ln.strip().split()[0]\n            cnt = self._min_freq\n        else:\n            token, cnt = ln.rsplit('\\t', 1)\n        return token, cnt\n\n    @property\n    def len(self):\n        return len(self)\n\n    def keys(self):\n        return (self[n] for n in range(self.len))\n\n    def values(self):\n        return list(range(self.len))\n\n    def items(self):\n        return zip(self.keys(), self.values())\n\n    def __getitem__(self, key):\n        if isinstance(key, (int, np.integer)):\n            return self._i2t[key]\n        elif isinstance(key, str):\n            return self._t2i[key]\n        else:\n            raise NotImplementedError(\"not implemented for type `{}`\".format(type(key)))\n\n    def __contains__(self, item):\n        return item in self._t2i\n\n    def __len__(self):\n        return len(self._i2t)\n\n    def reset(self):\n        self.freqs = None\n        unk_index = 0\n        if self.unk_token in self.special_tokens:\n            unk_index = self.special_tokens.index(self.unk_token)\n        self._t2i = defaultdict(lambda: unk_index)\n        self._i2t = []\n        self.count = 0\n\n    def idxs2toks(self, idxs):\n        return [self[idx] for idx in idxs]\n"
  },
  {
    "path": "deeppavlov/core/data/utils.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport collections\nimport gzip\nimport os\nimport secrets\nimport shutil\nimport tarfile\nimport zipfile\nfrom hashlib import md5\nfrom itertools import chain\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Any, Generator, Iterable, List, Mapping, Optional, Sequence, Sized, Union, Collection\nfrom urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit, urlparse\n\nimport numpy as np\nimport requests\nfrom tqdm import tqdm\n\nlog = getLogger(__name__)\n\n_MARK_DONE = '.done'\n\ntqdm.monitor_interval = 0\n\n\ndef get_download_token() -> str:\n    \"\"\"Return a download token from ~/.deeppavlov/token file.\n\n    If token file does not exists, creates the file and writes to it a random URL-safe text string\n    containing 32 random bytes.\n\n    Returns:\n        32 byte URL-safe text string from ~/.deeppavlov/token.\n\n    \"\"\"\n    token_file = Path.home() / '.deeppavlov' / 'token'\n    if not token_file.exists():\n        if token_file.parent.is_file():\n            token_file.parent.unlink()\n        token_file.parent.mkdir(parents=True, exist_ok=True)\n        token_file.write_text(secrets.token_urlsafe(32), encoding='utf8')\n\n    return token_file.read_text(encoding='utf8').strip()\n\n\ndef s3_download(url: str, destination: str) -> None:\n    \"\"\"Download a file from an Amazon S3 path `s3://<bucket_name>/<key>`\n\n    Requires the boto3 library to be installed and AWS credentials being set\n    via environment variables or a credentials file\n\n    Args:\n        url: The source URL.\n        destination: Path to the file destination (including file name).\n    \"\"\"\n    import boto3\n\n    s3 = boto3.resource('s3', endpoint_url=os.environ.get('AWS_ENDPOINT_URL'))\n\n    bucket, key = url[5:].split('/', maxsplit=1)\n    file_object = s3.Object(bucket, key)\n    file_size = file_object.content_length\n    with tqdm(total=file_size, unit='B', unit_scale=True) as pbar:\n        file_object.download_file(destination, Callback=pbar.update)\n\n\ndef simple_download(url: str, destination: Union[Path, str], headers: Optional[dict] = None, n_tries: int = 3) -> None:\n    \"\"\"Download a file from URL to target location.\n\n    Displays a progress bar to the terminal during the download process.\n\n    Args:\n        url: The source URL.\n        destination: Path to the file destination (including file name).\n        headers: Headers for file server.\n        n_tries: Number of retries if download fails.\n\n    \"\"\"\n    try:\n        destination = Path(destination)\n        destination.parent.mkdir(parents=True, exist_ok=True)\n\n        log.info('Downloading from {} to {}'.format(url, destination))\n\n        if url.startswith('s3://'):\n            return s3_download(url, str(destination))\n\n        chunk_size = 32 * 1024\n        temporary = destination.with_suffix(destination.suffix + '.part')\n\n        r = requests.get(url, stream=True, headers=headers)\n        if r.status_code != 200:\n            raise RuntimeError(f'Got status code {r.status_code} when trying to download {url}')\n        total_length = int(r.headers.get('content-length', 0))\n\n        if temporary.exists() and temporary.stat().st_size > total_length:\n            temporary.write_bytes(b'')  # clearing temporary file when total_length is inconsistent\n\n        with temporary.open('ab') as f:\n            downloaded = f.tell()\n            if downloaded != 0:\n                log.warning(f'Found a partial download {temporary}')\n            with tqdm(initial=downloaded, total=total_length, unit='B', unit_scale=True) as pbar:\n                while True:\n                    if downloaded != 0:\n                        log.warning(f'Download stopped abruptly, trying to resume from {downloaded} '\n                                    f'to reach {total_length}')\n                        headers['Range'] = f'bytes={downloaded}-'\n                        r = requests.get(url, headers=headers, stream=True)\n                        if 'content-length' not in r.headers or \\\n                                total_length - downloaded != int(r.headers['content-length']):\n                            raise RuntimeError('It looks like the server does not support resuming downloads.')\n\n                    try:\n                        for chunk in r.iter_content(chunk_size=chunk_size):\n                            if chunk:  # filter out keep-alive new chunks\n                                downloaded += len(chunk)\n                                pbar.update(len(chunk))\n                                f.write(chunk)\n                    except requests.exceptions.ChunkedEncodingError:\n                        if downloaded == 0:\n                            r = requests.get(url, stream=True, headers=headers)\n\n                    if downloaded >= total_length:\n                        # Note that total_length is 0 if the server didn't return the content length,\n                        # in this case we perform just one iteration and assume that we are done.\n                        break\n\n        temporary.rename(destination)\n    except Exception as e:\n        if n_tries > 0:\n            log.warning(f'Download failed: {e}, retrying')\n            simple_download(url, destination, headers, n_tries - 1)\n        else:\n            raise e\n\n\ndef download(dest_file_path: [List[Union[str, Path]]], source_url: str, force_download: bool = True,\n             headers: Optional[dict] = None) -> None:\n    \"\"\"Download a file from URL to one or several target locations.\n\n    Args:\n        dest_file_path: Path or list of paths to the file destination (including file name).\n        source_url: The source URL.\n        force_download: Download file if it already exists, or not.\n        headers: Headers for file server.\n\n    \"\"\"\n\n    if isinstance(dest_file_path, list):\n        dest_file_paths = [Path(path) for path in dest_file_path]\n    else:\n        dest_file_paths = [Path(dest_file_path).absolute()]\n\n    if not force_download:\n        to_check = list(dest_file_paths)\n        dest_file_paths = []\n        for p in to_check:\n            if p.exists():\n                log.info(f'File already exists in {p}')\n            else:\n                dest_file_paths.append(p)\n\n    if dest_file_paths:\n        cache_dir = os.getenv('DP_CACHE_DIR')\n        cached_exists = False\n        if cache_dir:\n            first_dest_path = Path(cache_dir) / md5(source_url.encode('utf8')).hexdigest()[:15]\n            cached_exists = first_dest_path.exists()\n        else:\n            first_dest_path = dest_file_paths.pop()\n\n        if not cached_exists:\n            first_dest_path.parent.mkdir(parents=True, exist_ok=True)\n\n            simple_download(source_url, first_dest_path, headers)\n        else:\n            log.info(f'Found cached {source_url} in {first_dest_path}')\n\n        for dest_path in dest_file_paths:\n            dest_path.parent.mkdir(parents=True, exist_ok=True)\n            shutil.copy(str(first_dest_path), str(dest_path))\n\n\ndef untar(file_path: Union[Path, str], extract_folder: Optional[Union[Path, str]] = None) -> None:\n    \"\"\"Simple tar archive extractor.\n\n    Args:\n        file_path: Path to the tar file to be extracted.\n        extract_folder: Folder to which the files will be extracted.\n\n    \"\"\"\n    file_path = Path(file_path)\n    if extract_folder is None:\n        extract_folder = file_path.parent\n    extract_folder = Path(extract_folder)\n    tar = tarfile.open(file_path)\n    tar.extractall(extract_folder)\n    tar.close()\n\n\ndef ungzip(file_path: Union[Path, str], extract_path: Optional[Union[Path, str]] = None) -> None:\n    \"\"\"Simple .gz archive extractor.\n\n    Args:\n        file_path: Path to the gzip file to be extracted.\n        extract_path: Path where the file will be extracted.\n\n    \"\"\"\n    chunk_size = 16 * 1024\n    file_path = Path(file_path)\n    if extract_path is None:\n        extract_path = file_path.with_suffix('')\n    extract_path = Path(extract_path)\n\n    with gzip.open(file_path, 'rb') as fin, extract_path.open('wb') as fout:\n        while True:\n            block = fin.read(chunk_size)\n            if not block:\n                break\n            fout.write(block)\n\n\ndef download_decompress(url: str,\n                        download_path: Union[Path, str],\n                        extract_paths: Optional[Union[List[Union[Path, str]], Path, str]] = None,\n                        headers: Optional[dict] = None) -> None:\n    \"\"\"Download and extract .tar.gz or .gz file to one or several target locations.\n\n    The archive is deleted if extraction was successful.\n\n    Args:\n        url: URL for file downloading.\n        download_path: Path to the directory where downloaded file will be stored until the end of extraction.\n        extract_paths: Path or list of paths where contents of archive will be extracted.\n        headers: Headers for file server.\n\n    \"\"\"\n    file_name = Path(urlparse(url).path).name\n    download_path = Path(download_path)\n\n    if extract_paths is None:\n        extract_paths = [download_path]\n    elif isinstance(extract_paths, list):\n        extract_paths = [Path(path) for path in extract_paths]\n    else:\n        extract_paths = [Path(extract_paths)]\n\n    cache_dir = os.getenv('DP_CACHE_DIR')\n    extracted = False\n    if cache_dir:\n        cache_dir = Path(cache_dir)\n        url_hash = md5(url.encode('utf8')).hexdigest()[:15]\n        arch_file_path = cache_dir / url_hash\n        extracted_path = cache_dir / (url_hash + '_extracted')\n        extracted = extracted_path.exists()\n        if not extracted and not arch_file_path.exists():\n            simple_download(url, arch_file_path, headers)\n        else:\n            if extracted:\n                log.info(f'Found cached and extracted {url} in {extracted_path}')\n            else:\n                log.info(f'Found cached {url} in {arch_file_path}')\n    else:\n        arch_file_path = download_path / file_name\n        simple_download(url, arch_file_path, headers)\n        extracted_path = extract_paths.pop()\n\n    if not extracted:\n        log.info('Extracting {} archive into {}'.format(arch_file_path, extracted_path))\n        extracted_path.mkdir(parents=True, exist_ok=True)\n\n        if file_name.endswith('.tar.gz'):\n            untar(arch_file_path, extracted_path)\n        elif file_name.endswith('.gz'):\n            ungzip(arch_file_path, extracted_path / Path(file_name).with_suffix('').name)\n        elif file_name.endswith('.zip'):\n            with zipfile.ZipFile(arch_file_path, 'r') as zip_ref:\n                zip_ref.extractall(extracted_path)\n        else:\n            raise RuntimeError(f'Trying to extract an unknown type of archive {file_name}')\n\n        if not cache_dir:\n            arch_file_path.unlink()\n\n    for extract_path in extract_paths:\n        for src in extracted_path.iterdir():\n            dest = extract_path / src.name\n            if src.is_dir():\n                _copytree(src, dest)\n            else:\n                extract_path.mkdir(parents=True, exist_ok=True)\n                shutil.copy(str(src), str(dest))\n\n\ndef _copytree(src: Path, dest: Path) -> None:\n    \"\"\"Recursively copies directory.\n\n    Destination directory could exist (unlike if we used shutil.copytree).\n\n    Args:\n        src: Path to copied directory.\n        dest: Path to destination directory.\n\n    \"\"\"\n    dest.mkdir(parents=True, exist_ok=True)\n    for f in src.iterdir():\n        f_dest = dest / f.name\n        if f.is_dir():\n            _copytree(f, f_dest)\n        else:\n            shutil.copy(str(f), str(f_dest))\n\n\ndef file_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> Optional[str]:\n    \"\"\"Return md5 hash value for file contents.\n\n    Args:\n        fpath: Path to file.\n        chunk_size: md5 object updated by ``chunk_size`` bytes from file.\n\n    Returns:\n        None if ``fpath`` does not point to a file, else returns md5 hash value as string.\n\n    \"\"\"\n    fpath = Path(fpath)\n    if not fpath.is_file():\n        return None\n    file_hash = md5()\n    with fpath.open('rb') as f:\n        for chunk in iter(lambda: f.read(chunk_size), b\"\"):\n            file_hash.update(chunk)\n    return file_hash.hexdigest()\n\n\ndef mark_done(path: Union[Path, str]) -> None:\n    \"\"\"Create ``.done`` empty file in the directory.\n\n    Args:\n        path: Path to directory.\n\n    Raises:\n        NotADirectoryError: If ``path`` does not point to a directory.\n\n    \"\"\"\n    path = Path(path)\n    if not path.is_dir():\n        raise NotADirectoryError(f\"Not a directory: '{path}'\")\n    mark = path / _MARK_DONE\n    mark.touch(exist_ok=True)\n\n\ndef is_done(path: Union[Path, str]) -> bool:\n    \"\"\"Check if ``.done`` file exists in directory.\n\n    Args:\n        path: Path to directory.\n\n    Returns:\n        True if directory contains ``.done`` file, False otherwise.\n\n    \"\"\"\n    mark = Path(path) / _MARK_DONE\n    return mark.is_file()\n\n\ndef _get_all_dimensions(batch: Sequence, level: int = 0, res: Optional[List[List[int]]] = None) -> List[List[int]]:\n    \"\"\"Return all presented element sizes of each dimension.\n\n    Args:\n        batch: Data array.\n        level: Recursion level.\n        res: List containing element sizes of each dimension.\n\n    Return:\n        List, i-th element of which is list containing all presented sized of batch's i-th dimension.\n\n    Examples:\n        >>> x = [[[1], [2, 3]], [[4], [5, 6, 7], [8, 9]]]\n        >>> _get_all_dimensions(x)\n        [[2], [2, 3], [1, 2, 1, 3, 2]]\n\n    \"\"\"\n    if not level:\n        res = [[len(batch)]]\n    if len(batch) and isinstance(batch[0], Sized) and not isinstance(batch[0], str):\n        level += 1\n        if len(res) <= level:\n            res.append([])\n        for item in batch:\n            res[level].append(len(item))\n            _get_all_dimensions(item, level, res)\n    return res\n\n\ndef get_dimensions(batch: Sequence) -> List[int]:\n    \"\"\"Return maximal size of each batch dimension.\"\"\"\n    return list(map(max, _get_all_dimensions(batch)))\n\n\ndef zero_pad(batch: Sequence,\n             zp_batch: Optional[np.ndarray] = None,\n             dtype: type = np.float32,\n             padding: Union[int, float] = 0) -> np.ndarray:\n    \"\"\"Fills the end of each array item to make its length maximal along each dimension.\n\n    Args:\n        batch: Initial array.\n        zp_batch: Padded array.\n        dtype = Type of padded array.\n        padding = Number to will initial array with.\n\n    Returns:\n        Padded array.\n\n    Examples:\n        >>> x = np.array([[1, 2, 3], [4], [5, 6]])\n        >>> zero_pad(x)\n        array([[1., 2., 3.],\n               [4., 0., 0.],\n               [5., 6., 0.]], dtype=float32)\n\n    \"\"\"\n    if zp_batch is None:\n        dims = get_dimensions(batch)\n        zp_batch = np.ones(dims, dtype=dtype) * padding\n    if zp_batch.ndim == 1:\n        zp_batch[:len(batch)] = batch\n    else:\n        for b, zp in zip(batch, zp_batch):\n            zero_pad(b, zp)\n    return zp_batch\n\n\ndef is_str_batch(batch: Iterable) -> bool:\n    \"\"\"Checks if iterable argument contains string at any nesting level.\"\"\"\n    while True:\n        if isinstance(batch, Iterable):\n            if isinstance(batch, str):\n                return True\n            elif isinstance(batch, np.ndarray):\n                return batch.dtype.kind == 'U'\n            else:\n                if len(batch) > 0:\n                    batch = batch[0]\n                else:\n                    return True\n        else:\n            return False\n\n\ndef flatten_str_batch(batch: Union[str, Iterable]) -> Union[list, chain]:\n    \"\"\"Joins all strings from nested lists to one ``itertools.chain``.\n\n    Args:\n        batch: List with nested lists to flatten.\n\n    Returns:\n        Generator of flat List[str]. For str ``batch`` returns [``batch``].\n\n    Examples:\n        >>> [string for string in flatten_str_batch(['a', ['b'], [['c', 'd']]])]\n        ['a', 'b', 'c', 'd']\n\n    \"\"\"\n    if isinstance(batch, str):\n        return [batch]\n    else:\n        return chain(*[flatten_str_batch(sample) for sample in batch])\n\n\ndef zero_pad_truncate(batch: Sequence[Sequence[Union[int, float, np.integer, np.floating,\n                                                     Sequence[Union[int, float, np.integer, np.floating]]]]],\n                      max_len: int, pad: str = 'post', trunc: str = 'post',\n                      dtype: Optional[Union[type, str]] = None) -> np.ndarray:\n    \"\"\"\n\n    Args:\n        batch: assumes a batch of lists of word indexes or their vector representations\n        max_len: resulting length of every batch item\n        pad: how to pad shorter batch items: can be ``'post'`` or ``'pre'``\n        trunc: how to truncate a batch item: can be ``'post'`` or ``'pre'``\n        dtype: overrides dtype for the resulting ``ndarray`` if specified,\n         otherwise ``np.int32`` is used for 2-d arrays and ``np.float32`` — for 3-d arrays\n\n    Returns:\n        a 2-d array of size ``(len(batch), max_len)`` or a 3-d array of size ``(len(batch), max_len, len(batch[0][0]))``\n    \"\"\"\n    if isinstance(batch[0][0], Collection):  # ndarray behaves like a Sequence without actually being one\n        size = (len(batch), max_len, len(batch[0][0]))\n        dtype = dtype or np.float32\n    else:\n        size = (len(batch), max_len)\n        dtype = dtype or np.int32\n\n    padded_batch = np.zeros(size, dtype=dtype)\n    for i, batch_item in enumerate(batch):\n        if len(batch_item) > max_len:  # trunc\n            padded_batch[i] = batch_item[slice(max_len) if trunc == 'post' else slice(-max_len, None)]\n        else:  # pad\n            padded_batch[i, slice(len(batch_item)) if pad == 'post' else slice(-len(batch_item), None)] = batch_item\n\n    return np.asarray(padded_batch)\n\n\ndef get_all_elems_from_json(search_json: dict, search_key: str) -> list:\n    \"\"\"Returns values by key in all nested dicts.\n\n    Args:\n        search_json: Dictionary in which one needs to find all values by specific key.\n        search_key: Key for search.\n\n    Returns:\n        List of values stored in nested structures by ``search_key``.\n\n    Examples:\n        >>> get_all_elems_from_json({'a':{'b': [1,2,3]}, 'b':42}, 'b')\n        [[1, 2, 3], 42]\n\n    \"\"\"\n    result = []\n    if isinstance(search_json, dict):\n        for key in search_json:\n            if key == search_key:\n                result.append(search_json[key])\n            else:\n                result.extend(get_all_elems_from_json(search_json[key], search_key))\n    elif isinstance(search_json, list):\n        for item in search_json:\n            result.extend(get_all_elems_from_json(item, search_key))\n\n    return result\n\n\ndef check_nested_dict_keys(check_dict: dict, keys: list) -> bool:\n    \"\"\"Checks if dictionary contains nested keys from keys list.\n\n    Args:\n        check_dict: Dictionary to check.\n        keys: Keys list. i-th nested dict of ``check_dict`` should contain dict containing (i+1)-th key\n        from the ``keys`` list by i-th key.\n\n    Returns:\n        True if dictionary contains nested keys from keys list, False otherwise.\n\n    Examples:\n        >>> check_nested_dict_keys({'x': {'y': {'z': 42}}}, ['x', 'y', 'z'])\n        True\n        >>> check_nested_dict_keys({'x': {'y': {'z': 42}}}, ['x', 'z', 'y'])\n        False\n        >>> check_nested_dict_keys({'x': {'y': 1, 'z': 42}}, ['x', 'y', 'z'])\n        False\n\n    \"\"\"\n    if isinstance(keys, list) and len(keys) > 0:\n        element = check_dict\n        for key in keys:\n            if isinstance(element, dict) and key in element.keys():\n                element = element[key]\n            else:\n                return False\n        return True\n    else:\n        return False\n\n\ndef jsonify_data(data: Any) -> Any:\n    \"\"\"Replaces JSON-non-serializable objects with JSON-serializable.\n\n    Function replaces numpy arrays and numbers with python lists and numbers, tuples is replaces with lists. All other\n    object types remain the same.\n\n    Args:\n        data: Object to make JSON-serializable.\n\n    Returns:\n        Modified input data.\n\n    \"\"\"\n    if isinstance(data, (list, tuple)):\n        result = [jsonify_data(item) for item in data]\n    elif isinstance(data, dict):\n        result = {}\n        for key in data.keys():\n            result[key] = jsonify_data(data[key])\n    elif isinstance(data, np.ndarray):\n        result = data.tolist()\n    elif isinstance(data, np.integer):\n        result = int(data)\n    elif isinstance(data, np.floating):\n        result = float(data)\n    elif callable(getattr(data, \"to_serializable_dict\", None)):\n        result = data.to_serializable_dict()\n    else:\n        result = data\n    return result\n\n\ndef chunk_generator(items_list: list, chunk_size: int) -> Generator[list, None, None]:\n    \"\"\"Yields consecutive slices of list.\n\n    Args:\n        items_list: List to slice.\n        chunk_size: Length of slice.\n\n    Yields:\n        list: ``items_list`` consecutive slices.\n\n    \"\"\"\n    for i in range(0, len(items_list), chunk_size):\n        yield items_list[i:i + chunk_size]\n\n\ndef update_dict_recursive(editable_dict: dict, editing_dict: Mapping) -> None:\n    \"\"\"Updates dict recursively.\n\n    You need to use this function to update dictionary if depth of editing_dict is more then 1.\n\n    Args:\n        editable_dict: Dictionary to edit.\n        editing_dict: Dictionary containing edits.\n\n    \"\"\"\n    for k, v in editing_dict.items():\n        if isinstance(v, collections.Mapping):\n            update_dict_recursive(editable_dict.get(k, {}), v)\n        else:\n            editable_dict[k] = v\n\n\ndef path_set_md5(url: str) -> str:\n    \"\"\"Given a file URL, return a md5 query of the file.\n\n    Args:\n        url: A given URL.\n\n    Returns:\n        URL of the md5 file.\n\n    \"\"\"\n    scheme, netloc, path, query_string, fragment = urlsplit(url)\n    path += '.md5'\n\n    return urlunsplit((scheme, netloc, path, query_string, fragment))\n\n\ndef set_query_parameter(url: str, param_name: str, param_value: str) -> str:\n    \"\"\"Given a URL, set or replace a query parameter and return the modified URL.\n\n    Args:\n        url: A given  URL.\n        param_name: The parameter name to add.\n        param_value: The parameter value.\n\n    Returns:\n        URL with the added parameter.\n\n    \"\"\"\n    scheme, netloc, path, query_string, fragment = urlsplit(url)\n    query_params = parse_qs(query_string)\n\n    query_params[param_name] = [param_value]\n    new_query_string = urlencode(query_params, doseq=True)\n\n    return urlunsplit((scheme, netloc, path, new_query_string, fragment))\n"
  },
  {
    "path": "deeppavlov/core/models/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/core/models/component.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom abc import ABCMeta, abstractmethod\n\nfrom logging import getLogger\n\nlog = getLogger(__name__)\n\n\nclass Component(metaclass=ABCMeta):\n    \"\"\"Abstract class for all callables that could be used in Chainer's pipe.\"\"\"\n\n    @abstractmethod\n    def __call__(self, *args, **kwargs):\n        pass\n\n    def reset(self):\n        pass\n\n    def destroy(self):\n        attr_list = list(self.__dict__.keys())\n        for attr_name in attr_list:\n            attr = getattr(self, attr_name)\n            if hasattr(attr, 'destroy'):\n                attr.destroy()\n            delattr(self, attr_name)\n"
  },
  {
    "path": "deeppavlov/core/models/estimator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom abc import abstractmethod\n\nfrom .component import Component\nfrom .serializable import Serializable\n\n\nclass Estimator(Component, Serializable):\n    \"\"\"Abstract class for components that could be fitted on the data as a whole.\"\"\"\n\n    @abstractmethod\n    def fit(self, *args, **kwargs):\n        pass\n"
  },
  {
    "path": "deeppavlov/core/models/nn_model.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom abc import abstractmethod\n\nfrom .component import Component\nfrom .serializable import Serializable\n\n\nclass NNModel(Component, Serializable):\n    \"\"\"Abstract class for deep learning components.\"\"\"\n\n    @abstractmethod\n    def train_on_batch(self, x: list, y: list):\n        pass\n\n    def process_event(self, event_name, data):\n        pass\n"
  },
  {
    "path": "deeppavlov/core/models/serializable.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom abc import ABCMeta, abstractmethod\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Union, Optional\n\nfrom deeppavlov.core.commands.utils import expand_path\n\nlog = getLogger(__name__)\n\n\nclass Serializable(metaclass=ABCMeta):\n    \"\"\"Abstract base class that expresses the interface for all models that can serialize data to a path.\"\"\"\n\n    def __init__(self, save_path: Optional[Union[str, Path]], load_path: Optional[Union[str, Path]] = None,\n                 mode: str = 'infer',\n                 *args, **kwargs) -> None:\n\n        if save_path:\n            self.save_path = expand_path(save_path)\n            self.save_path.parent.mkdir(parents=True, exist_ok=True)\n        else:\n            self.save_path = None\n\n        if load_path:\n            self.load_path = expand_path(load_path)\n            if mode != 'train' and self.save_path and self.load_path != self.save_path:\n                log.warning(\"Load path '{}' differs from save path '{}' in '{}' mode for {}.\"\n                            .format(self.load_path, self.save_path, mode, self.__class__.__name__))\n        elif mode != 'train' and self.save_path:\n            self.load_path = self.save_path\n            log.warning(\"No load path is set for {} in '{}' mode. Using save path instead\"\n                        .format(self.__class__.__name__, mode))\n        else:\n            self.load_path = None\n            log.warning(\"No load path is set for {}!\".format(self.__class__.__name__))\n\n    @abstractmethod\n    def save(self, *args, **kwargs):\n        pass\n\n    @abstractmethod\n    def load(self, *args, **kwargs):\n        pass\n"
  },
  {
    "path": "deeppavlov/core/models/torch_model.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom abc import abstractmethod\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Optional, Union\n\nimport torch\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.models.nn_model import NNModel\n\nlog = getLogger(__name__)\n\n\nclass TorchModel(NNModel):\n    \"\"\"Class implements torch model's main methods.\n\n    Args:\n        model: torch.nn.Model-based neural network model\n        device: device to use\n        optimizer: name of `torch.optim` optimizer\n        optimizer_parameters: dictionary with optimizer parameters\n        learning_rate_drop_patience: how many validations with no improvements to wait\n        learning_rate_drop_div: the divider of the learning rate after `learning_rate_drop_patience` unsuccessful\n            validations\n        load_before_drop: whether to load best model before dropping learning rate or not\n        min_learning_rate: min value of learning rate if learning rate decay is used\n        args:\n        kwargs: dictionary with other model parameters\n\n    Attributes:\n        device: `cpu` or `cuda` device to use\n        opt: dictionary with all model parameters\n        model: torch model\n        epochs_done: number of epochs that were done\n        optimizer: `torch.optim` instance\n        learning_rate_drop_patience: how many validations with no improvements to wait\n        learning_rate_drop_div: the divider of the learning rate after `learning_rate_drop_patience` unsuccessful\n            validations\n        load_before_drop: whether to load best model before dropping learning rate or not\n        min_learning_rate: min value of learning rate if learning rate decay is used\n        clip_norm: clip gradients by norm coefficient\n    \"\"\"\n\n    def __init__(self, model: torch.nn.Module,\n                 device: Union[torch.device, str] = \"cuda\",\n                 optimizer: str = \"AdamW\",\n                 optimizer_parameters: Optional[dict] = None,\n                 learning_rate_drop_patience: Optional[int] = None,\n                 learning_rate_drop_div: Optional[float] = None,\n                 load_before_drop: bool = True,\n                 min_learning_rate: float = 1e-07,\n                 clip_norm: Optional[float] = None,\n                 *args, **kwargs):\n\n        super().__init__(*args, **kwargs)\n        self.model = model\n        self.device = self._init_device(device)\n        self.model.to(self.device)\n        if self.device.type == \"cuda\" and torch.cuda.device_count() > 1:\n            self.model = torch.nn.DataParallel(self.model)\n        if optimizer_parameters is None:\n            optimizer_parameters = {\"lr\": 0.01}\n        self.optimizer = getattr(torch.optim, optimizer)(self.model.parameters(), **optimizer_parameters)\n        self.epochs_done = 0\n        self.learning_rate_drop_patience = learning_rate_drop_patience\n        self.learning_rate_drop_div = learning_rate_drop_div\n        self.load_before_drop = load_before_drop\n        self.min_learning_rate = min_learning_rate\n        self.clip_norm = clip_norm\n        self.load()\n        # we need to switch to eval mode here because by default it's in `train` mode.\n        # But in case of `interact/build_model` usage, we need to have model in eval mode.\n        self.model.eval()\n        log.debug(f\"Model was successfully initialized! Model summary:\\n {self.model}\")\n\n    def _init_device(self, device: Union[torch.device, str]) -> torch.device:\n        if device == \"gpu\":\n            device = \"cuda\"\n        if isinstance(device, str):\n            device = torch.device(device)\n        if device.type == \"cuda\" and not torch.cuda.is_available():\n            log.warning(f\"Unable to place component {self.__class__.__name__} on GPU, \"\n                        \"since no CUDA GPUs are available. Using CPU.\")\n            device = torch.device('cpu')\n        return device\n\n    @property\n    def is_data_parallel(self) -> bool:\n        return isinstance(self.model, torch.nn.DataParallel)\n\n    def load(self, fname: Optional[str] = None, *args, **kwargs) -> None:\n        \"\"\"Load model from `fname` (if `fname` is not given, use `self.load_path`) to `self.model` along with\n            the optimizer `self.optimizer`.\n            If `fname` (if `fname` is not given, use `self.load_path`) does not exist, initialize model from scratch.\n\n        Args:\n            fname: string path to checkpoint\n            *args:\n            **kwargs:\n\n        Returns:\n            None\n        \"\"\"\n        if fname is not None:\n            self.load_path = fname\n\n        if self.load_path:\n            log.debug(f\"Load path {self.load_path} is given.\")\n            if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir():\n                raise ConfigError(\"Provided load path is incorrect!\")\n\n            weights_path = Path(self.load_path.resolve())\n            weights_path = weights_path.with_suffix(f\".pth.tar\")\n            if weights_path.exists():\n                log.debug(f\"Load path {weights_path} exists.\")\n                log.debug(f\"Initializing `{self.__class__.__name__}` from saved.\")\n\n                # now load the weights, optimizer from saved\n                log.debug(f\"Loading weights from {weights_path}.\")\n                checkpoint = torch.load(weights_path, map_location=self.device)\n                model_state = checkpoint[\"model_state_dict\"]\n                optimizer_state = checkpoint[\"optimizer_state_dict\"]\n                # load a multi-gpu model on a single device\n                if all([key.startswith(\"module.\") for key in list(model_state.keys())]):\n                    model_state = {key.replace(\"module.\", \"\", 1): val for key, val in model_state.items()}\n\n                if self.is_data_parallel:\n                    self.model.module.load_state_dict(model_state)\n                else:\n                    self.model.load_state_dict(model_state)\n                try:  # TODO: remove this try-except after hf models deep update\n                    self.optimizer.load_state_dict(optimizer_state)\n                except ValueError as e:\n                    log.error(f'Failed to load optimizer state due to {repr(e)}')\n                self.epochs_done = checkpoint.get(\"epochs_done\", 0)\n            else:\n                log.warning(f\"Init from scratch. Load path {weights_path} does not exist.\")\n        else:\n            log.warning(f\"Init from scratch. Load path {self.load_path} is not provided.\")\n        self.model.to(self.device)\n\n    def save(self, fname: Optional[str] = None, *args, **kwargs) -> None:\n        \"\"\"Save torch model to `fname` (if `fname` is not given, use `self.save_path`). Checkpoint includes\n            `model_state_dict`, `optimizer_state_dict`, and `epochs_done` (number of training epochs).\n\n        Args:\n            fname:\n            *args:\n            **kwargs:\n\n        Returns:\n\n        \"\"\"\n        if fname is None:\n            fname = self.save_path\n\n        if not fname.parent.is_dir():\n            raise ConfigError(\"Provided save path is incorrect!\")\n\n        weights_path = Path(fname).with_suffix(f\".pth.tar\")\n        log.info(f\"Saving model to {weights_path}.\")\n        # move the model to `cpu` before saving to provide consistency\n        if self.is_data_parallel:\n            model_state_dict = self.model.module.cpu().state_dict()\n        else:\n            model_state_dict = self.model.cpu().state_dict()\n        torch.save({\n            \"model_state_dict\": model_state_dict,\n            \"optimizer_state_dict\": self.optimizer.state_dict(),\n            \"epochs_done\": self.epochs_done\n        }, weights_path)\n        # return it back to device (necessary if it was on `cuda`)\n        self.model.to(self.device)\n\n    def process_event(self, event_name: str, data: dict) -> None:\n        \"\"\"Process event. After epoch, increase `self.epochs_done`. After validation, decrease learning rate in\n            `self.learning_rate_drop_div` times (not lower than `self.min_learning_rate`)\n            if given `self.learning_rate_drop_patience`.\n\n        Args:\n            event_name: whether event is send after epoch or batch.\n                    Set of values: ``\"after_epoch\", \"after_batch\"``\n            data: event data (dictionary)\n        Returns:\n            None\n        \"\"\"\n        if event_name == \"after_epoch\":\n            self.epochs_done += 1\n\n        if event_name == \"after_validation\" and 'impatience' in data and self.learning_rate_drop_patience:\n            if data['impatience'] == self.learning_rate_drop_patience:\n                log.info(f\"----------Current LR is decreased in {self.learning_rate_drop_div} times----------\")\n                if self.load_before_drop:\n                    self.load(self.save_path)\n                    self.model.eval()\n                for param_group in self.optimizer.param_groups:\n                    param_group['lr'] = max(param_group['lr'] / self.learning_rate_drop_div, self.min_learning_rate)\n\n    @abstractmethod\n    def train_on_batch(self, x: list, y: list):\n        pass\n\n    def _make_step(self, loss: torch.Tensor) -> None:\n        loss.backward()\n        if self.clip_norm is not None:\n            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm)\n        self.optimizer.step()\n"
  },
  {
    "path": "deeppavlov/core/trainers/__init__.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .fit_trainer import FitTrainer\nfrom .nn_trainer import NNTrainer\nfrom .torch_trainer import TorchTrainer\n"
  },
  {
    "path": "deeppavlov/core/trainers/fit_trainer.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport datetime\nimport json\nimport time\nfrom itertools import islice\nfrom logging import getLogger\nfrom typing import Tuple, Dict, Union, Optional, Iterable, Any, Collection\n\nfrom tqdm import tqdm\n\nfrom deeppavlov.core.commands.infer import build_model\nfrom deeppavlov.core.common.chainer import Chainer\nfrom deeppavlov.core.common.params import from_params\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_fitting_iterator import DataFittingIterator\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\nfrom deeppavlov.core.models.estimator import Estimator\nfrom deeppavlov.core.trainers.utils import Metric, parse_metrics, prettify_metrics, NumpyArrayEncoder\n\nlog = getLogger(__name__)\nreport_log = getLogger('train_report')\n\n\n@register('fit_trainer')\nclass FitTrainer:\n    \"\"\"\n    Trainer class for fitting and evaluating :class:`Estimators <deeppavlov.core.models.estimator.Estimator>`\n\n    Args:\n        chainer_config: ``\"chainer\"`` block of a configuration file\n        batch_size: batch_size to use for partial fitting (if available) and evaluation,\n            the whole dataset is used if ``batch_size`` is negative or zero (default is ``-1``)\n        metrics: iterable of metrics where each metric can be a registered metric name or a dict of ``name`` and\n            ``inputs`` where ``name`` is a registered metric name and ``inputs`` is a collection of parameter names\n            from chainer’s inner memory that will be passed to the metric function;\n            default value for ``inputs`` parameter is a concatenation of chainer’s ``in_y`` and ``out`` fields\n            (default is ``('accuracy',)``)\n        evaluation_targets: data types on which to evaluate trained pipeline (default is ``('valid', 'test')``)\n        show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch\n            in evaluation logs (default is ``False``)\n        max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative\n            (default is ``-1``)\n        **kwargs: additional parameters whose names will be logged but otherwise ignored\n    \"\"\"\n\n    def __init__(self, chainer_config: dict, *, batch_size: int = -1,\n                 metrics: Iterable[Union[str, dict]] = ('accuracy',),\n                 evaluation_targets: Iterable[str] = ('valid', 'test'),\n                 show_examples: bool = False,\n                 max_test_batches: int = -1,\n                 **kwargs) -> None:\n        if kwargs:\n            log.warning(f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:')\n        self.chainer_config = chainer_config\n        self._chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y'))\n        self.batch_size = batch_size\n        self.metrics = parse_metrics(metrics, self._chainer.in_y, self._chainer.out_params)\n        self.evaluation_targets = tuple(evaluation_targets)\n        self.show_examples = show_examples\n        self.max_test_batches = None if max_test_batches < 0 else max_test_batches\n        self._built = False\n        self._saved = False\n        self._loaded = False\n\n    def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator]) -> None:\n        \"\"\"\n        Build the pipeline :class:`~deeppavlov.core.common.chainer.Chainer` and successively fit\n        :class:`Estimator <deeppavlov.core.models.estimator.Estimator>` components using a provided data iterator\n        \"\"\"\n        if self._built:\n            raise RuntimeError('Cannot fit already built chainer')\n        for component_index, component_config in enumerate(self.chainer_config['pipe'], 1):\n            component = from_params(component_config, mode='train')\n            if 'fit_on' in component_config:\n                component: Estimator\n\n                targets = component_config['fit_on']\n                if isinstance(targets, str):\n                    targets = [targets]\n\n                if self.batch_size > 0 and callable(getattr(component, 'partial_fit', None)):\n                    for i, (x, y) in tqdm(enumerate(iterator.gen_batches(self.batch_size, shuffle=False))):\n                        preprocessed = self._chainer.compute(x, y, targets=targets)\n                        # noinspection PyUnresolvedReferences\n                        component.partial_fit(*preprocessed)\n                else:\n                    preprocessed = self._chainer.compute(*iterator.get_instances(), targets=targets)\n                    if len(targets) == 1:\n                        preprocessed = [preprocessed]\n                    component.fit(*preprocessed)\n\n                component.save()\n\n            if 'in' in component_config:\n                c_in = component_config['in']\n                c_out = component_config['out']\n                in_y = component_config.get('in_y', None)\n                main = component_config.get('main', False)\n                self._chainer.append(component, c_in, c_out, in_y, main)\n        self._built = True\n\n    def _load(self) -> None:\n        if not self._loaded:\n            self._chainer.destroy()\n            self._chainer = build_model({'chainer': self.chainer_config}, load_trained=self._saved)\n            self._loaded = True\n\n    def get_chainer(self) -> Chainer:\n        \"\"\"Returns a :class:`~deeppavlov.core.common.chainer.Chainer` built from ``self.chainer_config`` for inference\"\"\"\n        self._load()\n        return self._chainer\n\n    def train(self, iterator: Union[DataFittingIterator, DataLearningIterator]) -> None:\n        \"\"\"Calls :meth:`~fit_chainer` with provided data iterator as an argument\"\"\"\n        self.fit_chainer(iterator)\n        self._saved = True\n\n    def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]],\n             metrics: Optional[Collection[Metric]] = None, *,\n             start_time: Optional[float] = None, show_examples: Optional[bool] = None) -> dict:\n        \"\"\"\n        Calculate metrics and return reports on provided data for currently stored\n        :class:`~deeppavlov.core.common.chainer.Chainer`\n\n        Args:\n            data: iterable of batches of inputs and expected outputs\n            metrics: collection of metrics namedtuples containing names for report, metric functions\n                and their inputs names (if omitted, ``self.metrics`` is used)\n            start_time: start time for test report\n            show_examples: a flag used to return inputs, expected outputs and predicted outputs for the last batch\n                in a result report (if omitted, ``self.show_examples`` is used)\n\n        Returns:\n            a report dict containing calculated metrics, spent time value, examples count in tested data\n            and maybe examples\n        \"\"\"\n\n        if start_time is None:\n            start_time = time.time()\n        if show_examples is None:\n            show_examples = self.show_examples\n        if metrics is None:\n            metrics = self.metrics\n\n        expected_outputs = list(set().union(self._chainer.out_params, *[m.inputs for m in metrics]))\n\n        outputs = {out: [] for out in expected_outputs}\n        examples = 0\n\n        data = islice(data, self.max_test_batches)\n\n        for x, y_true in tqdm(data):\n            examples += len(x)\n            y_predicted = list(self._chainer.compute(list(x), list(y_true), targets=expected_outputs))\n            if len(expected_outputs) == 1:\n                y_predicted = [y_predicted]\n            for out, val in zip(outputs.values(), y_predicted):\n                out += list(val)\n        if examples == 0:\n            log.warning('Got empty data iterable for scoring')\n            return {'eval_examples_count': 0, 'metrics': None, 'time_spent': str(datetime.timedelta(seconds=0))}\n\n        # metrics_values = [(m.name, m.fn(*[outputs[i] for i in m.inputs])) for m in metrics]\n        metrics_values = []\n        for metric in metrics:\n            calculate_metric = True\n            for i in metric.inputs:\n                outputs[i] = [k for k in outputs[i] if k is not None]\n                if len(outputs[i]) == 0:\n                    log.info(f'Metric {metric.alias} is not calculated due to absense of true and predicted samples')\n                    calculate_metric = False\n                    value = -1\n            if calculate_metric:\n                value = metric.fn(*[outputs[i] for i in metric.inputs])\n            metrics_values.append((metric.alias, value))\n\n        report = {\n            'eval_examples_count': examples,\n            'metrics': prettify_metrics(metrics_values),\n            'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5)))\n        }\n\n        if show_examples:\n            y_predicted = zip(*[y_predicted_group\n                                for out_name, y_predicted_group in zip(expected_outputs, y_predicted)\n                                if out_name in self._chainer.out_params])\n            if len(self._chainer.out_params) == 1:\n                y_predicted = [y_predicted_item[0] for y_predicted_item in y_predicted]\n            report['examples'] = [{\n                'x': x_item,\n                'y_predicted': y_predicted_item,\n                'y_true': y_true_item\n            } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)]\n\n        return report\n\n    def evaluate(self, iterator: DataLearningIterator,\n                 evaluation_targets: Optional[Iterable[str]] = None) -> Dict[str, dict]:\n        \"\"\"\n        Run :meth:`test` on multiple data types using provided data iterator\n\n        Args:\n            iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation\n            evaluation_targets: iterable of data types to evaluate on\n\n        Returns:\n            a dictionary with data types as keys and evaluation reports as values\n        \"\"\"\n        self._load()\n        if evaluation_targets is None:\n            evaluation_targets = self.evaluation_targets\n\n        res = {}\n\n        for data_type in evaluation_targets:\n            data_gen = iterator.gen_batches(self.batch_size, data_type=data_type, shuffle=False)\n            report = self.test(data_gen)\n            res[data_type] = report\n            report_log.info(json.dumps({data_type: report}, ensure_ascii=False, cls=NumpyArrayEncoder))\n\n        return res\n"
  },
  {
    "path": "deeppavlov/core/trainers/nn_trainer.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport datetime\nimport json\nimport time\nfrom itertools import islice\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Tuple, Union, Optional, Iterable\n\nfrom tqdm import tqdm\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.log_events import get_tb_writer\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\nfrom deeppavlov.core.trainers.fit_trainer import FitTrainer\nfrom deeppavlov.core.trainers.utils import parse_metrics, NumpyArrayEncoder\n\nlog = getLogger(__name__)\nreport_log = getLogger('train_report')\n\n\n@register('nn_trainer')\nclass NNTrainer(FitTrainer):\n    \"\"\"\n    | Bases :class:`~deeppavlov.core.trainers.FitTrainer`\n    | Trainer class for training and evaluating pipelines containing\n      :class:`Estimators <deeppavlov.core.models.estimator.Estimator>`\n      and an :class:`~deeppavlov.core.models.nn_model.NNModel`\n\n    Args:\n        chainer_config: ``\"chainer\"`` block of a configuration file\n        batch_size: batch_size to use for partial fitting (if available) and evaluation,\n            the whole dataset is used if ``batch_size`` is negative or zero (default is ``1``)\n        epochs: maximum epochs number to train the pipeline, ignored if negative or zero (default is ``-1``)\n        start_epoch_num: starting epoch number for reports (default is ``0``)\n        max_batches: maximum batches number to train the pipeline, ignored if negative or zero (default is ``-1``)\n        metrics: iterable of metrics where each metric can be a registered metric name or a dict of ``name`` and\n            ``inputs`` where ``name`` is a registered metric name and ``inputs`` is a collection of parameter names\n            from chainer’s inner memory that will be passed to the metric function;\n            default value for ``inputs`` parameter is a concatenation of chainer’s ``in_y`` and ``out`` fields;\n            the first metric is used for early stopping (default is ``('accuracy',)``)\n        train_metrics: metrics calculated for train logs (if omitted, ``metrics`` argument is used)\n        metric_optimization: one of ``'maximize'`` or ``'minimize'`` — strategy for metric optimization used in early\n            stopping (default is ``'maximize'``)\n        evaluation_targets: data types on which to evaluate a trained pipeline (default is ``('valid', 'test')``)\n        show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch\n            in evaluation logs (default is ``False``)\n        tensorboard_log_dir: path to a directory where tensorboard logs can be stored, ignored if None\n            (default is ``None``)\n        validate_first: flag used to calculate metrics on the ``'valid'`` data type before starting training\n            (default is ``True``)\n        validation_patience: how many times in a row the validation metric has to not improve for early stopping,\n            ignored if negative or zero (default is ``5``)\n        val_every_n_epochs: how often (in epochs) to validate the pipeline, ignored if negative or zero\n            (default is ``-1``)\n        val_every_n_batches: how often (in batches) to validate the pipeline, ignored if negative or zero\n            (default is ``-1``)\n        log_every_n_epochs: how often (in epochs) to calculate metrics on train data, ignored if negative or zero\n            (default is ``-1``)\n        log_every_n_batches: how often (in batches) to calculate metrics on train data, ignored if negative or zero\n            (default is ``-1``)\n        log_on_k_batches: count of random train batches to calculate metrics in log (default is ``1``)\n        max_test_batches: maximum batches count for pipeline testing and evaluation, overrides ``log_on_k_batches``,\n            ignored if negative (default is ``-1``)\n        **kwargs: additional parameters whose names will be logged but otherwise ignored\n\n\n    Trainer saves the model if it sees progress in scores. The full rules look like following:\n\n    - For the validation savepoint:\n        * 0-th validation (optional). Don't save model, establish a baseline.\n        * 1-th validation.\n             + If we have a baseline, save the model if we see an improvement, don't save otherwise.\n             + If we don't have a baseline, save the model.\n        * 2nd and later validations. Save the model if we see an improvement\n    - For the at-train-exit savepoint:\n        * Save the model if it happened before 1st validation (to capture early training results), don't save otherwise.\n\n    \"\"\"\n\n    def __init__(self, chainer_config: dict, *, \n                 batch_size: int = 1,\n                 epochs: int = -1,\n                 start_epoch_num: int = 0,\n                 max_batches: int = -1,\n                 metrics: Iterable[Union[str, dict]] = ('accuracy',),\n                 train_metrics: Optional[Iterable[Union[str, dict]]] = None,\n                 metric_optimization: str = 'maximize',\n                 evaluation_targets: Iterable[str] = ('valid', 'test'),\n                 show_examples: bool = False,\n                 tensorboard_log_dir: Optional[Union[str, Path]] = None,\n                 max_test_batches: int = -1,\n                 validate_first: bool = True,\n                 validation_patience: int = 5, val_every_n_epochs: int = -1, val_every_n_batches: int = -1,\n                 log_every_n_batches: int = -1, log_every_n_epochs: int = -1, log_on_k_batches: int = 1,\n                 **kwargs) -> None:\n        super().__init__(chainer_config, batch_size=batch_size, metrics=metrics, evaluation_targets=evaluation_targets,\n                         show_examples=show_examples, max_test_batches=max_test_batches, **kwargs)\n        if train_metrics is None:\n            self.train_metrics = self.metrics\n        else:\n            self.train_metrics = parse_metrics(train_metrics, self._chainer.in_y, self._chainer.out_params)\n\n        metric_optimization = metric_optimization.strip().lower()\n        self.score_best = None\n\n        def _improved(op):\n            return lambda score, baseline: False if baseline is None or score is None \\\n                else op(score, baseline)\n\n        if metric_optimization == 'maximize':\n            self.improved = _improved(lambda a, b: a > b)\n        elif metric_optimization == 'minimize':\n            self.improved = _improved(lambda a, b: a < b)\n        else:\n            raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize']))\n\n        self.validate_first = validate_first\n        self.validation_number = 0 if validate_first else 1\n        self.validation_patience = validation_patience\n        self.val_every_n_epochs = val_every_n_epochs\n        self.val_every_n_batches = val_every_n_batches\n        self.log_every_n_epochs = log_every_n_epochs\n        self.log_every_n_batches = log_every_n_batches\n        self.log_on_k_batches = log_on_k_batches if log_on_k_batches >= 0 else None\n\n        self.max_epochs = epochs\n        self.epoch = start_epoch_num\n        self.max_batches = max_batches\n\n        self.train_batches_seen = 0\n        self.examples = 0\n        self.patience = 0\n        self.last_result = {}\n        self.losses = []\n        self.start_time: Optional[float] = None\n        self.tb_writer = get_tb_writer(tensorboard_log_dir)\n\n    def save(self) -> None:\n        if self._loaded:\n            raise RuntimeError('Cannot save already finalized chainer')\n\n        self._chainer.save()\n\n    def _is_initial_validation(self):\n        return self.validation_number == 0\n\n    def _is_first_validation(self):\n        return self.validation_number == 1\n\n    def _validate(self, iterator: DataLearningIterator,\n                  tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None:\n        self._send_event(event_name='before_validation')\n        report = self.test(iterator.gen_batches(self.batch_size, data_type='valid', shuffle=False),\n                           start_time=self.start_time)\n\n        report['epochs_done'] = self.epoch\n        report['batches_seen'] = self.train_batches_seen\n        report['train_examples_seen'] = self.examples\n\n        metrics = list(report['metrics'].items())\n\n        if tensorboard_tag is not None and self.tb_writer is not None:\n            if tensorboard_index is None:\n                tensorboard_index = self.train_batches_seen\n            for name, score in metrics:\n                self.tb_writer.write_valid(tag=f'{tensorboard_tag}/{name}', scalar_value=score,\n                                           global_step=tensorboard_index)\n            self.tb_writer.flush()\n\n        m_name, score = metrics[0]\n\n        # Update the patience\n        if self.score_best is None:\n            self.patience = 0\n        else:\n            if self.improved(score, self.score_best):\n                self.patience = 0\n            else:\n                self.patience += 1\n\n        # Run the validation model-saving logic\n        if self._is_initial_validation():\n            log.info('Initial best {} of {}'.format(m_name, score))\n            self.score_best = score\n        elif self._is_first_validation() and self.score_best is None:\n            log.info('First best {} of {}'.format(m_name, score))\n            self.score_best = score\n            log.info('Saving model')\n            self.save()\n        elif self.improved(score, self.score_best):\n            log.info(f'Improved best {m_name} from {self.score_best} to {score}')\n            self.score_best = score\n            log.info('Saving model')\n            self.save()\n        else:\n            log.info('Did not improve on the {} of {}'.format(m_name, self.score_best))\n\n        report['impatience'] = self.patience\n        if self.validation_patience > 0:\n            report['patience_limit'] = self.validation_patience\n\n        self._send_event(event_name='after_validation', data=report)\n        report = {'valid': report}\n        report_log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder))\n        self.validation_number += 1\n\n    def _log(self, iterator: DataLearningIterator,\n             tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None:\n        self._send_event(event_name='before_log')\n        if self.log_on_k_batches == 0:\n            report = {\n                'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5)))\n            }\n        else:\n            data = islice(iterator.gen_batches(self.batch_size, data_type='train', shuffle=True),\n                          self.log_on_k_batches)\n            report = self.test(data, self.train_metrics, start_time=self.start_time)\n\n        report.update({\n            'epochs_done': self.epoch,\n            'batches_seen': self.train_batches_seen,\n            'train_examples_seen': self.examples\n        })\n\n        metrics: List[Tuple[str, float]] = list(report.get('metrics', {}).items()) + list(self.last_result.items())\n\n        report.update(self.last_result)\n        if self.losses:\n            report['loss'] = sum(self.losses) / len(self.losses)\n            self.losses.clear()\n            metrics.append(('loss', report['loss']))\n\n        if metrics and self.tb_writer is not None:\n            for name, score in metrics:\n                self.tb_writer.write_train(tag=f'{tensorboard_tag}/{name}', scalar_value=score,\n                                           global_step=tensorboard_index)\n            self.tb_writer.flush()\n\n        self._send_event(event_name='after_train_log', data=report)\n\n        report = {'train': report}\n        report_log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder))\n\n    def _send_event(self, event_name: str, data: Optional[dict] = None) -> None:\n        report = {\n            'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))),\n            'epochs_done': self.epoch,\n            'batches_seen': self.train_batches_seen,\n            'train_examples_seen': self.examples\n        }\n        if data is not None:\n            report.update(data)\n        self._chainer.process_event(event_name=event_name, data=report)\n\n    def train_on_batches(self, iterator: DataLearningIterator) -> None:\n        \"\"\"Train pipeline on batches using provided data iterator and initialization parameters\"\"\"\n        self.start_time = time.time()\n        if self.validate_first:\n            self._validate(iterator)\n\n        while True:\n            impatient = False\n            self._send_event(event_name='before_train')\n            for x, y_true in tqdm(iterator.gen_batches(self.batch_size, data_type='train')):\n                self.last_result = self._chainer.train_on_batch(x, y_true)\n                if self.last_result is None:\n                    self.last_result = {}\n                elif not isinstance(self.last_result, dict):\n                    self.last_result = {'loss': self.last_result}\n                if 'loss' in self.last_result:\n                    self.losses.append(self.last_result.pop('loss'))\n\n                self.train_batches_seen += 1\n                self.examples += len(x)\n\n                if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0:\n                    self._log(iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen)\n\n                if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0:\n                    self._validate(iterator,\n                                   tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen)\n\n                self._send_event(event_name='after_batch')\n\n                if 0 < self.max_batches <= self.train_batches_seen:\n                    impatient = True\n                    break\n\n                if 0 < self.validation_patience <= self.patience:\n                    log.info('Ran out of patience')\n                    impatient = True\n                    break\n\n            if impatient:\n                break\n\n            self.epoch += 1\n\n            if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0:\n                self._log(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch)\n\n            if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0:\n                self._validate(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch)\n\n            self._send_event(event_name='after_epoch')\n\n            if 0 < self.max_epochs <= self.epoch:\n                break\n\n            if 0 < self.validation_patience <= self.patience:\n                log.info('Ran out of patience')\n                break\n\n    def train(self, iterator: DataLearningIterator) -> None:\n        \"\"\"Call :meth:`~fit_chainer` and then :meth:`~train_on_batches` with provided data iterator as an argument\"\"\"\n        self.fit_chainer(iterator)\n        if callable(getattr(self._chainer, 'train_on_batch', None)):\n            try:\n                self.train_on_batches(iterator)\n            except KeyboardInterrupt:\n                log.info('Stopped training')\n        else:\n            log.warning(f'Using {self.__class__.__name__} for a pipeline without batched training')\n\n        # Run the at-train-exit model-saving logic\n        if self.validation_number < 1:\n            log.info('Save model to capture early training results')\n            self.save()\n"
  },
  {
    "path": "deeppavlov/core/trainers/torch_trainer.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import Tuple, Optional, Iterable, Collection, Any\n\nfrom deeppavlov.core.trainers.utils import Metric\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\nfrom deeppavlov.core.trainers.nn_trainer import NNTrainer\n\nlog = getLogger(__name__)\n\n\n@register('torch_trainer')\nclass TorchTrainer(NNTrainer):\n\n    def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]],\n             metrics: Optional[Collection[Metric]] = None, *,\n             start_time: Optional[float] = None, show_examples: Optional[bool] = None) -> dict:\n        self._chainer.get_main_component().model.eval()\n\n        report = super(TorchTrainer, self).test(data=data, metrics=metrics, start_time=start_time,\n                                                show_examples=show_examples)\n        self._chainer.get_main_component().model.train()\n        return report\n\n    def train_on_batches(self, iterator: DataLearningIterator) -> None:\n        self._chainer.get_main_component().model.train()\n        super(TorchTrainer, self).train_on_batches(iterator=iterator)\n        self._chainer.get_main_component().model.eval()\n"
  },
  {
    "path": "deeppavlov/core/trainers/utils.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections import OrderedDict, namedtuple\nfrom dataclasses import is_dataclass\nfrom functools import partial\nfrom json import JSONEncoder\nfrom typing import List, Tuple, Union, Iterable\n\nimport numpy as np\n\nfrom deeppavlov.core.common.metrics_registry import get_metric_by_name\n\nMetric = namedtuple('Metric', ['name', 'fn', 'inputs', 'alias'])\n\n\ndef parse_metrics(metrics: Iterable[Union[str, dict]], in_y: List[str], out_vars: List[str]) -> List[Metric]:\n    metrics_functions = []\n    for metric in metrics:\n        if isinstance(metric, str):\n            metric = {'name': metric, 'alias': metric}\n\n        metric_name = metric.pop('name')\n        alias = metric.pop('alias', metric_name)\n\n        f = get_metric_by_name(metric_name)\n\n        inputs = metric.pop('inputs', in_y + out_vars)\n        if isinstance(inputs, str):\n            inputs = [inputs]\n\n        metrics_functions.append(Metric(metric_name, partial(f, **metric), inputs, alias))\n\n    return metrics_functions\n\n\ndef prettify_metrics(metrics: List[Tuple[str, float]], precision: int = 4) -> OrderedDict:\n    \"\"\"Prettifies the dictionary of metrics.\"\"\"\n    prettified_metrics = OrderedDict()\n    for key, value in metrics:\n        if key in prettified_metrics:\n            Warning(\"Multiple metrics with the same name {}.\".format(key))\n        if isinstance(value, float):\n            value = round(value, precision)\n        prettified_metrics[key] = value\n    return prettified_metrics\n\n\nclass NumpyArrayEncoder(JSONEncoder):\n    def default(self, obj):\n        if isinstance(obj, np.ndarray):\n            return obj.tolist()\n        elif isinstance(obj, np.integer):\n            return int(obj)\n        elif isinstance(obj, np.floating):\n            return float(obj)\n        elif is_dataclass(obj):\n            return obj.__dict__\n        return JSONEncoder.default(self, obj)\n"
  },
  {
    "path": "deeppavlov/dataset_iterators/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/dataset_iterators/basic_classification_iterator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom collections import defaultdict\nfrom logging import getLogger\nfrom typing import List\n\nfrom sklearn.model_selection import train_test_split\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\n\nlog = getLogger(__name__)\n\n\n@register('basic_classification_iterator')\nclass BasicClassificationDatasetIterator(DataLearningIterator):\n    \"\"\"\n    Class gets data dictionary from DatasetReader instance, merge fields if necessary, split a field if necessary\n\n    Args:\n        data: dictionary of data with fields \"train\", \"valid\" and \"test\" (or some of them)\n        fields_to_merge: list of fields (out of ``\"train\", \"valid\", \"test\"``) to merge\n        merged_field: name of field (out of ``\"train\", \"valid\", \"test\"``) to which save merged fields\n        field_to_split: name of field (out of ``\"train\", \"valid\", \"test\"``) to split\n        split_fields: list of fields (out of ``\"train\", \"valid\", \"test\"``) to which save splitted field\n        split_proportions: list of corresponding proportions for splitting\n        seed: random seed for iterating\n        shuffle: whether to shuffle examples in batches\n        split_seed: random seed for splitting dataset, if ``split_seed`` is None, division is based on `seed`.\n        stratify: whether to use stratified split\n        shot: number of examples to sample for each class in training data. If None, all examples will remain in data.\n        *args: arguments\n        **kwargs: arguments\n\n    Attributes:\n        data: dictionary of data with fields \"train\", \"valid\" and \"test\" (or some of them)\n    \"\"\"\n\n    def __init__(self, data: dict,\n                 fields_to_merge: List[str] = None, merged_field: str = None,\n                 field_to_split: str = None, split_fields: List[str] = None, split_proportions: List[float] = None,\n                 seed: int = None, shuffle: bool = True, split_seed: int = None,\n                 stratify: bool = None,\n                 shot: int = None,\n                 *args, **kwargs):\n        \"\"\"\n        Initialize dataset using data from DatasetReader,\n        merges and splits fields according to the given parameters.\n        \"\"\"\n        super().__init__(data, seed=seed, shuffle=shuffle)\n\n        if fields_to_merge is not None:\n            if merged_field is not None:\n                log.info(\"Merging fields <<{}>> to new field <<{}>>\".format(fields_to_merge,\n                                                                            merged_field))\n                self._merge_data(fields_to_merge=fields_to_merge,\n                                 merged_field=merged_field)\n            else:\n                raise IOError(\"Given fields to merge BUT not given name of merged field\")\n\n        if field_to_split is not None:\n            if split_fields is not None:\n                log.info(\"Splitting field <<{}>> to new fields <<{}>>\".format(field_to_split,\n                                                                              split_fields))\n                self._split_data(field_to_split=field_to_split,\n                                 split_fields=split_fields,\n                                 split_proportions=[float(s) for s in\n                                                    split_proportions],\n                                 split_seed=split_seed,\n                                 stratify=stratify)\n            else:\n                raise IOError(\"Given field to split BUT not given names of split fields\")\n        \n        if shot is not None:\n            train_data = self.data['train']\n            self.random.shuffle(train_data)\n            self.random.seed(seed)\n\n            data_dict = defaultdict(list)\n            for text, label in train_data:\n                if len(data_dict[label]) < shot:\n                    data_dict[label].append(text)\n            \n            if min(len(x) for x in data_dict.values()) < shot:\n                log.warning(f\"Some labels have less than {shot} examples\")\n\n            self.data['train'] = [(text, label) for label in data_dict for text in data_dict[label]]\n\n    def _split_data(self, field_to_split: str = None, split_fields: List[str] = None,\n                    split_proportions: List[float] = None, split_seed: int = None, stratify: bool = None) -> bool:\n        \"\"\"\n        Split given field of dataset to the given list of fields with corresponding proportions\n\n        Args:\n            field_to_split: field name (out of ``\"train\", \"valid\", \"test\"``) which to split\n            split_fields: list of names (out of ``\"train\", \"valid\", \"test\"``) of fields to which split\n            split_proportions: corresponding proportions\n            split_seed: random seed for splitting dataset\n            stratify: whether to use stratified split\n\n        Returns:\n            None\n        \"\"\"\n        if split_seed is None:\n            split_seed = self.random.randint(0, 10000)\n        data_to_div = self.data[field_to_split].copy()\n        data_size = len(self.data[field_to_split])\n\n        for i in range(len(split_fields) - 1):\n            if stratify:\n                stratify = [sample[1] for sample in data_to_div]\n            self.data[split_fields[i]], data_to_div = train_test_split(\n                data_to_div,\n                test_size=len(data_to_div) - int(data_size * split_proportions[i]),\n                random_state=split_seed,\n                stratify=stratify)\n            self.data[split_fields[-1]] = data_to_div\n        return True\n\n    def _merge_data(self, fields_to_merge: List[str] = None, merged_field: str = None) -> bool:\n        \"\"\"\n        Merge given fields of dataset\n\n        Args:\n            fields_to_merge: list of fields (out of ``\"train\", \"valid\", \"test\"``) to merge\n            merged_field: name of field (out of ``\"train\", \"valid\", \"test\"``) to which save merged fields\n\n        Returns:\n            None\n        \"\"\"\n        data = self.data.copy()\n        data[merged_field] = []\n        for name in fields_to_merge:\n            data[merged_field] += self.data[name]\n        self.data = data\n        return True\n"
  },
  {
    "path": "deeppavlov/dataset_iterators/huggingface_dataset_iterator.py",
    "content": "# Copyright 2020 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List, Tuple, Any, Union\n\nfrom datasets import Dataset\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\n\n\n@register('huggingface_dataset_iterator')\nclass HuggingFaceDatasetIterator(DataLearningIterator):\n    \"\"\"Dataset iterator for HuggingFace Datasets.\"\"\"\n\n    def preprocess(self,\n                   data: Dataset,\n                   features: Union[str, List[str]],\n                   label: str = 'label',\n                   use_label_name: bool = True,\n                   *args, **kwargs) -> List[Tuple[Any, Any]]:\n        \"\"\"Extracts features and labels from HuggingFace Dataset\n\n        Args:\n            data: instance of HuggingFace Dataset\n            features: Dataset fields names to be extracted as features\n            label: Dataset field name to be used as label.\n            use_label_name: Use actual label name instead of its index (0, 1, ...). Defaults to True.\n\n        Returns:\n            List[Tuple[Any, Any]]: list of pairs of extracted features and labels\n        \"\"\"\n\n        dataset = []\n        for i in range(len(data)):  # for example in data\n            example = data[i]\n            if isinstance(features, str):\n                feat = example[features]\n            elif isinstance(features, list):\n                try:\n                    feat = tuple(example[f] for f in features)\n                except Exception as e:\n                    raise Exception(f\"{e} for example {example} while trying to find keys {features}\")\n            else:\n                raise RuntimeError(f\"features should be str or list, but found: {features}\")\n            lb = example[label]\n            if use_label_name and lb != -1:\n                # -1 label is used if there is no label (test set)\n                lb = data.info.features[label].names[lb]\n            dataset += [(feat, lb)]\n        return dataset\n"
  },
  {
    "path": "deeppavlov/dataset_iterators/morphotagger_iterator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Tuple, List, Dict, Any, Iterator\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\n\n\n@register('morphotagger_dataset_iterator')\nclass MorphoTaggerDatasetIterator(DataLearningIterator):\n    \"\"\"\n    Iterates over data for Morphological Tagging.\n    A subclass of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`.\n\n    Args:\n        seed: random seed for data shuffling\n        shuffle: whether to shuffle data during batching\n        validation_split: the fraction of validation data\n            (is used only if there is no `valid` subset in `data`)\n    \"\"\"\n\n    def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], seed: int = None,\n                 shuffle: bool = True, validation_split: float = 0.2) -> None:\n        self.validation_split = validation_split\n        super().__init__(data, seed, shuffle)\n\n    def split(self, *args, **kwargs) -> None:\n        \"\"\"\n        Splits the `train` part to `train` and `valid`, if no `valid` part is specified.\n        Moves deficient data from `valid` to `train` if both parts are given,\n        but `train` subset is too small.\n        \"\"\"\n        if len(self.valid) == 0:\n            if self.shuffle:\n                self.random.shuffle(self.train)\n            L = int(len(self.train) * (1.0 - self.validation_split))\n            self.train, self.valid = self.train[:L], self.train[L:]\n\n    def gen_batches(self, batch_size: int, data_type: str = 'train',\n                    shuffle: bool = None, return_indexes: bool = False) -> Iterator[tuple]:\n        \"\"\"Generate batches of inputs and expected output to train neural networks\n        Args:\n            batch_size: number of samples in batch\n            data_type: can be either 'train', 'test', or 'valid'\n            shuffle: whether to shuffle dataset before batching\n            return_indexes: whether to return indexes of batch elements in initial dataset\n        Yields:\n            a tuple of a batch of inputs and a batch of expected outputs.\n            If `return_indexes` is True, also yields indexes of batch elements.\n        \"\"\"\n        if shuffle is None:\n            shuffle = self.shuffle\n        data = self.data[data_type]\n        lengths = [len(x[0]) for x in data]\n        indexes = np.argsort(lengths)\n        L = len(data)\n        if batch_size < 0:\n            batch_size = L\n        starts = list(range(0, L, batch_size))\n        if shuffle:\n            self.random.shuffle(starts)\n        for start in starts:\n            indexes_to_yield = indexes[start:start + batch_size]\n            data_to_yield = tuple(list(x) for x in zip(*([data[i] for i in indexes_to_yield])))\n            if return_indexes:\n                yield indexes_to_yield, data_to_yield\n            else:\n                yield data_to_yield\n"
  },
  {
    "path": "deeppavlov/dataset_iterators/multitask_iterator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nimport math\nimport random\nfrom logging import getLogger\nfrom typing import Iterator, Optional, Tuple, Union\n\nimport numpy as np\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.params import from_params\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\n\nlog = getLogger(__name__)\n\n\n@register('multitask_iterator')\nclass MultiTaskIterator:\n    \"\"\"\n    Class merges data from several dataset iterators. When used for batch generation batches from\n    merged dataset iterators are united into one batch. If sizes of merged datasets are different\n    smaller datasets are repeated until their size becomes equal to the largest dataset.\n\n    Args:\n        data: dictionary which keys are task names and values are dictionaries with fields\n            ``\"train\", \"valid\", \"test\"``.\n        num_train_epochs: number of training epochs\n        tasks: dictionary which keys are task names and values are init params of dataset iterators. If task has\n            key-value pair ``'use_task_defaults': False`` task_defaults for this task dataset iterator will be ignored.\n        batch_size: batch_size\n        sampling_mode: mode of sampling we use. It can be plain, uniform or anneal.\n        gradient_accumulation_steps: number of gradient accumulation steps. Default is 1\n        steps_per_epoch: number of steps per epoch. Nesessary if gradient_accumulation_steps > 1\n        iterator_class_name: name of iterator class.\n        use_label_name, seed, features - parameters for the iterator class\n        one_element_tuples: if True, tuple of x consisting of one element is returned in this element. Default: True\n        task_defaults: default task parameters.\n        seed - random seed for sampling\n\n    Attributes:\n        data: dictionary of data with fields \"train\", \"valid\" and \"test\" (or some of them)\n    \"\"\"\n\n    def __init__(\n            self,\n            data: dict,\n            num_train_epochs: int,\n            tasks: dict,\n            batch_size: int = 8,\n            sampling_mode: str = 'plain',\n            gradient_accumulation_steps: int = 1,\n            steps_per_epoch: int = 0,\n            one_element_tuples: bool = True,\n            task_defaults: dict = None,\n            seed: int = 42,\n            **kwargs\n    ):\n        if data.keys() != tasks.keys():\n            raise ConfigError(\"Task names from dataset reader don't mach task names from dataset iterator: \"\n                              f\"{data.keys()} != {tasks.keys()}.\")\n        self.task_iterators = {}\n        if task_defaults is None:\n            task_defaults = dict()\n        for task_name, task_params in tasks.items():\n            if task_params.pop('use_task_defaults', True) is True:\n                task_config = copy.deepcopy(task_defaults)\n                task_config.update(task_params)\n            else:\n                task_config = task_params\n            try:\n                self.task_iterators[task_name] = from_params(task_config, data=data[task_name])\n            except Exception as e:\n                log.error(f'Failed to initialize dataset_iterator for \"{task_name}\" task. Make sure that all parameters'\n                          'from `task_defaults` and task parameters are correct.')\n                raise e\n        self.n_tasks = len(tasks.keys())\n        self.num_train_epochs = num_train_epochs\n        self.steps_per_epoch = steps_per_epoch\n        self.gradient_accumulation_steps = gradient_accumulation_steps\n        self.epochs_done = 0\n        self.steps_taken = 0\n        self.task_id = None\n        self.sampling_mode = sampling_mode\n        self.data = {\n            \"train\": self._extract_data_type(\"train\"),\n            \"valid\": self._extract_data_type(\"valid\"),\n            \"test\": self._extract_data_type(\"test\"),\n        }\n        for mode in [\"train\", \"valid\", \"test\"]:\n            log.info(f'For {mode}')\n            for task_name in self.data[mode]:\n                log.info(f'{task_name} has {len(self.data[mode][task_name])} examples')\n        self.train_sizes = self._get_data_size(\"train\")\n        if steps_per_epoch == 0:\n            self.steps_per_epoch = sum(self.train_sizes) // batch_size\n        else:\n            self.steps_per_epoch = steps_per_epoch\n\n        def is_nan(a):\n            return a != a\n\n        for mode in ['train', 'valid', 'test']:\n            for task in self.data[mode]:\n                for i in range(len(self.data[mode][task]) - 1, -1, -1):\n                    x = self.data[mode][task][i][0]\n                    y = self.data[mode][task][i][1]\n                    if is_nan(x) or any([is_nan(z) for z in x]) or is_nan(y):\n                        log.info(f'NAN detected {self.data[mode][task][i - 1:i]}')\n                        del self.data[mode][task][i]\n                        log.info(f'NAN for mode {mode} task {task} element {i} CLEARED')\n                    elif isinstance(x, tuple) and len(x) == 1 and one_element_tuples:\n                        # x is a tuple consisting of 1 element. return it as string\n                        self.data[mode][task][i] = (x[0], y)\n        self.max_task_data_len = dict()\n        for data_type in self.data:\n            sizes = self._get_data_size(data_type)\n            self.max_task_data_len[data_type] = max(sizes)\n        random.seed(seed)\n\n    def _get_data_size(self, data_type):\n        \"\"\"Returns list of sizes of each dataset for the given data_type: train,test or valid.\"\"\"\n        return [len(self.data[data_type][key]) for key in self.data[data_type]]\n\n    def _get_probs(self, data_type):\n        \"\"\"Returns sampling probabilities for different sampling modes - plain, uniform or anneal\"\"\"\n        if self.sampling_mode == 'uniform':\n            sizes = [1 for _ in self._get_data_size(data_type)]\n            # as we sample uniformly\n            s = sum(sizes)\n            probs = [p / s for p in sizes]\n        elif self.sampling_mode == 'plain':\n            sizes = self._get_data_size(data_type)\n            n_samples = sum(sizes)\n            probs = [p / n_samples for p in sizes]\n        elif self.sampling_mode == 'anneal':\n            alpha = 1.0 - 0.8 * (self.epochs_done / self.num_train_epochs)\n            annealed_sizes = [p ** alpha for p in self._get_data_size(data_type)]\n            n_samples = sum(annealed_sizes)\n            probs = [p / n_samples for p in annealed_sizes]\n        else:\n            raise ValueError(f'Unsupported sampling mode {self.sampling_mode}')\n        return probs\n\n    def _extract_data_type(self, data_type):\n        \"\"\"Function that merges data of the current data_type (e.g. train) from all task_iterators into one dict\"\"\"\n        dataset_part = {}\n        for task, iterator in self.task_iterators.items():\n            dataset_part[task] = getattr(iterator, data_type)\n        return dataset_part\n\n    def _transform_before_yielding(self, x, y, batch_size):\n        \"\"\"Function that transforms data from dataset before yielding\"\"\"\n\n        if len(x) != len(y):\n            raise Exception(f'x has len {len(x)} but y has len {len(y)}')\n        new_x, new_y = [], []\n        for i in range(batch_size):\n            x_tuple = tuple([x[t_id][i] for t_id in range(self.n_tasks)])\n            y_tuple = tuple([y[t_id][i] for t_id in range(self.n_tasks)])\n            if self.n_tasks == 1:\n                x_tuple = x_tuple[0]\n                y_tuple = y_tuple[0]\n            new_x.append(x_tuple)\n            new_y.append(y_tuple)\n        batches = (tuple(new_x), tuple(new_y))\n        return batches\n\n    def gen_batches(self, batch_size: int, data_type: str = \"train\",\n                    shuffle: bool = None) -> Iterator[Tuple[tuple, tuple]]:\n        \"\"\"\n        Generates batches and expected output to train neural networks.\n        If there are not enough samples from any task, samples are padded with None\n        Args:\n            batch_size: number of samples in batch\n            data_type: can be either 'train', 'test', or 'valid'\n            shuffle: whether to shuffle dataset before batching\n        Yields:\n            A tuple of a batch of inputs and a batch of expected outputs.\n            Inputs and outputs are tuples. Element of inputs or outputs is a tuple which\n            elements are x values of merged tasks in the order tasks are present in\n            `tasks` argument of `__init__` method.\n        \"\"\"\n\n        max_task_data_len = self.max_task_data_len[data_type]\n        log.info(f'Batch size {batch_size} with gradient accumulation steps {self.gradient_accumulation_steps}')\n        log.info(f'Efficient batch size {batch_size // self.gradient_accumulation_steps}')\n        batch_size = batch_size // self.gradient_accumulation_steps\n\n        if data_type == \"train\":\n            generators = [\n                SingleTaskBatchGenerator(iter_, batch_size, data_type, shuffle)\n                for iter_ in self.task_iterators.values()\n            ]\n            # probs only required while training\n            probs = self._get_probs(\"train\")\n            for step in range(self.steps_per_epoch):\n                if (self.steps_taken + 1) % self.gradient_accumulation_steps == 0 or self.task_id is None:\n                    self.task_id = np.random.choice(self.n_tasks, p=probs)\n                x = [[None for _ in range(batch_size)] for _ in range(self.n_tasks)]\n                y = [[None for _ in range(batch_size)] for _ in range(self.n_tasks)]\n                x[self.task_id], y[self.task_id] = generators[self.task_id].__next__()\n                if not all([s is None for s in x[self.task_id]]):\n                    batch_to_yield = self._transform_before_yielding(\n                        x, y, batch_size)\n                    yield batch_to_yield\n\n            self.epochs_done += 1\n            # one additional step is taken while logging training metrics\n            self.steps_taken -= 1\n        else:\n            eval_batch_size = 1\n            x = [[None for _ in range(eval_batch_size)] for _ in range(self.n_tasks)]\n            y = [[None for _ in range(eval_batch_size)] for _ in range(self.n_tasks)]\n            generators = [\n                SingleTaskBatchGenerator(\n                    iter_, batch_size=eval_batch_size, data_type=data_type, shuffle=shuffle)\n                for iter_ in self.task_iterators.values()\n            ]\n            for step in range(max_task_data_len):\n                for task_id in range(self.n_tasks):\n                    x[task_id], y[task_id] = generators[task_id].__next__()\n\n                batches = self._transform_before_yielding(x, y, eval_batch_size)\n                yield batches\n\n    def get_instances(self, data_type: str = \"train\"):\n        \"\"\"\n        Returns a tuple of inputs and outputs from all datasets. Lengths of\n        and outputs are equal to the size of the largest dataset. Smaller\n        datasets are padded with Nones until their sizes are equal to the size of the\n        largest dataset.\n        Args:\n            data_type: can be either 'train', 'test', or 'valid'\n        Returns:\n            A tuple of all inputs for a data type and all expected outputs\n            for a data type.\n        \"\"\"\n\n        max_task_data_len = max(\n            [\n                len(iter_.get_instances(data_type)[0])\n                for iter_ in self.task_iterators.values()\n            ]\n        )\n        x_instances = []\n        y_instances = []\n        for task_name, iter_ in self.task_iterators.items():\n            x, y = iter_.get_instances(data_type)\n            n_repeats = math.ceil(max_task_data_len / len(x))\n            x *= n_repeats\n            y *= n_repeats\n            x_instances.append(x[:max_task_data_len])\n            y_instances.append(y[:max_task_data_len])\n        error_msg = f'Len of x_instances {len(x_instances)} and y_instances {len(y_instances)} dont match'\n        if len(x_instances) != len(y_instances):\n            raise Exception(error_msg)\n        instances = (tuple(zip(*x_instances)), tuple(zip(*y_instances)))\n        return instances\n\n\nclass SingleTaskBatchGenerator:\n    \"\"\"\n    Batch generator for a single task.\n    If there are no elements in the dataset to form another batch, Nones are returned.\n    Args:\n        dataset_iterator: dataset iterator from which batches are drawn.\n        batch_size: size fo the batch.\n        data_type: \"train\", \"valid\", or \"test\"\n        shuffle: whether dataset will be shuffled.\n        n_batches: the number of batches that will be generated.\n    \"\"\"\n\n    def __init__(\n            self,\n            dataset_iterator: Union[DataLearningIterator],\n            batch_size: int,\n            data_type: str,\n            shuffle: bool,\n            n_batches: Optional[int] = None,\n            size_of_last_batch: Optional[int] = None,\n    ):\n        self.dataset_iterator = dataset_iterator\n        self.batch_size = batch_size\n        self.data_type = data_type\n        self.shuffle = shuffle\n        self.n_batches = n_batches\n        self.size_of_last_batch = (\n            self.batch_size if size_of_last_batch is None else size_of_last_batch)\n\n        self.inner_batch_size = math.gcd(\n            len(self.dataset_iterator.data[data_type]), batch_size\n        )\n        self.gen = self.dataset_iterator.gen_batches(\n            self.inner_batch_size, self.data_type, self.shuffle\n        )\n        self.batch_count = 0\n\n    def __iter__(self):\n        return self\n\n    def __next__(self):\n        if self.n_batches is not None and self.batch_count > self.n_batches:\n            raise StopIteration\n        x, y = (), ()\n        while len(x) < self.batch_size or len(y) < self.batch_size:\n            try:\n                xx, yy = next(self.gen)\n                x += xx\n                y += yy\n            except StopIteration:\n                x_nones = tuple([None for _ in range(self.batch_size)])\n                y_nones = x_nones\n                return x_nones, y_nones\n\n        self.batch_count += 1\n        if self.batch_count == self.n_batches:\n            x = x[:self.size_of_last_batch]\n            y = y[:self.size_of_last_batch]\n        return x, y\n"
  },
  {
    "path": "deeppavlov/dataset_iterators/siamese_iterator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import Dict, List, Tuple\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\n\nlog = getLogger(__name__)\n\n\n@register('siamese_iterator')\nclass SiameseIterator(DataLearningIterator):\n    \"\"\"The class contains methods for iterating over a dataset for ranking in training, validation and test mode.\"\"\"\n\n    def split(self, *args, len_valid=1000, len_test=1000, **kwargs) -> None:\n        if len(self.valid) == 0 and len_valid != 0:\n            self.random.shuffle(self.train)\n            self.valid = self.train[-len_valid:]\n            self.train = self.train[:-len_valid]\n        if len(self.test) == 0 and len_test != 0:\n            self.random.shuffle(self.train)\n            self.test = self.train[-len_test:]\n            self.train = self.train[:-len_test]\n"
  },
  {
    "path": "deeppavlov/dataset_iterators/sqlite_iterator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sqlite3\nfrom logging import getLogger\nfrom pathlib import Path\nfrom random import Random\nfrom typing import List, Any, Dict, Optional, Union, Generator, Tuple\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_fitting_iterator import DataFittingIterator\n\nlogger = getLogger(__name__)\n\n\n@register('sqlite_iterator')\nclass SQLiteDataIterator(DataFittingIterator):\n    \"\"\"Iterate over SQLite database.\n    Gen batches from SQLite data.\n    Get document ids and document.\n\n    Args:\n        load_path: a path to local DB file\n        batch_size: a number of samples in a single batch\n        shuffle: whether to shuffle data during batching\n        seed: random seed for data shuffling\n\n    Attributes:\n        connect: a DB connection\n        db_name: a DB name\n        doc_ids: DB document ids\n        doc2index: a dictionary of document indices and their titles\n        batch_size: a number of samples in a single batch\n        shuffle: whether to shuffle data during batching\n        random: an instance of :class:`Random` class.\n\n    \"\"\"\n\n    def __init__(self, load_path: Union[str, Path], batch_size: Optional[int] = None,\n                 shuffle: Optional[bool] = None, seed: Optional[int] = None, **kwargs) -> None:\n\n        load_path = str(expand_path(load_path))\n        logger.info(\"Connecting to database, path: {}\".format(load_path))\n        try:\n            self.connect = sqlite3.connect(load_path, check_same_thread=False)\n        except sqlite3.OperationalError as e:\n            e.args = e.args + (\"Check that DB path exists and is a valid DB file\",)\n            raise e\n        try:\n            self.db_name = self.get_db_name()\n        except TypeError as e:\n            e.args = e.args + (\n                'Check that DB path was created correctly and is not empty. '\n                'Check that a correct dataset_format is passed to the ODQAReader config',)\n            raise e\n        self.doc_ids = self.get_doc_ids()\n        self.doc2index = self.map_doc2idx()\n        self.batch_size = batch_size\n        self.shuffle = shuffle\n        self.random = Random(seed)\n\n    def get_doc_ids(self) -> List[Any]:\n        \"\"\"Get document ids.\n\n        Returns:\n            document ids\n        \"\"\"\n        cursor = self.connect.cursor()\n        cursor.execute('SELECT id FROM {}'.format(self.db_name))\n        ids = [ids[0] for ids in cursor.fetchall()]\n        cursor.close()\n        return ids\n\n    def get_db_name(self) -> str:\n        \"\"\"Get DB name.\n\n        Returns:\n            DB name\n\n        \"\"\"\n        cursor = self.connect.cursor()\n        cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n        assert cursor.arraysize == 1\n        name = cursor.fetchone()[0]\n        cursor.close()\n        return name\n\n    def map_doc2idx(self) -> Dict[int, Any]:\n        \"\"\"Map DB ids to integer ids.\n\n        Returns:\n            a dictionary of document titles and correspondent integer indices\n\n        \"\"\"\n        doc2idx = {doc_id: i for i, doc_id in enumerate(self.doc_ids)}\n        logger.info(\n            \"SQLite iterator: The size of the database is {} documents\".format(len(doc2idx)))\n        return doc2idx\n\n    def get_doc_content(self, doc_id: Any) -> Optional[str]:\n        \"\"\"Get document content by id.\n\n        Args:\n            doc_id: a document id\n\n        Returns:\n            document content if success, else raise Exception\n\n        \"\"\"\n        cursor = self.connect.cursor()\n        cursor.execute(\n            \"SELECT text FROM {} WHERE id = ?\".format(self.db_name),\n            (doc_id,)\n        )\n        result = cursor.fetchone()\n        cursor.close()\n        return result if result is None else result[0]\n\n    def gen_batches(self, batch_size: int, shuffle: bool = None) \\\n            -> Generator[Tuple[List[str], List[int]], Any, None]:\n        \"\"\"Gen batches of documents.\n\n        Args:\n            batch_size: a number of samples in a single batch\n            shuffle: whether to shuffle data during batching\n\n        Yields:\n            generated tuple of documents and their ids\n\n        \"\"\"\n        if shuffle is None:\n            shuffle = self.shuffle\n\n        if shuffle:\n            _doc_ids = self.random.sample(self.doc_ids, len(self.doc_ids))\n        else:\n            _doc_ids = self.doc_ids\n\n        if batch_size > 0:\n            batches = [_doc_ids[i:i + batch_size] for i in\n                       range(0, len(_doc_ids), batch_size)]\n        else:\n            batches = [_doc_ids]\n\n        for i, doc_ids in enumerate(batches):\n            docs = [self.get_doc_content(doc_id) for doc_id in doc_ids]\n            doc_nums = [self.doc2index[doc_id] for doc_id in doc_ids]\n            yield docs, zip(doc_ids, doc_nums)\n\n    def get_instances(self):\n        \"\"\"Get all data\"\"\"\n        doc_ids = list(self.doc_ids)\n        docs = [self.get_doc_content(doc_id) for doc_id in doc_ids]\n        doc_nums = [self.doc2index[doc_id] for doc_id in doc_ids]\n        return docs, zip(doc_ids, doc_nums)\n"
  },
  {
    "path": "deeppavlov/dataset_iterators/squad_iterator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport json\nfrom typing import Dict, Any, List, Tuple, Generator, Optional\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\n\n\n@register('squad_iterator')\nclass SquadIterator(DataLearningIterator):\n    \"\"\"SquadIterator allows to iterate over examples in SQuAD-like datasets.\n    SquadIterator is used to train \n    :class:`~deeppavlov.models.torch_bert.torch_transformers_squad:TorchTransformersSquad`.\n\n    It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset.\n    Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``\n\n    Attributes:\n        train: train examples\n        valid: validation examples\n        test: test examples\n\n    \"\"\"\n\n    def preprocess(self, data: Dict[str, Any], *args, **kwargs) -> \\\n            List[Tuple[Tuple[str, str], Tuple[List[str], List[int]]]]:\n        \"\"\"Extracts context, question, answer, answer_start from SQuAD data\n\n        Args:\n            data: data in squad format\n\n        Returns:\n            list of (context, question), (answer_text, answer_start)\n            answer text and answer_start are lists\n\n        \"\"\"\n        cqas = []\n        if data:\n            for article in data['data']:\n                for par in article['paragraphs']:\n                    context = par['context']\n                    for qa in par['qas']:\n                        q = qa['question']\n                        ans_text = []\n                        ans_start = []\n                        if qa['answers']:\n                            for answer in qa['answers']:\n                                ans_text.append(answer['text'])\n                                ans_start.append(answer['answer_start'])\n                        else:\n                            ans_text = ['']\n                            ans_start = [-1]\n                        cqas.append(((context, q), (ans_text, ans_start)))\n        return cqas\n\n\n@register('multi_squad_iterator')\nclass MultiSquadIterator(DataLearningIterator):\n    \"\"\"Dataset iterator for multiparagraph-SQuAD dataset.\n\n    With ``with_answer_rate`` rate samples context with answer and with ``1 - with_answer_rate`` samples context\n    from the same article, but without an answer. Contexts without an answer are sampled according to\n    their tfidf scores (tfidf score between question and context).\n\n    It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset.\n    Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``. If there is\n    no answer in context, then ``answer_text`` is empty string and `answer_start` is equal to -1.\n\n    Args:\n        data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values\n        seed: random seed for data shuffling\n        shuffle: whether to shuffle data during batching\n        with_answer_rate: sampling rate of contexts with answer\n\n    Attributes:\n        shuffle: whether to shuffle data during batching\n        random: instance of ``Random`` initialized with a seed\n    \"\"\"\n\n    def __init__(self, data, seed: Optional[int] = None, shuffle: bool = True, with_answer_rate: float = 0.666,\n                 *args, **kwargs) -> None:\n        self.with_answer_rate = with_answer_rate\n        self.seed = seed\n        self.np_random = np.random.RandomState(seed)\n        super().__init__(data, seed, shuffle, *args, **kwargs)\n\n    def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: bool = None) \\\n            -> Generator[Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]], None, None]:\n\n        if shuffle is None:\n            shuffle = self.shuffle\n\n        if data_type == 'train':\n            random = self.np_random\n        else:\n            random = np.random.RandomState(self.seed)\n\n        if shuffle:\n            random.shuffle(self.data[data_type])\n\n        data = self.data[data_type]\n        data_len = len(data)\n\n        for i in range((data_len - 1) // batch_size + 1):\n            batch = []\n            for j in range(i * batch_size, min((i + 1) * batch_size, data_len)):\n                q = data[j]['question']\n                contexts = data[j]['contexts']\n                ans_contexts = [c for c in contexts if len(c['answer']) > 0]\n                noans_contexts = [c for c in contexts if len(c['answer']) == 0]\n                # sample context with answer or without answer\n                if random.rand() < self.with_answer_rate or len(noans_contexts) == 0:\n                    # select random context with answer\n                    context = random.choice(ans_contexts)\n                else:\n                    # select random context without answer\n                    # prob ~ context tfidf score\n                    noans_scores = np.array([x['score'] for x in noans_contexts])\n                    noans_scores = noans_scores / np.sum(noans_scores)\n                    context = noans_contexts[np.argmax(random.multinomial(1, noans_scores))]\n\n                answer_text = [ans['text'] for ans in context['answer']] if len(context['answer']) > 0 else ['']\n                answer_start = [ans['answer_start']\n                                for ans in context['answer']] if len(context['answer']) > 0 else [-1]\n                batch.append(((context['context'], q), (answer_text, answer_start)))\n            yield tuple(zip(*batch))\n\n    def get_instances(self, data_type: str = 'train') -> Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]]:\n        data_examples = []\n        for qcas in self.data[data_type]:  # question, contexts, answers\n            question = qcas['question']\n            for context in qcas['contexts']:\n                answer_text = [x['text'] for x in context['answer']]\n                answer_start = [x['answer_start'] for x in context['answer']]\n                data_examples.append(((context['context'], question), (answer_text, answer_start)))\n        return tuple(zip(*data_examples))\n\n\n@register('multi_squad_retr_iterator')\nclass MultiSquadRetrIterator(DataLearningIterator):\n    \"\"\"Dataset iterator for multiparagraph-SQuAD dataset.\n\n    reads data from jsonl files\n\n    With ``with_answer_rate`` rate samples context with answer and with ``1 - with_answer_rate`` samples context\n    from the same article, but without an answer. Contexts without an answer are sampled from uniform distribution.\n    If ``with_answer_rate`` is None than we compute actual ratio for each data example.\n\n    It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset.\n    Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``. If there is\n    no answer in context, then ``answer_text`` is empty string and `answer_start` is equal to -1.\n\n    Args:\n        data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values\n        seed: random seed for data shuffling\n        shuffle: whether to shuffle data during batching\n        with_answer_rate: sampling rate of contexts with answer\n        squad_rate: sampling rate of context from squad dataset (actual rate would be with_answer_rate * squad_rate)\n\n    Attributes:\n        shuffle: whether to shuffle data during batching\n        random: instance of ``Random`` initialized with a seed\n    \"\"\"\n\n    def __init__(self, data, seed: Optional[int] = None, shuffle: bool = False,\n                 with_answer_rate: Optional[float] = None,\n                 squad_rate: Optional[float] = None, *args, **kwargs) -> None:\n        self.with_answer_rate = with_answer_rate\n        self.squad_rate = squad_rate\n        self.seed = seed\n        self.np_random = np.random.RandomState(seed)\n        self.shuffle = shuffle\n\n        self.train = data.get('train', [])\n        self.valid = data.get('valid', [])\n        self.test = data.get('test', [])\n\n        self.data = {\n            'train': self.train,\n            'valid': self.valid,\n            'test': self.test,\n        }\n\n        if self.shuffle:\n            raise RuntimeError('MultiSquadIterator doesn\\'t support shuffling.')\n\n    def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: bool = None) \\\n            -> Generator[Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]], None, None]:\n\n        if shuffle is None:\n            shuffle = self.shuffle\n\n        if data_type == 'train':\n            random = self.np_random\n        else:\n            random = np.random.RandomState(self.seed)\n\n        if shuffle:\n            raise RuntimeError('MultiSquadIterator doesn\\'t support shuffling.')\n\n        datafile = self.data[data_type]\n        with datafile.open('r', encoding='utf8') as fin:\n            end_of_file = False\n            while not end_of_file:\n                batch = []\n                for i in range(batch_size):\n                    line = fin.readline()\n                    if len(line) == 0:\n                        end_of_file = True\n                        break\n\n                    qcas = json.loads(line)\n                    q = qcas['question']\n                    contexts = qcas['contexts']\n                    ans_contexts = [c for c in contexts if len(c['answer']) > 0]\n                    noans_contexts = [c for c in contexts if len(c['answer']) == 0]\n                    ans_clen = len(ans_contexts)\n                    noans_clen = len(noans_contexts)\n                    # sample context with answer or without answer\n                    with_answer_rate = self.with_answer_rate\n                    if with_answer_rate is None:\n                        with_answer_rate = 1.0 if noans_clen == 0 else ans_clen / (ans_clen + noans_clen)\n\n                    if random.rand() < with_answer_rate or noans_clen == 0:\n                        # select random context with answer\n                        if self.squad_rate is not None:\n                            if random.rand() < self.squad_rate or len(ans_contexts) == 1:\n                                # first context is always from squad dataset\n                                context = ans_contexts[0]\n                            else:\n                                context = random.choice(ans_contexts[1:])\n                        else:\n                            context = random.choice(ans_contexts)\n                    else:\n                        # select random context without answer\n                        # prob ~ context tfidf score\n                        # noans_scores = np.array([x['score'] for x in noans_contexts])\n                        # noans_scores = noans_scores / np.sum(noans_scores)\n                        # context = noans_contexts[np.argmax(random.multinomial(1, noans_scores))]\n                        context = random.choice(noans_contexts)\n\n                    answer_text = [ans['text'] for ans in context['answer']] if len(context['answer']) > 0 else ['']\n                    answer_start = [ans['answer_start']\n                                    for ans in context['answer']] if len(context['answer']) > 0 else [-1]\n                    batch.append(((context['context'], q), (answer_text, answer_start)))\n                if batch:\n                    yield tuple(zip(*batch))\n\n    def get_instances(self, data_type: str = 'train') -> Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]]:\n        data_examples = []\n        for f in self.data[data_type]:  # question, contexts, answers\n            for line in f.open('r', encoding='utf8'):\n                qcas = json.loads(line)\n                question = qcas['question']\n                for context in qcas['contexts']:\n                    answer_text = [x['text'] for x in context['answer']]\n                    answer_start = [x['answer_start'] for x in context['answer']]\n                    data_examples.append(((context['context'], question), (answer_text, answer_start)))\n        return tuple(zip(*data_examples))\n"
  },
  {
    "path": "deeppavlov/dataset_iterators/typos_iterator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.data_learning_iterator import DataLearningIterator\n\n\n@register('typos_iterator')\nclass TyposDatasetIterator(DataLearningIterator):\n    \"\"\"Implementation of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for training\n     :class:`~deeppavlov.models.spelling_correction.brillmoore.ErrorModel`\n\n    \"\"\"\n\n    def split(self, test_ratio: float = 0., *args, **kwargs):\n        \"\"\"Split all data into train and test\n\n        Args:\n            test_ratio: ratio of test data to train, from 0. to 1.\n        \"\"\"\n        self.train += self.valid + self.test\n\n        split = int(len(self.train) * test_ratio)\n\n        self.random.shuffle(self.train)\n\n        self.test = self.train[:split]\n        self.train = self.train[split:]\n        self.valid = []\n"
  },
  {
    "path": "deeppavlov/dataset_readers/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/dataset_readers/basic_classification_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom logging import getLogger\nfrom pathlib import Path\n\nimport pandas as pd\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\nfrom deeppavlov.core.data.utils import download\n\nlog = getLogger(__name__)\n\n\n@register('basic_classification_reader')\nclass BasicClassificationDatasetReader(DatasetReader):\n    \"\"\"\n    Class provides reading dataset in .csv format\n    \"\"\"\n\n    def read(self, data_path: str, url: str = None,\n             format: str = \"csv\", class_sep: str = None,\n             *args, **kwargs) -> dict:\n        \"\"\"\n        Read dataset from data_path directory.\n        Reading files are all data_types + extension\n        (i.e for data_types=[\"train\", \"valid\"] files \"train.csv\" and \"valid.csv\" form\n        data_path will be read)\n\n        Args:\n            data_path: directory with files\n            url: download data files if data_path not exists or empty\n            format: extension of files. Set of Values: ``\"csv\", \"json\"``\n            class_sep: string separator of labels in column with labels\n            sep (str): delimeter for ``\"csv\"`` files. Default: None -> only one class per sample\n            header (int): row number to use as the column names\n            names (array): list of column names to use\n            orient (str): indication of expected JSON string format\n            lines (boolean): read the file as a json object per line. Default: ``False``\n\n        Returns:\n            dictionary with types from data_types.\n            Each field of dictionary is a list of tuples (x_i, y_i)\n        \"\"\"\n        data_types = [\"train\", \"valid\", \"test\"]\n\n        train_file = kwargs.get('train', 'train.csv')\n\n        if not Path(data_path, train_file).exists():\n            if url is None:\n                raise Exception(\n                    \"data path {} does not exist or is empty, and download url parameter not specified!\".format(\n                        data_path))\n            log.info(\"Loading train data from {} to {}\".format(url, data_path))\n            download(source_url=url, dest_file_path=Path(data_path, train_file))\n\n        data = {\"train\": [],\n                \"valid\": [],\n                \"test\": []}\n        for data_type in data_types:\n            file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format))\n            if file_name is None:\n                continue\n\n            file = Path(data_path).joinpath(file_name)\n            if file.exists():\n                if format == 'csv':\n                    keys = ('sep', 'header', 'names')\n                    options = {k: kwargs[k] for k in keys if k in kwargs}\n                    df = pd.read_csv(file, **options)\n                elif format == 'json':\n                    keys = ('orient', 'lines')\n                    options = {k: kwargs[k] for k in keys if k in kwargs}\n                    df = pd.read_json(file, **options)\n                else:\n                    raise Exception('Unsupported file format: {}'.format(format))\n\n                x = kwargs.get(\"x\", \"text\")\n                y = kwargs.get('y', 'labels')\n                if isinstance(x, list):\n                    if class_sep is None:\n                        # each sample is a tuple (\"text\", \"label\")\n                        data[data_type] = [([row[x_] for x_ in x], str(row[y]))\n                                           for _, row in df.iterrows()]\n                    else:\n                        # each sample is a tuple (\"text\", [\"label\", \"label\", ...])\n                        data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep))\n                                           for _, row in df.iterrows()]\n                else:\n                    if class_sep is None:\n                        # each sample is a tuple (\"text\", \"label\")\n                        data[data_type] = [(row[x], str(row[y])) for _, row in df.iterrows()]\n                    else:\n                        # each sample is a tuple (\"text\", [\"label\", \"label\", ...])\n                        data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()]\n            else:\n                log.warning(\"Cannot find {} file\".format(file))\n\n        return data\n"
  },
  {
    "path": "deeppavlov/dataset_readers/boolqa_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nfrom pathlib import Path\nfrom typing import Dict, List, Tuple\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\nfrom deeppavlov.core.data.utils import download_decompress\n\n\n@register('boolqa_reader')\nclass BoolqaReader(DatasetReader):\n    \"\"\"\n    The class to read the BoolQ dataset from files. \n    BoolQ is a question answering dataset for yes/no questions containing 15942 examples. \n    Each example is a triplet of (question, passage, answer).\n\n    More details about the English BoolQ are available in https://arxiv.org/abs/1905.10044\n    https://github.com/google-research-datasets/boolean-questions\n\n    The details about the Russian DaNetQA are available in \n    https://russiansuperglue.com/ru/tasks/task_info/DaNetQA\n\n    The reader supports English and Russian variants of the dataset.\n    The config example is boolqa_rubert.json.\n    \"\"\"\n\n    urls = { \n            'en': 'http://files.deeppavlov.ai/datasets/BoolQ.tar.gz',\n            'ru': 'http://files.deeppavlov.ai/datasets/DaNetQA.tar.gz'\n           }\n\n    def read(self,\n             data_path: str,\n             language: str = 'en',\n             *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:\n\n        \"\"\"\n        Reads BoolQ dataset from files.\n\n        Args:\n            data_path: A path to a folder with dataset files.\n            language: The dataset language ('ru', 'en' are available)\n\n        Returns:\n            dataset: items of the dataset [(question, passage), label]\n        \"\"\"\n\n        if language in self.urls:\n            self.url = self.urls[language]\n        else:\n            raise RuntimeError(f'The dataset for {language} is unavailable')\n\n        data_path = expand_path(data_path)\n        if not data_path.exists():\n            data_path.mkdir(parents=True)\n\n        download_decompress(self.url, data_path)\n        dataset = {}\n\n        for filename in ['train.jsonl', 'valid.jsonl']:\n            dataset[filename.split('.')[0]] = self._build_data(language, data_path / filename)\n\n        return dataset\n\n    @staticmethod\n    def _build_data(ln: str, data_path: Path) -> List[Tuple[Tuple[str, str], int]]:\n\n        data = {}\n        with open(data_path, 'r') as f:\n            for line in f:\n                jline = json.loads(line)\n                if ln == 'ru':\n                    if 'label' in jline:\n                        data[jline['question'], jline['passage']] = int(jline['label'])\n                if ln == 'en':\n                    if 'answer' in jline:\n                        data[jline['question'], jline['passage']] = int(jline['answer'])\n\n        return list(data.items())\n"
  },
  {
    "path": "deeppavlov/dataset_readers/conll2003_reader.py",
    "content": "from logging import getLogger\nfrom pathlib import Path\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\nfrom deeppavlov.core.data.utils import download_decompress\n\nlog = getLogger(__name__)\n\n\n@register('conll2003_reader')\nclass Conll2003DatasetReader(DatasetReader):\n    \"\"\"Class to read training datasets in CoNLL-2003 format\"\"\"\n\n    def read(self,\n             data_path: str,\n             dataset_name: str = None,\n             provide_pos: bool = False,\n             provide_chunk: bool = False,\n             provide_doc_ids: bool = False,\n             iob: bool = False,\n             iobes: bool = False,\n             docstart_token: str = None,\n            *args, **kwargs):\n        self.provide_pos = provide_pos\n        self.provide_chunk = provide_chunk\n        self.provide_doc_ids = provide_doc_ids\n        self.iob = iob\n        self.iobes = iobes\n        self.docstart_token = docstart_token\n        self.num_docs = 0\n        self.x_is_tuple = self.provide_pos or self.provide_doc_ids\n        data_path = Path(data_path)\n        files = list(data_path.glob('*.txt'))\n        if 'train.txt' not in {file_path.name for file_path in files}:\n            if dataset_name == 'conll2003':\n                url = 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz'\n            elif dataset_name == 'collection_rus':\n                url = 'http://files.deeppavlov.ai/deeppavlov_data/collection3_v2.tar.gz'\n            elif dataset_name == 'ontonotes':\n                url = 'http://files.deeppavlov.ai/deeppavlov_data/ontonotes_ner.tar.gz'\n            elif dataset_name == 'vlsp2016':\n                url = 'http://files.deeppavlov.ai/deeppavlov_data/vlsp2016.tar.gz'\n            elif dataset_name == 'dailydialog':\n                url = 'http://files.deeppavlov.ai/deeppavlov_data/dailydialog.tar.gz'\n            elif dataset_name == 'collection3':\n                url = 'http://files.deeppavlov.ai/deeppavlov_data/collection3_anh.tar.gz'\n            else:\n                raise RuntimeError('train.txt not found in \"{}\"'.format(data_path))\n            data_path.mkdir(exist_ok=True, parents=True)\n            download_decompress(url, data_path)\n            files = list(data_path.glob('*.txt'))\n        dataset = {}\n\n        for file_name in files:\n            name = file_name.with_suffix('').name\n            dataset[name] = self.parse_ner_file(file_name)\n        return dataset\n\n    def parse_ner_file(self, file_name: Path):\n        samples = []\n        with file_name.open(encoding='utf8') as f:\n            tokens = []\n            pos_tags = []\n            chunk_tags = []\n            tags = []\n            expected_items = 2 + int(self.provide_pos) + int(self.provide_chunk)\n            for line in f:\n                # Check end of the document\n                if 'DOCSTART' in line:\n                    if len(tokens) > 1:\n                        x = tokens if not self.x_is_tuple else (tokens,)\n                        if self.provide_pos:\n                            x = x + (pos_tags,)\n                        if self.provide_chunk:\n                            x = x + (chunk_tags,)\n                        if self.provide_doc_ids:\n                            x = x + (self.num_docs,)\n                        samples.append((x, tags))\n                        tokens = []\n                        pos_tags = []\n                        chunk_tags = []\n                        tags = []\n                    self.num_docs += 1\n                    if self.docstart_token is not None:\n                        tokens = [self.docstart_token]\n                        pos_tags = ['O']\n                        chunk_tags = ['O']\n                        tags = ['O']\n                elif len(line) < 2:\n                    if (len(tokens) > 0) and (tokens != [self.docstart_token]):\n                        x = tokens if not self.x_is_tuple else (tokens,)\n                        if self.provide_pos:\n                            x = x + (pos_tags,)\n                        if self.provide_chunk:\n                            x = x + (chunk_tags,)\n                        if self.provide_doc_ids:\n                            x = x + (self.num_docs,)\n                        samples.append((x, tags))\n                        tokens = []\n                        pos_tags = []\n                        chunk_tags = []\n                        tags = []\n                else:\n                    items = line.split()\n                    if len(items) < expected_items:\n                        raise Exception(f\"Input is not valid {line}\")\n                    tokens.append(items[0])\n                    tags.append(items[-1])\n                    if self.provide_pos:\n                        pos_tags.append(items[1])\n                    if self.provide_chunk:\n                        chunk_tags.append(items[2])\n            if tokens:\n                x = tokens if not self.x_is_tuple else (tokens,)\n                if self.provide_pos:\n                    x = x + (pos_tags,)\n                if self.provide_chunk:\n                    x = x + (chunk_tags,)\n                if self.provide_doc_ids:\n                    x = x + (self.num_docs,)\n                samples.append((x, tags))\n                self.num_docs += 1\n\n            if self.iob:\n                return [(x, self._iob2_to_iob(tags)) for x, tags in samples]\n            if self.iobes:\n                return [(x, self._iob2_to_iobes(tags)) for x, tags in samples]\n\n        return samples\n\n    @staticmethod\n    def _iob2_to_iob(tags):\n        iob_tags = []\n\n        for n, tag in enumerate(tags):\n            if tag.startswith('B-') and (not n or (tags[n - 1][2:] != tag[2:])):\n                tag = tag.replace(\"B-\", \"I-\")\n            iob_tags.append(tag)\n\n        return iob_tags\n\n    @staticmethod\n    def _iob2_to_iobes(tags):\n        tag_map = {\"BB\": \"S\", \"BO\": \"S\", \"IB\": \"E\", \"IO\": \"E\"}\n        tags = tags + [\"O\"]\n        iobes_tags = []\n        for i in range(len(tags) - 1):\n            tagtag = tags[i][0] + tags[i + 1][0]\n            if tagtag in tag_map:\n                iobes_tags.append(tag_map[tagtag] + tags[i][1:])\n            else:\n                iobes_tags.append(tags[i])\n        return iobes_tags\n"
  },
  {
    "path": "deeppavlov/dataset_readers/docred_reader.py",
    "content": "# Copyright 2021 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport itertools\nimport json\nimport os\nimport random\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Dict, List, Tuple, Union\n\nimport numpy as np\nimport pandas as pd\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\n\nlogger = getLogger(__name__)\n\n\n@register('docred_reader')\nclass DocREDDatasetReader(DatasetReader):\n    \"\"\" Class to read the datasets in DocRED format\"\"\"\n\n    def read(\n            self,\n            data_path: str,\n            rel2id_path: str,\n            rel_info_path: str,\n            negative_label: str = \"Na\",\n            train_valid_test_proportion: int = None,\n            valid_test_data_size: int = None,\n            generate_additional_neg_samples: bool = False,\n            num_neg_samples: int = None\n    ) -> Dict[str, List[Tuple]]:\n        \"\"\"\n        This class processes the DocRED relation extraction dataset (https://arxiv.org/abs/1906.06127v3).\n        Args:\n            data_path: a path to a folder with dataset files.\n            rel2id_path: a path to a file where information about relation to relation id corresponding is stored.\n            rel_info_path: a path to a file where information about relations and their real names is stored\n            negative_label: a label which will be used as a negative one (by default in DocRED: \"Na\")\n            train_valid_test_proportion: a proportion in which the data will be splitted into train, valid and test sets\n            valid_test_data_size: absolute amount of dev & test sets\n            generate_additional_neg_samples: boolean; whether to generate additional negative samples or not.\n            num_neg_samples: a number of additional negative samples that will be generated for each positive sample.\n        Returns:\n            DocRED output dictionary in the following format:\n            {\"data_type\":\n                List[\n                    Tuple(\n                        List[\n                            List[all tokens of the document],\n                            List[\n                                List[Tuple(start pos of mention 1 of ent 1, end pos of mention 1 of ent 1), ...],\n                                List[Tuple(start position of entity 2, end position of entity 2), ...],\n                                List[str(NER tag of entity 1), str(NER tag of entity 2)]\n                            ],\n                        List(int(one-hot encoded relation label))\n                    )\n                ]\n            }\n        \"\"\"\n\n        with open(str(expand_path(rel2id_path))) as file:\n            self.rel2id = json.load(file)\n        self.id2rel = {value: key for key, value in self.rel2id.items()}\n\n        with open(str(expand_path(rel_info_path))) as file:\n            self.relid2rel = json.load(file)\n        self.rel2relid = {value: key for key, value in self.relid2rel.items()}\n\n        self.negative_label = negative_label\n        self.if_add_neg_samples = generate_additional_neg_samples\n        self.num_neg_samples = num_neg_samples\n\n        if self.if_add_neg_samples and not self.num_neg_samples:\n            raise ValueError(\"Please provide a number of negative samples to be generated!\")\n\n        if train_valid_test_proportion and valid_test_data_size:\n            raise ValueError(\n                f\"The train, valid and test splitting should be done either basing on their proportional values to each\"\n                f\"other (train_valid_test_proportion parameter), or on the absolute size of valid and test data \"\n                f\"(valid_test_data_size parameter). They can't be used simultaneously.\"\n            )\n\n        self.train_valid_test_proportion = train_valid_test_proportion\n        self.valid_test_data_size = valid_test_data_size\n\n        data_path = Path(data_path).resolve()\n\n        with open(os.path.join(data_path, \"train_annotated.json\")) as file_ann:\n            train_data = json.load(file_ann)\n\n        with open(os.path.join(data_path, \"dev.json\")) as file:\n            valid_data = json.load(file)\n\n        # if you want to use test data from the original docred without labels (e.g. as negatives...),\n        # uncomment lines below\n        # with open(os.path.join(data_path, \"test.json\")) as file:\n        #     test_data = json.load(file)\n        #     test_processed = self.process_docred_file(test_data, neg_samples=None)\n\n        # merge valid and train data and split them again into train, valid & test\n        if self.train_valid_test_proportion:\n            train_data, test_data, valid_data = self.split_by_relative(list(train_data + valid_data))\n        elif self.valid_test_data_size:\n            train_data, test_data, valid_data = self.split_by_absolute(list(train_data + valid_data))\n\n        else:\n            raise ValueError(\n                f\"The train, valid and test splitting should be done either basing on their proportional values to each\"\n                f\"other (train_valid_test_proportion parameter), or on the absolute size of valid and test data \"\n                f\"(valid_test_data_size parameter). One of them should be set to the not-None value.\"\n            )\n\n        logger.info(\"Train data processing...\")\n        train_data, train_stat = self.process_docred_file(train_data, neg_samples=\"twice\")\n\n        logger.info(\"Valid data processing...\")\n        valid_data, valid_stat = self.process_docred_file(valid_data, neg_samples=\"equal\")\n\n        logger.info(\"Test data processing...\")\n        test_data, test_stat = self.process_docred_file(test_data, neg_samples=\"equal\")\n\n        self.print_statistics(train_stat, valid_stat, test_stat)\n\n        data = {\"train\": train_data, \"valid\": valid_data, \"test\": test_data}\n\n        return data\n\n    def split_by_absolute(self, all_labeled_data: List) -> Tuple[List, List, List]:\n        \"\"\"\n        All annotated data from DocRED is splitted into train, valid and test sets in following proportions:\n          len(valid_data) = len(test_data) = self.valid_test_data_size\n          len(train_data) = len(all data) - 2 * self.valid_test_data_size\n        Args:\n            all_labeled_data: List of all annotated data samples\n        Return:\n            Lists of train, valid and test data\n        \"\"\"\n        if (int(self.valid_test_data_size) * 3) > len(all_labeled_data):\n            raise ValueError(\n                f\"The dataset size {len(all_labeled_data)} is too small for taking {self.valid_test_data_size} samples\"\n                f\"for valid and test. Reduce the size of valid and test set.\"\n            )\n\n        random.shuffle(all_labeled_data)\n        valid_data = all_labeled_data[:int(self.valid_test_data_size)]\n        test_data = all_labeled_data[int(self.valid_test_data_size) + 1: 2 * int(self.valid_test_data_size)]\n        train_data = all_labeled_data[2 * int(self.valid_test_data_size) + 1:]\n        return train_data, valid_data, test_data\n\n    def split_by_relative(self, all_labeled_data: List) -> Tuple[List, List, List]:\n        \"\"\"\n        All annotated data from DocRED is splitted into train, valid and test sets in following proportions:\n          len(train_data) = train_valid_test_proportion * len(valid_data) = train_valid_test_proportion * len(test_data)\n        \"\"\"\n        random.shuffle(all_labeled_data)\n        one_prop = int(len(all_labeled_data)/int(self.train_valid_test_proportion))\n\n        valid_data = all_labeled_data[:one_prop]\n        test_data = all_labeled_data[one_prop + 1: 2 * one_prop]\n        train_data = all_labeled_data[2 * one_prop + 1:]\n        return train_data, valid_data, test_data\n\n    def process_docred_file(self, data: List[Dict], neg_samples: str = None) -> Tuple[List, Dict]:\n        \"\"\"\n        Processes a DocRED data and returns a DeepPavlov relevant output\n\n        Args:\n            data: List of data units\n            neg_samples: how many negative samples are to be generated\n                Possible values:\n                    - None: no negative samples will be generated\n                        (relevant to the test set which has from neg samples only)\n                    - equal: there will be one negative sample pro positive sample\n                    - twice: there will be twice as many negative samples as positive ones\n                    - thrice: there will be thrice as many negative samples as positive ones\n        Returns:\n            one list of processed documents\n        \"\"\"\n        stat_rel_name = {rel_name: 0 for _, rel_name in self.relid2rel.items()}\n        self.stat = {\"POS_REL\": 0, \"NEG_REL\": 0}  # collect statistics of positive and negative samples\n        processed_data_samples = []\n\n        for data_unit in data:\n            ent_ids2ent_pos, ent_ids2ent_text, ent_ids2ent_tag = {}, {}, {}\n\n            # get list of all tokens from the document\n            doc = [token for sent in data_unit[\"sents\"] for token in sent]\n\n            # the sentence start indices are needed for entities' indices recalculation to the whole text\n            sents_begins = list(np.cumsum([0] + [len(sent) for sent in data_unit[\"sents\"]]))\n\n            for ent_set_id, ent_set in enumerate(data_unit[\"vertexSet\"]):\n                ent_ids2ent_pos[ent_set_id], ent_ids2ent_text[ent_set_id], ent_ids2ent_tag[ent_set_id] = [], [], []\n                for ent in ent_set:\n                    # the list of tuples with each entity's new indices (recalculated regarding to the whole doc)\n                    ent_ids2ent_pos[ent_set_id].append(\n                        ((ent[\"pos\"][0] + sents_begins[ent[\"sent_id\"]]),\n                         (ent[\"pos\"][1] + sents_begins[ent[\"sent_id\"]]))\n                    )\n                    # also save entity id to entity as exact text mentions correspondence\n                    ent_ids2ent_text[ent_set_id].append(ent[\"name\"])\n                # get the sample NER tag (logically, the same for all entity mentions)\n                ent_ids2ent_tag[ent_set_id] = ent_set[0][\"type\"]\n                ent_ids2ent_text[ent_set_id] = list(set(ent_ids2ent_text[ent_set_id]))\n\n            # if no labels are provided for the data, handle all samples as negative ones\n            if \"labels\" not in data_unit:\n                processed_data_samples += self.construct_neg_samples(ent_ids2ent_pos, ent_ids2ent_tag, doc)\n\n            # if labels are provided, save samples as positive samples and generate negatives\n            else:\n                labels = data_unit[\"labels\"]\n                curr_processed_data_samples, stat_rel_name = self.construct_pos_neg_samples(\n                    labels, ent_ids2ent_pos, ent_ids2ent_tag, doc, stat_rel_name, neg_samples=neg_samples,\n                )\n                processed_data_samples += curr_processed_data_samples\n\n        logger.info(f\"Pos samples: {self.stat['POS_REL']}  Neg samples: {self.stat['NEG_REL']}.\")\n        self.stat.pop(\"POS_REL\")\n        self.stat.pop(\"NEG_REL\")\n\n        return processed_data_samples, stat_rel_name\n\n    def construct_pos_neg_samples(\n            self, labels: List, ent_id2ent: Dict, ent_id2ent_tag: Dict, doc: List, stat_rel: Dict, neg_samples: str,\n    ) -> Tuple[List, Dict]:\n        \"\"\"\n        Transforms the relevant information into an entry of the DocRED reader output. The entities between which\n        the relation is hold will serve as an annotation for positive samples, while all other entity pairs will be\n        used to construct the negative samples.\n\n        Args:\n            labels: information about relation found in a document (whole labels list of the original DocRED)\n            ent_id2ent: a dictionary {entity id: [entity mentions' positions]}\n            stat_rel: a dictionary with relation statistics (will be updated)\n            neg_samples: amount of negative samples that are to be generated\n            ent_id2ent_tag: a dictionary {entity id: entity NER tag}\n            doc: list of all tokens of the document\n        Returns:\n            a tuple with list of all doc tokens, entity information (positions & NER tags) and relation.\n        \"\"\"\n\n        num_pos_samples, num_neg_samples = 0, 0\n\n        data_samples = []\n        rel_triples = {}\n        for label_info in labels:\n            entity1_id, entity2_id = label_info[\"h\"], label_info[\"t\"]\n            if (entity1_id, entity2_id) in rel_triples:\n                rel_triples[(entity1_id, entity2_id)].append(self.rel2id[label_info['r']])\n            else:\n                rel_triples[(entity1_id, entity2_id)] = [self.rel2id[label_info['r']]]\n\n        # the one hot encoding of the negative label\n        neg_label_one_hot = self.label_to_one_hot([self.rel2id[self.negative_label]])\n\n        # iterate over all entities\n        for (ent1, ent2) in itertools.permutations(ent_id2ent, 2):\n\n            # if there is a relation hold between entities, save them (and a corresponding sample) as positive one\n            if (ent1, ent2) in rel_triples:\n                num_pos_samples += 1\n                labels = rel_triples[(ent1, ent2)]\n                label_one_hot = self.label_to_one_hot(labels)\n                data_samples.append(\n                    self.generate_data_sample(doc, ent1, ent2, label_one_hot, ent_id2ent, ent_id2ent_tag)\n                )\n                self.stat[\"POS_REL\"] += 1\n\n                for label in labels:\n                    rel_name = self.relid2rel[self.id2rel[label]]\n                    stat_rel[rel_name] += 1\n\n            else:\n                if not neg_samples:         # if no negative samples should be generated, skip\n                    continue\n\n                # if there is no relation hold between entities, save them (and a corresponding sample) as negative one\n                if neg_samples == \"equal\" and num_neg_samples < num_pos_samples:\n                    num_neg_samples += 1\n                    data_samples.append(\n                        self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag)\n                    )\n                    self.stat[\"NEG_REL\"] += 1\n\n                elif neg_samples == \"twice\" and num_neg_samples < 2 * num_pos_samples:\n                    num_neg_samples += 1\n                    data_samples.append(\n                        self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag)\n                    )\n                    self.stat[\"NEG_REL\"] += 1\n\n                elif neg_samples == \"thrice\" and num_neg_samples < 3 * num_pos_samples:\n                    num_neg_samples += 1\n                    data_samples.append(\n                        self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag)\n                    )\n                    self.stat[\"NEG_REL\"] += 1\n\n        return data_samples, stat_rel\n\n    def construct_neg_samples(\n            self, ent_id2ent: Dict, ent_id2ent_tag: Dict, doc: List\n    ) -> List[Tuple[Tuple[List, List], List]]:\n        \"\"\"\n        Turn the annotated documents but without any positive relation label to the negative samples in a format of\n            the DocRED reader output.\n\n        Args:\n            ent_id2ent: a dictionary {entity id: [entity mentions' positions]}\n            ent_id2ent_tag: a dictionary {entity id: entity NER tag}\n            doc: list of all tokens of the document\n        Returns:\n            a tuple with list of all doc tokens, entity information (positions & NER tags) and relation (=neg_label).\n        \"\"\"\n        neg_data_samples = []\n        neg_label_one_hot = self.label_to_one_hot([self.rel2id[self.negative_label]])\n        for ent1, ent2 in itertools.permutations(ent_id2ent.keys(), 2):\n            neg_data_samples.append(\n                self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag)\n            )\n\n            self.stat[\"NEG_REL\"] += 1\n        return neg_data_samples\n\n    @staticmethod\n    def generate_data_sample(\n            doc: List, ent1: int, ent2: int, label: List, ent_id2ent: Dict, ent_id2ent_tag: Dict\n    ) -> Tuple[List[Union[List, List]], List]:\n        \"\"\" Creates an entry of processed docred corpus \"\"\"\n        return (\n                    [\n                        doc,\n                        [ent_id2ent[ent1], ent_id2ent[ent2]],\n                        [ent_id2ent_tag[ent1], ent_id2ent_tag[ent2]]\n                    ],\n                    label\n                )\n\n    def generate_additional_neg_samples(self, doc: List, forbidden_entities: List, num_neg_samples: int):\n        \"\"\"\n        <CURRENTLY NOT USED>\n        Generated negative samples, i.e. the same document that is used for positive samples, but labeled with\n        \"no_relation\" label and with entities, that are not connected with any relation, marked as such.\n\n        Args:\n             doc: list of positive sentences\n             forbidden_entities: list of entities that participate in any of the relations (and, therefore, cannot be\n                chosen for negative sample)\n             num_neg_samples: number of negative samples that are to be generated out of this document\n        Returns:\n             a tuple with list of all doc tokens, entity information (positions & NER tags) and relation (=neg_label).\n        \"\"\"\n        # ATTENTION! To make it work, please run the following command: python3 -m deeppavlov install ner_ontonotes_bert\n\n        from deeppavlov import build_model, configs\n        ner = build_model(configs.ner.ner_ontonotes_bert_mult, download=True)\n        neg_data_samples = []\n        analysed_sentences = ner([\" \".join(doc)])  # returns [[[tokens]], [[ner tags]]]\n\n        # select ids of tokens that were not part of any relation so far\n        neg_entities_idx = random.sample(\n            [ent_idx for ent_idx in range(len(analysed_sentences[0][0]))\n             if analysed_sentences[0][0][ent_idx] not in forbidden_entities],\n            num_neg_samples * 2\n        )\n\n        # the one hot encoding of the negative label\n        neg_label_one_hot = self.label_to_one_hot([self.rel2id[self.negative_label]])\n\n        for n_ent_1_idx, n_ent_2_idx in itertools.permutations(neg_entities_idx, 2):\n            # if already sufficient number of negative samples have been generated\n            if len(neg_data_samples) == num_neg_samples:\n                break\n            neg_entity_1 = analysed_sentences[0][0][n_ent_1_idx]\n            neg_entity_2 = analysed_sentences[0][0][n_ent_2_idx]\n            neg_entity_1_tag = analysed_sentences[1][0][n_ent_1_idx]\n            neg_entity_2_tag = analysed_sentences[1][0][n_ent_2_idx]\n            neg_data_samples.append(\n                (doc, [[neg_entity_1], [neg_entity_2], neg_entity_1_tag, neg_entity_2_tag], neg_label_one_hot)\n            )\n            self.stat[\"NEG_REL\"] += 1\n\n        return neg_data_samples\n\n    def label_to_one_hot(self, labels: List[int]) -> List:\n        \"\"\" Turn labels to one hot encodings \"\"\"\n        relation = [0] * len(self.rel2id)\n        for label in labels:\n            relation[label] = 1\n        return relation\n\n    def print_statistics(self, train_stat: Dict, valid_stat: Dict, test_stat: Dict) -> None:\n        \"\"\" Print out the relation statistics as a markdown table \"\"\"\n        df = pd.DataFrame([self.rel2relid, train_stat, valid_stat, test_stat]).T\n        df.columns = ['d{}'.format(i) for i, col in enumerate(df, 1)]\n        logger.info(\"\\n\")\n        logger.info(df)\n"
  },
  {
    "path": "deeppavlov/dataset_readers/faq_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, softwaredata\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Dict\n\nfrom pandas import read_csv\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\n\n\n@register('faq_reader')\nclass FaqDatasetReader(DatasetReader):\n    \"\"\"Reader for FAQ dataset\"\"\"\n\n    def read(self, data_path: str = None, data_url: str = None, x_col_name: str = 'x', y_col_name: str = 'y') -> Dict:\n        \"\"\"\n        Read FAQ dataset from specified csv file or remote url\n\n        Parameters:\n            data_path: path to csv file of FAQ\n            data_url: url to csv file of FAQ\n            x_col_name: name of Question column in csv file\n            y_col_name: name of Answer column in csv file\n\n        Returns:\n            A dictionary containing training, validation and test parts of the dataset obtainable via\n            ``train``, ``valid`` and ``test`` keys.\n        \"\"\"\n\n        if data_url is not None:\n            data = read_csv(data_url)\n        elif data_path is not None:\n            data = read_csv(data_path)\n        else:\n            raise ValueError(\"Please specify data_path or data_url parameter\")\n\n        x = data[x_col_name]\n        y = data[y_col_name]\n\n        train_xy_tuples = [(x[i].strip(), y[i].strip()) for i in range(len(x))]\n\n        dataset = dict()\n        dataset[\"train\"] = train_xy_tuples\n        dataset[\"valid\"] = []\n        dataset[\"test\"] = []\n\n        return dataset\n"
  },
  {
    "path": "deeppavlov/dataset_readers/huggingface_dataset_reader.py",
    "content": "# Copyright 2020 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport re\nfrom collections import Counter\nfrom math import floor\nfrom typing import Dict, Optional, List, Union\n\nfrom datasets import load_dataset, Dataset, Features, ClassLabel, concatenate_datasets\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\n\n\n@register('huggingface_dataset_reader')\nclass HuggingFaceDatasetReader(DatasetReader):\n    \"\"\"Adds HuggingFace Datasets https://huggingface.co/datasets/ to DeepPavlov\n    \"\"\"\n\n    def read(self,\n             path: str,\n             name: Optional[str] = None,\n             train: Optional[str] = None,  # for lidirus with no train\n             valid: Optional[str] = None,\n             test: Optional[str] = None,\n             **kwargs) -> Dict[str, Dataset]:\n        \"\"\"Wraps datasets.load_dataset method\n\n        Args:\n            path: datasets.load_dataset path argument (e.g., `glue`)\n            name: datasets.load_dataset name argument (e.g., `mrpc`)\n            train: split name to use as training data.\n            valid: split name to use as validation data.\n            test: split name to use as test data.\n\n        Returns:\n            Dict[str, List[Dict]]: Dictionary with train, valid, test datasets\n        \"\"\"\n        if 'split' in kwargs:\n            raise RuntimeError('Split argument was used. Use train, valid, test arguments instead of split.')\n\n        # pop elements not relevant to BuilderConfig\n        downsample_ratio: Union[List[float], float] = kwargs.pop(\"downsample_ratio\", 1.)\n        seed = kwargs.pop(\"seed\", 42)\n        percentage = kwargs.pop(\"dev_percentage\", 50)\n        do_index_correction = kwargs.pop(\"do_index_correction\", True)\n\n        split_mapping = {'train': train, 'valid': valid, 'test': test}\n        # filter unused splits\n        split_mapping = {el: split_mapping[el] for el in split_mapping if split_mapping[el]}\n\n        if isinstance(downsample_ratio, float):\n            downsample_ratio = [downsample_ratio] * len(split_mapping)\n        elif isinstance(downsample_ratio, list) and len(downsample_ratio) != len(split_mapping):\n            raise ValueError(\"The number of downsample ratios must be the same as the number of splits\")\n\n        if path == \"russian_super_glue\" and \"_mixed\" in name:\n            name = name.replace(\"_mixed\", \"\")\n\n        dataset = load_dataset(path=path, name=name, split=list(split_mapping.values()), **kwargs)\n\n        if (path == \"super_glue\" and name == \"copa\") or (path == \"russian_super_glue\" and name == \"parus\"):\n            lang = \"en\" if name == \"copa\" else \"ru\"\n            dataset = [\n                dataset_split.map(preprocess_copa, batched=True, fn_kwargs={\"lang\": lang}) for dataset_split in dataset\n            ]\n        elif path == \"super_glue\" and name == \"boolq\":\n            # danetqa doesn't require the same preprocessing\n            dataset = load_dataset(path=path,\n                                   name=name,\n                                   split=interleave_splits(splits=list(split_mapping.values()),\n                                                           percentage=percentage),\n                                   **kwargs)\n            dataset = [dataset_split.map(preprocess_boolq, batched=True) for dataset_split in dataset]\n        elif (path == \"super_glue\" and name == \"record\") or (path == \"russian_super_glue\" and name == \"rucos\"):\n            label_column = \"label\"\n            dataset = [\n                binary_downsample(\n                    add_label_names(\n                        dataset_split.map(preprocess_record,\n                                          batched=True,\n                                          remove_columns=[\"answers\"]),\n                        label_column=label_column,\n                        label_names=[\"False\", \"True\"]\n                    ),\n                    ratio=ratio,\n                    seed=seed,\n                    label_column=label_column,\n                    do_correction=do_index_correction\n                ).map(add_num_examples, batched=True, batch_size=None)\n                for dataset_split, ratio\n                in zip(dataset, downsample_ratio)\n            ]\n        elif (path == \"super_glue\" and name == \"multirc\") or (path == \"russian_super_glue\" and name == \"muserc\"):\n            dataset = [\n                dataset_split.map(\n                    preprocess_multirc, batched=True, remove_columns=[\"paragraph\", \"question\"]\n                ) for dataset_split in dataset\n            ]\n        elif (path == \"super_glue\" and name == \"wsc\") or (path == \"russian_super_glue\" and name == \"rwsd\"):\n            dataset = [\n                dataset_split.map(\n                    preprocess_wsc,\n                    batched=True,\n                    remove_columns=[\"span1_index\", \"span2_index\", \"span1_text\", \"span2_text\"],\n                ) for dataset_split in dataset\n            ]\n        elif path == \"russian_super_glue\" and name == \"terra_mixed\" and \"train\" in list(split_mapping.values()):\n            tmp_dataset = []\n            for d, split in zip(dataset, split_mapping.values()):\n                if split == \"train\":\n                    to_mix = load_dataset(\"super_glue\", \"rte\", split=\"train\")\n                    combined_train = concatenate_datasets([to_mix, d])\n                    tmp_dataset.append(combined_train)\n                else:\n                    tmp_dataset.append(d)\n            dataset = tmp_dataset\n\n        elif path == \"russian_super_glue\" and name == \"rcb_mixed\" and \"train\" in list(split_mapping.values()):\n            tmp_dataset = []\n            for d, split in zip(dataset, split_mapping.values()):\n                if split == \"train\":\n                    to_mix = load_dataset(\"super_glue\", \"cb\", split=\"train\")\n                    combined_train = concatenate_datasets([to_mix, d.remove_columns([\"verb\", \"negation\"])])\n                    tmp_dataset.append(combined_train)\n                else:\n                    tmp_dataset.append(d.remove_columns([\"verb\", \"negation\"]))\n            dataset = tmp_dataset\n        elif path == \"russian_super_glue\" and name == \"danetqa_mixed\" and \"train\" in list(split_mapping.values()):\n            tmp_dataset = []\n            for d, split in zip(dataset, split_mapping.values()):\n                if split == \"train\":\n                    to_mix = load_dataset(\n                        \"super_glue\", \"boolq\", split=\"train\"\n                    ).map(\n                        preprocess_boolq, batched=True\n                    ).cast(d.features)\n                    combined_train = concatenate_datasets([to_mix, d])\n                    tmp_dataset.append(combined_train)\n                else:\n                    tmp_dataset.append(d)\n            dataset = tmp_dataset\n        return dict(zip(split_mapping.keys(), dataset))\n\n\ndef interleave_splits(splits: List[str], percentage: int = 50) -> List[str]:\n    \"\"\"Adds a portion of `dev` (or, `test` if there's only `train` and `test`) set to the `train` set.\n    Assumes that there are at two splits are passed ordered as (train, dev, test).\n    Args:\n        splits: list of strings\n        percentage: percentage (represented as an integer value between 0 and 100)\n                    of samples to extract from `dev` and add to `train`\n    Returns:\n        List[str] containing mixing instructions (e.g. ['train+validation[:50%]', 'validation[-50%:]'])\n    \"\"\"\n    if len(splits) < 2:\n        raise ValueError(\"At least two splits should be passed to this function\")\n    mixed_splits = [f\"{splits[0]}+{splits[1]}[:{percentage}%]\", f\"{splits[1]}[-{percentage}%:]\"]\n    if len(splits) == 3:\n        mixed_splits += [splits[2]]\n    return mixed_splits\n\n\ndef preprocess_copa(examples: Dataset, *, lang: str = \"en\") -> Dict[str, List[List[str]]]:\n    \"\"\"COPA preprocessing to be applied by the map function.\n    Args:\n        examples: an instance of Dataset class\n        lang: task language. Either `en` or `ru`.\n    Returns:\n        Dict[str, List[List[str]]]: processed features represented as nested\n        list with number of elements corresponding to the number of choices\n        (2 in this case)\n    \"\"\"\n    if lang == \"en\":\n        question_dict = {\n            \"cause\": \"What was the cause of this?\",\n            \"effect\": \"What happened as a result?\",\n        }\n    elif lang == \"ru\":\n        question_dict = {\n            \"cause\": \"Что было причиной этого?\",\n            \"effect\": \"Что случилось в результате?\",\n        }\n    else:\n        raise ValueError(f\"Incorrect `lang` value '{lang}'. Should be either 'en' or 'ru'.\")\n\n    num_choices = 2\n\n    questions = [question_dict[question] for question in examples[\"question\"]]\n    premises = examples[\"premise\"]\n\n    contexts = [f\"{premise} {question}\" for premise, question in zip(premises, questions)]\n    contexts = [[context] * num_choices for context in contexts]\n\n    choices = [[choice1, choice2] for choice1, choice2 in zip(examples[\"choice1\"], examples[\"choice2\"])]\n\n    return {\"contexts\": contexts,\n            \"choices\": choices}\n\n\ndef preprocess_boolq(examples: Dataset) -> Dict[str, List[str]]:\n    \"\"\"BoolQ preprocessing to be applied by the map function. The preprocessing boils down\n    to removing redundant titles from the passages.\n    Args:\n        examples: an instance of Dataset class\n    Returns:\n        Dict[str, List[str]]: processed features (just the passage in this case)\n    \"\"\"\n\n    def remove_passage_title(passage: str) -> str:\n        \"\"\"Removes the title of a given passage. The motivation is that the title duplicates\n        the beginning of the text body, which means that it's redundant. We remove to save space.\n        Args:\n            passage: a single `passage` string\n        Returns:\n            str: the same `passage` string with the title removed\n        \"\"\"\n        return re.sub(r\"^.+-- \", \"\", passage)\n\n    passages = [remove_passage_title(passage) for passage in examples[\"passage\"]]\n\n    return {\"passage\": passages}\n\n\ndef preprocess_record(examples: Dataset, *, clean_entities: bool = True) -> Dict[str, Union[List[str], List[int]]]:\n    \"\"\"ReCoRD preprocessing to be applied by the map function. This transforms the original\n    nested structure of the dataset into a flat one. New indices are generated to allow for\n    the restoration of the original structure. The resulting dataset amounts to a binary\n    classification problem.\n    Args:\n        examples: an instance of Dataset class\n        clean_entities: a boolean flag indicating whether to clean-up given entities\n    Returns:\n        Dict[str, Union[List[str], List[int]]]: flattened features of the dataset\n    \"\"\"\n\n    def fill_placeholder(sentence: str, candidate: str) -> str:\n        \"\"\"Fills `@placeholder` of a given query with the provided entity\n        Args:\n            sentence: query to fill\n            candidate: entity candidate for the query\n        Returns:\n            str: `sentence` with `@placeholder` replaced with `candidate`\n        \"\"\"\n        return re.sub(r\"@placeholder\", candidate.replace(\"\\\\\", \"\"), sentence)\n\n    def remove_highlight(context: str) -> str:\n        \"\"\"Removes highlights from a given passage\n        Args:\n            context: a passage to remove highlights from\n        Returns:\n            str: `context` with highlights removed\n        \"\"\"\n        return re.sub(r\"\\n@highlight\\n\", \". \", context)\n\n    queries: List[str] = examples[\"query\"]\n    passages: List[str] = [remove_highlight(passage) for passage in examples[\"passage\"]]\n    answers: List[List[str]] = examples[\"answers\"]\n    entities: List[List[str]] = examples[\"entities\"]\n    indices: List[Dict[str, int]] = examples[\"idx\"]\n\n    if clean_entities:\n        tmp_entities = []\n        for list_of_entities in entities:\n            tmp_entities.append(\n                list(set([entity.strip(\"\\n ,.!\") for entity in list_of_entities]))\n            )\n        entities = tmp_entities\n\n        tmp_answers = []\n        for list_of_answers in answers:\n            tmp_answers.append(\n                list(set([answer.strip(\"\\n ,.!\") for answer in list_of_answers]))\n            )\n        answers = tmp_answers\n\n    # new indices for flat examples\n    merged_indices: List[str] = []\n    # queries with placeholders filled\n    filled_queries: List[str] = []\n    # duplicated passages\n    extended_passages: List[str] = []\n    # contains one entity per flat example\n    flat_entities: List[str] = []\n    # whether the entity in this example is found in the answers (0 or 1)\n    labels: List[int] = []\n\n    for query, passage, list_of_answers, list_of_entities, index in zip(queries,\n                                                                        passages,\n                                                                        answers,\n                                                                        entities,\n                                                                        indices):\n        num_candidates: int = len(list_of_entities)\n\n        candidate_queries: List[str] = [fill_placeholder(query, entity) for entity in list_of_entities]\n        cur_labels: List[int] = [\n            int(entity in list_of_answers) if list_of_answers else -1 for entity in list_of_entities\n        ]\n        cur_passages: List[str] = [passage] * num_candidates\n\n        # keep track of the indices to be able to use target metrics\n        passage_index: int = index[\"passage\"]\n        query_index: int = index[\"query\"]\n        example_indices: List[str] = [f\"{passage_index}-{query_index}-{num_candidates}\"] * num_candidates\n\n        if sum(cur_labels) != 0:\n            merged_indices.extend(example_indices)\n            filled_queries.extend(candidate_queries)\n            extended_passages.extend(cur_passages)\n            flat_entities.extend(list_of_entities)\n            labels.extend(cur_labels)\n\n    return {\"idx\": merged_indices,\n            \"query\": filled_queries,\n            \"passage\": extended_passages,\n            \"entities\": flat_entities,\n            \"label\": labels}\n\n\ndef add_label_names(dataset: Dataset, label_column: str, label_names: List[str]):\n    \"\"\"Adds `names` to a specified `label` column.\n    All labels (i.e. integers) in the dataset should be < than the number of label names.\n    Args:\n        dataset: a Dataset to add label names to\n        label_column: the name of the label column (such as `label` or `labels`) in the dataset\n        label_names: a list of label names\n    Returns:\n        Dataset: A copy of the passed `dataset` with added label names\n    \"\"\"\n    new_features: Features = dataset.features.copy()\n    new_features[label_column] = ClassLabel(names=label_names)\n    return dataset.cast(new_features)\n\n\ndef binary_downsample(dataset: Dataset,\n                      ratio: float = 0.,\n                      seed: int = 42,\n                      label_column: str = \"label\",\n                      *,\n                      do_correction: bool = True) -> Dataset:\n    \"\"\"Downsamples a given dataset to the specified negative to positive examples ratio. Only works with\n    binary classification datasets with labels denoted as `0` and `1`.\n    Args:\n        dataset: a Dataset to downsample\n        ratio: negative to positive examples ratio to maintain\n        seed: a seed for shuffling\n        label_column: the name of `label` column such as 'label' or 'labels'\n        do_correction: correct resampled indices. If indices aren't corrected then examples with mismatched\n        indices will not be accounted for be ReCoRD metrics. This is not necessarily undesirable because\n        examples with such indices will have less negative examples (or even none), which makes them easier\n        for the model, thus inflating the resulting metrics.\n    Returns:\n        Dataset: a downsampled dataset\n    \"\"\"\n\n    def replace_indices(data: Dataset, index_map: Dict[str, str]) -> Dict[str, List[str]]:\n        idx: List[str] = [index_map.get(el, el) for el in data[\"idx\"]]\n        return {\"idx\": idx}\n\n    def get_correct_indices_map(data: Dataset) -> Dict[str, str]:\n        \"\"\"Generate a dictionary with replacements for indices that\n        are no longer correct due to downsampling (i.e. the total number\n        of elements denoted by the last part of an index has changed)\n        Args:\n            data: a downsampled Dataset\n        Returns:\n            Dict[str, str]: a dictionary containing replacement indices\n        \"\"\"\n        actual_n_elements: Counter = Counter(data[\"idx\"])\n        corrected_index_map: Dict[str, str] = dict()\n        for idx, n_elements in actual_n_elements.items():\n            expected_n_elements: int = int(idx.split(\"-\")[-1])\n            if expected_n_elements != n_elements:\n                new_idx: List[str] = idx.split(\"-\")\n                new_idx[-1]: str = str(n_elements)\n                new_idx: str = \"-\".join(new_idx)\n                corrected_index_map[idx] = new_idx\n        return corrected_index_map\n\n    def correct_indices(data: Dataset) -> Dataset:\n        \"\"\"Sets correct number of examples in downsampled indices\n        Args:\n            data: a downsampled dataset\n        Returns:\n            Dataset: the same dataset with correct indices\n        \"\"\"\n        index_map: Dict[str, str] = get_correct_indices_map(data)\n        return data.map(replace_indices, batched=True, fn_kwargs={\"index_map\": index_map})\n\n    dataset_labels = dataset.unique(label_column)\n    # `test` split shouldn't be downsampled\n    if dataset_labels == [-1]:\n        return dataset\n    elif set(dataset_labels) == {0, 1}:\n        # positive examples are denoted with `1`\n        num_positive: int = sum(dataset[label_column])\n        num_total: int = len(dataset)\n        # the original number of negative examples is returned if `ratio` is not explicitly specified\n        num_negative: int = floor(num_positive * ratio if ratio > 0 else num_total - num_positive)\n        # first `num_positive` examples in a sorted dataset are labeled with `1`\n        # while the rest are labeled with `0`\n        sorted_dataset: Dataset = dataset.sort(label_column, reverse=True)\n        # but we need to reshuffle the dataset before returning it\n        shuffled_dataset: Dataset = sorted_dataset.select(range(num_positive + num_negative)).shuffle(seed=seed)\n        if do_correction:\n            shuffled_dataset = correct_indices(shuffled_dataset)\n        return shuffled_dataset\n    # the same logic is not applicable to cases with != 2 classes\n    else:\n        raise ValueError(f\"Only binary classification labels are supported (i.e. [0, 1]), but {dataset_labels} were given\")\n\n\ndef add_num_examples(dataset: Dataset) -> Dict[str, List[int]]:\n    \"\"\"Adds the total number of examples in a given dataset to\n    each individual example. Must be applied to the whole dataset (i.e. `batched=True, batch_size=None`),\n    otherwise the number will be incorrect.\n    Args:\n        dataset: a Dataset to add number of examples to\n    Returns:\n        Dict[str, List[int]]: total number of examples repeated for each example\n    \"\"\"\n    num_examples = len(dataset[next(iter(dataset))])\n    return {\"num_examples\": [num_examples] * num_examples}\n\n\ndef preprocess_multirc(examples: Dataset, *, clean_paragraphs: bool = True) -> Dict[str, List[str]]:\n    \"\"\"Compose strings in form of paragraphs and the folllowing questions.\n\n    Args:\n        examples: A given dataset.\n        clean_paragraphs: Whether replace spaces and digits with a single space.\n\n    Returns:\n        Dict[str, List[str]]: Composed strings.\n\n    \"\"\"\n    paragraphs: List[str] = examples[\"paragraph\"]\n    questions: List[str] = examples[\"question\"]\n\n    if clean_paragraphs:\n        paragraphs = [re.sub(r\"\\s+\", \" \", re.sub(r\"\\(\\d{1,2}\\)\", \"\", paragraph).strip()) for paragraph in paragraphs]\n\n    contexts = [f\"{paragraph} {question}\" for paragraph, question in zip(paragraphs, questions)]\n\n    return {\"context\": contexts}\n\n\ndef preprocess_wsc(dataset: Dataset) -> Dict[str, List[str]]:\n    \"\"\"Forms proper sentences from spans1 that are always entities and spans2 that describe these entities.\n\n    Args:\n        dataset: A given dataset.\n\n    Returns:\n        Dict[str, List[str]]: Answers that form proper sentences from capitalized spans1 and spans2.\n\n    \"\"\"\n    spans1: List[str] = dataset[\"span1_text\"]\n    spans2: List[str] = dataset[\"span2_text\"]\n    answers = [f\"{s2.capitalize()} {s1}\" for s1, s2 in zip(spans1, spans2)]\n    return {\"answer\": answers}\n"
  },
  {
    "path": "deeppavlov/dataset_readers/imdb_reader.py",
    "content": "# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom logging import getLogger\nfrom typing import List, Dict, Any, Optional, Tuple\nfrom pathlib import Path\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\nfrom deeppavlov.core.data.utils import download_decompress, mark_done, is_done\n\nlog = getLogger(__name__)\n\n\n@register('imdb_reader')\nclass ImdbReader(DatasetReader):\n    \"\"\"This class downloads and reads the IMDb sentiment classification dataset.\n\n    https://ai.stanford.edu/~amaas/data/sentiment/\n\n    Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts.\n    (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association\n    for Computational Linguistics (ACL 2011).\n    \"\"\"\n\n    def read(self, data_path: str, url: Optional[str] = None,\n             *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:\n        \"\"\"\n        Args:\n            data_path: A path to a folder with dataset files.\n            url: A url to the archive with the dataset to download if the data folder is empty.\n        \"\"\"\n        data_path = Path(data_path)\n\n        if url is None:\n            url = \"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n\n        if not is_done(data_path):\n            log.info('[downloading data from {} to {}]'.format(url, data_path))\n            download_decompress(url, data_path)\n            mark_done(data_path)\n\n        alternative_data_path = data_path / \"aclImdb\"\n        if alternative_data_path.exists():\n            data_path = alternative_data_path\n\n        data = {\"train\": [],\n                \"test\": []}\n        for data_type in data.keys():\n            for label in [\"neg\", \"pos\"]:\n                labelpath = data_path / data_type / label\n                if not labelpath.exists():\n                    raise RuntimeError(f\"Cannot load data: {labelpath} does not exist\")\n                for filename in labelpath.glob(\"*.txt\"):\n                    with filename.open(encoding='utf-8') as f:\n                        text = f.read()\n                    data[data_type].append((text, [label]))\n\n            if not data[data_type]:\n                raise RuntimeError(f\"Could not load the '{data_type}' dataset, \"\n                                   \"probably data dirs are empty\")\n\n        return data\n"
  },
  {
    "path": "deeppavlov/dataset_readers/line_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, softwaredata\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Dict\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\n\n\n@register('line_reader')\nclass LineReader(DatasetReader):\n    \"\"\"Read txt file by lines\"\"\"\n\n    def read(self, data_path: str = None, *args, **kwargs) -> Dict:\n        \"\"\"Read lines from txt file\n\n        Args:\n            data_path: path to txt file\n\n        Returns:\n            A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys.\n        \"\"\"\n\n        with open(data_path) as f:\n            content = f.readlines()\n\n        dataset = dict()\n        dataset[\"train\"] = [(line,) for line in content]\n        dataset[\"valid\"] = []\n        dataset[\"test\"] = []\n\n        return dataset\n"
  },
  {
    "path": "deeppavlov/dataset_readers/morphotagging_dataset_reader.py",
    "content": "# Copyright 2018 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Dict, List, Union, Tuple, Optional\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\nfrom deeppavlov.core.data.utils import download_decompress, mark_done\n\nWORD_COLUMN, POS_COLUMN, TAG_COLUMN = 1, 3, 5\nHEAD_COLUMN, DEP_COLUMN = 6, 7\n\nlog = getLogger(__name__)\n\n\ndef get_language(filepath: str) -> str:\n    \"\"\"Extracts language from typical UD filename\n    \"\"\"\n    return filepath.split(\"-\")[0]\n\n\ndef read_infile(infile: Union[Path, str], *, from_words=False,\n                word_column: int = WORD_COLUMN, pos_column: int = POS_COLUMN,\n                tag_column: int = TAG_COLUMN, head_column: int = HEAD_COLUMN,\n                dep_column: int = DEP_COLUMN, max_sents: int = -1,\n                read_only_words: bool = False, read_syntax: bool = False) -> List[Tuple[List, Union[List, None]]]:\n    \"\"\"Reads input file in CONLL-U format\n\n    Args:\n        infile: a path to a file\n        word_column: column containing words (default=1)\n        pos_column: column containing part-of-speech labels (default=3)\n        tag_column: column containing fine-grained tags (default=5)\n        head_column: column containing syntactic head position (default=6)\n        dep_column: column containing syntactic dependency label (default=7)\n        max_sents: maximal number of sentences to read\n        read_only_words: whether to read only words\n        read_syntax: whether to return ``heads`` and ``deps`` alongside ``tags``. Ignored if read_only_words is ``True``\n\n    Returns:\n        a list of sentences. Each item contains a word sequence and an output sequence.\n        The output sentence is ``None``, if ``read_only_words`` is ``True``,\n        a single list of word tags if ``read_syntax`` is False,\n        and a list of the form [``tags``, ``heads``, ``deps``] in case ``read_syntax`` is ``True``.\n\n    \"\"\"\n    answer, curr_word_sent, curr_tag_sent = [], [], []\n    curr_head_sent, curr_dep_sent = [], []\n    # read_syntax = read_syntax and read_only_words\n    if from_words:\n        word_column, read_only_words = 0, True\n    if infile is not sys.stdin:\n        fin = open(infile, \"r\", encoding=\"utf8\")\n    else:\n        fin = sys.stdin\n    for line in fin:\n        line = line.strip()\n        if line.startswith(\"#\"):\n            continue\n        if line == \"\":\n            if len(curr_word_sent) > 0:\n                if read_only_words:\n                    curr_tag_sent = None\n                elif read_syntax:\n                    curr_tag_sent = [curr_tag_sent, curr_head_sent, curr_dep_sent]\n                answer.append((curr_word_sent, curr_tag_sent))\n            curr_tag_sent, curr_word_sent = [], []\n            curr_head_sent, curr_dep_sent = [], []\n            if len(answer) == max_sents:\n                break\n            continue\n        splitted = line.split(\"\\t\")\n        index = splitted[0]\n        if not from_words and not index.isdigit():\n            continue\n        curr_word_sent.append(splitted[word_column])\n        if not read_only_words:\n            pos, tag = splitted[pos_column], splitted[tag_column]\n            tag = pos if tag == \"_\" else \"{},{}\".format(pos, tag)\n            curr_tag_sent.append(tag)\n            if read_syntax:\n                curr_head_sent.append(int(splitted[head_column]))\n                curr_dep_sent.append(splitted[dep_column])\n    if len(curr_word_sent) > 0:\n        if read_only_words:\n            curr_tag_sent = None\n        elif read_syntax:\n            curr_tag_sent = [curr_tag_sent, curr_head_sent, curr_dep_sent]\n        answer.append((curr_word_sent, curr_tag_sent))\n    if infile is not sys.stdin:\n        fin.close()\n    return answer\n\n\n@register('morphotagger_dataset_reader')\nclass MorphotaggerDatasetReader(DatasetReader):\n    \"\"\"Class to read training datasets in UD format\"\"\"\n\n    URL = 'http://files.deeppavlov.ai/datasets/UD2.0_source/'\n\n    def read(self, data_path: Union[List, str],\n             language: Optional[str] = None,\n             data_types: Optional[List[str]] = None,\n             **kwargs) -> Dict[str, List]:\n        \"\"\"Reads UD dataset from data_path.\n\n        Args:\n            data_path: can be either\n                1. a directory containing files. The file for data_type 'mode'\n                is then data_path / {language}-ud-{mode}.conllu\n                2. a list of files, containing the same number of items as data_types\n            language: a language to detect filename when it is not given\n            data_types: which dataset parts among 'train', 'dev', 'test' are returned\n\n        Returns:\n            a dictionary containing dataset fragments (see ``read_infile``) for given data types\n        \"\"\"\n        if data_types is None:\n            data_types = [\"train\", \"dev\"]\n        elif isinstance(data_types, str):\n            data_types = list(data_types)\n        for data_type in data_types:\n            if data_type not in [\"train\", \"dev\", \"test\"]:\n                raise ValueError(\"Unknown data_type: {}, only train, dev and test \"\n                                 \"datatypes are allowed\".format(data_type))\n        if isinstance(data_path, str):\n            data_path = Path(data_path)\n        if isinstance(data_path, Path):\n            if data_path.exists():\n                is_file = data_path.is_file()\n            else:\n                is_file = (len(data_types) == 1)\n            if is_file:\n                # path to a single file\n                data_path, reserve_data_path = [data_path], None\n            else:\n                # path to data directory\n                if language is None:\n                    raise ValueError(\"You must implicitly provide language \"\n                                     \"when providing data directory as source\")\n                reserve_data_path = data_path\n                data_path = [data_path / \"{}-ud-{}.conllu\".format(language, mode)\n                             for mode in data_types]\n                reserve_data_path = [\n                    reserve_data_path / language / \"{}-ud-{}.conllu\".format(language, mode)\n                    for mode in data_types]\n        else:\n            data_path = [Path(data_path) for data_path in data_path]\n            reserve_data_path = None\n        if len(data_path) != len(data_types):\n            raise ValueError(\"The number of input files in data_path and data types \"\n                             \"in data_types must be equal\")\n        has_missing_files = any(not filepath.exists() for filepath in data_path)\n        if has_missing_files and reserve_data_path is not None:\n            has_missing_files = any(not filepath.exists() for filepath in reserve_data_path)\n            if not has_missing_files:\n                data_path = reserve_data_path\n        if has_missing_files:\n            # Files are downloaded from the Web repository\n            dir_path = data_path[0].parent\n            language = language or get_language(data_path[0].parts[-1])\n            url = self.URL + \"{}.tar.gz\".format(language)\n            log.info('[downloading data from {} to {}]'.format(url, dir_path))\n            dir_path.mkdir(exist_ok=True, parents=True)\n            download_decompress(url, dir_path)\n            mark_done(dir_path)\n        data = {}\n        for mode, filepath in zip(data_types, data_path):\n            if mode == \"dev\":\n                mode = \"valid\"\n            data[mode] = read_infile(filepath, **kwargs)\n        return data\n"
  },
  {
    "path": "deeppavlov/dataset_readers/multitask_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nfrom logging import getLogger\nfrom typing import Dict\n\nfrom deeppavlov.core.common.registry import get_model, register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\n\nlog = getLogger(__name__)\n\n\n@register('multitask_reader')\nclass MultiTaskReader(DatasetReader):\n    \"\"\"Class to read several datasets simultaneously.\"\"\"\n\n    def read(self, tasks: Dict[str, Dict[str, dict]], task_defaults: dict = None, **kwargs):\n        \"\"\"Creates dataset readers for tasks and returns what task dataset readers `read()` methods return.\n\n        Args:\n            tasks: dictionary which keys are task names and values are dictionaries with param name - value pairs for\n                nested dataset readers initialization. If task has key-value pair ``'use_task_defaults': False``,\n                task_defaults for this task dataset reader will be ignored.\n            task_defaults: default task parameters.\n\n        Returns:\n            dictionary which keys are task names and values are what task readers `read()` methods returned.\n        \"\"\"\n        data = dict()\n        if task_defaults is None:\n            task_defaults = dict()\n        for task_name, task_params in tasks.items():\n            if task_params.pop('use_task_defaults', True) is True:\n                task_config = copy.deepcopy(task_defaults)\n                task_config.update(task_params)\n            else:\n                task_config = task_params\n            reader = get_model(task_config.pop('class_name'))()\n            data[task_name] = reader.read(**task_config)\n        return data\n"
  },
  {
    "path": "deeppavlov/dataset_readers/odqa_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nimport logging\nimport sqlite3\nimport unicodedata\nfrom multiprocessing import Pool\nfrom pathlib import Path\nfrom typing import Union, List, Tuple, Generator, Any, Optional\n\nfrom tqdm import tqdm\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\nfrom deeppavlov.core.data.utils import download\n\nlogger = logging.getLogger(__name__)\n\n\n@register('odqa_reader')\nclass ODQADataReader(DatasetReader):\n    \"\"\"Build a SQLite database from folder with txt files, json files or\n    `Wiki Extractor <https://github.com/attardi/wikiextractor>`_ files.\n\n    \"\"\"\n\n    def read(self, data_path: Union[Path, str], db_url: Optional[str] = None, *args,\n             **kwargs) -> None:\n        \"\"\"Build a SQLite database from provided files, download SQLite database from a provided URL,\n         or do nothing.\n\n        Args:\n            data_path: a directory/file with texts to create a database from\n            db_url: path to a database url\n            kwargs:\n                save_path: a path where a database should be saved to, or path to a ready database\n                dataset_format: initial data format; should be selected from ['txt', 'wiki', 'json']\n\n        Returns:\n            None\n\n        \"\"\"\n        logger.info('Reading files...')\n        try:\n            save_path = expand_path(kwargs['save_path'])\n        except KeyError:\n            raise ConfigError(\n                f'\\\"save_path\\\" attribute should be set for {self.__class__.__name__}\\\n                 in the JSON config.')\n        if save_path.exists() and save_path.with_suffix(f'{save_path.suffix}.done').exists():\n            return\n        try:\n            dataset_format = kwargs['dataset_format']\n        except KeyError:\n            raise ConfigError(\n                f'\\\"dataset_format\\\" attribute should be set for {self.__class__.__name__}\\\n                 in the JSON config.')\n\n        save_path.parent.mkdir(parents=True, exist_ok=True)\n\n        if db_url:\n            download_dir = save_path.parent\n            logger.info(f'Downloading database from {db_url} to {download_dir}')\n            download(download_dir, db_url, force_download=False)\n            return\n\n        self._build_db(save_path, dataset_format, expand_path(data_path))\n\n    def iter_files(self, path: Union[Path, str]) -> Generator[Path, Any, Any]:\n        \"\"\"Iterate over folder with files or a single file and generate file paths.\n\n        Args:\n            path: path to a folder or a file\n\n        Raises:\n            RuntimeError if the provided `path` doesn't exist\n\n        Yields:\n            file paths one by one\n\n        Returns:\n            None\n\n        \"\"\"\n        path = Path(path)\n        if path.is_file():\n            yield path\n        elif path.is_dir():\n            for item in path.iterdir():\n                yield from self.iter_files(item)\n        else:\n            raise RuntimeError(\"Path doesn't exist: {}\".format(path))\n\n    def _build_db(self, save_path: Union[Path, str], dataset_format: str,\n                  data_path: Union[Path, str],\n                  num_workers: int = 8) -> None:\n        \"\"\"Build a SQLite database in parallel and save it to a pointed path.\n\n        Args:\n            save_path: a path where the ready database should be saved\n            dataset_format: a data format, should be selected from ['txt', 'json', 'wiki']\n            data_path: path to a folder/file from which to build a database\n            num_workers: a number of workers for parallel database building\n\n        Raises:\n            sqlite3.OperationalError if `save_path` doesn't exist.\n            RuntimeError if dataset_format is not in ['txt', 'json', 'wiki']\n\n        Returns:\n            None\n\n        \"\"\"\n        done_path = save_path.with_suffix(f'{save_path.suffix}.done')\n\n        if Path(save_path).exists():\n            Path(save_path).unlink()\n        if done_path.exists():\n            done_path.unlink()\n\n        logger.info('Building the database...')\n\n        try:\n            conn = sqlite3.connect(str(save_path))\n        except sqlite3.OperationalError as e:\n            e.args = e.args + (\"Check that DB path exists.\",)\n            raise e\n        c = conn.cursor()\n        sql_table = \"CREATE TABLE documents (id PRIMARY KEY, text);\"\n        c.execute(sql_table)\n\n        files = [f for f in self.iter_files(data_path)]\n        workers = Pool(num_workers)\n\n        if dataset_format == 'txt':\n            fn = self._get_file_contents\n        elif dataset_format == 'json':\n            fn = self._get_json_contents\n        elif dataset_format == 'wiki':\n            fn = self._get_wiki_contents\n        else:\n            raise RuntimeError('Unknown dataset format.')\n\n        with tqdm(total=len(files)) as pbar:\n            for data in tqdm(workers.imap_unordered(fn, files)):\n                try:\n                    c.executemany(\"INSERT INTO documents VALUES (?,?)\", data)\n                    pbar.update()\n                except sqlite3.IntegrityError as e:\n                    logger.warning(e)\n\n        conn.commit()\n        conn.close()\n        done_path.touch()\n\n    @staticmethod\n    def _get_file_contents(fpath: Union[Path, str]) -> List[Tuple[str, str]]:\n        \"\"\"Extract file contents from '.txt' file.\n\n        Args:\n            fpath: path to a '.txt' file.\n\n        Returns:\n             a list with tuple of normalized file name and file contents\n\n        \"\"\"\n        with open(fpath, encoding='utf-8') as fin:\n            text = fin.read()\n            normalized_text = unicodedata.normalize('NFD', text)\n            return [(fpath.name, normalized_text)]\n\n    @staticmethod\n    def _get_json_contents(fpath: Union[Path, str]) -> List[Tuple[str, str]]:\n        \"\"\"Extract file contents from '.json' file. JSON files should be formatted as list with dicts\n        which contain 'title' and 'doc' keywords.\n\n        Args:\n            fpath: path to a '.json' file.\n\n        Returns:\n            a list with tuples of normalized file name and file contents\n\n        \"\"\"\n        docs = []\n        with open(fpath, encoding='utf-8') as fin:\n            for line in fin:\n                data = json.loads(line)\n                for doc in data:\n                    if not doc:\n                        continue\n                    text = doc['text']\n                    normalized_text = unicodedata.normalize('NFD', text)\n                    docs.append((doc['title'], normalized_text))\n        return docs\n\n    @staticmethod\n    def _get_wiki_contents(fpath: Union[Path, str]) -> List[Tuple[str, str]]:\n        \"\"\"Extract file contents from wiki extractor formatted files.\n\n        Args:\n            fpath: path to a '.txt' file in wiki extractor format\n\n        Returns:\n            a list with tuples of normalized file name and file contents\n\n        \"\"\"\n        docs = []\n        with open(fpath, encoding='utf-8') as fin:\n            for line in fin:\n                doc = json.loads(line)\n                if not doc:\n                    continue\n                text = doc['text']\n                normalized_text = unicodedata.normalize('NFD', text)\n                docs.append((doc['title'], normalized_text))\n        return docs\n"
  },
  {
    "path": "deeppavlov/dataset_readers/paraphraser_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport xml.etree.ElementTree as ET\nfrom pathlib import Path\nfrom typing import Dict, List, Tuple\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\n\n\n@register('paraphraser_reader')\nclass ParaphraserReader(DatasetReader):\n    \"\"\"The class to read the paraphraser.ru dataset from files.\n\n    Please, see https://paraphraser.ru.\n    \"\"\"\n\n    def read(self,\n             data_path: str,\n             do_lower_case: bool = True,\n             *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:\n        \"\"\"Read the paraphraser.ru dataset from files.\n\n        Args:\n            data_path: A path to a folder with dataset files.\n            do_lower_case: Do you want to lowercase all texts\n        \"\"\"\n\n        data_path = expand_path(data_path)\n        train_fname = data_path / 'paraphrases.xml'\n        test_fname = data_path / 'paraphrases_gold.xml'\n\n        train_data = self._build_data(train_fname, do_lower_case)\n        test_data = self._build_data(test_fname, do_lower_case)\n        return {\"train\": train_data, \"valid\": [], \"test\": test_data}\n\n    @staticmethod\n    def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]:\n        root = ET.fromstring(data_path.read_text(encoding='utf8'))\n        data = {}\n        for paraphrase in root.findall('corpus/paraphrase'):\n            key = (paraphrase.find('value[@name=\"text_1\"]').text,\n                   paraphrase.find('value[@name=\"text_2\"]').text)\n            if do_lower_case:\n                key = tuple([t.lower() for t in key])\n\n            data[key] = 1 if int(paraphrase.find('value[@name=\"class\"]').text) >= 0 else 0\n        return list(data.items())\n"
  },
  {
    "path": "deeppavlov/dataset_readers/rel_ranking_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport xml.etree.ElementTree as ET\nfrom pathlib import Path\nfrom typing import Dict, List, Tuple\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\n\n\n@register('rel_ranking_reader')\nclass ParaphraserReader(DatasetReader):\n    \"\"\"The class to read the paraphraser.ru dataset from files.\n​\n    Please, see https://paraphraser.ru.\n    \"\"\"\n\n    def read(self,\n             data_path: str,\n             do_lower_case: bool = True,\n             *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:\n        \"\"\"Read the paraphraser.ru dataset from files.\n​\n        Args:\n            data_path: A path to a folder with dataset files.\n            do_lower_case: Do you want to lowercase all texts\n        \"\"\"\n\n        data_path = expand_path(data_path)\n        train_fname = data_path / 'paraphrases.xml'\n        test_fname = data_path / 'paraphrases_gold.xml'\n\n        train_data = self._build_data(train_fname, do_lower_case)\n        test_data = self._build_data(test_fname, do_lower_case)\n        return {\"train\": train_data, \"valid\": [], \"test\": test_data}\n\n    @staticmethod\n    def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]:\n        root = ET.fromstring(data_path.read_text(encoding='utf8'))\n        data = []\n        for paraphrase in root.findall('corpus/paraphrase'):\n            key = (paraphrase.find('value[@name=\"text_1\"]').text,\n                   paraphrase.find('value[@name=\"text_2\"]').text)\n            if do_lower_case:\n                key = tuple([t.lower() for t in key])\n\n            pos_or_neg = int(paraphrase.find('value[@name=\"class\"]').text)\n            data.append((key, pos_or_neg))\n        return data\n"
  },
  {
    "path": "deeppavlov/dataset_readers/rured_reader.py",
    "content": "import json\nimport os\nimport random\nfrom typing import Dict, List, Tuple\nfrom pathlib import Path\nfrom logging import getLogger\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\n\nlogger = getLogger(__name__)\n\n\n@register('rured_reader')\nclass RuREDDatasetReader(DatasetReader):\n    \"\"\" Class to read the datasets in RuRED format\"\"\"\n\n    def read(self, data_path: str, rel2id: Dict = None) -> Dict[str, List[Tuple]]:\n        \"\"\"\n        This class processes the RuRED relation extraction dataset\n        (http://www.dialog-21.ru/media/5093/gordeevdiplusetal-031.pdf).\n        Args:\n            data_path: a path to a folder with dataset files.\n            rel2id: a path to a file where information about relation to relation id corresponding is stored.\n        Returns:\n            RuRED output dictionary in the following format:\n            DocRED output dictionary in the following format:\n            {\"data_type\":\n                List[\n                    Tuple(\n                        List[\n                            List[all tokens of the document],\n                            List[\n                                List[Tuple(start pos of mention 1 of ent 1, end pos of mention 1 of ent 1), ...],\n                                List[Tuple(start position of entity 2, end position of entity 2), ...],\n                                List[str(NER tag of entity 1), str(NER tag of entity 2)]\n                            ],\n                        List(int(one-hot encoded relation label))\n                    )\n                ]\n            }\n        \"\"\"\n\n        data_path = Path(data_path).resolve()\n\n        if not rel2id:\n            self.rel2id = self.add_default_rel_dict()\n        else:\n            self.rel2id = rel2id\n        self.stat = {}\n        self.ner_stat = {}\n\n        with open(os.path.join(data_path, \"train.json\"), encoding='utf-8') as file:\n            train_data = json.load(file)\n\n        with open(os.path.join(data_path, \"dev.json\"), encoding='utf-8') as file:\n            dev_data = json.load(file)\n\n        with open(os.path.join(data_path, \"test.json\"), encoding='utf-8') as file:\n            test_data = json.load(file)\n\n        train_data, self.stat[\"train\"] = self.process_rured_file(train_data, num_neg_samples=\"twice\")\n        dev_data, self.stat[\"dev\"] = self.process_rured_file(dev_data, num_neg_samples=\"equal\")\n        test_data, self.stat[\"test\"] = self.process_rured_file(test_data, num_neg_samples=\"equal\")\n\n        data = {\"train\": train_data, \"valid\": dev_data, \"test\": test_data}\n\n        return data\n\n    def process_rured_file(self, data: List[Dict], num_neg_samples: str) -> Tuple[List, Dict]:\n        \"\"\"\n        Processes a RuRED data and returns a DeepPavlov relevant output\n\n        Args:\n            data: List of data units\n            num_neg_samples: how many negative samples will be included to positive ones\n                Possible values:\n                    - None: no negative samples will be generated\n                        (relevant to the test set which has from neg samples only)\n                    - equal: there will be one negative sample pro positive sample\n                    - twice: there will be twice as many negative samples as positive ones\n                    - all: take all negative samples from the dataset\n        Returns:\n            one list of processed documents\n        \"\"\"\n        processed_samples = []\n        neg_samples = []        # list of indices of negative samples\n        pos_samples = 0         # counter of positive samples\n\n        for sample in data:\n            # record negative sample ids\n            if sample[\"relation\"] == \"no_relation\":\n                neg_samples.append(len(processed_samples))\n            else:\n                pos_samples += 1\n\n            if sample[\"subj_type\"] in self.ner_stat:\n                self.ner_stat[sample[\"subj_type\"]] += 1\n            else:\n                self.ner_stat[sample[\"subj_type\"]] = 1\n\n            if sample[\"obj_type\"] in self.ner_stat:\n                self.ner_stat[sample[\"obj_type\"]] += 1\n            else:\n                self.ner_stat[sample[\"obj_type\"]] = 1\n\n            processed_samples.append(\n                (\n                    [\n                        sample[\"token\"],\n                        [[(sample[\"subj_start\"], sample[\"subj_end\"])], [(sample[\"obj_start\"], sample[\"obj_end\"])]],\n                        [sample[\"subj_type\"], sample[\"obj_type\"]]\n                    ],\n                    self.label_to_one_hot(self.rel2id[sample[\"relation\"]])\n                )\n            )\n\n        # filter out some of negative sample if relevant\n        if num_neg_samples == \"equal\":\n            # include the same amount of negative samples as positive ones\n            neg_to_eliminate = random.sample(neg_samples, (len(neg_samples) - pos_samples))\n            processed_samples = [\n                sample for sample_idx, sample in enumerate(processed_samples) if sample_idx not in neg_to_eliminate\n            ]\n        elif num_neg_samples == \"twice\":\n            # include twice as much negative samples as positive ones\n            neg_to_eliminate = random.sample(neg_samples, (len(neg_samples) - 2 * pos_samples))\n            processed_samples = [\n                sample for sample_idx, sample in enumerate(processed_samples) if sample_idx not in neg_to_eliminate\n            ]\n        elif num_neg_samples == \"none\":\n            # eliminate all negative samples\n            processed_samples = [\n                sample for sample_idx, sample in enumerate(processed_samples) if sample_idx not in neg_samples\n            ]\n        else:\n            raise ValueError(\"Unknown negative samples amount! Currently available are 'equal', 'twice' and 'none\")\n\n        # collect statistics\n        stat = {}\n        for sample in processed_samples:\n            rel = [rel for rel, sample_log in enumerate(sample[1]) if sample_log == 1][0]\n            if rel in stat:\n                stat[rel] += 1\n            else:\n                stat[rel] = 1\n\n        return processed_samples, stat\n\n    def label_to_one_hot(self, label: int) -> List[int]:\n        \"\"\" Turn labels to one hot encodings \"\"\"\n        relation = [0] * len(self.rel2id)\n        relation[label] = 1\n        return relation\n\n    @staticmethod\n    def add_default_rel_dict():\n        \"\"\" Creates a default relation to relation if dictionary with RuRED relations \"\"\"\n        return dict(no_relation=0, MEMBER=1, WORKS_AS=2, WORKPLACE=3, OWNERSHIP=4, SUBORDINATE_OF=5, TAKES_PLACE_IN=6,\n                    EVENT_TAKES_PART_IN=7, SELLS_TO=8, ALTERNATIVE_NAME=9, HEADQUARTERED_IN=10, PRODUCES=11,\n                    ABBREVIATION=12, DATE_DEFUNCT_IN=13, SUBEVENT_OF=14, DATE_FOUNDED_IN=15, DATE_TAKES_PLACE_ON=16,\n                    NUMBER_OF_EMPLOYEES_FIRED=17, ORIGINS_FROM=18, ACQUINTANCE_OF=19, PARENT_OF=20, ORGANIZES=21,\n                    FOUNDED_BY=22, PLACE_RESIDES_IN=23, BORN_IN=24, AGE_IS=25, RELATIVE=26, NUMBER_OF_EMPLOYEES=27,\n                    SIBLING=28, DATE_OF_BIRTH=29)\n"
  },
  {
    "path": "deeppavlov/dataset_readers/sq_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nimport pickle\nfrom typing import List\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\nfrom deeppavlov.core.common.file import load_pickle\nfrom deeppavlov.core.common.file import read_json\n\n\n@register('sq_reader')\nclass SQReader(DatasetReader):\n    \"\"\"Class to read training datasets\"\"\"\n\n    def read(self, data_path: str, valid_size: int = None):\n        if str(data_path).endswith(\".pickle\"):\n            dataset = load_pickle(data_path)\n        elif str(data_path).endswith(\".json\"):\n            dataset = read_json(data_path)\n        else:\n            raise TypeError(f'Unsupported file type: {data_path}')\n        if valid_size:\n            dataset[\"valid\"] = dataset[\"valid\"][:valid_size]\n\n        return dataset\n\n\n@register('rubq_reader')\nclass RuBQReader(SQReader):\n    \"\"\"Class to read RuBQ datasets\"\"\"\n\n    def read(self, data_path: str, version: str = \"2.0\", question_types: List[str] = [\"all\"],\n                   not_include_question_types: List[str] = None, num_samples: int = -1):\n        dataset = super().read(data_path)\n        for data_type in [\"valid\", \"test\"]:\n            samples = dataset[data_type]\n            samples = [sample for sample in samples if float(sample[\"RuBQ_version\"]) <= float(version) and\n                       (any(tp in sample[\"tags\"] for tp in question_types) or question_types == [\"all\"])]\n            if not_include_question_types:\n                samples = [sample for sample in samples if all([tp not in sample[\"tags\"]\n                           for tp in not_include_question_types])]\n            samples = [self.preprocess(sample) for sample in samples]\n            if num_samples > 0:\n                samples = samples[:num_samples]\n            dataset[data_type] = samples\n        return dataset\n\n    def preprocess(self, sample):\n        question = sample.get(\"question_text\", \"\")\n        answers = sample.get(\"answers\", [])\n        answer_ids = [elem.get(\"value\", \"\").split(\"/\")[-1] for elem in answers]\n        answer_labels = [elem.get(\"label\", \"\").split(\"/\")[-1] for elem in answers]\n        query = sample.get(\"query\", \"\")\n        if query is None:\n            query = \"\"\n        else:\n            query = query.replace(\"\\n\", \" \").replace(\"  \", \" \")\n        return [question, [answer_ids, answer_labels, query]]\n\n\n@register('lcquad_reader')\nclass LCQuADReader(SQReader):\n    \"\"\"Class to read LCQuAD dataset\"\"\"\n\n    def read(self, data_path: str, question_types: List[str] = \"all\",\n                   not_include_question_types: List[str] = None, num_samples: int = -1):\n        dataset = super().read(data_path)\n        for data_type in [\"valid\", \"test\"]:\n            samples = dataset[data_type]\n            samples = [sample for sample in samples if (any(tp == sample[\"subgraph\"] for tp in question_types) \\\n                                                        or question_types == [\"all\"])]\n            if not_include_question_types:\n                samples = [sample for sample in samples\n                           if sample[\"subgraph\"] not in not_include_question_types]\n            samples = [self.preprocess(sample) for sample in samples]\n            if num_samples > 0:\n                samples = samples[:num_samples]\n            dataset[data_type] = samples\n        return dataset\n\n    def preprocess(self, sample):\n        question = sample.get(\"question\", \"\")\n        answers = sample.get(\"answer\", [])\n        answer_labels = sample.get(\"answer_label\", [])\n        query = sample.get(\"sparql_wikidata\", \"\")\n        return [question, [answers, answer_labels, query]]\n"
  },
  {
    "path": "deeppavlov/dataset_readers/squad_dataset_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport json\nfrom pathlib import Path\nfrom typing import Dict, Any, Optional\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\nfrom deeppavlov.core.data.utils import download_decompress\n\n\n@register('squad_dataset_reader')\nclass SquadDatasetReader(DatasetReader):\n    \"\"\"\n    Downloads dataset files and prepares train/valid split.\n\n    SQuAD:\n    Stanford Question Answering Dataset\n    https://rajpurkar.github.io/SQuAD-explorer/\n    \n    SQuAD2.0:\n    Stanford Question Answering Dataset, version 2.0\n    https://rajpurkar.github.io/SQuAD-explorer/\n\n    SberSQuAD:\n    Dataset from SDSJ Task B\n    https://www.sdsj.ru/ru/contest.html\n\n    MultiSQuAD:\n    SQuAD dataset with additional contexts retrieved (by tfidf) from original Wikipedia article.\n\n    MultiSQuADRetr:\n    SQuAD dataset with additional contexts retrieved by tfidf document ranker from full Wikipedia.\n\n    \"\"\"\n\n    url_squad = 'http://files.deeppavlov.ai/datasets/squad-v1.1.tar.gz'\n    url_sber_squad = 'http://files.deeppavlov.ai/datasets/sber_squad-v1.1.tar.gz'\n    url_multi_squad = 'http://files.deeppavlov.ai/datasets/multiparagraph_squad.tar.gz'\n    url_squad2 = 'http://files.deeppavlov.ai/datasets/squad-v2.0.tar.gz'\n\n    def read(self, data_path: str, dataset: Optional[str] = 'SQuAD', url: Optional[str] = None, *args, **kwargs) \\\n            -> Dict[str, Dict[str, Any]]:\n        \"\"\"\n\n        Args:\n            data_path: path to save data\n            dataset: default dataset names: ``'SQuAD'``, ``'SberSQuAD'`` or ``'MultiSQuAD'``\n            url: link to archive with dataset, use url argument if non-default dataset is used\n\n        Returns:\n            dataset split on train/valid\n\n        Raises:\n            RuntimeError: if `dataset` is not one of these: ``'SQuAD'``, ``'SberSQuAD'``, ``'MultiSQuAD'``.\n        \"\"\"\n        if url is not None:\n            self.url = url\n        elif dataset == 'SQuAD':\n            self.url = self.url_squad\n        elif dataset == 'SberSQuAD':\n            self.url = self.url_sber_squad\n        elif dataset == 'MultiSQuAD':\n            self.url = self.url_multi_squad\n        elif dataset == 'SQuAD2.0':\n            self.url = self.url_squad2\n        else:\n            raise RuntimeError(f'Dataset {dataset} is unknown')\n\n        data_path = Path(data_path)\n        if dataset == \"SQuAD2.0\":\n            required_files = [f'{dt}-v2.0.json' for dt in ['train', 'dev']]\n        else:\n            required_files = [f'{dt}-v1.1.json' for dt in ['train', 'dev']]\n        data_path.mkdir(parents=True, exist_ok=True)\n\n        if not all((data_path / f).exists() for f in required_files):\n            download_decompress(self.url, data_path)\n\n        dataset = {}\n        for f in required_files:\n            with data_path.joinpath(f).open('r', encoding='utf8') as fp:\n                data = json.load(fp)\n            if f in {'dev-v1.1.json', 'dev-v2.0.json'}:\n                dataset['valid'] = data\n            else:\n                dataset['train'] = data\n\n        return dataset\n\n\n@register('multi_squad_dataset_reader')\nclass MultiSquadDatasetReader(DatasetReader):\n    \"\"\"\n    Downloads dataset files and prepares train/valid split.\n\n    MultiSQuADRetr:\n    Multiparagraph SQuAD dataset with additional contexts retrieved by tfidf document ranker from full En Wikipedia.\n\n    MultiSQuADRuRetr:\n    Multiparagraph SberSQuAD dataset with additional contexts retrieved by tfidf document ranker from  Ru Wikipedia.\n\n    \"\"\"\n\n    url_multi_squad_retr = 'http://files.deeppavlov.ai/datasets/multi_squad_retr_enwiki20161221.tar.gz'\n    url_multi_squad_ru_retr = 'http://files.deeppavlov.ai/datasets/multi_squad_ru_retr.tar.gz'\n\n    def read(self, data_path: str, dataset: Optional[str] = 'MultiSQuADRetr', url: Optional[str] = None, *args,\n             **kwargs) -> Dict[str, Dict[str, Any]]:\n        \"\"\"\n\n        Args:\n            data_path: path to save data\n            dataset: default dataset names: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'``\n            url: link to archive with dataset, use url argument if non-default dataset is used\n\n        Returns:\n            dataset split on train/valid\n\n        Raises:\n            RuntimeError: if `dataset` is not one of these: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'``.\n        \"\"\"\n        if url is not None:\n            self.url = url\n        elif dataset == 'MultiSQuADRetr':\n            self.url = self.url_multi_squad_retr\n        elif dataset == 'MultiSQuADRuRetr':\n            self.url = self.url_multi_squad_ru_retr\n        else:\n            raise RuntimeError(f'Dataset {dataset} is unknown')\n\n        data_path = Path(data_path)\n        required_files = [f'{dt}.jsonl' for dt in ['train', 'dev']]\n        if not data_path.exists():\n            data_path.mkdir(parents=True)\n\n        if not all((data_path / f).exists() for f in required_files):\n            download_decompress(self.url, data_path)\n\n        dataset = {}\n        for f in required_files:\n            if 'dev' in f:\n                dataset['valid'] = data_path.joinpath(f)\n            else:\n                dataset['train'] = data_path.joinpath(f)\n\n        return dataset\n"
  },
  {
    "path": "deeppavlov/dataset_readers/typos_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport csv\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Dict, List, Tuple\n\nimport requests\nfrom lxml import html\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\nfrom deeppavlov.core.data.utils import is_done, download, mark_done\n\nlog = getLogger(__name__)\n\n\n@register('typos_custom_reader')\nclass TyposCustom(DatasetReader):\n    \"\"\"Base class for reading spelling corrections dataset files\n\n    \"\"\"\n\n    def __init__(self):\n        pass\n\n    @staticmethod\n    def build(data_path: str) -> Path:\n        \"\"\"Base method that interprets ``data_path`` argument.\n\n        Args:\n            data_path: path to the tsv-file containing erroneous and corrected words\n\n        Returns:\n            the same path as a :class:`~pathlib.Path` object\n        \"\"\"\n        return Path(data_path)\n\n    @classmethod\n    def read(cls, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]:\n        \"\"\"Read train data for spelling corrections algorithms\n\n        Args:\n            data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposCustom.build`\n\n        Returns:\n            train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator`\n        \"\"\"\n        fname = cls.build(data_path)\n        with fname.open(newline='', encoding='utf8') as tsvfile:\n            reader = csv.reader(tsvfile, delimiter='\\t')\n            next(reader)\n            res = [(mistake, correct) for mistake, correct in reader]\n        return {'train': res}\n\n\n@register('typos_wikipedia_reader')\nclass TyposWikipedia(TyposCustom):\n    \"\"\"Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with\n     English Wikipedia's list of common misspellings\n\n    \"\"\"\n\n    @staticmethod\n    def build(data_path: str) -> Path:\n        \"\"\"Download and parse common misspellings list from `Wikipedia <https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines>`_\n\n        Args:\n            data_path: target directory to download the data to\n\n        Returns:\n            path to the resulting tsv-file\n        \"\"\"\n        data_path = Path(data_path) / 'typos_wiki'\n\n        fname = data_path / 'misspelings.tsv'\n\n        if not is_done(data_path):\n            url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'\n\n            page = requests.get(url)\n            tree = html.fromstring(page.content)\n            raw = tree.xpath('//pre/text()')[0].splitlines()\n            data = []\n            for pair in raw:\n                typo, corrects = pair.strip().split('->')\n                for correct in corrects.split(','):\n                    data.append([typo.strip(), correct.strip()])\n\n            fname.parent.mkdir(parents=True, exist_ok=True)\n            with fname.open('w', newline='', encoding='utf8') as tsvfile:\n                writer = csv.writer(tsvfile, delimiter='\\t')\n                for line in data:\n                    writer.writerow(line)\n\n            mark_done(data_path)\n\n            log.info('Built')\n        return fname\n\n\n@register('typos_kartaslov_reader')\nclass TyposKartaslov(DatasetReader):\n    \"\"\"Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with\n     a Russian misspellings dataset from `kartaslov <https://github.com/dkulagin/kartaslov>`_\n\n    \"\"\"\n\n    def __init__(self):\n        pass\n\n    @staticmethod\n    def build(data_path: str) -> Path:\n        \"\"\"Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_\n\n        Args:\n            data_path: target directory to download the data to\n\n        Returns:\n            path to the resulting csv-file\n        \"\"\"\n        data_path = Path(data_path) / 'kartaslov'\n\n        fname = data_path / 'orfo_and_typos.L1_5.csv'\n\n        if not is_done(data_path):\n            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'\n\n            download(fname, url)\n\n            mark_done(data_path)\n\n            log.info('Built')\n        return fname\n\n    @staticmethod\n    def read(data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]:\n        \"\"\"Read train data for spelling corrections algorithms\n\n        Args:\n            data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposKartaslov.build`\n\n        Returns:\n            train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator`\n        \"\"\"\n        fname = TyposKartaslov.build(data_path)\n        with open(str(fname), newline='', encoding='utf8') as csvfile:\n            reader = csv.reader(csvfile, delimiter=';')\n            next(reader)\n            res = [(mistake, correct) for correct, mistake, weight in reader]\n        return {'train': res}\n"
  },
  {
    "path": "deeppavlov/dataset_readers/ubuntu_v2_reader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport csv\nfrom pathlib import Path\nfrom typing import List, Dict, Tuple, Union\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.dataset_reader import DatasetReader\n\n\n@register('ubuntu_v2_reader')\nclass UbuntuV2Reader(DatasetReader):\n    \"\"\"The class to read the Ubuntu V2 dataset from csv files.\n\n    Please, see https://github.com/rkadlec/ubuntu-ranking-dataset-creator.\n    \"\"\"\n\n    def read(self, data_path: str,\n             positive_samples=False,\n             *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:\n        \"\"\"Read the Ubuntu V2 dataset from csv files.\n\n        Args:\n            data_path: A path to a folder with dataset csv files.\n            positive_samples: if `True`, only positive context-response pairs will be taken for train\n        \"\"\"\n\n        data_path = expand_path(data_path)\n        dataset = {'train': None, 'valid': None, 'test': None}\n        train_fname = Path(data_path) / 'train.csv'\n        valid_fname = Path(data_path) / 'valid.csv'\n        test_fname = Path(data_path) / 'test.csv'\n        self.positive_samples = positive_samples\n        self.sen2int_vocab = {}\n        self.classes_vocab_train = {}\n        self.classes_vocab_valid = {}\n        self.classes_vocab_test = {}\n        dataset[\"train\"] = self.preprocess_data_train(train_fname)\n        dataset[\"valid\"] = self.preprocess_data_validation(valid_fname)\n        dataset[\"test\"] = self.preprocess_data_validation(test_fname)\n        return dataset\n\n    def preprocess_data_train(self, train_fname: Union[Path, str]) -> List[Tuple[List[str], int]]:\n        contexts = []\n        responses = []\n        labels = []\n        with open(train_fname, 'r') as f:\n            reader = csv.reader(f)\n            next(reader)\n            for el in reader:\n                contexts.append(el[0])\n                responses.append(el[1])\n                labels.append(int(el[2]))\n            data = list(zip(contexts, responses))\n            data = list(zip(data, labels))\n            if self.positive_samples:\n                data = [el[0] for el in data if el[1] == 1]\n                data = list(zip(data, range(len(data))))\n        return data\n\n    def preprocess_data_validation(self, fname: Union[Path, str]) -> List[Tuple[List[str], int]]:\n        contexts = []\n        responses = []\n        with open(fname, 'r') as f:\n            reader = csv.reader(f)\n            next(reader)\n            for el in reader:\n                contexts.append(el[0])\n                responses.append(el[1:])\n        data = [[el[0]] + el[1] for el in zip(contexts, responses)]\n        data = [(el, 1) for el in data]\n        return data\n"
  },
  {
    "path": "deeppavlov/deep.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nfrom logging import getLogger\n\nfrom deeppavlov.core.commands.infer import interact_model, predict_on_stream\nfrom deeppavlov.core.commands.train import train_evaluate_model_from_config\nfrom deeppavlov.core.common.cross_validation import calc_cv_score\nfrom deeppavlov.core.common.file import find_config\nfrom deeppavlov.download import deep_download\nfrom deeppavlov.utils.pip_wrapper import install_from_config\nfrom deeppavlov.utils.server import start_model_server\nfrom deeppavlov.utils.socket import start_socket_server\n\nlog = getLogger(__name__)\n\nparser = argparse.ArgumentParser()\n\nparser.add_argument(\"mode\", help=\"select a mode, train or interact\", type=str,\n                    choices={'train', 'evaluate', 'interact', 'predict', 'riseapi', 'risesocket', 'download', 'install',\n                             'crossval'})\nparser.add_argument(\"config_path\", help=\"path to a pipeline json config\", type=str)\n\nparser.add_argument(\"-e\", \"--start-epoch-num\", dest=\"start_epoch_num\", default=None,\n                    help=\"Start epoch number\", type=int)\nparser.add_argument(\"--recursive\", action=\"store_true\", help=\"Train nested configs\")\n\nparser.add_argument(\"-b\", \"--batch-size\", dest=\"batch_size\", default=None, help=\"inference batch size\", type=int)\nparser.add_argument(\"-f\", \"--input-file\", dest=\"file_path\", default=None, help=\"Path to the input file\", type=str)\nparser.add_argument(\"-d\", \"--download\", action=\"store_true\", help=\"download model components\")\nparser.add_argument(\"-i\", \"--install\", action=\"store_true\", help=\"install model requirements\")\n\nparser.add_argument(\"--folds\", help=\"number of folds\", type=int, default=5)\n\nparser.add_argument(\"--https\", action=\"store_true\", default=None, help=\"run model in https mode\")\nparser.add_argument(\"--key\", default=None, help=\"ssl key\", type=str)\nparser.add_argument(\"--cert\", default=None, help=\"ssl certificate\", type=str)\n\nparser.add_argument(\"-p\", \"--port\", default=None, help=\"api port\", type=int)\n\nparser.add_argument(\"--socket-type\", default=\"TCP\", type=str, choices={\"TCP\", \"UNIX\"})\nparser.add_argument(\"--socket-file\", default=\"/tmp/deeppavlov_socket.s\", type=str)\n\n\ndef main():\n    args = parser.parse_args()\n    pipeline_config_path = find_config(args.config_path)\n\n    if args.install or args.mode == 'install':\n        install_from_config(pipeline_config_path)\n    if args.download or args.mode == 'download':\n        deep_download(pipeline_config_path)\n\n    if args.mode == 'train':\n        train_evaluate_model_from_config(pipeline_config_path,\n                                         recursive=args.recursive,\n                                         start_epoch_num=args.start_epoch_num)\n    elif args.mode == 'evaluate':\n        train_evaluate_model_from_config(pipeline_config_path, to_train=False, start_epoch_num=args.start_epoch_num)\n    elif args.mode == 'interact':\n        interact_model(pipeline_config_path)\n    elif args.mode == 'riseapi':\n        start_model_server(pipeline_config_path, args.https, args.key, args.cert, port=args.port)\n    elif args.mode == 'risesocket':\n        start_socket_server(pipeline_config_path, args.socket_type, port=args.port, socket_file=args.socket_file)\n    elif args.mode == 'predict':\n        predict_on_stream(pipeline_config_path, args.batch_size, args.file_path)\n    elif args.mode == 'crossval':\n        if args.folds < 2:\n            log.error('Minimum number of Folds is 2')\n        else:\n            calc_cv_score(pipeline_config_path, n_folds=args.folds, is_loo=False)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "deeppavlov/download.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport secrets\nimport shutil\nimport sys\nfrom argparse import ArgumentParser, Namespace\nfrom collections import defaultdict\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Union, Optional, Dict, Iterable, Set, Tuple, List\nfrom urllib.parse import urlparse\nimport requests\nfrom filelock import FileLock\n\nimport deeppavlov\nfrom deeppavlov.core.commands.utils import expand_path, parse_config\nfrom deeppavlov.core.data.utils import download, download_decompress, get_all_elems_from_json, file_md5, \\\n    set_query_parameter, path_set_md5, get_download_token\n\nlog = getLogger(__name__)\n\nparser = ArgumentParser()\n\nparser.add_argument('--config', '-c', help=\"path to a pipeline json config\", type=str,\n                    default=None)\nparser.add_argument('-all', action='store_true',\n                    help=\"Download everything. Warning! There should be at least 10 GB space\"\n                         \" available on disk.\")\n\n\ndef get_config_downloads(config: Union[str, Path, dict]) -> Set[Tuple[str, Path]]:\n    config = parse_config(config)\n\n    downloads = set()\n    if 'metadata' in config and 'download' in config['metadata']:\n        for resource in config['metadata']['download']:\n            if isinstance(resource, str):\n                resource = {\n                    'url': resource\n                }\n\n            url = resource['url']\n            dest = expand_path(resource.get('subdir', ''))\n\n            downloads.add((url, dest))\n\n    config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')]\n\n    downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)}\n\n    return downloads\n\n\ndef get_configs_downloads(config: Optional[Union[str, Path, dict]] = None) -> Dict[str, Set[Path]]:\n    all_downloads = defaultdict(set)\n    if config:\n        configs = [config]\n    else:\n        configs = list(Path(deeppavlov.__path__[0], 'configs').glob('**/*.json'))\n\n    for config in configs:\n        for url, dest in get_config_downloads(config):\n            all_downloads[url].add(dest)\n\n    return all_downloads\n\n\ndef check_md5(url: str, dest_paths: List[Path], headers: Optional[dict] = None) -> bool:\n    url_md5 = path_set_md5(url)\n\n    try:\n        if url_md5.startswith('s3://'):\n            import boto3\n\n            s3 = boto3.resource('s3')\n            bucket, key = url_md5[5:].split('/', maxsplit=1)\n            obj = s3.Object(bucket, key)\n            data = obj.get()['Body'].read().decode('utf8')\n        else:\n            r = requests.get(url_md5, headers=headers)\n            if r.status_code != 200:\n                return False\n            data = r.text\n    except Exception as e:\n        log.debug(f'Could not download {url_md5} because of an exception {type(e)}: {e}')\n        return False\n\n    expected = {}\n    for line in data.splitlines():\n        _md5, fname = line.split(' ', maxsplit=1)\n        if fname[0] != '*':\n            if fname[0] == ' ':\n                log.warning(f'Hash generated in text mode for {fname}, comparison could be incorrect')\n            else:\n                log.error(f'Unknown hash content format in {url + \".md5\"}')\n                return False\n        expected[fname[1:]] = _md5\n\n    done = None\n    not_done = []\n    for base_path in dest_paths:\n        if all(file_md5(base_path / p) == _md5 for p, _md5 in expected.items()):\n            done = base_path\n        else:\n            not_done.append(base_path)\n\n    if done is None:\n        return False\n\n    for base_path in not_done:\n        log.info(f'Copying data from {done} to {base_path}')\n        for p in expected.keys():\n            shutil.copy(done / p, base_path / p)\n    return True\n\n\ndef download_resource(url: str, dest_paths: Iterable[Union[Path, str]], headers: Optional[dict] = None) -> None:\n    dest_paths = [Path(dest) for dest in dest_paths]\n    download_path = dest_paths[0].parent\n    download_path.mkdir(parents=True, exist_ok=True)\n    file_name = urlparse(url).path.split('/')[-1]\n    lockfile = download_path / f'.{file_name}.lock'\n\n    with FileLock(lockfile).acquire(poll_intervall=10):\n        if check_md5(url, dest_paths, headers):\n            log.info(f'Skipped {url} download because of matching hashes')\n        elif any(ext in url for ext in ('.tar.gz', '.gz', '.zip')):\n            download_decompress(url, download_path, dest_paths, headers=headers)\n        else:\n            dest_files = [dest_path / file_name for dest_path in dest_paths]\n            download(dest_files, url, headers=headers)\n\n\ndef download_resources(args: Namespace) -> None:\n    if not args.all and not args.config:\n        log.error('You should provide either model config path or -all flag')\n        sys.exit(1)\n    elif args.all:\n        downloads = get_configs_downloads()\n    else:\n        config_path = Path(args.config).resolve()\n        downloads = get_configs_downloads(config=config_path)\n\n    for url, dest_paths in downloads.items():\n        download_resource(url, dest_paths)\n\n\ndef deep_download(config: Union[str, Path, dict]) -> None:\n    downloads = get_configs_downloads(config)\n    last_id = len(downloads) - 1\n    session_id = secrets.token_urlsafe(32)\n\n    for file_id, (url, dest_paths) in enumerate(downloads.items()):\n        headers = {\n            'dp-token': get_download_token(),\n            'dp-session': session_id,\n            'dp-file-id': str(last_id - file_id),\n            'dp-version': deeppavlov.__version__\n        }\n        if not url.startswith('s3://') and not isinstance(config, dict):\n            url = set_query_parameter(url, 'config', Path(config).stem)\n        download_resource(url, dest_paths, headers)\n\n\ndef main(args: Optional[List[str]] = None) -> None:\n    args = parser.parse_args(args)\n    log.info(\"Downloading...\")\n    download_resources(args)\n    log.info(\"\\nDownload successful!\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "deeppavlov/metrics/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/metrics/accuracy.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport itertools\nimport re\nfrom logging import getLogger\nfrom typing import List\n\nimport numpy as np\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\nlog = getLogger(__name__)\n\n\n@register_metric('accuracy')\ndef accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float:\n    \"\"\"\n    Calculate accuracy in terms of absolute coincidence\n\n    Args:\n        y_true: array of true values\n        y_predicted: array of predicted values\n\n    Returns:\n        fraction of absolutely coincidental samples\n    \"\"\"\n    examples_len = len(y_true)\n    # if y1 and y2 are both arrays, == can be erroneously interpreted as element-wise equality\n\n    def _are_equal(y1, y2):\n        answer = (y1 == y2)\n        if isinstance(answer, np.ndarray):\n            answer = answer.all()\n        return answer\n\n    equalities = [_are_equal(y1, y2) for y1, y2 in zip(y_true, y_predicted)]\n    correct = sum(equalities)\n    return correct / examples_len if examples_len else 0\n\n\n@register_metric('kbqa_accuracy')\ndef kbqa_accuracy(questions_batch, pred_answer_labels_batch, pred_answer_ids_batch, pred_query_batch,\n                  gold_answer_labels_batch, gold_answer_ids_batch, gold_query_batch) -> float:\n    num_samples = len(pred_answer_ids_batch)\n    correct = 0\n    for question, pred_answer_label, pred_answer_ids, pred_query, gold_answer_labels, gold_answer_ids, gold_query in \\\n            zip(questions_batch, pred_answer_labels_batch, pred_answer_ids_batch, pred_query_batch,\n                gold_answer_labels_batch, gold_answer_ids_batch, gold_query_batch):\n        found_date = False\n        if pred_answer_ids and gold_answer_ids and re.findall(r\"[\\d]{3,4}\", pred_answer_ids[0]) and \\\n                re.findall(r\"[\\d]{3,4}\", pred_answer_ids[0]) == re.findall(r\"[\\d]{3,4}\", gold_answer_ids[0]):\n            found_date = True\n        found_label = False\n        if len(gold_answer_labels) == 1 and len(pred_answer_label) > 1 and pred_answer_label == gold_answer_labels[0]:\n            found_label = True\n        no_answer = False\n        if pred_answer_label == \"Not Found\" and not gold_answer_ids:\n            no_answer = True\n        if set(pred_answer_ids) == set(gold_answer_ids) or gold_query in pred_query or found_date or found_label \\\n                or no_answer:\n            correct += 1\n        log.debug(f\"question: {question} -- gold_answer_ids: {gold_answer_ids} -- pred_answer_ids: {pred_answer_ids}\")\n    return correct / num_samples if num_samples else 0\n\n\n@register_metric('multitask_accuracy')\ndef multitask_accuracy(*args) -> float:\n    \"\"\"\n    Accuracy for multiple simultaneous tasks.\n\n    Args:\n        *args: a list of `2n` inputs. The first `n` inputs are the correct answers for `n` tasks,\n            and the last `n` are the predicted ones.\n\n    Returns:\n        The percentage of inputs where the answers for all `n` tasks are correct.\n    \"\"\"\n    n = len(args)\n    y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:]\n    answers = []\n    for true, pred in zip(y_true_by_tasks, y_predicted_by_tasks):\n        answers.append(accuracy(true, pred))\n    final_answer = sum(answers)/len(answers)\n    return final_answer\n\n\n@register_metric('multitask_sequence_accuracy')\ndef multitask_sequence_accuracy(*args) -> float:\n    \"\"\"\n    Accuracy for multiple simultaneous sequence labeling (tagging) tasks.\n    For each sequence the model checks whether all its elements\n    are labeled correctly for all the individual taggers.\n\n    Args:\n        *args: a list of `2n` inputs. The first `n` inputs are the correct answers for `n` tasks,\n            and the last `n` are the predicted ones. For each task an\n\n    Returns:\n        The percentage of sequences where all the items has correct answers for all `n` tasks.\n\n    \"\"\"\n    n = len(args)\n    y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:]\n    y_true_by_sents = list(zip(*y_true_by_tasks))\n    y_predicted_by_sents = list(zip(*y_predicted_by_tasks))\n    y_true = list(list(zip(*elem)) for elem in y_true_by_sents)\n    y_predicted = list(list(zip(*elem)) for elem in y_predicted_by_sents)\n    return accuracy(y_true, y_predicted)\n\n\n@register_metric('multitask_token_accuracy')\ndef multitask_token_accuracy(*args) -> float:\n    \"\"\"\n        Per-item accuracy for multiple simultaneous sequence labeling (tagging) tasks.\n\n        Args:\n            *args: a list of `2n` inputs. The first `n` inputs are the correct answers for `n` tasks\n                and the last `n` are the predicted ones. For each task an\n\n        Returns:\n            The percentage of sequence elements for which the answers for all `n` tasks are correct.\n\n        \"\"\"\n    n = len(args)\n    y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:]\n    y_true_by_sents = list(zip(*y_true_by_tasks))\n    y_predicted_by_sents = list(zip(*y_predicted_by_tasks))\n    y_true = list(list(zip(*elem)) for elem in y_true_by_sents)\n    y_predicted = list(list(zip(*elem)) for elem in y_predicted_by_sents)\n    return per_token_accuracy(y_true, y_predicted)\n\n\n@register_metric('sets_accuracy')\ndef sets_accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float:\n    \"\"\"\n    Calculate accuracy in terms of sets coincidence\n\n    Args:\n        y_true: true values\n        y_predicted: predicted values\n\n    Returns:\n        portion of samples with absolutely coincidental sets of predicted values\n\n    Alias:\n        sets_accuracy\n    \"\"\"\n    examples_len = len(y_true)\n    correct = sum([set(y1) == set(y2) for y1, y2 in zip(y_true, y_predicted)])\n    return correct / examples_len if examples_len else 0\n\n\n@register_metric('slots_accuracy')\ndef slots_accuracy(y_true, y_predicted):\n    y_true = [{tag.split('-')[-1] for tag in s if tag != 'O'} for s in y_true]\n    y_predicted = [set(s.keys()) for s in y_predicted]\n    return accuracy(y_true, y_predicted)\n\n\n@register_metric('per_token_accuracy')\ndef per_token_accuracy(y_true, y_predicted):\n    y_true = list(itertools.chain(*y_true))\n    y_predicted = itertools.chain(*y_predicted)\n    examples_len = len(y_true)\n    correct = sum([y1 == y2 for y1, y2 in zip(y_true, y_predicted)])\n    return correct / examples_len if examples_len else 0\n\n\n# region go-bot metrics\n\n@register_metric('per_item_dialog_accuracy')\ndef per_item_dialog_accuracy(y_true, y_predicted: List[List[str]]):\n    # todo metric classes???\n    y_true = [y['text'] for dialog in y_true for y in dialog]\n    y_predicted = itertools.chain(*y_predicted)\n    examples_len = len(y_true)\n    correct = sum([y1.strip().lower() == y2.strip().lower() for y1, y2 in zip(y_true, y_predicted)])\n    return correct / examples_len if examples_len else 0\n\n\n@register_metric('acc')\ndef round_accuracy(y_true, y_predicted):\n    \"\"\"\n    Rounds predictions and calculates accuracy in terms of absolute coincidence.\n\n    Args:\n        y_true: list of true values\n        y_predicted: list of predicted values\n\n    Returns:\n        portion of absolutely coincidental samples\n    \"\"\"\n    if isinstance(y_predicted[0], np.ndarray):\n        predictions = [np.round(x) for x in y_predicted]\n    else:\n        predictions = [round(x) for x in y_predicted]\n    examples_len = len(y_true)\n    correct = sum([y1 == y2 for y1, y2 in zip(y_true, predictions)])\n    return correct / examples_len if examples_len else 0\n"
  },
  {
    "path": "deeppavlov/metrics/bleu.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport itertools\nfrom typing import List, Tuple, Any\n\nfrom nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction, brevity_penalty, closest_ref_length\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\nfrom deeppavlov.metrics.google_bleu import compute_bleu\n\nSMOOTH = SmoothingFunction()\n\n\n@register_metric('bleu_advanced')\ndef bleu_advanced(y_true: List[Any], y_predicted: List[Any],\n                  weights: Tuple = (1,), smoothing_function=SMOOTH.method1,\n                  auto_reweigh=False, penalty=True) -> float:\n    \"\"\"Calculate BLEU score\n\n    Parameters:\n        y_true: list of reference tokens\n        y_predicted: list of query tokens\n        weights: n-gram weights\n        smoothing_function: SmoothingFunction\n        auto_reweigh: Option to re-normalize the weights uniformly\n        penalty: either enable brevity penalty or not\n\n    Return:\n        BLEU score\n    \"\"\"\n\n    bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh)\n\n    hyp_len = len(y_predicted)\n    hyp_lengths = hyp_len\n    ref_lengths = closest_ref_length([y_true], hyp_len)\n\n    bpenalty = brevity_penalty(ref_lengths, hyp_lengths)\n\n    if penalty is True or bpenalty == 0:\n        return bleu_measure\n\n    return bleu_measure / bpenalty\n\n\n@register_metric('bleu')\ndef bleu(y_true, y_predicted):\n    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],\n                       [y_p.lower().split() for y_p in y_predicted])\n\n\n@register_metric('google_bleu')\ndef google_bleu(y_true, y_predicted):\n    return compute_bleu(([y_t.lower().split()] for y_t in y_true),\n                        (y_p.lower().split() for y_p in y_predicted))[0]\n\n\n@register_metric('per_item_bleu')\ndef per_item_bleu(y_true, y_predicted):\n    y_predicted = itertools.chain(*y_predicted)\n    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],\n                       [y_p.lower().split() for y_p in y_predicted])\n\n\n@register_metric('per_item_dialog_bleu')\ndef per_item_dialog_bleu(y_true, y_predicted):\n    y_true = (y['text'] for dialog in y_true for y in dialog)\n    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],\n                       [y.lower().split() for y_p in y_predicted for y in y_p])\n"
  },
  {
    "path": "deeppavlov/metrics/correlation.py",
    "content": "# Copyright 2020 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom scipy.stats import pearsonr, spearmanr\nfrom sklearn.metrics import matthews_corrcoef\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\n\n@register_metric('pearson_correlation')\ndef pearson_correlation(y_true, y_predicted) -> float:\n    return pearsonr(y_predicted, y_true)[0]\n\n\n@register_metric('spearman_correlation')\ndef spearman_correlation(y_true, y_predicted) -> float:\n    return spearmanr(y_predicted, y_true)[0]\n\n\n@register_metric('matthews_correlation')\ndef matthews_correlation(y_true, y_predicted) -> float:\n    return matthews_corrcoef(y_true, y_predicted)\n"
  },
  {
    "path": "deeppavlov/metrics/elmo_metrics.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nimport numpy as np\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\n\n@register_metric('elmo_loss2ppl')\ndef elmo_loss2ppl(losses: List[np.ndarray]) -> float:\n    \"\"\" Calculates perplexity by loss\n\n    Args:\n        losses: list of numpy arrays of model losses\n\n    Returns:\n        perplexity : float\n    \"\"\"\n    avg_loss = np.mean(losses)\n    return float(np.exp(avg_loss))\n"
  },
  {
    "path": "deeppavlov/metrics/fmeasure.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport itertools\nfrom collections import OrderedDict\nfrom itertools import chain\nfrom logging import getLogger\n\nimport numpy as np\nfrom sklearn.metrics import f1_score\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\nlog = getLogger(__name__)\n\n\n@register_metric('ner_f1')\ndef ner_f1(y_true, y_predicted):\n    \"\"\"\n    Calculates F1 measure for Named Entity Recognition task.\n\n    Args:\n        y_true: list of true values\n        y_predicted: list of predicted values\n\n    Returns:\n        F1 score\n\n    Alias:\n        ner_f1\n    \"\"\"\n    y_true = list(chain(*y_true))\n    y_predicted = list(chain(*y_predicted))\n    results = precision_recall_f1(y_true,\n                                  y_predicted,\n                                  print_results=True)\n    f1 = results['__total__']['f1']\n    return f1\n\n\n@register_metric('ner_token_f1')\ndef ner_token_f1(y_true, y_predicted, print_results=False):\n    \"\"\"\n    Calculates F1 measure for Named Entity Recognition task without taking into account BIO or BIOES markup.\n\n    Args:\n        y_true: list of true values\n        y_predicted: list of predicted values\n        print_results: if True, then F1 score for each entity type is printed\n\n    Returns:\n        F1 score\n\n    Alias:\n        ner_f1\n    \"\"\"\n    y_true = list(chain(*y_true))\n    y_pred = list(chain(*y_predicted))\n\n    # Drop BIO or BIOES markup\n    assert all(len(tag.split('-')) <= 2 for tag in y_true)\n\n    y_true = [tag.split('-')[-1] for tag in y_true]\n    y_pred = [tag.split('-')[-1] for tag in y_pred]\n    tags = set(y_true) | set(y_pred)\n    tags_dict = {tag: n for n, tag in enumerate(tags)}\n\n    y_true_inds = np.array([tags_dict[tag] for tag in y_true])\n    y_pred_inds = np.array([tags_dict[tag] for tag in y_pred])\n\n    results = {}\n    for tag, tag_ind in tags_dict.items():\n        if tag == 'O':\n            continue\n        tp = np.sum((y_true_inds == tag_ind) & (y_pred_inds == tag_ind))\n        fn = np.sum((y_true_inds == tag_ind) & (y_pred_inds != tag_ind))\n        fp = np.sum((y_true_inds != tag_ind) & (y_pred_inds == tag_ind))\n        n_pred = np.sum(y_pred_inds == tag_ind)\n        n_true = np.sum(y_true_inds == tag_ind)\n        if tp + fp > 0:\n            precision = tp / (tp + fp) * 100\n        else:\n            precision = 0\n        if tp + fn > 0:\n            recall = tp / (tp + fn) * 100\n        else:\n            recall = 0\n        if precision + recall > 0:\n            f1 = 2 * precision * recall / (precision + recall)\n        else:\n            f1 = 0\n        results[tag] = {'precision': precision, 'recall': recall,\n                        'f1': f1, 'n_true': n_true, 'n_pred': n_pred,\n                        'tp': tp, 'fp': fp, 'fn': fn}\n\n    results['__total__'], accuracy, total_true_entities, total_predicted_entities, total_correct = _global_stats_f1(\n        results)\n    n_tokens = len(y_true)\n    if print_results:\n        log.debug('TOKEN LEVEL F1')\n        _print_conll_report(results, accuracy, total_true_entities, total_predicted_entities, n_tokens, total_correct)\n    return results['__total__']['f1']\n\n\ndef _print_conll_report(results, accuracy, total_true_entities, total_predicted_entities, n_tokens, total_correct,\n                        short_report=False, entity_of_interest=None):\n    tags = list(results.keys())\n\n    s = 'processed {len} tokens ' \\\n        'with {tot_true} phrases; ' \\\n        'found: {tot_pred} phrases;' \\\n        ' correct: {tot_cor}.\\n\\n'.format(len=n_tokens,\n                                          tot_true=total_true_entities,\n                                          tot_pred=total_predicted_entities,\n                                          tot_cor=total_correct)\n\n    s += 'precision:  {tot_prec:.2f}%; ' \\\n         'recall:  {tot_recall:.2f}%; ' \\\n         'FB1:  {tot_f1:.2f}\\n\\n'.format(acc=accuracy,\n                                         tot_prec=results['__total__']['precision'],\n                                         tot_recall=results['__total__']['recall'],\n                                         tot_f1=results['__total__']['f1'])\n\n    if not short_report:\n        for tag in tags:\n            if entity_of_interest is not None:\n                if entity_of_interest in tag:\n                    s += '\\t' + tag + ': precision:  {tot_prec:.2f}%; ' \\\n                                      'recall:  {tot_recall:.2f}%; ' \\\n                                      'F1:  {tot_f1:.2f} ' \\\n                                      '{tot_predicted}\\n\\n'.format(tot_prec=results[tag]['precision'],\n                                                                   tot_recall=results[tag]['recall'],\n                                                                   tot_f1=results[tag]['f1'],\n                                                                   tot_predicted=results[tag]['n_pred'])\n            elif tag != '__total__':\n                s += '\\t' + tag + ': precision:  {tot_prec:.2f}%; ' \\\n                                  'recall:  {tot_recall:.2f}%; ' \\\n                                  'F1:  {tot_f1:.2f} ' \\\n                                  '{tot_predicted}\\n\\n'.format(tot_prec=results[tag]['precision'],\n                                                               tot_recall=results[tag]['recall'],\n                                                               tot_f1=results[tag]['f1'],\n                                                               tot_predicted=results[tag]['n_pred'])\n    elif entity_of_interest is not None:\n        s += '\\t' + entity_of_interest + ': precision:  {tot_prec:.2f}%; ' \\\n                                         'recall:  {tot_recall:.2f}%; ' \\\n                                         'F1:  {tot_f1:.2f} ' \\\n                                         '{tot_predicted}\\n\\n'.format(tot_prec=results[entity_of_interest]['precision'],\n                                                                      tot_recall=results[entity_of_interest]['recall'],\n                                                                      tot_f1=results[entity_of_interest]['f1'],\n                                                                      tot_predicted=results[entity_of_interest][\n                                                                          'n_pred'])\n    log.debug(s)\n\n\ndef _global_stats_f1(results):\n    total_true_entities = 0\n    total_predicted_entities = 0\n    total_precision = 0\n    total_recall = 0\n    total_f1 = 0\n    total_correct = 0\n    for tag in results:\n        if tag == '__total__':\n            continue\n\n        n_pred = results[tag]['n_pred']\n        n_true = results[tag]['n_true']\n        total_correct += results[tag]['tp']\n        total_true_entities += n_true\n        total_predicted_entities += n_pred\n        total_precision += results[tag]['precision'] * n_pred\n        total_recall += results[tag]['recall'] * n_true\n        total_f1 += results[tag]['f1'] * n_true\n    if total_true_entities > 0:\n        accuracy = total_correct / total_true_entities * 100\n        total_recall = total_recall / total_true_entities\n    else:\n        accuracy = 0\n        total_recall = 0\n    if total_predicted_entities > 0:\n        total_precision = total_precision / total_predicted_entities\n    else:\n        total_precision = 0\n\n    if total_precision + total_recall > 0:\n        total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall)\n    else:\n        total_f1 = 0\n\n    total_res = {'n_predicted_entities': total_predicted_entities,\n                 'n_true_entities': total_true_entities,\n                 'precision': total_precision,\n                 'recall': total_recall,\n                 'f1': total_f1}\n    return total_res, accuracy, total_true_entities, total_predicted_entities, total_correct\n\n\n@register_metric('f1')\ndef round_f1(y_true, y_predicted):\n    \"\"\"\n    Calculates F1 (binary) measure.\n\n    Args:\n        y_true: list of true values\n        y_predicted: list of predicted values\n\n    Returns:\n        F1 score\n\n    Alias:\n        f1\n    \"\"\"\n    try:\n        predictions = [np.round(x) for x in y_predicted]\n    except TypeError:\n        if set(y_true) | set(y_predicted) in ({\"True\"}, {\"False\"}, {\"False\", \"True\"}):\n            y_true = [y == \"True\" for y in y_true]\n            predictions = [y == \"True\" for y in y_predicted]\n        else:\n            raise RuntimeError(f\"Unexpectible type for {y_true} and {predictions}\")\n\n    return f1_score(y_true, predictions)\n\n\n@register_metric('f1_macro')\ndef round_f1_macro(y_true, y_predicted):\n    \"\"\"\n    Calculates F1 macro measure.\n\n    Args:\n        y_true: list of true values\n        y_predicted: list of predicted values\n\n    Returns:\n        F1 score\n\n    Alias:\n        f1_macro\n    \"\"\"\n    try:\n        predictions = [np.round(x) for x in y_predicted]\n    except TypeError:\n        predictions = y_predicted\n\n    return f1_score(np.array(y_true), np.array(predictions), average=\"macro\")\n\n\n@register_metric('f1_weighted')\ndef round_f1_weighted(y_true, y_predicted):\n    \"\"\"\n    Calculates F1 weighted measure.\n\n    Args:\n        y_true: list of true values\n        y_predicted: list of predicted values\n\n    Returns:\n        F1 score\n\n    Alias:\n        f1_weighted\n    \"\"\"\n    try:\n        predictions = [np.round(x) for x in y_predicted]\n    except TypeError:\n        predictions = y_predicted\n\n    return f1_score(np.array(y_true), np.array(predictions), average=\"weighted\")\n\n\ndef chunk_finder(current_token, previous_token, tag):\n    current_tag = current_token.split('-', 1)[-1]\n    previous_tag = previous_token.split('-', 1)[-1]\n    if previous_tag != tag:\n        previous_tag = 'O'\n    if current_tag != tag:\n        current_tag = 'O'\n\n    if current_tag != 'O' and (\n            previous_tag == 'O' or\n            previous_token in ['E-' + tag, 'L-' + tag, 'S-' + tag, 'U-' + tag] or\n            current_token in ['B-' + tag, 'S-' + tag, 'U-' + tag]\n    ):\n        create_chunk = True\n    else:\n        create_chunk = False\n\n    if previous_tag != 'O' and (\n            current_tag == 'O' or\n            previous_token in ['E-' + tag, 'L-' + tag, 'S-' + tag, 'U-' + tag] or\n            current_token in ['B-' + tag, 'S-' + tag, 'U-' + tag]\n    ):\n        pop_out = True\n    else:\n        pop_out = False\n    return create_chunk, pop_out\n\n\ndef precision_recall_f1(y_true, y_pred, print_results=True, short_report=False, entity_of_interest=None):\n    # Find all tags\n    tags = set()\n    for tag in itertools.chain(y_true, y_pred):\n        if tag != 'O':\n            current_tag = tag[2:]\n            tags.add(current_tag)\n    tags = sorted(list(tags))\n\n    results = OrderedDict()\n    for tag in tags:\n        results[tag] = OrderedDict()\n    results['__total__'] = OrderedDict()\n    n_tokens = len(y_true)\n    total_correct = 0\n    # Firstly we find all chunks in the ground truth and prediction\n    # For each chunk we write starting and ending indices\n\n    for tag in tags:\n        count = 0\n        true_chunk = []\n        pred_chunk = []\n        y_true = [str(y) for y in y_true]\n        y_pred = [str(y) for y in y_pred]\n        prev_tag_true = 'O'\n        prev_tag_pred = 'O'\n        while count < n_tokens:\n            yt = y_true[count]\n            yp = y_pred[count]\n\n            create_chunk_true, pop_out_true = chunk_finder(yt, prev_tag_true, tag)\n            if pop_out_true:\n                true_chunk[-1] = (true_chunk[-1], count - 1)\n            if create_chunk_true:\n                true_chunk.append(count)\n\n            create_chunk_pred, pop_out_pred = chunk_finder(yp, prev_tag_pred, tag)\n            if pop_out_pred:\n                pred_chunk[-1] = (pred_chunk[-1], count - 1)\n            if create_chunk_pred:\n                pred_chunk.append(count)\n            prev_tag_true = yt\n            prev_tag_pred = yp\n            count += 1\n\n        if len(true_chunk) > 0 and not isinstance(true_chunk[-1], tuple):\n            true_chunk[-1] = (true_chunk[-1], count - 1)\n        if len(pred_chunk) > 0 and not isinstance(pred_chunk[-1], tuple):\n            pred_chunk[-1] = (pred_chunk[-1], count - 1)\n\n        # Then we find all correctly classified intervals\n        # True positive results\n        tp = len(set(pred_chunk).intersection(set(true_chunk)))\n        # And then just calculate errors of the first and second kind\n        # False negative\n        fn = len(true_chunk) - tp\n        # False positive\n        fp = len(pred_chunk) - tp\n        if tp + fp > 0:\n            precision = tp / (tp + fp) * 100\n        else:\n            precision = 0\n        if tp + fn > 0:\n            recall = tp / (tp + fn) * 100\n        else:\n            recall = 0\n        if precision + recall > 0:\n            f1 = 2 * precision * recall / (precision + recall)\n        else:\n            f1 = 0\n        results[tag]['precision'] = precision\n        results[tag]['recall'] = recall\n        results[tag]['f1'] = f1\n        results[tag]['n_pred'] = len(pred_chunk)\n        results[tag]['n_true'] = len(true_chunk)\n        results[tag]['tp'] = tp\n        results[tag]['fn'] = fn\n        results[tag]['fp'] = fp\n\n    results['__total__'], accuracy, total_true_entities, total_predicted_entities, accuracy = _global_stats_f1(results)\n    results['__total__']['n_pred'] = total_predicted_entities\n    results['__total__']['n_true'] = total_true_entities\n\n    if print_results:\n        s = 'processed {len} tokens ' \\\n            'with {tot_true} phrases; ' \\\n            'found: {tot_pred} phrases;' \\\n            ' correct: {tot_cor}.\\n\\n'.format(len=n_tokens,\n                                              tot_true=total_true_entities,\n                                              tot_pred=total_predicted_entities,\n                                              tot_cor=total_correct)\n\n        s += 'precision:  {tot_prec:.2f}%; ' \\\n             'recall:  {tot_recall:.2f}%; ' \\\n             'FB1:  {tot_f1:.2f}\\n\\n'.format(acc=accuracy,\n                                             tot_prec=results['__total__']['precision'],\n                                             tot_recall=results['__total__']['recall'],\n                                             tot_f1=results['__total__']['f1'])\n\n        if not short_report:\n            for tag in tags:\n                if entity_of_interest is not None:\n                    if entity_of_interest in tag:\n                        s += '\\t' + tag + ': precision:  {tot_prec:.2f}%; ' \\\n                                          'recall:  {tot_recall:.2f}%; ' \\\n                                          'F1:  {tot_f1:.2f} ' \\\n                                          '{tot_predicted}\\n\\n'.format(tot_prec=results[tag]['precision'],\n                                                                       tot_recall=results[tag]['recall'],\n                                                                       tot_f1=results[tag]['f1'],\n                                                                       tot_predicted=results[tag]['n_pred'])\n                elif tag != '__total__':\n                    s += '\\t' + tag + ': precision:  {tot_prec:.2f}%; ' \\\n                                      'recall:  {tot_recall:.2f}%; ' \\\n                                      'F1:  {tot_f1:.2f} ' \\\n                                      '{tot_predicted}\\n\\n'.format(tot_prec=results[tag]['precision'],\n                                                                   tot_recall=results[tag]['recall'],\n                                                                   tot_f1=results[tag]['f1'],\n                                                                   tot_predicted=results[tag]['n_pred'])\n        elif entity_of_interest is not None:\n            s += '\\t' + entity_of_interest + ': precision:  {tot_prec:.2f}%; ' \\\n                                             'recall:  {tot_recall:.2f}%; ' \\\n                                             'F1:  {tot_f1:.2f} ' \\\n                                             '{tot_predicted}\\n\\n'.format(\n                tot_prec=results[entity_of_interest]['precision'],\n                tot_recall=results[entity_of_interest]['recall'],\n                tot_f1=results[entity_of_interest]['f1'],\n                tot_predicted=results[entity_of_interest]['n_pred'])\n        log.debug(s)\n    return results\n\n\n@register_metric(\"average__ner_f1__f1_macro__f1\")\ndef ner_f1__f1_macro__f1(ner_true, ner_pred, macro_true, macro_pred, f1_true, f1_pred):\n    ner_f1_res = ner_f1(ner_true, ner_pred) / 100\n    f1_macro_res = round_f1_macro(macro_true, macro_pred)\n    f1_res = round_f1(f1_true, f1_pred)\n    return (ner_f1_res + f1_macro_res + f1_res) / 3\n\n\n@register_metric(\"average__roc_auc__roc_auc__ner_f1\")\ndef roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_probas2, ner_true3, ner_pred3):\n    from .roc_auc_score import roc_auc_score\n    roc_auc1 = roc_auc_score(true_onehot1, pred_probas1)\n    roc_auc2 = roc_auc_score(true_onehot2, pred_probas2)\n    ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100\n    return (roc_auc1 + roc_auc2 + ner_f1_3) / 3\n"
  },
  {
    "path": "deeppavlov/metrics/google_bleu.py",
    "content": "# Copyright 2017 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# ==============================================================================\n\n\"\"\"Python implementation of BLEU and smooth-BLEU.\n\nThis module provides a Python implementation of BLEU and smooth-BLEU.\nSmooth BLEU is computed following the method outlined in the paper:\nChin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic\nevaluation metrics for machine translation. COLING 2004.\n\"\"\"\n\nimport collections\nimport math\n\n\ndef _get_ngrams(segment, max_order):\n    \"\"\"Extracts all n-grams upto a given maximum order from an input segment.\n\n    Args:\n      segment: text segment from which n-grams will be extracted.\n      max_order: maximum length in tokens of the n-grams returned by this\n          methods.\n\n    Returns:\n      The Counter containing all n-grams upto max_order in segment\n      with a count of how many times each n-gram occurred.\n    \"\"\"\n    ngram_counts = collections.Counter()\n    for order in range(1, max_order + 1):\n        for i in range(0, len(segment) - order + 1):\n            ngram = tuple(segment[i:i + order])\n            ngram_counts[ngram] += 1\n    return ngram_counts\n\n\ndef compute_bleu(reference_corpus, translation_corpus, max_order=4,\n                 smooth=False):\n    \"\"\"Computes BLEU score of translated segments against one or more references.\n\n    Args:\n      reference_corpus: list of lists of references for each translation. Each\n          reference should be tokenized into a list of tokens.\n      translation_corpus: list of translations to score. Each translation\n          should be tokenized into a list of tokens.\n      max_order: Maximum n-gram order to use when computing BLEU score.\n      smooth: Whether or not to apply Lin et al. 2004 smoothing.\n\n    Returns:\n      3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram\n      precisions and brevity penalty.\n    \"\"\"\n    matches_by_order = [0] * max_order\n    possible_matches_by_order = [0] * max_order\n    reference_length = 0\n    translation_length = 0\n    for (references, translation) in zip(reference_corpus,\n                                         translation_corpus):\n        reference_length += min(len(r) for r in references)\n        translation_length += len(translation)\n\n        merged_ref_ngram_counts = collections.Counter()\n        for reference in references:\n            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)\n        translation_ngram_counts = _get_ngrams(translation, max_order)\n        overlap = translation_ngram_counts & merged_ref_ngram_counts\n        for ngram in overlap:\n            matches_by_order[len(ngram) - 1] += overlap[ngram]\n        for order in range(1, max_order + 1):\n            possible_matches = len(translation) - order + 1\n            if possible_matches > 0:\n                possible_matches_by_order[order - 1] += possible_matches\n\n    precisions = [0] * max_order\n    for i in range(0, max_order):\n        if smooth:\n            precisions[i] = ((matches_by_order[i] + 1.) /\n                             (possible_matches_by_order[i] + 1.))\n        else:\n            if possible_matches_by_order[i] > 0:\n                precisions[i] = (float(matches_by_order[i]) /\n                                 possible_matches_by_order[i])\n            else:\n                precisions[i] = 0.0\n\n    if min(precisions) > 0:\n        p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)\n        geo_mean = math.exp(p_log_sum)\n    else:\n        geo_mean = 0\n\n    ratio = float(translation_length) / reference_length\n\n    if ratio > 1.0:\n        bp = 1.\n    else:\n        bp = math.exp(1 - 1. / ratio)\n\n    bleu = geo_mean * bp\n\n    return (bleu, precisions, bp, ratio, translation_length, reference_length)\n"
  },
  {
    "path": "deeppavlov/metrics/log_loss.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom typing import List, Union\n\nimport numpy as np\nfrom sklearn.metrics import log_loss\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\n\n@register_metric('log_loss')\ndef sk_log_loss(y_true: Union[List[List[float]], List[List[int]], np.ndarray],\n                y_predicted: Union[List[List[float]], List[List[int]], np.ndarray]) -> float:\n    \"\"\"\n    Calculates log loss.\n\n    Args:\n        y_true: list or array of true values\n        y_predicted: list or array of predicted values\n\n    Returns:\n        Log loss\n\n    Alias:\n        log_loss\n    \"\"\"\n    return log_loss(y_true, y_predicted)\n"
  },
  {
    "path": "deeppavlov/metrics/mse.py",
    "content": "# Copyright 2020 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nfrom sklearn.metrics import mean_squared_error\nfrom typing import Union\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\n\n@register_metric('mean_squared_error')\ndef mse(y_true: Union[np.array, list],\n        y_predicted: Union[np.array, list],\n        *args,\n        **kwargs) -> float:\n    \"\"\"\n    Calculates mean squared error.\n    Args:\n        y_true: list of true values\n        y_predicted: list of predicted values\n    Returns:\n        float: Mean squared error\n    \"\"\"\n    for value in [y_true, y_predicted]:\n        assert (np.isfinite(value).all())\n    return mean_squared_error(y_true, y_predicted, *args, **kwargs)\n"
  },
  {
    "path": "deeppavlov/metrics/recall_at_k.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom typing import List\n\nimport numpy as np\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\n\ndef recall_at_k(y_true: List[int], y_pred: List[List[np.ndarray]], k: int):\n    \"\"\"\n    Calculates recall at k ranking metric.\n\n    Args:\n        y_true: Labels. Not used in the calculation of the metric.\n        y_predicted: Predictions.\n            Each prediction contains ranking score of all ranking candidates for the particular data sample.\n            It is supposed that the ranking score for the true candidate goes first in the prediction.\n\n    Returns:\n        Recall at k\n    \"\"\"\n    num_examples = float(len(y_pred))\n    predictions = np.array(y_pred)\n    predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k]\n    num_correct = 0\n    for el in predictions:\n        if 0 in el:\n            num_correct += 1\n    return float(num_correct) / num_examples\n\n\n@register_metric('r@1')\ndef r_at_1(y_true, y_pred):\n    return recall_at_k(y_true, y_pred, k=1)\n\n\n@register_metric('r@2')\ndef r_at_2(y_true, y_pred):\n    return recall_at_k(y_true, y_pred, k=2)\n\n\n@register_metric('r@5')\ndef r_at_5(labels, predictions):\n    return recall_at_k(labels, predictions, k=5)\n\n\n@register_metric('r@10')\ndef r_at_10(labels, predictions):\n    return recall_at_k(labels, predictions, k=10)\n"
  },
  {
    "path": "deeppavlov/metrics/record_metrics.py",
    "content": "import re\nimport string\nimport collections\nfrom typing import List\n\nimport numpy as np\n\nfrom deeppavlov.models.preprocessors.torch_transformers_preprocessor import RecordNestedExample\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\n\n@register_metric(\"record_f1_score\")\ndef record_f1_score(record_examples: List[RecordNestedExample]):\n    \"\"\"Calculate F1 score for given nested ReCoRD examples\n\n    Args:\n        record_examples: processed ReCoRD examples\n\n    Returns:\n        float: F1 score\n    \"\"\"\n    if not record_examples:\n        return 0.\n    f1_scores = []\n    for example in record_examples:\n        example_f1s = []\n        for answer in example.answers:\n            example_f1s.append(exact_match_score(example.prediction, answer))\n        if example_f1s:\n            f1_scores.append(max(example_f1s))\n    return np.mean(f1_scores)\n\n\n@register_metric(\"record_em_score\")\ndef record_em_score(record_examples: List[RecordNestedExample]):\n    \"\"\"Calculate Exact Match score for given nested ReCoRD examples\n\n    Args:\n        record_examples: processed ReCoRD examples\n\n    Returns:\n        float: Exact Match score\n    \"\"\"\n    if not record_examples:\n        return 0.\n    em_scores = []\n    for example in record_examples:\n        example_ems = []\n        for answer in example.answers:\n            example_ems.append(string_f1_score(example.prediction, answer))\n        if example_ems:\n            em_scores.append(max(example_ems))\n    return np.mean(em_scores) if em_scores else -1\n\n\ndef normalize_answer(s):\n    \"\"\"Lower text and remove punctuation, articles and extra whitespace.\n    From official ReCoRD eval script\n    \"\"\"\n\n    def remove_articles(text):\n        return re.sub(r\"\\b(a|an|the)\\b\", \" \", text)\n\n    def white_space_fix(text):\n        return \" \".join(text.split())\n\n    def remove_punc(text):\n        exclude = set(string.punctuation)\n        return \"\".join(ch for ch in text if ch not in exclude)\n\n    def lower(text):\n        return text.lower()\n\n    return white_space_fix(remove_articles(remove_punc(lower(s))))\n\n\ndef string_f1_score(prediction, ground_truth):\n    \"\"\"Compute normalized token level F1\n    From official ReCoRD eval script\n    \"\"\"\n    prediction_tokens = normalize_answer(prediction).split()\n    ground_truth_tokens = normalize_answer(ground_truth).split()\n    common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens)\n    num_same = sum(common.values())\n    if num_same == 0:\n        return 0\n    precision = 1.0 * num_same / len(prediction_tokens)\n    recall = 1.0 * num_same / len(ground_truth_tokens)\n    f1 = (2 * precision * recall) / (precision + recall)\n    return f1\n\n\ndef exact_match_score(prediction, ground_truth):\n    \"\"\"Compute normalized exact match\n    From official ReCoRD eval script\n    \"\"\"\n    return normalize_answer(prediction) == normalize_answer(ground_truth)\n"
  },
  {
    "path": "deeppavlov/metrics/roc_auc_score.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom typing import List, Union\n\nimport numpy as np\nimport sklearn.metrics\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\n\n@register_metric('roc_auc')\ndef roc_auc_score(y_true: Union[List[List[float]], List[List[int]], np.ndarray],\n                  y_pred: Union[List[List[float]], List[List[int]], np.ndarray]) -> float:\n    \"\"\"\n    Compute Area Under the Curve (AUC) from prediction scores.\n\n    Args:\n        y_true: true binary labels\n        y_pred: target scores, can either be probability estimates of the positive class\n\n    Returns:\n        Area Under the Curve (AUC) from prediction scores\n\n    Alias:\n        roc_auc\n    \"\"\"\n    try:\n        return sklearn.metrics.roc_auc_score(np.squeeze(np.array(y_true)),\n                                             np.squeeze(np.array(y_pred)), average=\"macro\")\n    except ValueError:\n        return 0.\n"
  },
  {
    "path": "deeppavlov/metrics/squad_metrics.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nimport string\nfrom collections import Counter\nfrom typing import List\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\n\n@register_metric('squad_v2_em')\ndef squad_v2_exact_match(y_true: List[List[str]], y_predicted: List[str]) -> float:\n    \"\"\" Calculates Exact Match score between y_true and y_predicted\n        EM score uses the best matching y_true answer:\n            if y_pred equal at least to one answer in y_true then EM = 1, else EM = 0\n\n    The same as in SQuAD-v2.0\n\n    Args:\n        y_true: list of correct answers (correct answers are represented by list of strings)\n        y_predicted: list of predicted answers\n\n    Returns:\n        exact match score : float\n    \"\"\"\n    EM_total = sum(normalize_answer(prediction) in map(normalize_answer, ground_truth)\n                   for ground_truth, prediction in zip(y_true, y_predicted))\n    return 100 * EM_total / len(y_true) if len(y_true) > 0 else 0\n\n\n@register_metric('squad_v1_em')\ndef squad_v1_exact_match(y_true: List[List[str]], y_predicted: List[str]) -> float:\n    \"\"\" Calculates Exact Match score between y_true and y_predicted\n        EM score uses the best matching y_true answer:\n            if y_pred equal at least to one answer in y_true then EM = 1, else EM = 0\n        Skips examples without an answer.\n    Args:\n        y_true: list of correct answers (correct answers are represented by list of strings)\n        y_predicted: list of predicted answers\n    Returns:\n        exact match score : float\n    \"\"\"\n    EM_total = 0\n    count = 0\n    for ground_truth, prediction in zip(y_true, y_predicted):\n        if len(ground_truth[0]) == 0:\n            # skip empty answers\n            continue\n        count += 1\n        EMs = [int(normalize_answer(gt) == normalize_answer(prediction)) for gt in ground_truth]\n        EM_total += max(EMs)\n    return 100 * EM_total / count if count > 0 else 0\n\n\n@register_metric('squad_v2_f1')\ndef squad_v2_f1(y_true: List[List[str]], y_predicted: List[str]) -> float:\n    \"\"\" Calculates F-1 score between y_true and y_predicted\n        F-1 score uses the best matching y_true answer\n\n    The same as in SQuAD-v2.0\n\n    Args:\n        y_true: list of correct answers (correct answers are represented by list of strings)\n        y_predicted: list of predicted answers\n\n    Returns:\n        F-1 score : float\n    \"\"\"\n    f1_total = 0.0\n    for ground_truth, prediction in zip(y_true, y_predicted):\n        prediction_tokens = normalize_answer(prediction).split()\n        f1s = []\n        for gt in ground_truth:\n            gt_tokens = normalize_answer(gt).split()\n            if len(gt_tokens) == 0 or len(prediction_tokens) == 0:\n                f1s.append(float(gt_tokens == prediction_tokens))\n                continue\n            common = Counter(prediction_tokens) & Counter(gt_tokens)\n            num_same = sum(common.values())\n            if num_same == 0:\n                f1s.append(0.0)\n                continue\n            precision = 1.0 * num_same / len(prediction_tokens)\n            recall = 1.0 * num_same / len(gt_tokens)\n            f1 = (2 * precision * recall) / (precision + recall)\n            f1s.append(f1)\n        f1_total += max(f1s)\n    return 100 * f1_total / len(y_true) if len(y_true) > 0 else 0\n\n\n@register_metric('squad_v1_f1')\ndef squad_v1_f1(y_true: List[List[str]], y_predicted: List[str]) -> float:\n    \"\"\" Calculates F-1 score between y_true and y_predicted\n        F-1 score uses the best matching y_true answer\n\n        Skips examples without an answer.\n    Args:\n        y_true: list of correct answers (correct answers are represented by list of strings)\n        y_predicted: list of predicted answers\n    Returns:\n        F-1 score : float\n    \"\"\"\n    f1_total = 0.0\n    count = 0\n    for ground_truth, prediction in zip(y_true, y_predicted):\n        if len(ground_truth[0]) == 0:\n            # skip empty answers\n            continue\n        count += 1\n        prediction_tokens = normalize_answer(prediction).split()\n        f1s = []\n        for gt in ground_truth:\n            gt_tokens = normalize_answer(gt).split()\n            common = Counter(prediction_tokens) & Counter(gt_tokens)\n            num_same = sum(common.values())\n            if num_same == 0:\n                f1s.append(0.0)\n                continue\n            precision = 1.0 * num_same / len(prediction_tokens)\n            recall = 1.0 * num_same / len(gt_tokens)\n            f1 = (2 * precision * recall) / (precision + recall)\n            f1s.append(f1)\n        f1_total += max(f1s)\n    return 100 * f1_total / count if count > 0 else 0\n\n\ndef normalize_answer(s: str) -> str:\n    def remove_articles(text):\n        return re.sub(r'\\b(a|an|the)\\b', ' ', text)\n\n    def white_space_fix(text):\n        return ' '.join(text.split())\n\n    def remove_punc(text):\n        exclude = set(string.punctuation)\n        return ''.join(ch for ch in text if ch not in exclude)\n\n    def lower(text):\n        return text.lower()\n\n    return white_space_fix(remove_articles(remove_punc(lower(s))))\n"
  },
  {
    "path": "deeppavlov/models/__init__.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\n\nimport nltk\n\nfrom deeppavlov.core.common.prints import RedirectedPrints\n\nif not os.environ.get('DP_SKIP_NLTK_DOWNLOAD'):\n    with RedirectedPrints():\n        nltk.download('punkt', quiet=True)\n        nltk.download('stopwords', quiet=True)\n        nltk.download('perluniprops', quiet=True)\n        nltk.download('nonbreaking_prefixes', quiet=True)\n"
  },
  {
    "path": "deeppavlov/models/api_requester/__init__.py",
    "content": "from .api_requester import *\n"
  },
  {
    "path": "deeppavlov/models/api_requester/api_requester.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport asyncio\nfrom typing import Any, List, Dict, AsyncIterable\n\nimport requests\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\n@register('api_requester')\nclass ApiRequester(Component):\n    \"\"\"Component for forwarding parameters to APIs\n\n    Args:\n        url: url of the API.\n        out: count of expected returned values or their names in a chainer.\n        param_names: list of parameter names for API requests.\n        debatchify: if ``True``, single instances will be sent to the API endpoint instead of batches.\n\n    Attributes:\n        url: url of the API.\n        out_count: count of expected returned values.\n        param_names: list of parameter names for API requests.\n        debatchify: if True, single instances will be sent to the API endpoint instead of batches.\n    \"\"\"\n\n    def __init__(self, url: str, out: [int, list], param_names: [list, tuple] = None, debatchify: bool = False,\n                 *args, **kwargs):\n        self.url = url\n        if param_names is None:\n            param_names = kwargs.get('in', ())\n        self.param_names = param_names\n        self.out_count = out if isinstance(out, int) else len(out)\n        self.debatchify = debatchify\n\n    def __call__(self, *args: List[Any], **kwargs: Dict[str, Any]):\n        \"\"\"\n\n        Args:\n            *args: list of parameters sent to the API endpoint. Parameter names are taken from self.param_names.\n            **kwargs: named parameters to send to the API endpoint. If not empty, args are ignored\n\n        Returns:\n            result of the API request(s)\n        \"\"\"\n        data = kwargs or dict(zip(self.param_names, args))\n\n        if self.debatchify:\n            batch_size = 0\n            for v in data.values():\n                batch_size = len(v)\n                break\n\n            assert batch_size > 0\n\n            async def collect():\n                return [j async for j in self.get_async_response(data, batch_size)]\n\n            loop = asyncio.get_event_loop()\n            response = loop.run_until_complete(collect())\n            if self.out_count > 1:\n                response = list(zip(*response))\n        else:\n            response = requests.post(self.url, json=data).json()\n\n        return response\n\n    async def get_async_response(self, data: dict, batch_size: int) -> AsyncIterable:\n        \"\"\"Helper function for sending requests asynchronously if the API endpoint does not support batching\n\n        Args:\n            data: data to be passed to the API endpoint\n            batch_size: requests count\n\n        Yields:\n            requests results parsed as json\n        \"\"\"\n        loop = asyncio.get_event_loop()\n        futures = [\n            loop.run_in_executor(\n                None,\n                requests.post,\n                self.url,\n                None,\n                {k: v[i] for k, v in data.items()}\n            )\n            for i in range(batch_size)\n        ]\n        for r in await asyncio.gather(*futures):\n            yield r.json()\n"
  },
  {
    "path": "deeppavlov/models/api_requester/api_router.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport concurrent\nfrom concurrent.futures import ProcessPoolExecutor\nfrom logging import getLogger\nfrom typing import List\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.models.api_requester import ApiRequester\n\nlogger = getLogger(__name__)\n\n\n@register(\"api_router\")\nclass ApiRouter(Component):\n    \"\"\"A helper class for running multiple API requesters on the same data in parallel\n\n    Args:\n        api_requesters: list of ApiRequester objects\n        n_workers: The maximum number of subprocesses to run\n\n    Attributes:\n        api_requesters: list of ApiRequester objects\n        n_workers: The maximum number of subprocesses to run\n    \"\"\"\n\n    def __init__(self, api_requesters: List[ApiRequester], n_workers: int = 1, *args, **kwargs):\n        self.api_requesters = api_requesters\n        self.n_workers = n_workers\n\n    def __call__(self, *args):\n        \"\"\"\n\n        Args:\n            *args: list of arguments to forward to the API requesters\n\n        Returns:\n            results of the requests\n        \"\"\"\n        with ProcessPoolExecutor(self.n_workers) as executor:\n            futures = [executor.submit(api_requester, *args) for api_requester\n                       in\n                       self.api_requesters]\n\n            concurrent.futures.wait(futures)\n            results = []\n            for future, api_requester in zip(futures, self.api_requesters):\n                result = future.result()\n                if api_requester.out_count > 1:\n                    results += result\n                else:\n                    results.append(result)\n\n        return results\n"
  },
  {
    "path": "deeppavlov/models/classifiers/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/classifiers/cos_sim_classifier.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, softwaredata\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom logging import getLogger\nfrom typing import List, Tuple, Union\n\nimport numpy as np\nfrom scipy.sparse import vstack, csr_matrix\nfrom scipy.sparse.linalg import norm as sparse_norm\n\nfrom deeppavlov.core.common.file import load_pickle\nfrom deeppavlov.core.common.file import save_pickle\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.estimator import Estimator\nfrom deeppavlov.core.models.serializable import Serializable\n\nlogger = getLogger(__name__)\n\n\n@register(\"cos_sim_classifier\")\nclass CosineSimilarityClassifier(Estimator, Serializable):\n    \"\"\"\n    Classifier based on cosine similarity between vectorized sentences\n\n    Parameters:\n        save_path: path to save the model\n        load_path: path to load the model\n    \"\"\"\n\n    def __init__(self, top_n: int = 1, save_path: str = None, load_path: str = None, **kwargs) -> None:\n        super().__init__(save_path=save_path, load_path=load_path, **kwargs)\n        self.top_n = top_n\n\n        self.x_train_features = self.y_train = None\n\n        if kwargs['mode'] != 'train':\n            self.load()\n\n    def __call__(self, q_vects: Union[csr_matrix, List]) -> Tuple[List[str], List[int]]:\n        \"\"\"Found most similar answer for input vectorized question\n\n        Parameters:\n            q_vects: vectorized questions\n\n        Returns:\n            Tuple of Answer and Score\n        \"\"\"\n\n        if isinstance(q_vects[0], csr_matrix):\n            q_norm = sparse_norm(q_vects)\n            if q_norm == 0.0:\n                cos_similarities = np.zeros((q_vects.shape[0], self.x_train_features.shape[0]))\n            else:\n                norm = q_norm * sparse_norm(self.x_train_features, axis=1)\n                cos_similarities = np.array(q_vects.dot(self.x_train_features.T).todense())\n                cos_similarities = cos_similarities / norm\n        elif isinstance(q_vects[0], np.ndarray):\n            q_vects = np.array(q_vects)\n            self.x_train_features = np.array(self.x_train_features)\n            norm = np.linalg.norm(q_vects) * np.linalg.norm(self.x_train_features, axis=1)\n            cos_similarities = q_vects.dot(self.x_train_features.T) / norm\n        elif q_vects[0] is None:\n            cos_similarities = np.zeros(len(self.x_train_features))\n        else:\n            raise NotImplementedError('Not implemented this type of vectors')\n\n        # get cosine similarity for each class\n        y_labels = np.unique(self.y_train)\n        labels_scores = np.zeros((len(cos_similarities), len(y_labels)))\n        for i, label in enumerate(y_labels):\n            labels_scores[:, i] = np.max([cos_similarities[:, i]\n                                          for i, value in enumerate(self.y_train) if value == label], axis=0)\n\n        labels_scores_sum = labels_scores.sum(axis=1, keepdims=True)\n        labels_scores = np.divide(labels_scores, labels_scores_sum,\n                                  out=np.zeros_like(labels_scores), where=(labels_scores_sum != 0))\n\n        answer_ids = np.argsort(labels_scores)[:, -self.top_n:]\n\n        # generate top_n answers and scores\n        answers = []\n        scores = []\n        for i in range(len(answer_ids)):\n            answers.extend([y_labels[id] for id in answer_ids[i, ::-1]])\n            scores.extend([np.round(labels_scores[i, id], 2) for id in answer_ids[i, ::-1]])\n\n        return answers, scores\n\n    def fit(self, x_train_vects: Tuple[Union[csr_matrix, List]], y_train: Tuple[str]) -> None:\n        \"\"\"Train classifier\n\n        Parameters:\n            x_train_vects: vectorized question for train dataset\n            y_train: answers for train dataset\n\n        Returns:\n            None\n        \"\"\"\n        if isinstance(x_train_vects, tuple):\n            if len(x_train_vects) != 0:\n                if isinstance(x_train_vects[0], csr_matrix):\n                    self.x_train_features = vstack(list(x_train_vects))\n                elif isinstance(x_train_vects[0], np.ndarray):\n                    self.x_train_features = np.vstack(list(x_train_vects))\n                else:\n                    raise NotImplementedError('Not implemented this type of vectors')\n            else:\n                raise ValueError(\"Train vectors can't be empty\")\n        else:\n            self.x_train_features = x_train_vects\n\n        self.y_train = list(y_train)\n\n    def save(self) -> None:\n        \"\"\"Save classifier parameters\"\"\"\n        logger.info(\"Saving faq_model to {}\".format(self.save_path))\n        save_pickle((self.x_train_features, self.y_train), self.save_path)\n\n    def load(self) -> None:\n        \"\"\"Load classifier parameters\"\"\"\n        logger.debug(\"Loading faq_model from {}\".format(self.load_path))\n        self.x_train_features, self.y_train = load_pickle(self.load_path)\n"
  },
  {
    "path": "deeppavlov/models/classifiers/dnnc_proba2labels.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import List\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlog = getLogger(__name__)\n\n\n@register('dnnc_proba2labels')\nclass Proba2Labels(Component):\n    \"\"\"\n    Converts pairwise simmilarity scores into class label\n    \n    Args:\n        confidence_threshold: used to determine whether example belongs to one \n                              of the classes in 'y_support' or not\n        pooling: strategy for averaging similarity scores for each label\n        is_binary: determines whether the similarity is a number or a probability vector\n    \"\"\"\n\n    def __init__(self,\n                 confidence_threshold: float = 0.0,\n                 pooling: str = 'max',\n                 is_binary: bool = True,\n                 **kwargs) -> None:\n\n        self.confidence_threshold = confidence_threshold\n        self.pooling = pooling\n        self.is_binary = is_binary\n\n    def __call__(self,\n                 simmilarity_scores: List[float],\n                 x: List[str],\n                 x_populated: List[str],\n                 x_support: List[str],\n                 y_support: List[str]\n                ) -> List[str]:\n\n        y_pred = []\n\n        simmilarity_scores = np.array(simmilarity_scores)\n        x_populated = np.array(x_populated)\n        x_support = np.array(x_support)\n        y_support = np.array(y_support)\n        unique_labels = np.unique(y_support)\n\n        # Transform probits vector into a simmilarity score\n        if not self.is_binary:\n            simmilarity_scores = simmilarity_scores[:, 1]\n\n        for example in x:\n            example_mask = np.where(np.logical_xor(x_populated == example, x_support == example))\n            example_simmilarity_scores = simmilarity_scores[example_mask]\n            example_y_support = y_support[example_mask]\n\n            probability_by_label = []\n            for label in unique_labels:\n                label_mask = np.where(example_y_support == label)\n                label_simmilarity_scores = example_simmilarity_scores[label_mask]\n                if self.pooling == 'avg':\n                    label_probability = np.mean(label_simmilarity_scores)\n                elif self.pooling == 'max':\n                    label_probability = np.max(label_simmilarity_scores)\n                probability_by_label.append(label_probability)\n\n            probability_by_label = np.array(probability_by_label)\n            max_probability = max(probability_by_label)\n            max_probability_label = unique_labels[np.argmax(probability_by_label)]\n            prediction = \"oos\" if max_probability < self.confidence_threshold else max_probability_label\n\n            y_pred.append(prediction)\n\n        return y_pred\n"
  },
  {
    "path": "deeppavlov/models/classifiers/proba2labels.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\n\nimport numpy as np\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlog = getLogger(__name__)\n\n\n@register('proba2labels')\nclass Proba2Labels(Component):\n    \"\"\"\n    Class implements probability to labels processing using the following ways: \\\n     choosing one or top_n indices with maximal probability or choosing any number of indices \\\n      which probabilities to belong with are higher than given confident threshold\n\n    Args:\n        max_proba: whether to choose label with maximal probability\n        confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label)\n        top_n: how many top labels with the highest probabilities to return\n\n    Attributes:\n        max_proba: whether to choose label with maximal probability\n        confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label)\n        top_n: how many top labels with the highest probabilities to return\n    \"\"\"\n\n    def __init__(self,\n                 max_proba: bool = None,\n                 confidence_threshold: float = None,\n                 top_n: int = None,\n                 is_binary: bool = False,\n                 **kwargs) -> None:\n        \"\"\" Initialize class with given parameters\"\"\"\n\n        self.max_proba = max_proba\n        self.confidence_threshold = confidence_threshold\n        self.top_n = top_n\n        self.is_binary = is_binary\n\n    def __call__(self,\n                 *args,\n                 **kwargs):\n        \"\"\"\n        Process probabilities to labels\n        Args:\n            Every argument is a list of vectors with probability distribution\n        Returns:\n            list of labels (only label classification) or list of lists of labels (multi-label classification),\n            or list of the following lists (in multitask setting) for every argument\n        \"\"\"\n        answer = []\n        log.debug(f'input {args}')\n        for data in args:\n            if all([k is None for k in data]):\n                answer.append([])\n            elif self.confidence_threshold:\n                if self.is_binary:\n                    answer.append([int(el > self.confidence_threshold) for el in data])\n                else:\n                    answer.append([list(np.where(np.array(d) > self.confidence_threshold)[0]) for d in data])\n            elif self.max_proba:\n                answer.append([np.argmax(d) for d in data])\n            elif self.top_n:\n                answer.append([np.argsort(d)[::-1][:self.top_n] for d in data])\n            else:\n                raise ConfigError(\"Proba2Labels requires one of three arguments: bool `max_proba` or \"\n                                  \"float `confidence_threshold` for multi-label classification or\"\n                                  \"integer `top_n` for choosing several labels with the highest probabilities\")\n        if len(args) == 1:  # only one argument\n            answer = answer[0]\n        log.debug(f'output {answer}')\n        return answer\n"
  },
  {
    "path": "deeppavlov/models/classifiers/re_bert.py",
    "content": "import logging\nfrom pathlib import Path\nfrom typing import Tuple, Union, Any, List\n\nimport torch\nfrom torch import Tensor\nimport torch.nn as nn\nfrom opt_einsum import contract\nfrom transformers import AutoConfig, BertModel, BertTokenizer\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.models.relation_extraction.losses import ATLoss\n\nlog = logging.getLogger(__name__)\n\n\nclass BertWithAdaThresholdLocContextPooling(nn.Module):\n\n    def __init__(\n            self,\n            n_classes: int = 97,\n            pretrained_bert: str = None,\n            bert_tokenizer_config_file: str = None,\n            bert_config_file: str = None,\n            emb_size: int = 768,\n            block_size: int = 8,       # 64\n            num_ner_tags: int = 6,        # number of ner tags\n            threshold: float = None,\n            device: str = \"gpu\"\n    ):\n        super().__init__()\n        self.n_classes = n_classes\n        self.pretrained_bert = pretrained_bert\n        self.bert_config_file = bert_config_file\n        self.num_ner_tags = num_ner_tags\n        self.emb_size = emb_size\n        self.block_size = block_size\n        self.threshold = threshold\n\n        self.loss_fnt = ATLoss()\n        self.device = torch.device(\"cuda\" if torch.cuda.is_available() and device == \"gpu\" else \"cpu\")\n\n        # initialize parameters that would be filled later\n        self.model, self.config, self.bert_config = None, None, None\n        self.load()\n\n        # initialize tokenizer to call resize_token_embeddings function for model with increased tokenizer size (due to\n        # the additional <ENT> token) and get CLS and SEP token ids\n        if Path(bert_tokenizer_config_file).is_file():\n            vocab_file = str(expand_path(bert_tokenizer_config_file))\n            self.tokenizer = BertTokenizer(vocab_file=vocab_file)\n        else:\n            tokenizer = BertTokenizer.from_pretrained(pretrained_bert)\n        self.model.resize_token_embeddings(len(tokenizer) + 1)\n        self.cls_token_id = tokenizer.cls_token_id\n        self.sep_token_id = tokenizer.sep_token_id\n\n        self.hidden_size = self.config.hidden_size\n        self.head_extractor = nn.Linear(2 * self.hidden_size + self.num_ner_tags, self.emb_size)\n        self.tail_extractor = nn.Linear(2 * self.hidden_size + self.num_ner_tags, self.emb_size)\n        self.bilinear = nn.Linear(self.emb_size * self.block_size, self.n_classes)\n\n    def forward(\n            self,\n            input_ids: Tensor,\n            attention_mask: Tensor,\n            entity_pos: List,\n            ner_tags: List,\n            labels: List = None\n    ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]:\n\n        if labels:\n            curr_threshold = None       # for training: no set threshold but adaptive one\n        else:\n            curr_threshold = self.threshold     # for development and test: threshold set in config\n\n        output = self.model(input_ids=input_ids, attention_mask=attention_mask)\n        sequence_output = output[0]  # Tensor (batch_size x input_length x 768)\n        attention = output[-1][-1]  # Tensor (batch_size x 12 x input_length x input_length)\n\n        hs, rs, ts = self.get_hrt(sequence_output, attention, entity_pos)       # Tensors (batch_size x 768)\n\n        # get ner tags of entities\n        hs_ner_tags, ts_ner_tags = torch.Tensor([list(ele) for ele in list(zip(*ner_tags))]).to(self.device)\n        hs_inp = torch.cat([hs, rs, hs_ner_tags], dim=1)\n        ts_inp = torch.cat([ts, rs, ts_ner_tags], dim=1)\n\n        hs = torch.tanh(self.head_extractor(hs_inp))\n        ts = torch.tanh(self.tail_extractor(ts_inp))\n        b1 = hs.view(-1, self.emb_size // self.block_size, self.block_size)\n        b2 = ts.view(-1, self.emb_size // self.block_size, self.block_size)\n        bl = (b1.unsqueeze(3) * b2.unsqueeze(2)).view(-1, self.emb_size * self.block_size)\n        logits = self.bilinear(bl)\n\n        output = (self.loss_fnt.get_label(logits, num_labels=self.n_classes, threshold=curr_threshold), logits)\n        if labels is not None:\n            labels_tensors = [torch.tensor(label) for label in labels]\n            labels_tensors = torch.stack(labels_tensors).to(logits)\n            loss = self.loss_fnt(logits.float(), labels_tensors.float())\n            output = (loss.to(sequence_output),) + output\n        return output\n\n    def get_hrt(self, sequence_output: Tensor, attention: Tensor, entity_pos: List) -> Tuple[Tensor, Tensor, Tensor]:\n        _, h, _, max_sequence_length = attention.size()\n        hss, tss, rss = [], [], []\n        for i in range(len(entity_pos)):            # for each training sample (= doc)\n            entity_embs, entity_atts = [], []\n            for e in entity_pos[i]:             # for each entity (= list of entity mentions)\n                if len(e) == 0:\n                    continue\n                if len(e) > 1:\n                    e_emb, e_att = [], []\n                    for start, end in e:        # for start and end position of each mention\n                        # skip the entity pair if the entity mention is truncated due to limited max seq length.\n                        if start + 1 < max_sequence_length:\n                            e_emb.append(sequence_output[i, start + 1])\n                            e_att.append(attention[i, :, start + 1])\n                    if len(e_emb) > 0:\n                        e_emb = torch.logsumexp(torch.stack(e_emb, dim=0), dim=0)\n                        e_att = torch.stack(e_att, dim=0).mean(0)\n                    else:\n                        e_emb = torch.zeros(self.hidden_size).to(sequence_output)\n                        e_att = torch.zeros(h, max_sequence_length).to(attention)\n                else:\n                    start, end = e[0]\n                    if start + 1 < max_sequence_length:\n                        e_emb = sequence_output[i, start + 1]\n                        e_att = attention[i, :, start + 1]\n                    else:\n                        e_emb = torch.zeros(self.hidden_size).to(sequence_output)\n                        e_att = torch.zeros(h, max_sequence_length).to(attention)\n                entity_embs.append(e_emb)           # get an embedding of an entity\n                entity_atts.append(e_att)       # get attention of an entity\n\n            entity_embs = torch.stack(entity_embs, dim=0)  # [n_e, d]           # entity embeddings for each document\n            entity_atts = torch.stack(entity_atts, dim=0)  # [n_e, h, seq_len]\n\n            hs = torch.index_select(entity_embs, 0, torch.tensor([0]).to(self.device))  # embeddings of the first entity\n            ts = torch.index_select(entity_embs, 0, torch.tensor([1]).to(self.device)) # embeddings of the second entity\n\n            h_att = torch.index_select(entity_atts, 0, torch.tensor([0]).to(self.device))\n            t_att = torch.index_select(entity_atts, 0, torch.tensor([1]).to(self.device))\n            ht_att = (h_att * t_att).mean(1)\n            ht_att = ht_att / (ht_att.sum(1, keepdim=True) + 1e-5)\n            rs = contract(\"ld,rl->rd\", sequence_output[i], ht_att)  # ht_i.shape[0] x sequence_output.shape[2]\n            hss.append(hs)\n            tss.append(ts)\n            rss.append(rs)\n\n        hss = torch.cat(hss, dim=0)\n        tss = torch.cat(tss, dim=0)\n        rss = torch.cat(rss, dim=0)\n\n        return hss, rss, tss\n\n    def load(self) -> None:\n        if self.pretrained_bert:\n            log.debug(f\"From pretrained {self.pretrained_bert}.\")\n            self.config = AutoConfig.from_pretrained(\n                self.pretrained_bert, num_labels=self.n_classes, output_attentions=True, output_hidden_states=True\n            )\n            self.model = BertModel.from_pretrained(self.pretrained_bert, config=self.config)\n\n        elif self.bert_config_file and Path(self.bert_config_file).is_file():\n            self.config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file)))\n            self.model = BertModel.from_config(config=self.bert_config)\n        else:\n            raise ConfigError(\"No pre-trained BERT model is given.\")\n\n        self.model.to(self.device)\n"
  },
  {
    "path": "deeppavlov/models/classifiers/torch_classification_model.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport logging\nfrom typing import List, Union, Optional\n\nimport numpy as np\nimport torch\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\nfrom .torch_nets import ShallowAndWideCnn\n\nlog = logging.getLogger(__name__)\n\n\n@register('torch_text_classification_model')\nclass TorchTextClassificationModel(TorchModel):\n    \"\"\"Class implements torch model for classification of texts.\n    Input can either be embedded tokenized texts OR indices of words in the vocabulary.\n    Number of tokens is not fixed while the samples in batch should be padded to the same (e.g. longest) lengths.\n\n    Args:\n        n_classes: number of classes\n        kernel_sizes_cnn: list of kernel sizes of convolutions\n        filters_cnn: number of filters for convolutions\n        dense_size: number of units for dense layer\n        dropout_rate: dropout rate, after convolutions and between dense\n        embedding_size: size of vector representation of words\n        multilabel: is multi-label classification (if so, `sigmoid` activation will be used, otherwise, softmax)\n        criterion: criterion name from `torch.nn`\n        embedded_tokens: True, if input contains embedded tokenized texts;\n                         False, if input containes indices of words in the vocabulary\n        vocab_size: vocabulary size in case of `embedded_tokens=False`, and embedding is a layer in the Network\n        return_probas: whether to return probabilities or index of classes (only for `multilabel=False`)\n\n    Attributes:\n        model: torch model itself\n        epochs_done: number of epochs that were done\n        criterion: torch criterion instance\n    \"\"\"\n\n    def __init__(self, n_classes: int,\n                 kernel_sizes_cnn: List[int],\n                 filters_cnn: int,\n                 dense_size: int,\n                 dropout_rate: float = 0.0,\n                 embedding_size: Optional[int] = None,\n                 multilabel: bool = False,\n                 criterion: str = \"CrossEntropyLoss\",\n                 embedded_tokens: bool = True,\n                 vocab_size: Optional[int] = None,\n                 return_probas: bool = True,\n                 **kwargs):\n\n        if n_classes == 0:\n            raise ConfigError(\"Please, provide vocabulary with considered classes or number of classes.\")\n\n        if multilabel and not return_probas:\n            raise RuntimeError('Set return_probas to True for multilabel classification!')\n\n        self.multilabel = multilabel\n        self.return_probas = return_probas\n        model = ShallowAndWideCnn(\n            n_classes=n_classes, embedding_size=embedding_size,\n            kernel_sizes_cnn=kernel_sizes_cnn, filters_cnn=filters_cnn,\n            dense_size=dense_size, dropout_rate=dropout_rate,\n            embedded_tokens=embedded_tokens,\n            vocab_size=vocab_size\n        )\n        self.criterion = getattr(torch.nn, criterion)()\n        super().__init__(model, **kwargs)\n\n    def __call__(self, texts: List[np.ndarray], *args) -> Union[List[List[float]], List[int]]:\n        \"\"\"Infer on the given data.\n\n        Args:\n            texts: list of tokenized text samples\n            labels: labels\n            *args: additional arguments\n\n        Returns:\n            for each sentence:\n                vector of probabilities to belong with each class\n                or list of labels sentence belongs with\n        \"\"\"\n        with torch.no_grad():\n            features = np.array(texts)\n            inputs = torch.from_numpy(features)\n            inputs = inputs.to(self.device)\n            outputs = self.model(inputs)\n            if self.multilabel:\n                outputs = torch.nn.functional.sigmoid(outputs)\n            else:\n                outputs = torch.nn.functional.softmax(outputs, dim=-1)\n\n        outputs = outputs.cpu().detach().numpy()\n        if self.return_probas:\n            return outputs.tolist()\n        else:\n            return np.argmax(outputs, axis=-1).tolist()\n\n    def train_on_batch(self, texts: List[List[np.ndarray]], labels: list) -> Union[float, List[float]]:\n        \"\"\"Train the model on the given batch.\n\n        Args:\n            texts: vectorized texts\n            labels: list of labels\n\n        Returns:\n            metrics values on the given batch\n        \"\"\"\n        features, labels = np.array(texts), np.array(labels)\n\n        inputs, labels = torch.from_numpy(features), torch.from_numpy(labels)\n        inputs, labels = inputs.to(self.device), labels.to(self.device)\n        # zero the parameter gradients\n        self.optimizer.zero_grad()\n\n        # forward + backward + optimize\n        outputs = self.model(inputs)\n        labels = labels.view(-1).long()\n        loss = self.criterion(outputs, labels)\n        self._make_step(loss)\n        return loss.item()\n"
  },
  {
    "path": "deeppavlov/models/classifiers/torch_nets.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List, Union, Optional\n\nimport torch\nimport torch.nn as nn\n\n\nclass ShallowAndWideCnn(nn.Module):\n    def __init__(self, n_classes: int, embedding_size: int, kernel_sizes_cnn: List[int],\n                 filters_cnn: Union[int, List[int]], dense_size: int, dropout_rate: float = 0.0,\n                 embedded_tokens: bool = True, vocab_size: Optional[int] = None, **kwargs):\n        super().__init__()\n        self.embedded_tokens = embedded_tokens\n        self.kernel_sizes_cnn = kernel_sizes_cnn\n\n        if not embedded_tokens and vocab_size:\n            self.embedding = nn.Embedding(vocab_size, embedding_size)\n        if isinstance(filters_cnn, int):\n            filters_cnn = len(kernel_sizes_cnn) * [filters_cnn]\n\n        for i in range(len(kernel_sizes_cnn)):\n            setattr(self, \"conv_\" + str(i), nn.Conv1d(embedding_size, filters_cnn[i], kernel_sizes_cnn[i],\n                                                      padding=kernel_sizes_cnn[i]))\n            setattr(self, \"bn_\" + str(i), nn.BatchNorm1d(filters_cnn[i]))\n            setattr(self, \"relu_\" + str(i), nn.ReLU())\n            setattr(self, \"pool_\" + str(i), nn.AdaptiveMaxPool1d(1))\n\n        self.dropout = nn.Dropout(dropout_rate)\n        self.dense = nn.Linear(sum(filters_cnn), dense_size)\n        self.relu_dense = nn.ReLU()\n        self.final_dense = nn.Linear(dense_size, n_classes)\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        # number of tokens is variable\n        if not self.embedded_tokens:\n            # x of shape [batch_size, number of tokens]\n            input = self.embedding(x)\n            input = input.permute(0, 2, 1)\n        else:\n            # x of shape [batch_size, number of tokens, embedding_size]\n            input = x.permute(0, 2, 1)\n\n        # input of [batch size, embedding size, number of tokens]\n        outputs = []\n        for i in range(len(self.kernel_sizes_cnn)):\n            # convolutional input should be of shape [batch_size, embedding_size, number of tokens]\n            output = getattr(self, \"conv_\" + str(i))(input)\n            output = getattr(self, \"bn_\" + str(i))(output)\n            output = getattr(self, \"relu_\" + str(i))(output)\n            output = getattr(self, \"pool_\" + str(i))(output)\n            output = output.squeeze(-1)\n            # output of shape [batch_size, out]\n            outputs.append(output)\n\n        output = torch.cat(outputs, dim=-1)\n        output = self.dropout(output)\n        output = self.dense(output)\n        output = self.relu_dense(output)\n        output = self.dropout(output)\n        output = self.final_dense(output)\n        return output\n"
  },
  {
    "path": "deeppavlov/models/classifiers/utils.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom logging import getLogger\nfrom typing import List\n\nimport numpy as np\n\nlog = getLogger(__name__)\n\n\ndef labels2onehot(labels: [List[str], List[List[str]], np.ndarray], classes: [list, np.ndarray]) -> np.ndarray:\n    \"\"\"\n    Convert labels to one-hot vectors for multi-class multi-label classification\n\n    Args:\n        labels: list of samples where each sample is a class or a list of classes which sample belongs with\n        classes: array of classes' names\n\n    Returns:\n        2d array with one-hot representation of given samples\n    \"\"\"\n    n_classes = len(classes)\n    y = []\n    for sample in labels:\n        curr = np.zeros(n_classes)\n        if isinstance(sample, list):\n            for intent in sample:\n                if intent not in classes:\n                    log.warning('Unknown label {} detected. Assigning no class'.format(intent))\n                else:\n                    curr[np.where(np.array(classes) == intent)[0]] = 1\n        else:\n            curr[np.where(np.array(classes) == sample)[0]] = 1\n        y.append(curr)\n    y = np.asarray(y)\n    return y\n\n\ndef proba2labels(proba: [list, np.ndarray], confidence_threshold: float, classes: [list, np.ndarray]) -> List[List]:\n    \"\"\"\n    Convert vectors of probabilities to labels using confident threshold\n    (if probability to belong with the class is bigger than confidence_threshold, sample belongs with the class;\n    if no probabilities bigger than confident threshold, sample belongs with the class with the biggest probability)\n\n    Args:\n        proba: list of samples where each sample is a vector of probabilities to belong with given classes\n        confidence_threshold (float): boundary of probability to belong with a class\n        classes: array of classes' names\n\n    Returns:\n        list of lists of labels for each sample\n    \"\"\"\n    y = []\n    for sample in proba:\n        to_add = np.where(sample > confidence_threshold)[0]\n        if len(to_add) > 0:\n            y.append(np.array(classes)[to_add].tolist())\n        else:\n            y.append(np.array([np.array(classes)[np.argmax(sample)]]).tolist())\n\n    return y\n\n\ndef proba2onehot(proba: [list, np.ndarray], confidence_threshold: float, classes: [list, np.ndarray]) -> np.ndarray:\n    \"\"\"\n    Convert vectors of probabilities to one-hot representations using confident threshold\n\n    Args:\n        proba: samples where each sample is a vector of probabilities to belong with given classes\n        confidence_threshold: boundary of probability to belong with a class\n        classes: array of classes' names\n\n    Returns:\n        2d array with one-hot representation of given samples\n    \"\"\"\n    return labels2onehot(proba2labels(proba, confidence_threshold, classes), classes)\n"
  },
  {
    "path": "deeppavlov/models/doc_retrieval/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/doc_retrieval/bpr.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Tuple\n\nimport faiss\nimport numpy as np\nimport torch\nfrom tqdm import trange\nfrom transformers import AutoTokenizer, BertModel\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.core.models.serializable import Serializable\n\n\nclass FaissBinaryIndex:\n    def __init__(self, index: faiss.Index):\n        self.index = index\n\n    def search(self, query_embs: np.ndarray, k: int, binary_k=1000, rerank=True) -> Tuple[np.ndarray, np.ndarray]:\n        faiss.omp_set_num_threads(12)\n        num_queries = query_embs.shape[0]\n        bin_query_embs = np.packbits(np.where(query_embs > 0, 1, 0)).reshape(num_queries, -1)\n\n        raw_index = self.index.index\n        _, ids_arr = raw_index.search(bin_query_embs, binary_k)\n        psg_embs = np.vstack([np.unpackbits(raw_index.reconstruct(int(id_))) for id_ in ids_arr.reshape(-1)])\n        psg_embs = psg_embs.reshape(query_embs.shape[0], binary_k, query_embs.shape[1])\n        psg_embs = psg_embs.astype(np.float32)\n\n        psg_embs = psg_embs * 2 - 1\n        scores_arr = np.einsum(\"ijk,ik->ij\", psg_embs, query_embs)\n        sorted_indices = np.argsort(-scores_arr, axis=1)\n\n        ids_arr = ids_arr[np.arange(num_queries)[:, None], sorted_indices]\n        ids_arr = np.array([self.index.id_map.at(int(id_)) for id_ in ids_arr.reshape(-1)], dtype=np.int)\n        ids_arr = ids_arr.reshape(num_queries, -1)\n        scores_arr = scores_arr[np.arange(num_queries)[:, None], sorted_indices]\n\n        return scores_arr[:, :k], ids_arr[:, :k]\n\n\n@register('bpr')\nclass BPR(Component, Serializable):\n    def __init__(self, pretrained_model: str,\n                 load_path: str,\n                 bpr_index: str,\n                 query_encoder_file: str,\n                 max_query_length: int = 256,\n                 top_n: int = 100,\n                 device: str = \"gpu\",\n                 *args, **kwargs\n                 ):\n        super().__init__(save_path=None, load_path=load_path)\n        self.device = torch.device(\"cuda\" if torch.cuda.is_available() and device == \"gpu\" else \"cpu\")\n        self.bpr_index = bpr_index\n        self.top_n = top_n\n        self.max_query_length = max_query_length\n        self.query_encoder_file = query_encoder_file\n        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)\n        self.q_encoder = BertModel.from_pretrained(pretrained_model).to(self.device)\n        self.load()\n        self.index = FaissBinaryIndex(self.base_index)\n\n    def load(self):\n        checkpoint = torch.load(str(self.load_path / self.query_encoder_file), map_location=self.device)\n        self.q_encoder.load_state_dict(checkpoint[\"model_state_dict\"], strict=False)\n        self.base_index = faiss.read_index_binary(str(self.load_path / self.bpr_index))\n\n    def save(self) -> None:\n        pass\n\n    def encode_queries(self, queries, batch_size: int = 256) -> np.ndarray:\n        embeddings = []\n        with torch.no_grad():\n            for start in trange(0, len(queries), batch_size):\n                model_inputs = self.tokenizer.batch_encode_plus(\n                    queries[start: start + batch_size],\n                    return_tensors=\"pt\",\n                    max_length=self.max_query_length,\n                    padding=\"max_length\",\n                )\n                model_inputs = {k: v.to(self.device) for k, v in model_inputs.items()}\n                sequence_output = self.q_encoder(**model_inputs)[0]\n                emb = sequence_output[:, 0, :].contiguous().cpu().numpy()\n                embeddings.append(emb)\n\n        return np.vstack(embeddings)\n\n    def __call__(self, queries):\n        queries = [query.lower() for query in queries]\n        query_embeddings = self.encode_queries(queries)\n        scores_batch, ids_batch = self.index.search(query_embeddings, self.top_n)\n        ids_batch = ids_batch.tolist()\n        return ids_batch\n"
  },
  {
    "path": "deeppavlov/models/doc_retrieval/logit_ranker.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom operator import itemgetter\nfrom typing import List, Union, Tuple, Optional\n\nfrom deeppavlov.core.common.chainer import Chainer\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.estimator import Component\nfrom deeppavlov.models.doc_retrieval.utils import find_answer_sentence\n\nlogger = getLogger(__name__)\n\n\n@register(\"logit_ranker\")\nclass LogitRanker(Component):\n    \"\"\"Select best answer using squad model logits. Make several batches for a single batch, send each batch\n     to the squad model separately and get a single best answer for each batch.\n\n     Args:\n        squad_model: a loaded squad model\n        batch_size: batch size to use with squad model\n        sort_noans: whether to downgrade noans tokens in the most possible answers\n        top_n: number of answers to return\n\n     Attributes:\n        squad_model: a loaded squad model\n        batch_size: batch size to use with squad model\n        top_n: number of answers to return\n\n    \"\"\"\n\n    def __init__(self, squad_model: Union[Chainer, Component], batch_size: int = 50,\n                 sort_noans: bool = False, top_n: int = 1, return_answer_sentence: bool = False, **kwargs):\n        self.squad_model = squad_model\n        self.batch_size = batch_size\n        self.sort_noans = sort_noans\n        self.top_n = top_n\n        self.return_answer_sentence = return_answer_sentence\n\n    def __call__(self, contexts_batch: List[List[str]], questions_batch: List[List[str]],\n                 doc_ids_batch: Optional[List[List[str]]] = None) -> \\\n            Union[\n                Tuple[List[str], List[float], List[int], List[str]],\n                Tuple[List[List[str]], List[List[float]], List[List[int]], List[List[str]]],\n                Tuple[List[str], List[float], List[int]],\n                Tuple[List[List[str]], List[List[float]], List[List[int]]]\n            ]:\n\n        \"\"\"\n        Sort obtained results from squad reader by logits and get the answer with a maximum logit.\n\n        Args:\n            contexts_batch: a batch of contexts which should be treated as a single batch in the outer JSON config\n            questions_batch: a batch of questions which should be treated as a single batch in the outer JSON config\n            doc_ids_batch (optional): names of the documents from which the contexts_batch was derived\n        Returns:\n             a batch of best answers, their scores, places in contexts\n             and doc_ids for this answers if doc_ids_batch were passed\n        \"\"\"\n        if doc_ids_batch is None:\n            logger.warning(\"you didn't pass tfidf_doc_ids as input in logit_ranker config so \"\n                           \"batch_best_answers_doc_ids can't be compute\")\n\n        batch_best_answers = []\n        batch_best_answers_score = []\n        batch_best_answers_place = []\n        batch_best_answers_doc_ids = []\n        batch_best_answers_sentences = []\n        for quest_ind, [contexts, questions] in enumerate(zip(contexts_batch, questions_batch)):\n            results = []\n            for i in range(0, len(contexts), self.batch_size):\n                c_batch = contexts[i: i + self.batch_size]\n                q_batch = questions[i: i + self.batch_size]\n                batch_predict = list(zip(*self.squad_model(c_batch, q_batch), c_batch))\n                results += batch_predict\n            if self.sort_noans:\n                results_sort = sorted(results, key=lambda x: (x[0] != '', x[2]), reverse=True)\n            else:\n                results_sort = sorted(results, key=itemgetter(2), reverse=True)\n            best_answers = [x[0] for x in results_sort[:self.top_n]]\n            best_answers_place = [x[1] for x in results_sort[:self.top_n]]\n            best_answers_score = [x[2] for x in results_sort[:self.top_n]]\n            best_answers_contexts = [x[3] for x in results_sort[:self.top_n]]\n            batch_best_answers.append(best_answers)\n            batch_best_answers_place.append(best_answers_place)\n            batch_best_answers_score.append(best_answers_score)\n            best_answers_sentences = []\n            for answer, place, context in zip(best_answers, best_answers_place, best_answers_contexts):\n                sentence = find_answer_sentence(place, context)\n                best_answers_sentences.append(sentence)\n            batch_best_answers_sentences.append(best_answers_sentences)\n\n            if doc_ids_batch is not None:\n                doc_ind = [results.index(x) for x in results_sort]\n                batch_best_answers_doc_ids.append(\n                    [doc_ids_batch[quest_ind][i] for i in doc_ind][:len(batch_best_answers[-1])])\n\n        if self.top_n == 1:\n            batch_best_answers = [x[0] for x in batch_best_answers]\n            batch_best_answers_place = [x[0] for x in batch_best_answers_place]\n            batch_best_answers_score = [x[0] for x in batch_best_answers_score]\n            batch_best_answers_doc_ids = [x[0] for x in batch_best_answers_doc_ids]\n            batch_best_answers_sentences = [x[0] for x in batch_best_answers_sentences]\n\n        if doc_ids_batch is None:\n            if self.return_answer_sentence:\n                return batch_best_answers, batch_best_answers_score, batch_best_answers_place, \\\n                       batch_best_answers_sentences\n            return batch_best_answers, batch_best_answers_score, batch_best_answers_place\n\n        if self.return_answer_sentence:\n            return batch_best_answers, batch_best_answers_score, batch_best_answers_place, batch_best_answers_doc_ids, \\\n                   batch_best_answers_sentences\n        return batch_best_answers, batch_best_answers_score, batch_best_answers_place, batch_best_answers_doc_ids\n"
  },
  {
    "path": "deeppavlov/models/doc_retrieval/pop_ranker.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom operator import itemgetter\nfrom typing import List, Any, Tuple\n\nimport numpy as np\nimport joblib\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.file import read_json\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.estimator import Component\n\nlogger = getLogger(__name__)\n\n\n@register('pop_ranker')\nclass PopRanker(Component):\n    \"\"\"Rank documents according to their tfidf scores and popularities. It is not a standalone ranker,\n    it should be used for re-ranking the results of TF-IDF Ranker.\n\n    Based on a Logistic Regression trained on 3 features:\n\n    * tfidf score of the article\n    * popularity of the article obtained via Wikimedia REST API as a mean number of views for the period since 2017/11/05 to 2018/11/05\n    * multiplication of the two features above\n\n    Args:\n        pop_dict_path: a path to json file with article title to article popularity map\n        load_path: a path to saved logistic regression classifier\n        top_n: a number of doc ids to return\n        active: whether to return a number specified by :attr:`top_n` (``True``) or all ids\n         (``False``)\n\n    Attributes:\n        pop_dict: a map of article titles to their popularity\n        mean_pop: mean popularity of all popularities in :attr:`pop_dict`, use it when popularity is not found\n        clf: a loaded logistic regression classifier\n        top_n: a number of doc ids to return\n        active: whether to return a number specified by :attr:`top_n` or all ids\n\n    \"\"\"\n\n    def __init__(self, pop_dict_path: str, load_path: str, top_n: int = 3, active: bool = True,\n                 **kwargs) -> None:\n        pop_dict_path = expand_path(pop_dict_path)\n        logger.debug(f\"Reading popularity dictionary from {pop_dict_path}\")\n        self.pop_dict = read_json(pop_dict_path)\n        self.mean_pop = np.mean(list(self.pop_dict.values()))\n        load_path = expand_path(load_path)\n        logger.debug(f\"Loading popularity ranker from {load_path}\")\n        self.clf = joblib.load(load_path)\n        self.top_n = top_n\n        self.active = active\n\n    def __call__(self, input_doc_ids: List[List[Any]], input_doc_scores: List[List[float]]) -> \\\n            Tuple[List[List], List[List]]:\n        \"\"\"Get tfidf scores and tfidf ids, re-rank them by applying logistic regression classifier,\n        output pop ranker ids and pop ranker scores.\n\n         Args:\n            input_doc_ids: top input doc ids of tfidf ranker\n            input_doc_scores: top input doc scores of tfidf ranker corresponding to doc ids\n\n        Returns:\n            top doc ids of pop ranker and their corresponding scores\n\n        \"\"\"\n        batch_ids = []\n        batch_scores = []\n        for instance_ids, instance_scores in zip(input_doc_ids, input_doc_scores):\n            instance_probas = []\n            for idx, score in zip(instance_ids, instance_scores):\n                pop = self.pop_dict.get(idx, self.mean_pop)\n                features = [score, pop, score * pop]\n                prob = self.clf.predict_proba([features])\n                instance_probas.append(prob[0][1])\n\n            sort = sorted(enumerate(instance_probas), key=itemgetter(1), reverse=True)\n            sorted_probas = [item[1] for item in sort]\n            sorted_ids = [instance_ids[item[0]] for item in sort]\n\n            if self.active:\n                sorted_ids = sorted_ids[:self.top_n]\n                sorted_probas = sorted_probas[:self.top_n]\n\n            batch_ids.append(sorted_ids)\n            batch_scores.append(sorted_probas)\n\n        return batch_ids, batch_scores\n"
  },
  {
    "path": "deeppavlov/models/doc_retrieval/tfidf_ranker.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import List, Any, Tuple\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.estimator import Component\nfrom deeppavlov.models.vectorizers.hashing_tfidf_vectorizer import HashingTfIdfVectorizer\n\nlogger = getLogger(__name__)\n\n\n@register(\"tfidf_ranker\")\nclass TfidfRanker(Component):\n    \"\"\"Rank documents according to input strings.\n\n    Args:\n        vectorizer: a vectorizer class\n        top_n: a number of doc ids to return\n        active: whether to return a number specified by :attr:`top_n` (``True``) or all ids\n         (``False``)\n\n    Attributes:\n        top_n: a number of doc ids to return\n        vectorizer: an instance of vectorizer class\n        active: whether to return a number specified by :attr:`top_n` or all ids\n        index2doc: inverted :attr:`doc_index`\n        iterator: a dataset iterator used for generating batches while fitting the vectorizer\n\n    \"\"\"\n\n    def __init__(self, vectorizer: HashingTfIdfVectorizer, top_n=5, active: bool = True, **kwargs):\n\n        self.top_n = top_n\n        self.vectorizer = vectorizer\n        self.active = active\n\n    def __call__(self, questions: List[str]) -> Tuple[List[Any], List[float]]:\n        \"\"\"Rank documents and return top n document titles with scores.\n\n        Args:\n            questions: list of queries used in ranking\n\n        Returns:\n            a tuple of selected doc ids and their scores\n        \"\"\"\n\n        batch_doc_ids, batch_docs_scores = [], []\n        q_tfidfs = self.vectorizer(questions)\n\n        for q_tfidf in q_tfidfs:\n            scores = q_tfidf * self.vectorizer.tfidf_matrix\n            scores = np.squeeze(\n                scores.toarray() + 0.0001)  # add a small value to eliminate zero scores\n\n            if self.active:\n                thresh = self.top_n\n            else:\n                thresh = len(self.vectorizer.doc_index)\n\n            if thresh >= len(scores):\n                o = np.argpartition(-scores, len(scores) - 1)[0:thresh]\n            else:\n                o = np.argpartition(-scores, thresh)[0:thresh]\n            o_sort = o[np.argsort(-scores[o])]\n\n            doc_scores = scores[o_sort]\n            doc_ids = [self.vectorizer.index2doc.get(i, int(i)) for i in o_sort]\n            batch_doc_ids.append(doc_ids)\n            batch_docs_scores.append(doc_scores)\n\n        return batch_doc_ids, batch_docs_scores\n"
  },
  {
    "path": "deeppavlov/models/doc_retrieval/utils.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Any, List\n\nimport nltk\n\nfrom deeppavlov.core.common.registry import register\n\n\n@register('concat_lists')\ndef concat_lists(list_a: List[List[Any]], list_b: List[List[Any]]):\n    list_u = []\n    for element_a, element_b in zip(list_a, list_b):\n        list_u.append(element_a + element_b)\n    return list_u\n\n\ndef find_answer_sentence(answer_pos: int, context: str) -> str:\n    answer_sentence = \"\"\n    context_sentences = nltk.sent_tokenize(context)\n    start = 0\n    context_sentences_offsets = []\n    for sentence in context_sentences:\n        end = start + len(sentence)\n        context_sentences_offsets.append((start, end))\n        start = end + 1\n\n    for sentence, (start_offset, end_offset) in zip(context_sentences, context_sentences_offsets):\n        if start_offset < answer_pos < end_offset:\n            answer_sentence = sentence\n            break\n\n    return answer_sentence\n"
  },
  {
    "path": "deeppavlov/models/embedders/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/embedders/abstract_embedder.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom abc import ABCMeta, abstractmethod\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Union, Iterator\n\nimport numpy as np\n\nfrom deeppavlov.core.data.utils import zero_pad\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.core.models.serializable import Serializable\n\nlog = getLogger(__name__)\n\n\nclass Embedder(Component, Serializable, metaclass=ABCMeta):\n    \"\"\"\n    Class implements fastText embedding model\n\n    Args:\n        load_path: path where to load pre-trained embedding model from\n        pad_zero: whether to pad samples or not\n\n    Attributes:\n        model: model instance\n        tok2emb: dictionary with already embedded tokens\n        dim: dimension of embeddings\n        pad_zero: whether to pad sequence of tokens with zeros or not\n        mean: whether to return one mean embedding vector per sample\n        load_path: path with pre-trained fastText binary model\n    \"\"\"\n\n    def __init__(self, load_path: Union[str, Path], pad_zero: bool = False, mean: bool = False, **kwargs) -> None:\n        \"\"\"\n        Initialize embedder with given parameters\n        \"\"\"\n        super().__init__(save_path=None, load_path=load_path)\n        self.tok2emb = {}\n        self.pad_zero = pad_zero\n        self.mean = mean\n        self.dim = None\n        self.model = None\n        self.load()\n\n    def save(self) -> None:\n        \"\"\"\n        Class does not save loaded model again as it is not trained during usage\n        \"\"\"\n        raise NotImplementedError\n\n    def __call__(self, batch: List[List[str]], mean: bool = None) -> List[Union[list, np.ndarray]]:\n        \"\"\"\n        Embed sentences from batch\n\n        Args:\n            batch: list of tokenized text samples\n            mean: whether to return mean embedding of tokens per sample\n\n        Returns:\n            embedded batch\n        \"\"\"\n        batch = [self._encode(sample, mean) for sample in batch]\n        if self.pad_zero:\n            batch = zero_pad(batch)\n        return batch\n\n    @abstractmethod\n    def __iter__(self) -> Iterator[str]:\n        \"\"\"\n        Iterate over all words from the model vocabulary\n\n        Returns:\n            iterator\n        \"\"\"\n\n    @abstractmethod\n    def _get_word_vector(self, w: str) -> np.ndarray:\n        \"\"\"\n        Embed a word using ``self.model``\n\n        Args:\n            w: a word\n\n        Returns:\n            embedding vector\n        \"\"\"\n\n    def _encode(self, tokens: List[str], mean: bool) -> Union[List[np.ndarray], np.ndarray]:\n        \"\"\"\n        Embed one text sample\n\n        Args:\n            tokens: tokenized text sample\n            mean: whether to return mean embedding of tokens per sample\n\n        Returns:\n            list of embedded tokens or array of mean values\n        \"\"\"\n        embedded_tokens = []\n        for t in tokens:\n            try:\n                emb = self.tok2emb[t]\n            except KeyError:\n                try:\n                    emb = self._get_word_vector(t)\n                except KeyError:\n                    emb = np.zeros(self.dim, dtype=np.float32)\n                self.tok2emb[t] = emb\n            embedded_tokens.append(emb)\n\n        if mean is None:\n            mean = self.mean\n\n        if mean:\n            filtered = [et for et in embedded_tokens if np.any(et)]\n            if filtered:\n                return np.mean(filtered, axis=0)\n            return np.zeros(self.dim, dtype=np.float32)\n\n        return embedded_tokens\n"
  },
  {
    "path": "deeppavlov/models/embedders/fasttext_embedder.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import Iterator\n\nimport fasttext\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.models.embedders.abstract_embedder import Embedder\n\nlog = getLogger(__name__)\n\n\n@register('fasttext')\nclass FasttextEmbedder(Embedder):\n    \"\"\"\n    Class implements fastText embedding model\n\n    Args:\n        load_path: path where to load pre-trained embedding model from\n        pad_zero: whether to pad samples or not\n\n    Attributes:\n        model: fastText model instance\n        tok2emb: dictionary with already embedded tokens\n        dim: dimension of embeddings\n        pad_zero: whether to pad sequence of tokens with zeros or not\n        load_path: path with pre-trained fastText binary model\n    \"\"\"\n\n    def _get_word_vector(self, w: str) -> np.ndarray:\n        return self.model.get_word_vector(w)\n\n    def load(self) -> None:\n        \"\"\"\n        Load fastText binary model from self.load_path\n        \"\"\"\n        log.debug(f\"[loading fastText embeddings from `{self.load_path}`]\")\n        self.model = fasttext.load_model(str(self.load_path))\n        self.dim = self.model.get_dimension()\n\n    def __iter__(self) -> Iterator[str]:\n        \"\"\"\n        Iterate over all words from fastText model vocabulary\n\n        Returns:\n            iterator\n        \"\"\"\n        yield from self.model.get_words()\n"
  },
  {
    "path": "deeppavlov/models/embedders/tfidf_weighted_embedder.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import List, Union, Optional, Tuple\n\nimport numpy as np\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.utils import zero_pad\nfrom deeppavlov.core.models.component import Component\n\nlog = getLogger(__name__)\n\n\n@register('tfidf_weighted')\nclass TfidfWeightedEmbedder(Component):\n    \"\"\"\n    The class implements the functionality of embedding the sentence \\\n        as a weighted average by special coefficients of tokens embeddings. \\\n        Coefficients can be taken from the given TFIDF-vectorizer in ``vectorizer`` or \\\n        calculated as TFIDF from counter vocabulary given in ``counter_vocab_path``.\n        Also one can give ``tags_vocab_path`` to the vocabulary with weights of tags. \\\n        In this case, batch with tags should be given as a second input in ``__call__`` method.\n\n    Args:\n        embedder: embedder instance\n        tokenizer: tokenizer instance, should be able to detokenize sentence\n        pad_zero: whether to pad samples or not\n        mean: whether to return mean token embedding\n        tags_vocab_path: optional path to vocabulary with tags weights\n        vectorizer: vectorizer instance should be trained with ``analyzer=\"word\"``\n        counter_vocab_path: path to counter vocabulary\n        idf_base_count: minimal idf value (less time occured are not counted)\n        log_base: logarithm base for TFIDF-coefficient calculation froom counter vocabulary\n        min_idf_weight: minimal idf weight\n\n    Attributes:\n        embedder: embedder instance\n        tokenizer: tokenizer instance, should be able to detokenize sentence\n        dim: dimension of embeddings\n        pad_zero: whether to pad samples or not\n        mean: whether to return mean token embedding\n        tags_vocab: vocabulary with weigths for tags\n        vectorizer: vectorizer instance\n        counter_vocab_path: path to counter vocabulary\n        counter_vocab: counter vocabulary\n        idf_base_count: minimal idf value (less time occured are not counted)\n        log_base: logarithm base for TFIDF-coefficient calculation froom counter vocabulary\n        min_idf_weight: minimal idf weight\n\n    Examples:\n        >>> from deeppavlov.models.embedders.tfidf_weighted_embedder import TfidfWeightedEmbedder\n        >>> from deeppavlov.models.embedders.fasttext_embedder import FasttextEmbedder\n        >>> fasttext_embedder = FasttextEmbedder('/data/embeddings/wiki.ru.bin')\n        >>> fastTextTfidf = TfidfWeightedEmbedder(embedder=fasttext_embedder,\n                counter_vocab_path='/data/vocabs/counts_wiki_lenta.txt')\n        >>> fastTextTfidf([['большой', 'и', 'розовый', 'бегемот']])\n        [array([ 1.99135890e-01, -7.14746421e-02,  8.01428872e-02, -5.32840924e-02,\n                 5.05212297e-02,  2.76053832e-01, -2.53270134e-01, -9.34443950e-02,\n                 ...\n                 1.18385439e-02,  1.05643446e-01, -1.21904516e-03,  7.70555378e-02])]\n    \"\"\"\n\n    def __init__(self,\n                 embedder: Component,\n                 tokenizer: Component = None,\n                 pad_zero: bool = False,\n                 mean: bool = False,\n                 tags_vocab_path: str = None,\n                 vectorizer: Component = None,\n                 counter_vocab_path: str = None,\n                 idf_base_count: int = 100,\n                 log_base: int = 10,\n                 min_idf_weight=0.0, **kwargs) -> None:\n        self.embedder = embedder\n        self.dim = self.embedder.dim\n        self.mean = mean\n        self.pad_zero = pad_zero\n        self.tokenizer = tokenizer or self.space_detokenizer\n        self.vectorizer = vectorizer\n\n        if vectorizer and counter_vocab_path:\n            raise ConfigError(\"TfidfWeightedEmbedder got vectorizer and counter_vocab_path simultaneously.\"\n                              \" Remove one of them, please\")\n        elif vectorizer:\n            self.vectorizer = vectorizer\n            self.vocabulary = np.array(self.vectorizer.model.get_feature_names())\n        elif counter_vocab_path:\n            self.counter_vocab_path = expand_path(counter_vocab_path)\n            self.counter_vocab, self.min_count = self.load_counter_vocab(self.counter_vocab_path)\n            self.idf_base_count = idf_base_count\n            self.log_base = log_base\n            self.min_idf_weight = min_idf_weight\n        else:\n            raise ConfigError(\"TfidfWeightedEmbedder did not get vectorizer or counter_vocab_path.\"\n                              \" Set one of them, please\")\n\n        if tags_vocab_path:\n            self.tags_vocab = self.load_tags_vocab(expand_path(tags_vocab_path))\n        else:\n            self.tags_vocab = None\n\n    @staticmethod\n    def load_tags_vocab(load_path: str) -> dict:\n        \"\"\"\n        Load tag vocabulary from the given path, each key of the vocabulary is a tag, \\\n            and the corresponding value of the item is a coefficient of words with such tags to be multiplied for.\n\n        Args:\n            load_path: path to the vocabulary to be load from\n\n        Returns:\n            vocabulary\n        \"\"\"\n        tags_vocab = dict()\n        with open(load_path, 'r') as f:\n            lines = f.readlines()\n            f.close()\n\n        for line in lines:\n            key, val = line[:-1].split(' ')  # \"\\t\"\n            tags_vocab[key] = val\n\n        return tags_vocab\n\n    @staticmethod\n    def load_counter_vocab(load_path: str) -> Tuple[dict, int]:\n        \"\"\"\n        Load counter vocabulary from the given path\n\n        Args:\n            load_path: path to the vocabulary to be load from\n\n        Returns:\n            vocabulary\n        \"\"\"\n        counter_vocab = dict()\n        with open(load_path, 'r') as f:\n            lines = f.readlines()\n            f.close()\n\n        min_val = np.inf\n        for line in lines:\n            key, val = line[:-1].split('\\t')\n            val = int(val)\n            counter_vocab[key] = val\n            if val < min_val:\n                min_val = val\n\n        return counter_vocab, min_val\n\n    @staticmethod\n    def space_detokenizer(batch: List[List[str]]) -> List[str]:\n        \"\"\"\n        Detokenizer by default. Linking tokens by space symbol\n\n        Args:\n            batch: batch of tokenized texts\n\n        Returns:\n            batch of detokenized texts\n        \"\"\"\n        return [\" \".join(tokens) for tokens in batch]\n\n    def __call__(self, batch: List[List[str]], tags_batch: Optional[List[List[str]]] = None, mean: bool = None,\n                 *args, **kwargs) -> List[Union[list, np.ndarray]]:\n        \"\"\"\n        Infer on the given data\n\n        Args:\n            batch: tokenized text samples\n            tags_batch: optional batch of corresponding tags\n            mean: whether to return mean token embedding (does not depend on self.mean)\n            *args: additional arguments\n            **kwargs: additional arguments\n\n        Returns:\n\n        \"\"\"\n\n        if self.tags_vocab:\n            if tags_batch is None:\n                raise ConfigError(\"TfidfWeightedEmbedder got 'tags_vocab_path' but __call__ did not get tags_batch.\")\n            batch = [self._tags_encode(sample, tags_sample, mean=mean) for sample, tags_sample in\n                     zip(batch, tags_batch)]\n        else:\n            if tags_batch:\n                raise ConfigError(\"TfidfWeightedEmbedder got tags batch, but 'tags_vocab_path' is empty.\")\n            batch = [self._encode(sample, mean=mean) for sample in batch]\n\n        if self.pad_zero:\n            batch = zero_pad(batch)\n\n        return batch\n\n    def _encode(self, tokens: List[str], mean: bool) -> Union[List[np.ndarray], np.ndarray]:\n        \"\"\"\n        Embed one text sample\n\n        Args:\n            tokens: tokenized text sample\n            mean: whether to return mean token embedding (does not depend on self.mean)\n\n        Returns:\n            list of embedded tokens or array of mean values\n        \"\"\"\n        if self.vectorizer:\n            detokenized_sample = self.tokenizer([tokens])[0]  # str\n            vectorized_sample = self.vectorizer([detokenized_sample])  # (voc_size,)\n\n            weights = np.array([vectorized_sample[0, np.where(self.vocabulary == token)[0][0]]\n                                if len(np.where(self.vocabulary == token)[0]) else 0.\n                                for token in tokens])\n        else:\n            weights = np.array([self.get_weight(max(self.counter_vocab.get(token, 0), self.idf_base_count))\n                                for token in tokens])\n\n        if sum(weights) == 0:\n            weights = np.ones(len(tokens))\n\n        embedded_tokens = np.array(self.embedder([tokens]))[0, :, :]\n\n        if mean is None:\n            mean = self.mean\n\n        if mean:\n            embedded_tokens = np.average(embedded_tokens, weights=weights, axis=0)\n        else:\n            embedded_tokens = np.array([weights[i] * embedded_tokens[i] for i in range(len(tokens))])\n\n        return embedded_tokens\n\n    def get_weight(self, count: int) -> float:\n        \"\"\"\n        Calculate the weight corresponding to the given count\n\n        Args:\n            count: the number of occurences of particular token\n\n        Returns:\n            weight\n        \"\"\"\n        log_count = np.log(count) / np.log(self.log_base)\n        log_base_count = np.log(self.idf_base_count) / np.log(self.log_base)\n        weight = max(1.0 / (1.0 + log_count - log_base_count), self.min_idf_weight)\n        return weight\n\n    def _tags_encode(self, tokens: List[str], tags: List[str], mean: bool) -> Union[List[np.ndarray], np.ndarray]:\n        \"\"\"\n        Embed one text sample\n\n        Args:\n            tokens: tokenized text sample\n            tags: tokenized tags sample\n            mean: whether to return mean token embedding (does not depend on self.mean)\n\n        Returns:\n            list of embedded tokens or array of mean values\n        \"\"\"\n\n        embedded_tokens = np.array(self.embedder([tokens]))[0, :, :]\n\n        tags_weights = np.array([self.tags_vocab.get(tag, 1.0) for tag in tags])\n\n        detokenized_sample = self.tokenizer([tokens])[0]  # str\n        vectorized_sample = self.vectorizer([detokenized_sample])  # (voc_size,)\n\n        if self.vectorizer:\n            weights = np.array([vectorized_sample[0, np.where(self.vocabulary == token)[0][0]]\n                                if len(np.where(self.vocabulary == token)[0]) else 0.\n                                for token in tokens])\n        else:\n            weights = np.array([self.get_weight(max(self.counter_vocab.get(token, 0), self.idf_base_count))\n                                for token in tokens])\n\n        weights = np.multiply(weights, tags_weights)\n        if sum(weights) == 0:\n            weights = np.ones(len(tokens))\n\n        if mean is None:\n            mean = self.mean\n\n        if mean:\n            embedded_tokens = np.average(embedded_tokens, weights=weights, axis=0)\n        else:\n            embedded_tokens = np.array([weights[i] * embedded_tokens[i] for i in range(len(tokens))])\n\n        return embedded_tokens\n"
  },
  {
    "path": "deeppavlov/models/embedders/transformers_embedder.py",
    "content": "# Copyright 2020 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom pathlib import Path\nfrom typing import Union, Tuple, Collection\n\nimport torch\nimport transformers\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.serializable import Serializable\n\n\n@register('transformers_bert_embedder')\nclass TransformersBertEmbedder(Serializable):\n    \"\"\"Transformers-based BERT model for embeddings tokens, subtokens and sentences\n\n    Args:\n        load_path: path to a pretrained BERT pytorch checkpoint\n        bert_config_file: path to a BERT configuration file\n        truncate: whether to remove zero-paddings from returned data\n\n    \"\"\"\n    model: transformers.BertModel\n    dim: int\n\n    def __init__(self, load_path: Union[str, Path], bert_config_path: Union[str, Path] = None,\n                 truncate: bool = False, **kwargs):\n        super().__init__(save_path=None, load_path=load_path, **kwargs)\n        if bert_config_path is not None:\n            bert_config_path = expand_path(bert_config_path)\n        self.config = bert_config_path\n        self.truncate = truncate\n        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n        self.load()\n\n    def save(self, *args, **kwargs):\n        raise NotImplementedError\n\n    def load(self):\n        self.model = transformers.BertModel.from_pretrained(self.load_path, config=self.config).eval().to(self.device)\n        self.dim = self.model.config.hidden_size\n\n    def __call__(self,\n                 subtoken_ids_batch: Collection[Collection[int]],\n                 startofwords_batch: Collection[Collection[int]],\n                 attention_batch: Collection[Collection[int]]) -> Tuple[Collection[Collection[Collection[float]]],\n                                                                        Collection[Collection[Collection[float]]],\n                                                                        Collection[Collection[float]],\n                                                                        Collection[Collection[float]],\n                                                                        Collection[Collection[float]]]:\n        \"\"\"Predict embeddings values for a given batch\n\n        Args:\n            subtoken_ids_batch: padded indexes for every subtoken\n            startofwords_batch: a mask matrix with ``1`` for every first subtoken init in a token and ``0``\n                for every other subtoken\n            attention_batch: a mask matrix with ``1`` for every significant subtoken and ``0`` for paddings\n        \"\"\"\n        ids_tensor = torch.tensor(subtoken_ids_batch, device=self.device, dtype=torch.long)\n        startofwords_tensor = torch.tensor(startofwords_batch, device=self.device).bool()\n        attention_tensor = torch.tensor(attention_batch, device=self.device)\n        with torch.no_grad():\n            output = self.model(ids_tensor, attention_tensor)\n            last_hidden = output.last_hidden_state\n            pooler_output = output.pooler_output\n            attention_tensor = attention_tensor.unsqueeze(-1)\n            max_emb = torch.max(last_hidden - 1e9 * (1 - attention_tensor), dim=1)[0]\n            subword_emb = last_hidden * attention_tensor\n            mean_emb = torch.sum(subword_emb, dim=1) / torch.sum(attention_tensor, dim=1)\n\n            tokens_lengths = startofwords_tensor.sum(dim=1)\n            word_emb = torch.zeros((subword_emb.shape[0], tokens_lengths.max(), subword_emb.shape[2]),\n                                   device=self.device, dtype=subword_emb.dtype)\n            target_indexes = (torch.arange(word_emb.shape[1], device=self.device).expand(word_emb.shape[:-1]) <\n                              tokens_lengths.unsqueeze(-1))\n            word_emb[target_indexes] = subword_emb[startofwords_tensor]\n\n        subword_emb = subword_emb.cpu().numpy()\n        word_emb = word_emb.cpu().numpy()\n        pooler_output = pooler_output.cpu().numpy()\n        max_emb = max_emb.cpu().numpy()\n        mean_emb = mean_emb.cpu().numpy()\n        if self.truncate:\n            subword_emb = [item[:mask.sum()] for item, mask in zip(subword_emb, attention_batch)]\n            word_emb = [item[:mask.sum()] for item, mask in zip(word_emb, startofwords_batch)]\n        return word_emb, subword_emb, max_emb, mean_emb, pooler_output\n"
  },
  {
    "path": "deeppavlov/models/entity_extraction/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/entity_extraction/entity_detection_parser.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom collections import defaultdict\nfrom logging import getLogger\nfrom string import punctuation\nfrom typing import List, Tuple, Union, Any\n\nimport numpy as np\nfrom nltk.corpus import stopwords\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlog = getLogger(__name__)\npunctuation = punctuation.replace('+', '')\n\n\n@register('question_sign_checker')\nclass QuestionSignChecker:\n    def __init__(self, delete_brackets: bool = False, **kwargs):\n        self.delete_brackets = delete_brackets\n        self.replace_tokens = [(\" '\", ' \"'), (\"' \", '\" '), (\" ?\", \"?\"), (\"  \", \" \")]\n\n    def __call__(self, questions: List[str]) -> List[str]:\n        \"\"\"Adds question sign if it is absent or replaces dots in the end with question sign.\"\"\"\n        questions_clean = []\n        for question in questions:\n            question = question if question.endswith('?') else f'{question.rstrip(\".\")}?'\n            if self.delete_brackets:\n                brackets_text = re.findall(r\"(\\(.*?\\))\", question)\n                for elem in brackets_text:\n                    question = question.replace(elem, \" \")\n            for old_tok, new_tok in self.replace_tokens:\n                question = question.replace(old_tok, new_tok)\n            questions_clean.append(question)\n        return questions_clean\n\n\n@register('entity_type_split')\ndef entity_type_split(entities_batch: List[List[str]], tags_batch: List[List[str]]) -> Tuple[\n    List[List[str]], List[List[str]], List[List[str]]]:\n    f_entities_batch, f_types_batch, f_tags_batch = [], [], []\n    for entities_list, tags_list in zip(entities_batch, tags_batch):\n        f_entities_list, f_types_list, f_tags_list = [], [], []\n        for entity, tag in zip(entities_list, tags_list):\n            if tag != \"T\":\n                f_entities_list.append(entity)\n                f_tags_list.append(tag.lower())\n            else:\n                f_types_list.append(entity)\n        f_entities_batch.append(f_entities_list)\n        f_tags_batch.append(f_tags_list)\n        f_types_batch.append(f_types_list)\n    return f_entities_batch, f_tags_batch, f_types_batch\n\n\n@register('entity_detection_parser')\nclass EntityDetectionParser(Component):\n    \"\"\"This class parses probabilities of tokens to be a token from the entity substring.\"\"\"\n\n    def __init__(self, o_tag: str, tags_file: str, entity_tags: List[str] = None, ignore_points: bool = False,\n                 thres_proba: float = 0.8, make_tags_from_probas: bool = False, lang: str = \"en\",\n                 ignored_tags: List[str] = None, **kwargs):\n        \"\"\"\n        Args:\n            o_tag: tag for tokens which are neither entities nor types\n            tags_file: filename with NER tags\n            entity_tags: tags for entities\n            ignore_points: whether to consider points as separate symbols\n            thres_proba: if the probability of the tag is less than thres_proba, we assign the tag as 'O'\n            make_tags_from_probas: whether to define token tags from confidences from sequence tagging model\n            lang: language of texts\n            ignored_tags: not used tags of entities\n        \"\"\"\n        self.entity_tags = entity_tags\n        self.o_tag = o_tag\n        self.ignore_points = ignore_points\n        self.thres_proba = thres_proba\n        self.tag_ind_dict = {}\n        with open(str(expand_path(tags_file))) as fl:\n            tags = [line.split('\\t')[0] for line in fl.readlines()]\n            self.tags = tags\n            if self.entity_tags is None:\n                self.entity_tags = list(\n                    {tag.split('-')[1] for tag in tags if len(tag.split('-')) > 1}.difference({self.o_tag}))\n\n            self.entity_prob_ind = {entity_tag: [i for i, tag in enumerate(tags) if entity_tag in tag]\n                                    for entity_tag in self.entity_tags}\n            self.tags_ind = {tag: i for i, tag in enumerate(tags)}\n            self.et_prob_ind = [i for tag, ind in self.entity_prob_ind.items() for i in ind]\n            for entity_tag, tag_ind in self.entity_prob_ind.items():\n                for ind in tag_ind:\n                    self.tag_ind_dict[ind] = entity_tag\n            self.tag_ind_dict[0] = self.o_tag\n        self.make_tags_from_probas = make_tags_from_probas\n        if lang == \"en\":\n            self.stopwords = set(stopwords.words(\"english\"))\n        elif lang == \"ru\":\n            self.stopwords = set(stopwords.words(\"russian\"))\n        else:\n            raise ValueError(f'Unsupported lang value: \"{lang}\". Only \"en\" and \"ru\" are allowed.')\n        self.ignored_tags = ignored_tags or []\n\n    def __call__(self, question_tokens_batch: List[List[str]], tokens_info_batch: List[List[List[float]]],\n                 tokens_probas_batch: np.ndarray) -> \\\n            Tuple[List[dict], List[dict], List[dict]]:\n        \"\"\"\n        Args:\n            question_tokens_batch: tokenized questions\n            tokens_info_batch: list of tags of question tokens\n            tokens_probas_probas: list of probabilities of question tokens\n        Returns:\n            Batch of dicts where keys are tags and values are substrings corresponding to tags\n            Batch of substrings which correspond to entity types\n            Batch of lists of token indices in the text which correspond to entities\n        \"\"\"\n        entities_batch = []\n        positions_batch = []\n        probas_batch = []\n        for tokens, tags, probas in \\\n                zip(question_tokens_batch, tokens_info_batch, tokens_probas_batch):\n            if self.make_tags_from_probas:\n                tags, _ = self.tags_from_probas(tokens, probas)\n            tags = self.correct_quotes(tokens, tags, probas)\n            tags = self.correct_tags(tokens, tags)\n            entities, positions, entities_probas = self.entities_from_tags(tokens, tags, probas)\n            entities_batch.append(entities)\n            positions_batch.append(positions)\n            probas_batch.append(entities_probas)\n        return entities_batch, positions_batch, probas_batch\n\n    def tags_from_probas(self, tokens: List[str], probas: np.array) -> Tuple[List[Union[str, List[str]]], List[Any]]:\n        \"\"\"\n        This method makes a list of tags from a list of probas for tags\n        Args:\n            tokens: text tokens list\n            probas: probabilities for tokens to belong to particular tags\n        Returns:\n            list of tags for tokens\n            list of probabilities of these tags\n        \"\"\"\n        tags = []\n        tag_probas = []\n        for token, proba in zip(tokens, probas):\n            if proba[0] < self.thres_proba:\n                tag_num = np.argmax(proba[1:]) + 1\n            else:\n                tag_num = 0\n            tags.append(self.tags[tag_num])\n            tag_probas.append(proba[tag_num])\n\n        return tags, tag_probas\n\n    def correct_tags(self, tokens: List[str], tags: List[str]) -> List[str]:\n        for i in range(len(tags) - 2):\n            if len(tags[i]) > 1 and tags[i].startswith(\"B-\"):\n                tag = tags[i].split(\"-\")[1]\n                if tags[i + 2] == f\"I-{tag}\" and tags[i + 1] != f\"I-{tag}\":\n                    tags[i + 1] = f\"I-{tag}\"\n            if tokens[i + 1] in '«' and tags[i] != \"O\":\n                tags[i] = \"O\"\n                tags[i + 1] = \"O\"\n            if len(tags[i]) > 1 and tags[i].split(\"-\")[1] == \"EVENT\":\n                found_n = -1\n                for j in range(i + 1, i + 3):\n                    if re.findall(r\"[\\d]{3,4}\", tokens[j]):\n                        found_n = j\n                        break\n                if found_n > 0:\n                    for j in range(i + 1, found_n + 1):\n                        tags[j] = \"I-EVENT\"\n            if i < len(tokens) - 3 and len(tokens[i]) == 1 and tokens[i + 1] == \".\" and len(tokens[i + 2]) == 1 \\\n                    and tokens[i + 3] == \".\" and tags[i + 2].startswith(\"B-\"):\n                tag = tags[i + 2].split(\"-\")[1]\n                tags[i] = f\"B-{tag}\"\n                tags[i + 1] = f\"I-{tag}\"\n                tags[i + 2] = f\"I-{tag}\"\n        return tags\n\n    def correct_quotes(self, tokens: List[str], tags: List[str], probas: np.array) -> List[str]:\n        quotes = {\"«\": \"»\", '\"': '\"'}\n        for i in range(len(tokens)):\n            if tokens[i] in {\"«\", '\"'}:\n                quote_start = tokens[i]\n                end_pos = 0\n                for j in range(i + 1, len(tokens)):\n                    if tokens[j] == quotes[quote_start]:\n                        end_pos = j\n                        break\n                if end_pos and end_pos != i + 1:\n                    probas_sum = np.sum(probas[i + 1:end_pos], axis=0)\n                    tags_probas = {}\n                    for tag in self.entity_prob_ind:\n                        for ind in self.entity_prob_ind[tag]:\n                            if tag not in tags_probas:\n                                tags_probas[tag] = probas_sum[ind]\n                            else:\n                                tags_probas[tag] += probas_sum[ind]\n                    tags_probas = list(tags_probas.items())\n                    tags_probas = sorted(tags_probas, key=lambda x: x[1], reverse=True)\n                    found_tag = \"\"\n                    for tag, _ in tags_probas:\n                        if tag != \"PERSON\":\n                            found_tag = tag\n                            break\n                    if found_tag:\n                        tags[i + 1] = f\"B-{found_tag}\"\n                        for j in range(i + 2, end_pos):\n                            tags[j] = f\"I-{found_tag}\"\n        return tags\n\n    def add_entity(self, entity: str, c_tag: str) -> None:\n        replace_tokens = [(' - ', '-'), (\"'s\", ''), (' .', '.'), ('{', ''), ('}', ''),\n                          ('  ', ' '), ('\"', \"'\"), ('(', ''), (')', ''), (' +', '+')]\n        if entity and (entity[-1] in punctuation or entity[-1] == \"»\"):\n            entity = entity[:-1]\n            self.ent_pos_dict[c_tag] = self.ent_pos_dict[c_tag][:-1]\n        if entity and (entity[0] in punctuation or entity[0] == \"«\"):\n            entity = entity[1:]\n            self.ent_pos_dict[c_tag] = self.ent_pos_dict[c_tag][1:]\n        entity = ' '.join(entity)\n        for old, new in replace_tokens:\n            entity = entity.replace(old, new)\n        if entity and entity.lower() not in self.stopwords:\n            cur_probas = self.ent_probas_dict[c_tag]\n            self.ents_pos_probas_dict[c_tag].append((entity, self.ent_pos_dict[c_tag],\n                                                     round(sum(cur_probas) / len(cur_probas), 4)))\n        self.ent_dict[c_tag] = []\n        self.ent_pos_dict[c_tag] = []\n        self.ent_probas_dict[c_tag] = []\n\n    def entities_from_tags(self, tokens: List[str], tags: List[str],\n                                 tag_probas: List[List[float]]) -> Tuple[dict, dict, dict]:\n        \"\"\"\n        This method makes lists of substrings corresponding to entities and entity types\n        and a list of indices of tokens which correspond to entities\n        Args:\n            tokens: list of tokens of the text\n            tags: list of tags for tokens\n            tag_probas: list of probabilities of tags\n        Returns:\n            list of entity substrings (or a dict of tags (keys) and entity substrings (values))\n            list of substrings for entity types\n            list of indices of tokens which correspond to entities (or a dict of tags (keys)\n                and list of indices of entity tokens)\n        \"\"\"\n        self.ent_dict = defaultdict(list)\n        self.ent_pos_dict = defaultdict(list)\n        self.ent_probas_dict = defaultdict(list)\n        self.ents_pos_probas_dict = defaultdict(list)\n        cnt = 0\n        for n, (tok, tag, probas) in enumerate(zip(tokens, tags, tag_probas)):\n            if tag.split('-')[-1] in self.entity_tags:\n                f_tag = tag.split(\"-\")[-1]\n                if tag.startswith(\"B-\") and any(self.ent_dict.values()):\n                    for c_tag, entity in self.ent_dict.items():\n                        self.add_entity(entity, c_tag)\n                self.ent_dict[f_tag].append(tok)\n                self.ent_pos_dict[f_tag].append(cnt)\n                self.ent_probas_dict[f_tag].append(probas[self.tags_ind[tag]])\n\n            elif any(self.ent_dict.values()):\n                for tag, entity in self.ent_dict.items():\n                    c_tag = tag.split(\"-\")[-1]\n                    self.add_entity(entity, c_tag)\n            cnt += 1\n        if any(self.ent_dict.values()):\n            for tag, entity in self.ent_dict.items():\n                c_tag = tag.split(\"-\")[-1]\n                self.add_entity(entity, c_tag)\n\n        self.ents_pos_probas_dict = {tag: elements for tag, elements in self.ents_pos_probas_dict.items()\n                                     if tag not in self.ignored_tags}\n\n        for tag in self.ents_pos_probas_dict:\n            ents_pos_proba = self.ents_pos_probas_dict[tag]\n\n        entities_dict = {tag: [ent[0] for ent in ents] for tag, ents in self.ents_pos_probas_dict.items()}\n        entities_positions_dict = {tag: [ent[1] for ent in ents] for tag, ents in self.ents_pos_probas_dict.items()}\n        entities_probas_dict = {tag: [ent[2] for ent in ents] for tag, ents in self.ents_pos_probas_dict.items()}\n        log.debug(f\"entities_dict {entities_dict}\")\n\n        return entities_dict, entities_positions_dict, entities_probas_dict\n"
  },
  {
    "path": "deeppavlov/models/entity_extraction/entity_linking.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nimport sqlite3\nfrom logging import getLogger\nfrom typing import List, Dict, Tuple, Any, Union\nfrom collections import defaultdict\n\nimport nltk\nimport spacy\nfrom hdt import HDTDocument\nfrom nltk.corpus import stopwords\nfrom rapidfuzz import fuzz\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.core.models.serializable import Serializable\nfrom deeppavlov.models.entity_extraction.find_word import WordSearcher\n\nlog = getLogger(__name__)\nnltk.download(\"stopwords\")\n\n\n@register(\"entity_linker\")\nclass EntityLinker(Component, Serializable):\n    \"\"\"\n    Class for linking of entity substrings in the document to entities in Wikidata\n    \"\"\"\n\n    def __init__(\n            self,\n            load_path: str,\n            entity_ranker=None,\n            entities_database_filename: str = None,\n            words_dict_filename: str = None,\n            ngrams_matrix_filename: str = None,\n            num_entities_for_bert_ranking: int = 50,\n            num_entities_for_conn_ranking: int = 5,\n            num_entities_to_return: int = 10,\n            max_text_len: int = 300,\n            max_paragraph_len: int = 150,\n            lang: str = \"ru\",\n            use_descriptions: bool = True,\n            alias_coef: float = 1.1,\n            use_tags: bool = False,\n            lemmatize: bool = False,\n            full_paragraph: bool = False,\n            use_connections: bool = False,\n            kb_filename: str = None,\n            prefixes: Dict[str, Any] = None,\n            **kwargs,\n    ) -> None:\n        \"\"\"\n\n        Args:\n            load_path: path to folder with inverted index files\n            entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert\n            entities_database_filename: filename with database with entities index\n            words_dict_filename: filename with words and corresponding tags\n            ngrams_matrix_filename: filename with char tfidf matrix\n            num_entities_for_bert_ranking: number of candidate entities for BERT ranking using description and context\n            num_entities_for_conn_ranking: number of candidate entities for ranking using connections in the knowledge\n                graph\n            num_entities_to_return: number of candidate entities for the substring which are returned\n            max_text_len: maximal length of entity context\n            max_paragraph_len: maximal length of context paragraphs\n            lang: russian or english\n            use_description: whether to perform entity ranking by context and description\n            alias_coef: coefficient which is multiplied by the substring matching confidence if the substring is the\n                title of the entity\n            use_tags: whether to filter candidate entities by tags\n            lemmatize: whether to lemmatize tokens\n            full_paragraph: whether to use full paragraph for entity context\n            use_connections: whether to rank entities by connections in the knowledge graph\n            kb_filename: filename with the knowledge base in HDT format\n            prefixes: entity and title prefixes\n            **kwargs:\n        \"\"\"\n        super().__init__(save_path=None, load_path=load_path)\n        self.lemmatize = lemmatize\n        self.num_entities_for_bert_ranking = num_entities_for_bert_ranking\n        self.num_entities_for_conn_ranking = num_entities_for_conn_ranking\n        self.entity_ranker = entity_ranker\n        self.entities_database_filename = entities_database_filename\n        self.num_entities_to_return = num_entities_to_return\n        self.max_text_len = max_text_len\n        self.max_paragraph_len = max_paragraph_len\n        self.lang = f\"@{lang}\"\n        if self.lang == \"@en\":\n            self.stopwords = set(stopwords.words(\"english\"))\n            self.nlp = spacy.load(\"en_core_web_sm\")\n        elif self.lang == \"@ru\":\n            self.stopwords = set(stopwords.words(\"russian\"))\n            self.nlp = spacy.load(\"ru_core_news_sm\")\n        self.alias_coef = alias_coef\n        self.use_descriptions = use_descriptions\n        self.use_connections = use_connections\n        self.use_tags = use_tags\n        self.full_paragraph = full_paragraph\n        self.re_tokenizer = re.compile(r\"[\\w']+|[^\\w ]\")\n        self.related_tags = {\n            \"loc\": [\"gpe\", \"country\", \"city\", \"us_state\", \"river\"],\n            \"gpe\": [\"loc\", \"country\", \"city\", \"us_state\"],\n            \"work_of_art\": [\"product\", \"law\"],\n            \"product\": [\"work_of_art\"],\n            \"law\": [\"work_of_art\"],\n            \"org\": [\"fac\", \"business\"],\n            \"business\": [\"org\"]\n        }\n        self.word_searcher = None\n        if words_dict_filename:\n            self.word_searcher = WordSearcher(words_dict_filename, ngrams_matrix_filename, self.lang)\n        self.kb_filename = kb_filename\n        self.prefixes = prefixes\n        self.load()\n\n    def load(self) -> None:\n        self.conn = sqlite3.connect(str(self.load_path / self.entities_database_filename))\n        self.cur = self.conn.cursor()\n        self.kb = None\n        if self.kb_filename:\n            self.kb = HDTDocument(str(expand_path(self.kb_filename)))\n\n    def save(self) -> None:\n        pass\n\n    def __call__(\n            self,\n            substr_batch: List[List[str]],\n            tags_batch: List[List[str]] = None,\n            probas_batch: List[List[float]] = None,\n            sentences_batch: List[List[str]] = None,\n            offsets_batch: List[List[List[int]]] = None,\n            sentences_offsets_batch: List[List[Tuple[int, int]]] = None,\n            entities_to_link_batch: List[List[int]] = None\n    ):\n        if (not sentences_offsets_batch or sentences_offsets_batch[0] is None) and sentences_batch is not None:\n            sentences_offsets_batch = []\n            for sentences_list in sentences_batch:\n                sentences_offsets_list = []\n                start = 0\n                for sentence in sentences_list:\n                    end = start + len(sentence)\n                    sentences_offsets_list.append([start, end])\n                    start = end + 1\n                sentences_offsets_batch.append(sentences_offsets_list)\n\n        if sentences_batch is None:\n            sentences_batch = [[] for _ in substr_batch]\n            sentences_offsets_batch = [[] for _ in substr_batch]\n\n        if not entities_to_link_batch or entities_to_link_batch[0] is None:\n            entities_to_link_batch = [[1 for _ in substr_list] for substr_list in substr_batch]\n\n        log.debug(f\"substr: {substr_batch} --- sentences_batch: {sentences_batch} --- offsets: {offsets_batch}\")\n        if (not offsets_batch or offsets_batch[0] is None) and sentences_batch:\n            offsets_batch = []\n            for substr_list, sentences_list in zip(substr_batch, sentences_batch):\n                text = \" \".join(sentences_list).lower()\n                log.debug(f\"text {text}\")\n                offsets_list = []\n                for substr in substr_list:\n                    st_offset = text.find(substr.lower())\n                    end_offset = st_offset + len(substr)\n                    offsets_list.append([st_offset, end_offset])\n                offsets_batch.append(offsets_list)\n        ids_batch, conf_batch, pages_batch, labels_batch = [], [], [], []\n        for substr_list, offsets_list, tags_list, probas_list, sentences_list, sentences_offsets_list, \\\n            entities_to_link in zip(substr_batch, offsets_batch, tags_batch, probas_batch, sentences_batch,\n                                    sentences_offsets_batch, entities_to_link_batch):\n            ids_list, conf_list, pages_list, labels_list = \\\n                self.link_entities(substr_list, offsets_list, tags_list, probas_list, sentences_list,\n                                   sentences_offsets_list, entities_to_link)\n            log.debug(f\"ids_list {ids_list} conf_list {conf_list}\")\n            if self.num_entities_to_return == 1:\n                pages_list = [pages[0] for pages in pages_list]\n            else:\n                pages_list = [pages[: len(ids)] for pages, ids in zip(pages_list, ids_list)]\n            ids_batch.append(ids_list)\n            conf_batch.append(conf_list)\n            pages_batch.append(pages_list)\n            labels_batch.append(labels_list)\n        return ids_batch, conf_batch, pages_batch, labels_batch\n\n    def link_entities(\n            self,\n            substr_list: List[str],\n            offsets_list: List[List[int]],\n            tags_list: List[str],\n            probas_list: List[float],\n            sentences_list: List[str],\n            sentences_offsets_list: List[List[int]],\n            entities_to_link: List[int]\n    ) -> Tuple[List[Any], List[Any], List[List[Union[str, Any]]], List[List[Union[str, Any]]]]:\n        log.debug(f\"substr_list {substr_list} tags_list {tags_list} probas {probas_list} offsets_list {offsets_list}\")\n        ids_list, conf_list, pages_list, label_list, descr_list = [], [], [], [], []\n        if substr_list:\n            entities_scores_list = []\n            cand_ent_scores_list = []\n            for substr, tags, proba in zip(substr_list, tags_list, probas_list):\n                for old_symb, new_symb in [(\"'s\", \"\"), (\"@\", \"\"), (\"  \", \" \"), (\".\", \"\"), (\",\", \"\"), (\"-\", \" \"),\n                                           (\"'\", \" \"), (\"!\", \"\"), (\":\", \"\"), (\"&\", \"\"), (\"/\", \" \"), ('\"', \"\"),\n                                           (\"  \", \" \")]:\n                    substr = substr.replace(old_symb, new_symb)\n                substr = substr.strip()\n                cand_ent_init = defaultdict(set)\n                if len(substr) > 1:\n                    if isinstance(tags, str):\n                        tags = [tags]\n                    tags = [tag.lower() for tag in tags]\n                    if tags and not isinstance(tags[0], (list, tuple)):\n                        tags = [(tag, 1.0) for tag in tags]\n                    if tags and tags[0][0] == \"e\":\n                        use_tags_flag = False\n                    else:\n                        use_tags_flag = True\n                    cand_ent_init = self.find_exact_match(substr, tags, use_tags=use_tags_flag)\n                    new_substr = re.sub(r\"\\b([a-z]{1}) ([a-z]{1})\\b\", r\"\\1\\2\", substr)\n                    if substr != new_substr:\n                        new_cand_ent_init = self.find_exact_match(new_substr, tags, use_tags=use_tags_flag)\n                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)\n\n                    init_substr_split = substr.lower().split(\" \")\n                    if tags[0][0] in {\"person\", \"work_of_art\"}:\n                        substr_split = [word for word in substr.lower().split(\" \") if len(word) > 0]\n                    else:\n                        substr_split = [word for word in substr.lower().split(\" \")\n                                        if word not in self.stopwords and len(word) > 0]\n\n                    substr_split_lemm = [self.nlp(tok)[0].lemma_ for tok in substr_split]\n                    substr_lemm = \" \".join(substr_split_lemm)\n                    if substr_split != substr_split_lemm \\\n                            or (tags[0][0] == \"work_of_art\"\n                                and len(substr_split) != len(init_substr_split)):\n                        new_cand_ent_init = self.find_fuzzy_match(substr_split, tags, use_tags=use_tags_flag)\n                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)\n                    if substr_split != substr_split_lemm:\n                        new_cand_ent_init = self.find_exact_match(substr_lemm, tags, use_tags=use_tags_flag)\n                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)\n                        new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, tags, use_tags=use_tags_flag)\n                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)\n\n                    all_low_conf = self.define_all_low_conf(cand_ent_init, 1.0)\n                    clean_tags, corr_tags, corr_clean_tags = self.correct_tags(tags)\n                    log.debug(f\"substr: {substr} --- lemm: {substr_split_lemm} --- tags: {tags} --- corr_tags: \"\n                              f\"{corr_tags} --- all_low_conf: {all_low_conf} --- cand_ent_init: {len(cand_ent_init)}\")\n\n                    if (not cand_ent_init or all_low_conf) and corr_tags:\n                        corr_cand_ent_init = self.find_exact_match(substr, corr_tags, use_tags=use_tags_flag)\n                        cand_ent_init = self.unite_dicts(cand_ent_init, corr_cand_ent_init)\n                        if substr_split != substr_split_lemm:\n                            new_cand_ent_init = self.find_exact_match(substr_lemm, corr_tags, use_tags=use_tags_flag)\n                            cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)\n                            new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, corr_tags,\n                                                                      use_tags=use_tags_flag)\n                            cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)\n\n                    if not cand_ent_init and len(substr_split) == 1 and self.word_searcher:\n                        corr_words = self.word_searcher(substr_split[0], set(clean_tags + corr_clean_tags))\n                        if corr_words:\n                            cand_ent_init = self.find_exact_match(corr_words[0], tags + corr_tags,\n                                                                  use_tags=use_tags_flag)\n\n                    if not cand_ent_init and len(substr_split) > 1:\n                        cand_ent_init = self.find_fuzzy_match(substr_split, tags)\n\n                    all_low_conf = self.define_all_low_conf(cand_ent_init, 0.85)\n                    if (not cand_ent_init or all_low_conf) and tags[0][0] != \"t\":\n                        use_tags_flag = False\n                        new_cand_ent_init = self.find_exact_match(substr, tags, use_tags=use_tags_flag)\n                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)\n                        if substr_split != substr_split_lemm and (tags[0][0] == \"e\" or not cand_ent_init):\n                            new_cand_ent_init = self.find_fuzzy_match(substr_split, tags, use_tags=use_tags_flag)\n                            cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)\n                            new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, tags, use_tags=use_tags_flag)\n                            cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)\n\n                cand_ent_scores = []\n                for entity in cand_ent_init:\n                    entities_scores = list(cand_ent_init[entity])\n                    entities_scores = sorted(entities_scores, key=lambda x: (x[0], x[2], x[1]), reverse=True)\n                    cand_ent_scores.append(([entity] + list(entities_scores[0])))\n\n                cand_ent_scores = sorted(cand_ent_scores, key=lambda x: (x[1], x[3], x[2]), reverse=True)\n                cand_ent_scores = cand_ent_scores[: self.num_entities_for_bert_ranking]\n                cand_ent_scores_list.append(cand_ent_scores)\n                entity_ids = [elem[0] for elem in cand_ent_scores]\n                scores = [elem[1:4] for elem in cand_ent_scores]\n                conf_list.append(scores)\n                entities_scores_list.append(\n                    {entity_id: entity_scores for entity_id, entity_scores in zip(entity_ids, scores)}\n                )\n                ids_list.append(entity_ids)\n                pages = [elem[4] for elem in cand_ent_scores]\n                entity_labels = [elem[5] for elem in cand_ent_scores]\n                pages_list.append({entity_id: page for entity_id, page in zip(entity_ids, pages)})\n                label_list.append(\n                    {entity_id: entity_label for entity_id, entity_label in zip(entity_ids, entity_labels)})\n                descr_list.append([elem[6] for elem in cand_ent_scores])\n\n            scores_dict = {}\n            if self.use_connections and self.kb:\n                scores_dict = self.rank_by_connections(ids_list)\n\n            substr_lens = [len(entity_substr.split()) for entity_substr in substr_list]\n            ids_list, conf_list = self.rank_by_description(substr_list, tags_list, offsets_list, ids_list,\n                                                           descr_list, entities_scores_list, sentences_list,\n                                                           sentences_offsets_list, substr_lens, scores_dict)\n        label_list = [[label_dict.get(entity_id, \"\") for entity_id in entity_ids]\n                      for entity_ids, label_dict in zip(ids_list, label_list)]\n        pages_list = [[pages_dict.get(entity_id, \"\") for entity_id in entity_ids]\n                      for entity_ids, pages_dict in zip(ids_list, pages_list)]\n\n        f_ids_list, f_conf_list, f_pages_list, f_label_list = [], [], [], []\n        for ids, confs, pages, labels, add_flag in \\\n                zip(ids_list, conf_list, pages_list, label_list, entities_to_link):\n            if add_flag:\n                f_ids_list.append(ids)\n                f_conf_list.append(confs)\n                f_pages_list.append(pages)\n                f_label_list.append(labels)\n        return f_ids_list, f_conf_list, f_pages_list, f_label_list\n\n    def define_all_low_conf(self, cand_ent_init, thres):\n        all_low_conf = True\n        for entity_id in cand_ent_init:\n            entity_info_set = cand_ent_init[entity_id]\n            for entity_info in entity_info_set:\n                if entity_info[0] >= thres:\n                    all_low_conf = False\n                    break\n            if not all_low_conf:\n                break\n        return all_low_conf\n\n    def correct_tags(self, tags):\n        clean_tags = [tag for tag, conf in tags]\n        corr_tags, corr_clean_tags = [], []\n        for tag, conf in tags:\n            if tag in self.related_tags:\n                corr_tag_list = self.related_tags[tag]\n                for corr_tag in corr_tag_list:\n                    if corr_tag not in clean_tags and corr_tag not in corr_clean_tags:\n                        corr_tags.append([corr_tag, conf])\n                        corr_clean_tags.append(corr_tag)\n        return clean_tags, corr_tags, corr_clean_tags\n\n    def unite_dicts(self, cand_ent_init, new_cand_ent_init):\n        for entity_id in new_cand_ent_init:\n            if entity_id in cand_ent_init:\n                for entity_info in new_cand_ent_init[entity_id]:\n                    cand_ent_init[entity_id].add(entity_info)\n            else:\n                cand_ent_init[entity_id] = new_cand_ent_init[entity_id]\n        return cand_ent_init\n\n    def process_cand_ent(self, cand_ent_init, entities_and_ids, substr_split, tag, tag_conf, use_tags):\n        for title, entity_id, rels, ent_tag, page, label, descr in entities_and_ids:\n            if (ent_tag == tag and use_tags) or not use_tags:\n                substr_score = self.calc_substr_score(title, substr_split, tag, ent_tag, label)\n                cand_ent_init[entity_id].add((substr_score, rels, tag_conf, page, label, descr))\n        return cand_ent_init\n\n    def sanitize_substr(self, entity_substr, tag):\n        if tag == \"person\":\n            entity_substr_split = entity_substr.split()\n            if len(entity_substr_split) > 1 and len(entity_substr_split[-1]) > 1 and len(entity_substr_split[-2]) == 1:\n                entity_substr = entity_substr_split[-1]\n        return entity_substr\n\n    def find_exact_match(self, entity_substr, tags, use_tags=True):\n        entity_substr = entity_substr.lower()\n        entity_substr_split = entity_substr.split()\n        cand_ent_init = defaultdict(set)\n        for tag, tag_conf in tags:\n            entity_substr = self.sanitize_substr(entity_substr, tag)\n            query = \"SELECT * FROM inverted_index WHERE title MATCH ?;\"\n            entities_and_ids = []\n            try:\n                res = self.cur.execute(query, (entity_substr,))\n                entities_and_ids = res.fetchall()\n            except:\n                log.info(f\"error in query execute {query}\")\n            if entities_and_ids:\n                cand_ent_init = self.process_cand_ent(\n                    cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf, use_tags)\n        return cand_ent_init\n\n    def find_fuzzy_match(self, entity_substr_split, tags, use_tags=True):\n        cand_ent_init = defaultdict(set)\n        for tag, tag_conf in tags:\n            if len(entity_substr_split) > 3:\n                entity_substr_split = [\" \".join(entity_substr_split[i:i + 2])\n                                       for i in range(len(entity_substr_split) - 1)]\n            for word in entity_substr_split:\n                if len(word) > 1 and word not in self.stopwords:\n                    query = \"SELECT * FROM inverted_index WHERE title MATCH ?;\"\n                    part_entities_and_ids = []\n                    try:\n                        res = self.cur.execute(query, (word,))\n                        part_entities_and_ids = res.fetchall()\n                    except:\n                        log.info(f\"error in query execute {query}\")\n                    if part_entities_and_ids:\n                        cand_ent_init = self.process_cand_ent(\n                            cand_ent_init, part_entities_and_ids, entity_substr_split, tag, tag_conf, use_tags)\n        return cand_ent_init\n\n    def match_tokens(self, entity_substr_split, label_tokens):\n        cnt = 0.0\n        if not (len(entity_substr_split) > 1 and len(label_tokens) > 1\n                and set(entity_substr_split) != set(label_tokens) and label_tokens[0] != label_tokens[-1]\n                and ((entity_substr_split[0] == label_tokens[-1]) or (entity_substr_split[-1] == label_tokens[0]))):\n            for ent_tok in entity_substr_split:\n                found = False\n                for label_tok in label_tokens:\n                    if label_tok == ent_tok:\n                        found = True\n                        break\n                if found:\n                    cnt += 1.0\n                else:\n                    for label_tok in label_tokens:\n                        if label_tok[:2] == ent_tok[:2]:\n                            fuzz_score = fuzz.ratio(label_tok, ent_tok)\n                            c_long_toks = len(label_tok) >= 8 and label_tok[:6] == ent_tok[:6] and fuzz_score > 70.0\n                            c_shrt_toks = len(label_tokens) > 2 and len(label_tok) > 3 and label_tok[:4] == ent_tok[:4]\n                            if (fuzz_score >= 75.0 or c_long_toks or c_shrt_toks) and not found:\n                                cnt += fuzz_score * 0.01\n                                break\n        substr_score = round(cnt / max(len(label_tokens), len(entity_substr_split)), 3)\n        if len(label_tokens) == 2 and len(entity_substr_split) == 1:\n            if entity_substr_split[0] == label_tokens[1]:\n                substr_score = 0.5\n            elif entity_substr_split[0] == label_tokens[0]:\n                substr_score = 0.3\n        return substr_score\n\n    def correct_substr_score(self, entity_substr_split, label_tokens, substr_score):\n        if sum([len(tok) == 1 for tok in entity_substr_split]) == 2 and len(label_tokens) >= 2 \\\n                and any([(len(tok) == 2 and re.findall(r\"[a-z]{2}\", tok)) for tok in label_tokens]):\n            new_label_tokens = []\n            for tok in label_tokens:\n                if len(tok) == 2 and re.findall(r\"[a-z]{2}\", tok):\n                    new_label_tokens.append(tok[0])\n                    new_label_tokens.append(tok[1])\n                else:\n                    new_label_tokens.append(tok)\n            label_tokens = new_label_tokens\n        if any([re.findall(r\"[\\d]{4}\", tok) for tok in entity_substr_split]) \\\n                and any([re.findall(r\"[\\d]{4}–[\\d]{2}\", tok) for tok in label_tokens]):\n            new_label_tokens = []\n            for tok in label_tokens:\n                if re.findall(r\"[\\d]{4}–[\\d]{2}\", tok):\n                    new_label_tokens.append(tok[:4])\n                    new_label_tokens.append(tok[5:])\n                else:\n                    new_label_tokens.append(tok)\n            label_tokens = new_label_tokens\n        new_substr_score = self.match_tokens(entity_substr_split, label_tokens)\n        substr_score = max(substr_score, new_substr_score)\n        return substr_score\n\n    def calc_substr_score(self, entity_title, entity_substr_split, tag, ent_tag, entity_label):\n        if self.lang == \"@ru\":\n            entity_title = entity_title.replace(\"ё\", \"е\")\n        label_tokens = entity_title.split()\n        substr_score = self.match_tokens(entity_substr_split, label_tokens)\n        substr_score = self.correct_substr_score(entity_substr_split, label_tokens, substr_score)\n        if re.findall(r\" \\(.*\\)\", entity_label):\n            entity_label_split = entity_label.replace(\"(\", \"\").replace(\")\", \"\").lower().split()\n            lbl_substr_score = self.match_tokens(entity_substr_split, entity_label_split)\n            substr_score = max(substr_score, lbl_substr_score)\n        if tag == ent_tag and tag.lower() == \"person\" and len(entity_substr_split) > 1 \\\n                and len(entity_substr_split[-1]) > 1 and len(entity_substr_split[-2]) == 1 \\\n                and len(label_tokens) == len(entity_substr_split):\n            cnt = 0.0\n            for j in range(len(label_tokens) - 1):\n                if label_tokens[j][0] == entity_substr_split[j][0]:\n                    cnt += 1.0\n            if label_tokens[-1] == entity_substr_split[-1]:\n                cnt += 1.0\n            new_substr_score = cnt / len(label_tokens)\n            substr_score = max(substr_score, new_substr_score)\n\n        if entity_title.lower() == entity_label.lower() and substr_score == 1.0:\n            substr_score = substr_score * self.alias_coef\n        return substr_score\n\n    def rank_by_description(\n            self,\n            entity_substr_list: List[str],\n            tags_list: List[str],\n            entity_offsets_list: List[List[int]],\n            cand_ent_list: List[List[str]],\n            cand_ent_descr_list: List[List[str]],\n            entities_scores_list: List[Dict[str, Tuple[int, float]]],\n            sentences_list: List[str],\n            sentences_offsets_list: List[Tuple[int, int]],\n            substr_lens: List[int],\n            scores_dict: Dict[str, int] = None\n    ) -> Tuple[List[Union[Union[float, List[Any], List[Union[float, Any]]], Any]], List[\n        Union[Union[tuple, List[tuple], List[Any], List[Tuple[Union[float, Any], ...]]], Any]]]:\n        entity_ids_list = []\n        conf_list = []\n        contexts = []\n        for entity_offset in entity_offsets_list:\n            context, sentence = \"\", \"\"\n            if len(entity_offset) == 2:\n                entity_start_offset, entity_end_offset = entity_offset\n                rel_start_offset = 0\n                rel_end_offset = 0\n                found_sentence_num = 0\n                for num, (sent, (sent_start_offset, sent_end_offset)) in enumerate(\n                        zip(sentences_list, sentences_offsets_list)\n                ):\n                    if entity_start_offset >= sent_start_offset and entity_end_offset <= sent_end_offset:\n                        sentence = sent\n                        found_sentence_num = num\n                        rel_start_offset = entity_start_offset - sent_start_offset\n                        rel_end_offset = entity_end_offset - sent_start_offset\n                        break\n            if sentence:\n                start_of_sentence = 0\n                end_of_sentence = len(sentence)\n                if len(sentence) > self.max_text_len:\n                    start_of_sentence = max(rel_start_offset - self.max_text_len // 2, 0)\n                    end_of_sentence = min(rel_end_offset + self.max_text_len // 2, len(sentence))\n                text_before = sentence[start_of_sentence:rel_start_offset]\n                text_after = sentence[rel_end_offset:end_of_sentence]\n                context = text_before + \"[ENT]\" + text_after\n                if self.full_paragraph:\n                    cur_sent_len = len(re.findall(self.re_tokenizer, context))\n                    first_sentence_num = found_sentence_num\n                    last_sentence_num = found_sentence_num\n                    context = [context]\n                    while True:\n                        added = False\n                        if last_sentence_num < len(sentences_list) - 1:\n                            sentence_tokens = re.findall(self.re_tokenizer, sentences_list[last_sentence_num + 1])\n                            last_sentence_len = len(sentence_tokens)\n                            if cur_sent_len + last_sentence_len < self.max_paragraph_len:\n                                context.append(sentences_list[last_sentence_num + 1])\n                                cur_sent_len += last_sentence_len\n                                last_sentence_num += 1\n                                added = True\n                        if first_sentence_num > 0:\n                            sentence_tokens = re.findall(self.re_tokenizer, sentences_list[first_sentence_num - 1])\n                            first_sentence_len = len(sentence_tokens)\n                            if cur_sent_len + first_sentence_len < self.max_paragraph_len:\n                                context = [sentences_list[first_sentence_num - 1]] + context\n                                cur_sent_len += first_sentence_len\n                                first_sentence_num -= 1\n                                added = True\n                        if not added:\n                            break\n                    context = \" \".join(context)\n\n            log.debug(f\"rank, context: {context}\")\n            contexts.append(context)\n\n        if self.use_descriptions:\n            scores_list = self.entity_ranker(contexts, cand_ent_list, cand_ent_descr_list)\n        else:\n            scores_list = [[(entity_id, 1.0) for entity_id in cand_ent] for cand_ent in cand_ent_list]\n\n        for entity_substr, tag, context, candidate_entities, substr_len, entities_scores, scores in zip(\n                entity_substr_list, tags_list, contexts, cand_ent_list, substr_lens, entities_scores_list, scores_list\n        ):\n            entities_with_scores = []\n            max_conn_score = 0\n            if scores_dict and scores:\n                max_conn_score = max([scores_dict.get(entity, 0) for entity, _ in scores])\n            for entity, score in scores:\n                substr_score = round(entities_scores.get(entity, (0.0, 0))[0], 2)\n                num_rels = entities_scores.get(entity, (0.0, 0))[1]\n                if len(context.split()) < 4:\n                    score = 0.95\n                elif scores_dict and 0 < max_conn_score == scores_dict.get(entity, 0):\n                    score = 1.0\n                    num_rels = 200\n                entities_with_scores.append((entity, substr_score, num_rels, float(score)))\n\n            if tag == \"t\":\n                entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True)\n            else:\n                entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[3], x[2]), reverse=True)\n            log.debug(f\"{entity_substr} --- tag: {tag} --- entities_with_scores: {entities_with_scores}\")\n\n            if not entities_with_scores:\n                top_entities = []\n                top_conf = []\n            elif entities_with_scores and substr_len == 1 and entities_with_scores[0][1] < 1.0:\n                top_entities = []\n                top_conf = []\n            elif entities_with_scores and (\n                    entities_with_scores[0][1] < 0.3\n                    or (entities_with_scores[0][3] < 0.13 and entities_with_scores[0][2] < 20)\n                    or (entities_with_scores[0][3] < 0.3 and entities_with_scores[0][2] < 4)\n                    or entities_with_scores[0][1] < 0.6\n            ):\n                top_entities = []\n                top_conf = []\n            else:\n                top_entities = [score[0] for score in entities_with_scores]\n                top_conf = [score[1:] for score in entities_with_scores]\n\n            high_conf_entities = []\n            high_conf_nums = []\n            for elem_num, (entity, conf) in enumerate(zip(top_entities, top_conf)):\n                if len(conf) == 3 and conf[0] >= 1.0 and conf[1] > 50 and conf[2] > 0.3:\n                    new_conf = list(conf)\n                    if new_conf[1] > 55:\n                        new_conf[2] = 1.0\n                    new_conf = tuple(new_conf)\n                    high_conf_entities.append((entity,) + new_conf)\n                    high_conf_nums.append(elem_num)\n\n            high_conf_entities = sorted(high_conf_entities, key=lambda x: (x[1], x[3], x[2]), reverse=True)\n            log.debug(f\"high_conf_entities: {high_conf_entities}\")\n            for n, elem_num in enumerate(high_conf_nums):\n                if 0 <= elem_num - n < len(top_entities):\n                    del top_entities[elem_num - n]\n                    del top_conf[elem_num - n]\n\n            top_entities = [elem[0] for elem in high_conf_entities] + top_entities\n            top_conf = [elem[1:] for elem in high_conf_entities] + top_conf\n\n            if not top_entities:\n                entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True)\n                top_entities = [score[0] for score in entities_with_scores]\n                top_conf = [score[1:] for score in entities_with_scores]\n\n            if self.num_entities_to_return == 1 and top_entities:\n                entity_ids_list.append(top_entities[0])\n                conf_list.append([round(cnf, 2) for cnf in top_conf[0]])\n            elif self.num_entities_to_return == \"max\":\n                if top_conf:\n                    max_conf = top_conf[0][0]\n                    max_rank_conf = top_conf[0][2]\n                    entity_ids, confs = [], []\n                    for entity_id, conf in zip(top_entities, top_conf):\n                        if (conf[0] >= max_conf * 0.9 and max_rank_conf <= 1.0) \\\n                                or (max_rank_conf == 1.0 and conf[2] == 1.0):\n                            entity_ids.append(entity_id)\n                            confs.append([round(cnf, 2) for cnf in conf])\n                    entity_ids_list.append(entity_ids)\n                    conf_list.append(confs)\n                else:\n                    entity_ids_list.append([])\n                    conf_list.append([])\n            else:\n                entity_ids_list.append(top_entities[: self.num_entities_to_return])\n                conf_list.append([[round(cnf, 2) for cnf in conf] for conf in top_conf[: self.num_entities_to_return]])\n            log.debug(f\"{entity_substr} --- top entities {entity_ids_list[-1]} --- top_conf {conf_list[-1]}\")\n        return entity_ids_list, conf_list\n\n    def sort_out_low_conf(self, entity_substr, top_entities, top_conf):\n        if len(entity_substr.split()) > 1 and top_conf:\n            f_top_entities, f_top_conf = [], []\n            for top_conf_thres, conf_thres in [(1.0, 0.9), (0.9, 0.8)]:\n                if top_conf[0][0] >= top_conf_thres:\n                    for ent, conf in zip(top_entities, top_conf):\n                        if conf[0] > conf_thres:\n                            f_top_entities.append(ent)\n                            f_top_conf.append(conf)\n            return f_top_entities, f_top_conf\n        return top_entities, top_conf\n\n    def rank_by_connections(self, ids_list):\n        objects_sets_dict, scores_dict, conn_dict = {}, {}, {}\n        for ids in ids_list:\n            for entity_id in ids:\n                scores_dict[entity_id] = 0\n                conn_dict[entity_id] = set()\n        for ids in ids_list:\n            for entity_id in ids[:self.num_entities_for_conn_ranking]:\n                objects = set()\n                for prefix in self.prefixes[\"entity\"]:\n                    tr, _ = self.kb.search_triples(f\"{prefix}/{entity_id}\", \"\", \"\")\n                    for subj, rel, obj in tr:\n                        if rel.split(\"/\")[-1] not in {\"P31\", \"P279\"}:\n                            if any([obj.startswith(pr) for pr in self.prefixes[\"entity\"]]):\n                                objects.add(obj.split(\"/\")[-1])\n                            if rel.startswith(self.prefixes[\"rels\"][\"no_type\"]):\n                                tr2, _ = self.kb.search_triples(obj, \"\", \"\")\n                                for _, rel2, obj2 in tr2:\n                                    if rel2.startswith(self.prefixes[\"rels\"][\"statement\"]) \\\n                                            or rel2.startswith(self.prefixes[\"rels\"][\"qualifier\"]):\n                                        if any([obj2.startswith(pr) for pr in self.prefixes[\"entity\"]]):\n                                            objects.add(obj2.split(\"/\")[-1])\n                objects_sets_dict[entity_id] = objects\n                for obj in objects:\n                    if obj not in objects_sets_dict:\n                        objects_sets_dict[obj] = set()\n                    objects_sets_dict[obj].add(entity_id)\n\n        for i in range(len(ids_list)):\n            for j in range(len(ids_list)):\n                if i != j:\n                    for entity_id1 in ids_list[i][:self.num_entities_for_conn_ranking]:\n                        for entity_id2 in ids_list[j][:self.num_entities_for_conn_ranking]:\n                            if entity_id1 in objects_sets_dict[entity_id2]:\n                                conn_dict[entity_id1].add(entity_id2)\n                                conn_dict[entity_id2].add(entity_id1)\n        for entity_id in conn_dict:\n            scores_dict[entity_id] = len(conn_dict[entity_id])\n        return scores_dict\n"
  },
  {
    "path": "deeppavlov/models/entity_extraction/find_word.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport itertools\nimport pickle\nfrom collections import Counter\n\nimport numpy as np\nimport scipy as sp\n\nfrom deeppavlov.core.commands.utils import expand_path\n\nSparse = sp.sparse.csr_matrix\n\n\nclass WordSearcher:\n    def __init__(self, words_dict_filename: str, ngrams_matrix_filename: str, lang: str = \"@en\", thresh: int = 1000):\n        self.words_dict_filename = words_dict_filename\n        self.ngrams_matrix_filename = ngrams_matrix_filename\n        if lang == \"@en\":\n            self.letters = \"abcdefghijklmnopqrstuvwxyz\"\n        elif lang == \"@ru\":\n            self.letters = \"абвгдеёжзийклмнопрстуфхцчшщъыьэюя\"\n        else:\n            raise ValueError(f'Unexpected lang value: \"{lang}\"')\n        self.thresh = thresh\n        self.load()\n        self.make_ngrams_dicts()\n\n    def load(self):\n        with open(str(expand_path(self.words_dict_filename)), \"rb\") as fl:\n            self.words_dict = pickle.load(fl)\n        words_list = list(self.words_dict.keys())\n        self.words_list = sorted(words_list)\n\n        loader = np.load(str(expand_path(self.ngrams_matrix_filename)), allow_pickle=True)\n        self.count_matrix = Sparse((loader[\"data\"], loader[\"indices\"], loader[\"indptr\"]), shape=loader[\"shape\"])\n\n    def make_ngrams_dicts(self):\n        self.bigrams_dict, self.trigrams_dict = {}, {}\n        bigram_combs = list(itertools.product(self.letters, self.letters))\n        bigram_combs = [\"\".join(comb) for comb in bigram_combs]\n        trigram_combs = list(itertools.product(self.letters, self.letters, self.letters))\n        trigram_combs = [\"\".join(comb) for comb in trigram_combs]\n        for cnt, bigram in enumerate(bigram_combs):\n            self.bigrams_dict[bigram] = cnt\n        for cnt, trigram in enumerate(trigram_combs):\n            self.trigrams_dict[trigram] = cnt + len(bigram_combs)\n\n    def __call__(self, query, tags):\n        ngrams_list = []\n        for i in range(len(query) - 1):\n            ngram = query[i : i + 2].lower()\n            if ngram in self.bigrams_dict:\n                ngram_id = self.bigrams_dict[ngram]\n                ngrams_list.append(ngram_id)\n        for i in range(len(query) - 2):\n            ngram = query[i : i + 3].lower()\n            if ngram in self.trigrams_dict:\n                ngram_id = self.trigrams_dict[ngram]\n                ngrams_list.append(ngram_id)\n        ngrams_with_cnts = Counter(ngrams_list).most_common()\n        ngram_ids = [elem[0] for elem in ngrams_with_cnts]\n        ngram_cnts = [1 for _ in ngrams_with_cnts]\n\n        indptr = np.array([0, len(ngram_cnts)])\n        query_matrix = Sparse(\n            (ngram_cnts, ngram_ids, indptr), shape=(1, len(self.bigrams_dict) + len(self.trigrams_dict))\n        )\n\n        scores = query_matrix * self.count_matrix\n        scores = np.squeeze(scores.toarray())\n\n        if self.thresh >= len(scores):\n            o = np.argpartition(-scores, len(scores) - 1)[0:self.thresh]\n        else:\n            o = np.argpartition(-scores, self.thresh)[0:self.thresh]\n        o_sort = o[np.argsort(-scores[o])]\n        o_sort = o_sort.tolist()\n\n        found_words = [self.words_list[n] for n in o_sort]\n        found_words = [\n            word\n            for word in found_words\n            if (\n                word.startswith(query[0])\n                and abs(len(word) - len(query)) < 3\n                and self.words_dict[word].intersection(tags)\n            )\n        ]\n        return found_words\n"
  },
  {
    "path": "deeppavlov/models/entity_extraction/ner_chunker.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom logging import getLogger\nfrom string import punctuation\nfrom typing import List, Tuple, Union, Any\n\nfrom nltk import sent_tokenize\nfrom transformers import AutoTokenizer\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.core.common.chainer import Chainer\nfrom deeppavlov.models.entity_extraction.entity_detection_parser import EntityDetectionParser\n\nlog = getLogger(__name__)\n\n\n@register('ner_chunker')\nclass NerChunker(Component):\n    \"\"\"\n        Class to split documents into chunks of max_seq_len symbols so that the length will not exceed\n        maximal sequence length to feed into BERT\n    \"\"\"\n\n    def __init__(self, vocab_file: str, max_seq_len: int = 400, lowercase: bool = False, batch_size: int = 2, **kwargs):\n        \"\"\"\n        Args:\n            vocab_file: vocab file of pretrained transformer model\n            max_seq_len: maximal length of chunks into which the document is split\n            lowercase: whether to lowercase text\n            batch_size: how many chunks are in batch\n        \"\"\"\n        self.max_seq_len = max_seq_len\n        self.batch_size = batch_size\n        self.re_tokenizer = re.compile(r\"[\\w']+|[^\\w ]\")\n        self.tokenizer = AutoTokenizer.from_pretrained(vocab_file,\n                                                       do_lower_case=True)\n        self.punct_ext = punctuation + \" \" + \"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\"\n        self.russian_letters = \"абвгдеёжзийклмнопрстуфхцчшщъыьэюя\"\n        self.lowercase = lowercase\n\n    def __call__(self, docs_batch: List[str]) -> Tuple[List[List[str]], List[List[int]], List[List[Union[\n        List[Union[Tuple[int, int], Tuple[Union[int, Any], Union[int, Any]]]], List[\n            Tuple[Union[int, Any], Union[int, Any]]], List[Tuple[int, int]]]]], List[List[Union[List[Any], List[str]]]],\n                                                                 List[List[str]]]:\n        \"\"\"\n        This method splits each document in the batch into chunks wuth the maximal length of max_seq_len\n \n        Args:\n            docs_batch: batch of documents\n        Returns:\n            batch of lists of document chunks for each document\n            batch of lists of numbers of documents which correspond to chunks\n        \"\"\"\n        text_batch_list, nums_batch_list, sentences_offsets_batch_list, sentences_batch_list = [], [], [], []\n        text_batch, nums_batch, sentences_offsets_batch, sentences_batch = [], [], [], []\n        for n, doc in enumerate(docs_batch):\n            if self.lowercase:\n                doc = doc.lower()\n            start = 0\n            text = \"\"\n            sentences_list = []\n            sentences_offsets_list = []\n            cur_len = 0\n            doc_pieces = doc.split(\"\\n\")\n            doc_pieces = [self.sanitize(doc_piece) for doc_piece in doc_pieces]\n            doc_pieces = [doc_piece for doc_piece in doc_pieces if len(doc_piece) > 1]\n            if doc_pieces:\n                sentences = []\n                for doc_piece in doc_pieces:\n                    sentences += sent_tokenize(doc_piece)\n                for sentence in sentences:\n                    sentence_tokens = re.findall(self.re_tokenizer, sentence)\n                    sentence_len = sum([len(self.tokenizer.encode_plus(token, add_special_tokens=False)[\"input_ids\"])\n                                        for token in sentence_tokens])\n                    if cur_len + sentence_len < self.max_seq_len:\n                        text += f\"{sentence} \"\n                        cur_len += sentence_len\n                        end = start + len(sentence)\n                        sentences_offsets_list.append((start, end))\n                        sentences_list.append(sentence)\n                        start = end + 1\n                    else:\n                        text = text.strip()\n                        if text:\n                            text_batch.append(text)\n                            sentences_offsets_batch.append(sentences_offsets_list)\n                            sentences_batch.append(sentences_list)\n                            nums_batch.append(n)\n\n                        if sentence_len < self.max_seq_len:\n                            text = f\"{sentence} \"\n                            cur_len = sentence_len\n                            start = 0\n                            end = start + len(sentence)\n                            sentences_offsets_list = [(start, end)]\n                            sentences_list = [sentence]\n                            start = end + 1\n                        else:\n                            text = \"\"\n                            sentence_chunks = sentence.split(\" \")\n                            for chunk in sentence_chunks:\n                                chunk_tokens = re.findall(self.re_tokenizer, chunk)\n                                chunk_len = sum([len(self.tokenizer.encode_plus(token,\n                                                                                add_special_tokens=False)[\"input_ids\"])\n                                                 for token in chunk_tokens])\n                                if cur_len + chunk_len < self.max_seq_len:\n                                    text += f\"{chunk} \"\n                                    cur_len += chunk_len + 1\n                                    end = start + len(chunk)\n                                    sentences_offsets_list.append((start, end))\n                                    sentences_list.append(chunk)\n                                    start = end + 1\n                                else:\n                                    text = text.strip()\n                                    if text:\n                                        text_batch.append(text)\n                                        sentences_offsets_batch.append(sentences_offsets_list)\n                                        sentences_batch.append(sentences_list)\n                                        nums_batch.append(n)\n\n                                    text = f\"{chunk} \"\n                                    cur_len = chunk_len\n                                    start = 0\n                                    end = start + len(chunk)\n                                    sentences_offsets_list = [(start, end)]\n                                    sentences_list = [chunk]\n                                    start = end + 1\n\n                text = text.strip().strip(\",\")\n                if text:\n                    text_batch.append(text)\n                    nums_batch.append(n)\n                    sentences_offsets_batch.append(sentences_offsets_list)\n                    sentences_batch.append(sentences_list)\n            else:\n                text_batch.append(\"а\")\n                nums_batch.append(n)\n                sentences_offsets_batch.append([(0, len(doc))])\n                sentences_batch.append([doc])\n\n        num_batches = len(text_batch) // self.batch_size + int(len(text_batch) % self.batch_size > 0)\n        for jj in range(num_batches):\n            text_batch_list.append(text_batch[jj * self.batch_size:(jj + 1) * self.batch_size])\n            nums_batch_list.append(nums_batch[jj * self.batch_size:(jj + 1) * self.batch_size])\n            sentences_offsets_batch_list.append(\n                sentences_offsets_batch[jj * self.batch_size:(jj + 1) * self.batch_size])\n            sentences_batch_list.append(sentences_batch[jj * self.batch_size:(jj + 1) * self.batch_size])\n\n        return text_batch_list, nums_batch_list, sentences_offsets_batch_list, sentences_batch_list\n\n    def sanitize(self, text):\n        text_len = len(text)\n\n        if text_len > 0 and text[text_len - 1] not in {'.', '!', '?'}:\n            i = text_len - 1\n            while text[i] in self.punct_ext and i > 0:\n                i -= 1\n                if (text[i] in {'.', '!', '?'} and text[i - 1].lower() in self.russian_letters) or \\\n                        (i > 1 and text[i] in {'.', '!', '?'} and text[i - 1] in '\"' and text[\n                            i - 2].lower() in self.russian_letters):\n                    break\n\n            text = text[:i + 1]\n        text = re.sub(r'\\s+', ' ', text)\n        return text\n\n\n@register('ner_chunk_model')\nclass NerChunkModel(Component):\n    \"\"\"\n        Class for linking of entity substrings in the document to entities in Wikidata\n    \"\"\"\n\n    def __init__(self, ner: Chainer,\n                 ner_parser: EntityDetectionParser,\n                 ner2: Chainer = None,\n                 ner_parser2: EntityDetectionParser = None,\n                 **kwargs) -> None:\n        \"\"\"\n        Args:\n            ner: config for entity detection\n            ner_parser: component deeppavlov.models.entity_extraction.entity_detection_parser\n            ner2: config of additional entity detection model (ensemble of ner and ner2 models gives better\n                entity detection quality than single ner model)\n            ner_parser2: component deeppavlov.models.entity_extraction.entity_detection_parser\n            **kwargs:\n        \"\"\"\n        self.ner = ner\n        self.ner_parser = ner_parser\n        self.ner2 = ner2\n        self.ner_parser2 = ner_parser2\n\n    def __call__(self, text_batch_list: List[List[str]],\n                 nums_batch_list: List[List[int]],\n                 sentences_offsets_batch_list: List[List[List[Tuple[int, int]]]],\n                 sentences_batch_list: List[List[List[str]]]\n                 ):\n        \"\"\"\n        Args:\n            text_batch_list: list of document chunks\n            nums_batch_list: nums of documents\n            sentences_offsets_batch_list: indices of start and end symbols of sentences in text\n            sentences_batch_list: list of sentences from texts\n        Returns:\n            doc_entity_substr_batch: entity substrings\n            doc_entity_offsets_batch: indices of start and end symbols of entities in text\n            doc_tags_batch: entity tags (PER, LOC, ORG)\n            doc_sentences_offsets_batch: indices of start and end symbols of sentences in text\n            doc_sentences_batch: list of sentences from texts\n        \"\"\"\n        entity_substr_batch_list, entity_offsets_batch_list, entity_positions_batch_list, tags_batch_list, \\\n        entity_probas_batch_list, text_len_batch_list, text_tokens_len_batch_list = [], [], [], [], [], [], []\n        for text_batch, sentences_offsets_batch, sentences_batch in \\\n                zip(text_batch_list, sentences_offsets_batch_list, sentences_batch_list):\n            text_batch = [text.replace(\"\\xad\", \" \") for text in text_batch]\n\n            ner_tokens_batch, ner_tokens_offsets_batch, ner_probas_batch, probas_batch = self.ner(text_batch)\n            entity_substr_batch, entity_positions_batch, entity_probas_batch = \\\n                self.ner_parser(ner_tokens_batch, ner_probas_batch, probas_batch)\n            if self.ner2:\n                ner_tokens_batch2, ner_tokens_offsets_batch2, ner_probas_batch2, probas_batch2 = self.ner2(text_batch)\n                entity_substr_batch2, entity_positions_batch2, entity_probas_batch2 = \\\n                    self.ner_parser2(ner_tokens_batch2, ner_probas_batch2, probas_batch2)\n                entity_substr_batch, entity_positions_batch, entity_probas_batch = \\\n                    self.merge_annotations(entity_substr_batch, entity_positions_batch, entity_probas_batch,\n                                           entity_substr_batch2, entity_positions_batch2, entity_probas_batch2)\n\n            entity_pos_tags_probas_batch = [[(entity_substr.lower(), entity_substr_positions, tag, entity_proba)\n                                             for tag, entity_substr_list in entity_substr_dict.items()\n                                             for entity_substr, entity_substr_positions, entity_proba in\n                                             zip(entity_substr_list, entity_positions_dict[tag],\n                                                 entity_probas_dict[tag])]\n                                            for entity_substr_dict, entity_positions_dict, entity_probas_dict in\n                                            zip(entity_substr_batch, entity_positions_batch, entity_probas_batch)]\n\n            entity_substr_batch, entity_offsets_batch, entity_positions_batch, tags_batch, \\\n            probas_batch = [], [], [], [], []\n            for entity_pos_tags_probas, ner_tokens_offsets_list in \\\n                    zip(entity_pos_tags_probas_batch, ner_tokens_offsets_batch):\n                if entity_pos_tags_probas:\n                    entity_offsets_list = []\n                    entity_substr_list, entity_positions_list, tags_list, probas_list = zip(*entity_pos_tags_probas)\n                    for entity_positions in entity_positions_list:\n                        start_offset = ner_tokens_offsets_list[entity_positions[0]][0]\n                        end_offset = ner_tokens_offsets_list[entity_positions[-1]][1]\n                        entity_offsets_list.append((start_offset, end_offset))\n                else:\n                    entity_substr_list, entity_offsets_list, entity_positions_list = [], [], []\n                    tags_list, probas_list = [], []\n                entity_substr_batch.append(list(entity_substr_list))\n                entity_offsets_batch.append(list(entity_offsets_list))\n                entity_positions_batch.append(list(entity_positions_list))\n                tags_batch.append(list(tags_list))\n                probas_batch.append(list(probas_list))\n\n            entity_substr_batch_list.append(entity_substr_batch)\n            tags_batch_list.append(tags_batch)\n            entity_offsets_batch_list.append(entity_offsets_batch)\n            entity_positions_batch_list.append(entity_positions_batch)\n            entity_probas_batch_list.append(probas_batch)\n            text_len_batch_list.append([len(text) for text in text_batch])\n            text_tokens_len_batch_list.append([len(ner_tokens) for ner_tokens in ner_tokens_batch])\n\n        doc_entity_substr_batch, doc_tags_batch, doc_entity_offsets_batch, doc_probas_batch = [], [], [], []\n        doc_entity_positions_batch, doc_sentences_offsets_batch, doc_sentences_batch = [], [], []\n        doc_entity_substr, doc_tags, doc_probas, doc_entity_offsets, doc_entity_positions = [], [], [], [], []\n        doc_sentences_offsets, doc_sentences = [], []\n        cur_doc_num = 0\n        text_len_sum = 0\n        text_tokens_len_sum = 0\n        for entity_substr_batch, tags_batch, probas_batch, entity_offsets_batch, entity_positions_batch, \\\n            sentences_offsets_batch, sentences_batch, text_len_batch, text_tokens_len_batch, nums_batch in \\\n                zip(entity_substr_batch_list, tags_batch_list, entity_probas_batch_list, entity_offsets_batch_list,\n                    entity_positions_batch_list, sentences_offsets_batch_list, sentences_batch_list,\n                    text_len_batch_list, text_tokens_len_batch_list, nums_batch_list):\n            for entity_substr_list, tag_list, probas_list, entity_offsets_list, entity_positions_list, \\\n                sentences_offsets_list, sentences_list, text_len, text_tokens_len, doc_num in \\\n                    zip(entity_substr_batch, tags_batch, probas_batch, entity_offsets_batch, entity_positions_batch,\n                        sentences_offsets_batch, sentences_batch, text_len_batch, text_tokens_len_batch, nums_batch):\n                if doc_num == cur_doc_num:\n                    doc_entity_substr += entity_substr_list\n                    doc_tags += tag_list\n                    doc_probas += probas_list\n                    doc_entity_offsets += [(start_offset + text_len_sum, end_offset + text_len_sum)\n                                           for start_offset, end_offset in entity_offsets_list]\n                    doc_sentences_offsets += [(start_offset + text_len_sum, end_offset + text_len_sum)\n                                              for start_offset, end_offset in sentences_offsets_list]\n                    doc_entity_positions += [[pos + text_tokens_len_sum for pos in positions]\n                                             for positions in entity_positions_list]\n                    doc_sentences += sentences_list\n                    text_len_sum += text_len + 1\n                    text_tokens_len_sum += text_tokens_len\n                else:\n                    doc_entity_substr_batch.append(doc_entity_substr)\n                    doc_tags_batch.append(doc_tags)\n                    doc_probas_batch.append(doc_probas)\n                    doc_entity_offsets_batch.append(doc_entity_offsets)\n                    doc_entity_positions_batch.append(doc_entity_positions)\n                    doc_sentences_offsets_batch.append(doc_sentences_offsets)\n                    doc_sentences_batch.append(doc_sentences)\n                    doc_entity_substr = entity_substr_list\n                    doc_tags = tag_list\n                    doc_probas = probas_list\n                    doc_entity_offsets = entity_offsets_list\n                    doc_sentences_offsets = sentences_offsets_list\n                    doc_sentences = sentences_list\n                    cur_doc_num = doc_num\n                    text_len_sum = text_len + 1\n                    text_tokens_len_sum = text_tokens_len\n\n        doc_entity_substr_batch.append(doc_entity_substr)\n        doc_tags_batch.append(doc_tags)\n        doc_probas_batch.append(doc_probas)\n        doc_entity_offsets_batch.append(doc_entity_offsets)\n        doc_entity_positions_batch.append(doc_entity_positions)\n        doc_sentences_offsets_batch.append(doc_sentences_offsets)\n        doc_sentences_batch.append(doc_sentences)\n\n        return doc_entity_substr_batch, doc_entity_offsets_batch, doc_entity_positions_batch, doc_tags_batch, \\\n               doc_sentences_offsets_batch, doc_sentences_batch, doc_probas_batch\n\n    def merge_annotations(self, substr_batch, pos_batch, probas_batch, substr_batch2, pos_batch2, probas_batch2):\n        log.debug(f\"ner_chunker, substr2: {substr_batch2} --- pos2: {pos_batch2} --- probas2: {probas_batch2} --- \"\n                  f\"substr: {substr_batch} --- pos: {pos_batch} --- probas: {probas_batch}\")\n        for i in range(len(substr_batch)):\n            for key2 in substr_batch2[i]:\n                substr_list2 = substr_batch2[i][key2]\n                pos_list2 = pos_batch2[i][key2]\n                probas_list2 = probas_batch2[i][key2]\n                for substr2, pos2, probas2 in zip(substr_list2, pos_list2, probas_list2):\n                    found = False\n                    for key in substr_batch[i]:\n                        pos_list = pos_batch[i][key]\n                        for pos in pos_list:\n                            if pos[0] <= pos2[0] <= pos[-1] or pos[0] <= pos2[-1] <= pos[-1]:\n                                found = True\n                    if not found:\n                        if key2 not in substr_batch[i]:\n                            substr_batch[i][key2] = []\n                            pos_batch[i][key2] = []\n                            probas_batch[i][key2] = []\n                        substr_batch[i][key2].append(substr2)\n                        pos_batch[i][key2].append(pos2)\n                        probas_batch[i][key2].append(probas2)\n        for i in range(len(substr_batch)):\n            for key2 in substr_batch2[i]:\n                substr_list2 = substr_batch2[i][key2]\n                pos_list2 = pos_batch2[i][key2]\n                probas_list2 = probas_batch2[i][key2]\n                for substr2, pos2, probas2 in zip(substr_list2, pos_list2, probas_list2):\n                    for key in substr_batch[i]:\n                        inds = []\n                        substr_list = substr_batch[i][key]\n                        pos_list = pos_batch[i][key]\n                        probas_list = probas_batch[i][key]\n                        for n, (substr, pos, probas) in enumerate(zip(substr_list, pos_list, probas_list)):\n                            if (pos[0] == pos2[0] and pos[-1] < pos2[-1]) or (pos[0] > pos2[0] and pos[-1] == pos2[-1]):\n                                inds.append(n)\n                            elif key == \"EVENT\" and ((pos[0] >= pos2[0] and pos[-1] <= pos2[-1])\n                                                     or (len(substr.split()) == 1 and pos2[0] <= pos[0])):\n                                inds.append(n)\n\n                        if (len(inds) > 1 or (len(inds) == 1 and key in {\"WORK_OF_ART\", \"EVENT\"})) \\\n                                and not (key == \"PERSON\" and \" и \" in substr2):\n                            inds = sorted(inds, reverse=True)\n                            for ind in inds:\n                                del substr_batch[i][key][ind]\n                                del pos_batch[i][key][ind]\n                                del probas_batch[i][key][ind]\n                            substr_batch[i][key].append(substr2)\n                            pos_batch[i][key].append(pos2)\n                            probas_batch[i][key].append(probas2)\n        return substr_batch, pos_batch, probas_batch\n"
  },
  {
    "path": "deeppavlov/models/kbqa/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/kbqa/query_generator.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nimport itertools\nimport re\nfrom collections import defaultdict\nfrom logging import getLogger\nfrom typing import Tuple, List, Optional, Union, Dict, Any, Set\n\nimport nltk\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.models.kbqa.query_generator_base import QueryGeneratorBase\nfrom deeppavlov.models.kbqa.rel_ranking_infer import RelRankerInfer\nfrom deeppavlov.models.kbqa.utils import extract_year, extract_number, make_combs, fill_query, find_query_features, \\\n    make_sparql_query, merge_sparql_query\nfrom deeppavlov.models.kbqa.wiki_parser import WikiParser\n\nlog = getLogger(__name__)\n\n\n@register('query_generator')\nclass QueryGenerator(QueryGeneratorBase):\n    \"\"\"\n        Class for query generation using Wikidata hdt file\n    \"\"\"\n\n    def __init__(self, wiki_parser: WikiParser,\n                 rel_ranker: RelRankerInfer,\n                 entities_to_leave: int = 5,\n                 types_to_leave: int = 2,\n                 rels_to_leave: int = 7,\n                 max_comb_num: int = 10000,\n                 gold_query_info: Dict[str, str] = None,\n                 map_query_str_to_kb: List[Tuple[str, str]] = None,\n                 return_answers: bool = True, *args, **kwargs) -> None:\n        \"\"\"\n\n        Args:\n            wiki_parser: component deeppavlov.models.kbqa.wiki_parser\n            rel_ranker: component deeppavlov.models.kbqa.rel_ranking_infer\n            entities_to_leave: how many entities to leave after entity linking\n            types_to_leave: how many types to leave after entity linking\n            rels_to_leave: how many relations to leave after relation ranking\n            max_comb_num: the maximum number of combinations of candidate entities and relations\n            gold_query_info: dict of variable names used for formatting output sparql queries\n            map_query_str_to_kb: mapping of knowledge base prefixes to full https\n            return_answers: whether to return answers or candidate relations and answers for further ranking\n            **kwargs:\n        \"\"\"\n        self.wiki_parser = wiki_parser\n        self.rel_ranker = rel_ranker\n        self.entities_to_leave = entities_to_leave\n        self.types_to_leave = types_to_leave\n        self.rels_to_leave = rels_to_leave\n        self.max_comb_num = max_comb_num\n        self.gold_query_info = gold_query_info\n        self.map_query_str_to_kb = map_query_str_to_kb\n        self.return_answers = return_answers\n        self.replace_tokens = [(\"wdt:p\", \"wdt:P\"), (\"pq:p\", \"pq:P\")]\n        super().__init__(wiki_parser=self.wiki_parser, rel_ranker=self.rel_ranker,\n                         entities_to_leave=self.entities_to_leave, rels_to_leave=self.rels_to_leave,\n                         *args, **kwargs)\n\n    def __call__(self, question_batch: List[str],\n                 question_san_batch: List[str],\n                 template_type_batch: Union[List[List[str]], List[str]],\n                 entities_from_ner_batch: List[List[str]],\n                 types_from_ner_batch: List[List[str]],\n                 entity_tags_batch: List[List[str]],\n                 probas_batch: List[List[float]],\n                 answer_types_batch: List[Set[str]] = None,\n                 entities_to_link_batch: List[List[int]] = None) -> Tuple[List[Any], List[Any]]:\n\n        candidate_outputs_batch, template_answers_batch = [], []\n        if not answer_types_batch or answer_types_batch[0] is None:\n            answer_types_batch = [[] for _ in question_batch]\n        if not entities_to_link_batch or entities_to_link_batch[0] is None:\n            entities_to_link_batch = [[1 for _ in substr_list] for substr_list in entities_from_ner_batch]\n        log.debug(f\"kbqa inputs {question_batch} {question_san_batch} template_type_batch: {template_type_batch} --- \"\n                  f\"entities_from_ner: {entities_from_ner_batch} --- types_from_ner: {types_from_ner_batch} --- \"\n                  f\"entity_tags_batch: {entity_tags_batch} --- answer_types_batch: \"\n                  f\"{[list(elem)[:3] for elem in answer_types_batch]}\")\n        for question, question_sanitized, template_type, entities_from_ner, types_from_ner, entity_tags_list, \\\n            probas, entities_to_link, answer_types in zip(question_batch, question_san_batch, template_type_batch,\n                                                          entities_from_ner_batch, types_from_ner_batch,\n                                                          entity_tags_batch, probas_batch, entities_to_link_batch,\n                                                          answer_types_batch):\n            if template_type == \"-1\":\n                template_type = \"7\"\n            candidate_outputs, template_answer = \\\n                self.find_candidate_answers(question, question_sanitized, template_type, entities_from_ner,\n                                            types_from_ner, entity_tags_list, probas, entities_to_link, answer_types)\n            candidate_outputs_batch.append(candidate_outputs)\n            template_answers_batch.append(template_answer)\n\n        if self.return_answers:\n            answers = self.rel_ranker(question_batch, template_type_batch, candidate_outputs_batch,\n                                      entities_from_ner_batch, template_answers_batch)\n            log.debug(f\"(__call__)answers: {answers}\")\n            if not answers:\n                answers = [\"Not Found\" for _ in question_batch]\n            return answers\n        else:\n            return candidate_outputs_batch, template_answers_batch\n\n    def parse_queries_info(self, question, queries_info, entity_ids, type_ids, rels_from_template):\n        parsed_queries_info = []\n        question_tokens = nltk.word_tokenize(question)\n        rels_scores_dict = {}\n        for query_info in queries_info:\n            query = query_info[\"query_template\"].lower()\n            for old_tok, new_tok in self.replace_tokens:\n                query = query.replace(old_tok, new_tok)\n            log.debug(f\"\\n_______________________________\\nquery: {query}\\n_______________________________\\n\")\n            entities_and_types_select = query_info[\"entities_and_types_select\"]\n            rels_for_search = query_info[\"rank_rels\"]\n            rel_types = query_info[\"rel_types\"]\n            n_hops = query_info[\"n_hops\"]\n            unk_rels = query_info.get(\"unk_rels\", [])\n            query_seq_num = query_info[\"query_sequence\"]\n            return_if_found = query_info[\"return_if_found\"]\n            log.debug(f\"(query_parser)query: {query}, rels_for_search {rels_for_search}, rel_types {rel_types} \"\n                      f\"n_hops {n_hops}, {query_seq_num}, {return_if_found}\")\n            query_triplets = re.findall(\"{[ ]?(.*?)[ ]?}\", query)[0].split(' . ')\n            log.debug(f\"(query_parser)query_triplets: {query_triplets}\")\n            query_triplets_split = [triplet.split(' ')[:3] for triplet in query_triplets]\n            property_types = {}\n            for rel_type, query_triplet in zip(rel_types, query_triplets_split):\n                if query_triplet[1].startswith(\"?\") and rel_type == \"qualifier\":\n                    property_types[query_triplet[1]] = rel_type\n            query_sequence_dict = {num + 1: triplet for num, triplet in enumerate(query_triplets_split)}\n            query_sequence = []\n            for i in query_seq_num:\n                query_sequence.append(query_sequence_dict[i])\n            triplet_info_list = [(\"forw\" if triplet[2].startswith('?') else \"backw\", search_source, rel_type, n_hop)\n                                 for search_source, triplet, rel_type, n_hop in \\\n                                 zip(rels_for_search, query_sequence, rel_types, n_hops)\n                                 if search_source != \"do_not_rank\"]\n            log.debug(f\"(query_parser)query_sequence_dict: {query_sequence_dict} --- rel_directions: \"\n                      f\"{triplet_info_list} --- query_sequence: {query_sequence}\")\n            entity_ids = [entity[:self.entities_to_leave] for entity in entity_ids]\n            rels, entities_rel_conn = [], set()\n            if rels_from_template is not None:\n                rels = [[(rel, 1.0) for rel in rel_list] for rel_list in rels_from_template]\n            elif not rels:\n                for triplet_info in triplet_info_list:\n                    ex_rels, cur_rels_scores_dict, entity_rel_conn = self.find_top_rels(question, entity_ids,\n                                                                                        triplet_info)\n                    rels.append(ex_rels)\n                    rels_scores_dict = {**rels_scores_dict, **cur_rels_scores_dict}\n                    entities_rel_conn = entities_rel_conn.union(entity_rel_conn)\n            log.debug(f\"(query_parser)rels: {rels}\")\n            rels_from_query = [triplet[1] for triplet in query_triplets_split if triplet[1].startswith('?')]\n            qualifier_rels = [triplet[1] for triplet in query_triplets_split if triplet[1].startswith(\"pq:P\")]\n\n            answer_ent, order_info, filter_from_query = find_query_features(query, qualifier_rels, question)\n            log.debug(f\"(query_parser) filter_from_query: {filter_from_query} --- order_info: {order_info}\")\n\n            year = extract_year(question_tokens, question)\n            number = extract_number(question_tokens, question)\n            log.debug(f\"year {year}, number {number}\")\n            if year:\n                filter_info = [(elem[0], elem[1].replace(\"n\", year)) for elem in filter_from_query]\n            elif number:\n                filter_info = [(elem[0], elem[1].replace(\"n\", number)) for elem in filter_from_query]\n            else:\n                filter_info = [elem for elem in filter_from_query if elem[1] != \"n\"]\n            for unk_prop, prop_type in property_types.items():\n                filter_info.append((unk_prop, prop_type))\n            log.debug(f\"(query_parser)filter_from_query: {filter_from_query}\")\n            rel_combs = make_combs(rels, permut=False)\n\n            entity_positions, type_positions = [elem.split('_') for elem in entities_and_types_select.split(' ')]\n            log.debug(f\"entity_positions {entity_positions}, type_positions {type_positions}\")\n            selected_entity_ids, selected_type_ids = [], []\n            if len(entity_ids) > 1 and len(entity_positions) == 1:\n                selected_entity_ids = []\n                for j in range(max([len(elem) for elem in entity_ids])):\n                    for elem in entity_ids:\n                        if j < len(elem):\n                            selected_entity_ids.append(elem[j])\n                selected_entity_ids = [selected_entity_ids]\n            elif entity_ids:\n                selected_entity_ids = [entity_ids[int(pos) - 1] for pos in entity_positions if int(pos) > 0]\n            if type_ids:\n                selected_type_ids = [type_ids[int(pos) - 1][:self.types_to_leave]\n                                     for pos in type_positions if int(pos) > 0]\n            entity_combs = make_combs(selected_entity_ids, permut=True)\n            type_combs = make_combs(selected_type_ids, permut=False)\n            log.debug(f\"(query_parser)entity_combs: {entity_combs[:3]}, type_combs: {type_combs[:3]},\"\n                      f\" rel_combs: {rel_combs[:3]}\")\n\n            all_combs_list = list(itertools.product(entity_combs, type_combs, rel_combs))\n            all_combs_list = sorted(all_combs_list, key=lambda x: (sum([elem[-1] for elem in x]), x[0][-1]))\n            parsed_queries_info.append({\"query_triplets\": query_triplets,\n                                        \"query_sequence\": query_sequence,\n                                        \"rels_from_query\": rels_from_query,\n                                        \"answer_ent\": answer_ent,\n                                        \"filter_info\": filter_info,\n                                        \"order_info\": order_info,\n                                        \"rel_types\": rel_types,\n                                        \"unk_rels\": unk_rels,\n                                        \"return_if_found\": return_if_found,\n                                        \"selected_entity_ids\": selected_entity_ids,\n                                        \"selected_type_ids\": selected_type_ids,\n                                        \"rels\": rels,\n                                        \"entities_rel_conn\": entities_rel_conn,\n                                        \"entity_combs\": entity_combs,\n                                        \"type_combs\": type_combs,\n                                        \"rel_combs\": rel_combs,\n                                        \"all_combs_list\": all_combs_list})\n        return parsed_queries_info, rels_scores_dict\n\n    def check_valid_query(self, entities_rel_conn, query_hdt_seq):\n        entity_rel_valid = True\n        if entities_rel_conn:\n            for query_hdt_elem in query_hdt_seq:\n                entity, rel = \"\", \"\"\n                if len(query_hdt_elem) == 3 and any([query_hdt_elem[i].startswith(\"?\") for i in [0, 2]]):\n                    if \"statement\" in self.kb_prefixes and query_hdt_elem[1].startswith(self.kb_prefixes[\"statement\"]):\n                        continue\n                    else:\n                        if not query_hdt_elem[0].startswith(\"?\"):\n                            entity = query_hdt_elem[0].split(\"/\")[-1]\n                        elif not query_hdt_elem[2].startswith(\"?\"):\n                            entity = query_hdt_elem[2].split(\"/\")[-1]\n                        if not query_hdt_elem[1].startswith(\"?\"):\n                            rel = query_hdt_elem[1].split(\"/\")[-1]\n                        if entity and rel and rel not in self.kb_prefixes[\"type_rels\"] \\\n                                and (entity, rel) not in entities_rel_conn:\n                            entity_rel_valid = False\n        return entity_rel_valid\n\n    def query_parser(self, question: str,\n                     queries_info: Dict[str, str],\n                     entity_ids: List[List[str]],\n                     type_ids: List[List[str]],\n                     answer_types: Set[str],\n                     rels_from_template: Optional[List[Tuple[str]]] = None) -> Union[List[Dict[str, Any]], list]:\n        parsed_queries_info, rels_scores_dict = self.parse_queries_info(question, queries_info, entity_ids, type_ids,\n                                                                        rels_from_template)\n        queries_list, parser_info_list, entity_conf_list = [], [], []\n        new_combs_list, query_info_list = [], []\n        combs_num_list = [len(parsed_query_info[\"all_combs_list\"]) for parsed_query_info in parsed_queries_info]\n        if combs_num_list:\n            max_comb_nums = max(combs_num_list)\n        else:\n            max_comb_nums = 0\n        for comb_num in range(max_comb_nums):\n            for parsed_query_info in parsed_queries_info:\n                if comb_num < min(len(parsed_query_info[\"all_combs_list\"]), self.max_comb_num):\n                    query_triplets = parsed_query_info[\"query_triplets\"]\n                    query_sequence = parsed_query_info[\"query_sequence\"]\n                    rels_from_query = parsed_query_info[\"rels_from_query\"]\n                    answer_ent = parsed_query_info[\"answer_ent\"]\n                    filter_info = parsed_query_info[\"filter_info\"]\n                    order_info = parsed_query_info[\"order_info\"]\n                    rel_types = parsed_query_info[\"rel_types\"]\n                    unk_rels = parsed_query_info[\"unk_rels\"]\n                    return_if_found = parsed_query_info[\"return_if_found\"]\n                    entities_rel_conn = parsed_query_info[\"entities_rel_conn\"]\n                    combs = parsed_query_info[\"all_combs_list\"][comb_num]\n                    if combs[0][-1] == 0:\n                        entity_conf_list.append(1.0)\n                    else:\n                        entity_conf_list.append(0.9)\n                    query_hdt_seq = [fill_query(query_hdt_elem, combs[0], combs[1], combs[2],\n                                                self.map_query_str_to_kb)\n                                     for query_hdt_elem in query_sequence]\n                    if comb_num == 0:\n                        log.debug(f\"\\n______________________\\nfilled query: {query_hdt_seq}\\n______________________\\n\")\n\n                    entity_rel_valid = self.check_valid_query(entities_rel_conn, query_hdt_seq)\n                    if entity_rel_valid:\n                        new_combs_list.append(combs)\n                        queries_list.append((answer_ent, rels_from_query, query_hdt_seq, filter_info, order_info,\n                                             answer_types, rel_types, return_if_found))\n                        query_info_list.append((query_triplets, query_hdt_seq, answer_ent, filter_info, order_info))\n                        parser_info_list.append(\"query_execute\")\n                    if comb_num < 3 and unk_rels:\n                        unk_query_sequence = copy.deepcopy(query_sequence)\n                        unk_rels_from_query = copy.deepcopy(rels_from_query)\n                        for unk_rel, rel_var in zip(unk_rels, [\"?p\", \"?p2\"]):\n                            unk_query_sequence[int(unk_rel) - 1][1] = rel_var\n                            combs[-1][int(unk_rel) - 1] = (rel_var, 1.0)\n                            if rel_var not in rels_from_query:\n                                unk_rels_from_query.append(rel_var)\n                        query_hdt_seq = [\n                            fill_query(query_hdt_elem, combs[0], combs[1], combs[2], self.map_query_str_to_kb)\n                            for query_hdt_elem in unk_query_sequence]\n                        new_combs_list.append(combs)\n                        queries_list.append((answer_ent, unk_rels_from_query, query_hdt_seq, filter_info, order_info,\n                                             answer_types, rel_types, return_if_found))\n                        query_info_list.append((query_triplets, query_hdt_seq, answer_ent, filter_info, order_info))\n                        parser_info_list.append(\"query_execute\")\n\n        outputs_list = self.wiki_parser(parser_info_list, queries_list)\n        outputs = self.parse_outputs(outputs_list, new_combs_list, query_info_list, entity_conf_list, rels_scores_dict)\n        return outputs\n\n    def parse_outputs(self, outputs_list, combs_list, query_info_list, entity_conf_list, rels_scores_dict):\n        outputs = []\n        if isinstance(outputs_list, list) and outputs_list:\n            outputs_len = len(outputs_list)\n            combs_list = combs_list[:outputs_len]\n            entity_conf_list = entity_conf_list[:outputs_len]\n            for combs, query_info, entity_conf, (answers_list, found_rels_list, found_combs_list) in \\\n                    zip(combs_list, query_info_list, entity_conf_list, outputs_list):\n                for answers, found_rels, found_comb in zip(answers_list, found_rels_list, found_combs_list):\n                    found_rels = [found_rel.split(\"/\")[-1] for found_rel in found_rels]\n                    new_combs = list(copy.deepcopy(combs))\n                    found_unk_rel = False\n                    for j, rel_var in enumerate([\"?p\", \"?p2\"]):\n                        if isinstance(new_combs[2][j], tuple) and new_combs[2][j][0] == rel_var:\n                            if found_rels:\n                                new_combs[2][j] = (found_rels[j], rels_scores_dict.get(found_rels[j], 1.0))\n                            else:\n                                new_combs[2][j] = (new_combs[2][j][0], 0.0)\n                            found_unk_rel = True\n                    if found_rels and not found_unk_rel:\n                        new_combs[2] = new_combs[2][:-1] + [(found_rels[0], 1.0), new_combs[2][-1]]\n                    confidence = np.prod([score for rel, score in new_combs[2][:-1]])\n                    if answers:\n                        outputs.append([new_combs[0], new_combs[1]] + [rel for rel, score in new_combs[2][:-1]] +\n                                       answers + [(confidence, entity_conf), found_comb, query_info, new_combs[2]])\n            outputs_dict = defaultdict(list)\n            types_dict = defaultdict(list)\n            for output in outputs:\n                key = (tuple(output[0]), tuple([rel.split(\"/\")[-1] for rel in output[2:-5]]))\n                if key not in outputs_dict or output[-5:] not in outputs_dict[key]:\n                    outputs_dict[key].append(output[-5:])\n                    types_dict[key].append(tuple(output[1]))\n            outputs = []\n            for (entity_comb, rel_comb), output in outputs_dict.items():\n                type_comb = types_dict[(entity_comb, rel_comb)]\n                output_conf = [elem[1] for elem in output]\n                output_conf = sorted(output_conf, key=lambda x: x[0] * x[1], reverse=True)\n                found_combs = [elem[2] for elem in output]\n                queries = [elem[3] for elem in output]\n                rel_combs = [elem[4] for elem in output]\n                cur_rel_comb = rel_combs[0]\n                cur_rel_comb = [rel for rel, score in cur_rel_comb[:-1]]\n                sparql_query = make_sparql_query(queries[0], entity_comb, rel_combs[0], type_comb[0],\n                                                 self.gold_query_info)\n                parser_info_list = [\"fill_triplets\"]\n                parser_query_list = [(queries[0][1], queries[0][2], found_combs[0])]\n                filled_triplets = self.wiki_parser(parser_info_list, parser_query_list)\n                outputs.append({\"entities\": entity_comb, \"types\": type_comb, \"relations\": list(cur_rel_comb),\n                                \"answers\": tuple([ans for ans, *_ in output]), \"output_conf\": output_conf[0],\n                                \"sparql_query\": sparql_query, \"triplets\": filled_triplets[0]})\n        return outputs\n\n\n@register('query_formatter')\nclass QueryFormatter(Component):\n    def __init__(self, query_info: Dict[str, str], replace_prefixes: Dict[str, str] = None, **kwargs):\n        self.query_info = query_info\n        self.replace_prefixes = replace_prefixes\n\n    def __call__(self, queries_batch):\n        parsed_queries_batch = []\n        for query in queries_batch:\n            query_split = re.findall(\"{[ ]?(.*?)[ ]?}\", query)\n            init_query_triplets, query_triplets = [], []\n            if query_split:\n                init_query_triplets = query_split[0].split('. ')\n            for triplet in init_query_triplets:\n                triplet = \" \".join([elem.strip(\"<>\") for elem in triplet.strip().split()])\n                if self.replace_prefixes:\n                    for old_prefix, new_prefix in self.replace_prefixes.items():\n                        triplet = triplet.replace(old_prefix, new_prefix)\n                query_triplets.append(triplet)\n            answer_ent, order_info, filter_from_query = find_query_features(query, order_from_query=True)\n            query_info = (query_triplets, answer_ent, filter_from_query, order_info)\n            query = merge_sparql_query(query_info, self.query_info)\n            parsed_queries_batch.append(query)\n        return parsed_queries_batch\n"
  },
  {
    "path": "deeppavlov/models/kbqa/query_generator_base.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport itertools\nimport json\nfrom logging import getLogger\nfrom typing import Tuple, List, Dict, Optional, Union, Any, Set\n\nfrom bs4 import BeautifulSoup\nfrom whapi import search, get_html\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.file import read_json\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.core.models.serializable import Serializable\nfrom deeppavlov.models.entity_extraction.entity_linking import EntityLinker\nfrom deeppavlov.models.kbqa.rel_ranking_infer import RelRankerInfer\nfrom deeppavlov.models.kbqa.template_matcher import TemplateMatcher\nfrom deeppavlov.models.kbqa.utils import preprocess_template_queries\n\nlog = getLogger(__name__)\n\n\nclass QueryGeneratorBase(Component, Serializable):\n    \"\"\"\n        This class takes as input entity substrings, defines the template of the query and\n        fills the slots of the template with candidate entities and relations.\n    \"\"\"\n\n    def __init__(self, template_matcher: TemplateMatcher,\n                 rel_ranker: RelRankerInfer,\n                 load_path: str,\n                 sparql_queries_filename: str,\n                 entity_linker: EntityLinker,\n                 rels_in_ranking_queries_fname: str = None,\n                 wiki_parser=None,\n                 entities_to_leave: int = 5,\n                 rels_to_leave: int = 7,\n                 syntax_structure_known: bool = False,\n                 use_wp_api_requester: bool = False,\n                 use_el_api_requester: bool = False,\n                 use_alt_templates: bool = True,\n                 delete_rel_prefix: bool = True,\n                 kb_prefixes: Dict[str, str] = None, *args, **kwargs) -> None:\n        \"\"\"\n\n        Args:\n            template_matcher: component deeppavlov.models.kbqa.template_matcher\n            rel_ranker: component deeppavlov.models.kbqa.rel_ranking_infer\n            load_path: path to folder with wikidata files\n            sparql_queries_filename: file with sparql query templates\n            entity_linker: component deeppavlov.models.entity_extraction.entity_linking for linking of entities\n            rels_in_ranking_queries_fname: file with list of rels in queries for questions with ranking\n            wiki_parser: component deeppavlov.models.kbqa.wiki_parser\n            entities_to_leave: how many entities to leave after entity linking\n            rels_to_leave: how many relations to leave after relation ranking\n            syntax_structure_known: if syntax tree parser was used to define query template type\n            use_wp_api_requester: whether deeppavlov.models.api_requester.api_requester component will be used for\n                Wiki Parser\n            use_el_api_requester: whether deeppavlov.models.api_requester.api_requester component will be used for\n                Entity Linking\n            use_alt_templates: whether to use alternative templates if no answer was found for default query template\n            delete_rel_prefix: whether to delete prefix in relations\n            kb_prefixes: prefixes for entities, relations and types in the knowledge base\n        \"\"\"\n        super().__init__(save_path=None, load_path=load_path)\n        self.template_matcher = template_matcher\n        self.entity_linker = entity_linker\n        self.wiki_parser = wiki_parser\n        self.rel_ranker = rel_ranker\n        self.rels_in_ranking_queries_fname = rels_in_ranking_queries_fname\n        self.rels_in_ranking_queries = {}\n        self.entities_to_leave = entities_to_leave\n        self.rels_to_leave = rels_to_leave\n        self.syntax_structure_known = syntax_structure_known\n        self.use_wp_api_requester = use_wp_api_requester\n        self.use_el_api_requester = use_el_api_requester\n        self.use_alt_templates = use_alt_templates\n        self.sparql_queries_filename = sparql_queries_filename\n        self.delete_rel_prefix = delete_rel_prefix\n        self.kb_prefixes = kb_prefixes\n\n        self.load()\n\n    def load(self) -> None:\n        if self.rels_in_ranking_queries_fname is not None:\n            self.rels_in_ranking_queries = read_json(self.load_path / self.rels_in_ranking_queries_fname)\n\n        template_queries = read_json(str(expand_path(self.sparql_queries_filename)))\n        self.template_queries = preprocess_template_queries(template_queries, self.kb_prefixes)\n\n    def save(self) -> None:\n        pass\n\n    def find_candidate_answers(self, question: str,\n                               question_sanitized: str,\n                               template_types: Union[List[str], str],\n                               entities_from_ner: List[str],\n                               types_from_ner: List[str],\n                               entity_tags: List[str],\n                               probas: List[float],\n                               entities_to_link: List[int],\n                               answer_types: Set[str]) -> Tuple[Union[List[Dict[str, Any]], list], str]:\n        candidate_outputs = []\n        self.template_nums = [template_types]\n\n        replace_tokens = [(' - ', '-'), (' .', ''), ('{', ''), ('}', ''), ('  ', ' '), ('\"', \"'\"), ('(', ''),\n                          (')', ''), ('–', '-')]\n        for old, new in replace_tokens:\n            question = question.replace(old, new)\n\n        entities_from_template, types_from_template, rels_from_template, rel_dirs_from_template, query_type_template, \\\n        entity_types, template_answer, template_answer_types, template_found = self.template_matcher(\n            question_sanitized, entities_from_ner)\n        if query_type_template:\n            self.template_nums = [query_type_template]\n\n        log.debug(\n            f\"question: {question} entities_from_template {entities_from_template} template_type {self.template_nums} \"\n            f\"types from template {types_from_template} rels_from_template {rels_from_template} entities_from_ner \"\n            f\"{entities_from_ner} types_from_ner {types_from_ner} answer_types {list(answer_types)[:3]}\")\n\n        if entities_from_template or types_from_template:\n            if rels_from_template[0][0] == \"PHOW\":\n                how_to_content = self.find_answer_wikihow(entities_from_template[0])\n                candidate_outputs = [[\"PHOW\", how_to_content, 1.0]]\n            else:\n                entity_ids = self.get_entity_ids(entities_from_template, entity_tags, probas, question,\n                                                 entities_to_link)\n                type_ids = self.get_entity_ids(types_from_template, [\"t\" for _ in types_from_template],\n                                               [1.0 for _ in types_from_template], question)\n                log.debug(f\"entities_from_template: {entities_from_template} --- entity_types: {entity_types} --- \"\n                          f\"types_from_template: {types_from_template} --- rels_from_template: {rels_from_template} \"\n                          f\"--- answer_types: {template_answer_types} --- entity_ids: {entity_ids}\")\n                candidate_outputs = self.sparql_template_parser(question_sanitized, entity_ids, type_ids,\n                                                                template_answer_types, rels_from_template,\n                                                                rel_dirs_from_template)\n        if not candidate_outputs and (entities_from_ner or types_from_ner):\n            log.debug(f\"(__call__)entities_from_ner: {entities_from_ner}\")\n            entity_ids = self.get_entity_ids(entities_from_ner, entity_tags, probas, question)\n            type_ids = self.get_entity_ids(types_from_ner, [\"t\" for _ in types_from_ner],\n                                           [1.0 for _ in types_from_ner], question)\n            log.debug(f\"(__call__)entity_ids: {entity_ids} type_ids {type_ids}\")\n            self.template_nums = template_types\n            log.debug(f\"(__call__)self.template_nums: {self.template_nums}\")\n            if not self.syntax_structure_known:\n                entity_ids = entity_ids[:3]\n            candidate_outputs = self.sparql_template_parser(question_sanitized, entity_ids, type_ids, answer_types)\n        return candidate_outputs, template_answer\n\n    def get_entity_ids(self, entities: List[str], tags: List[str], probas: List[float], question: str,\n                       entities_to_link: List[int] = None) -> List[List[str]]:\n        entity_ids, el_output = [], []\n        try:\n            el_output = self.entity_linker([entities], [tags], [probas], [[question]], [None], [None],\n                                           [entities_to_link])\n        except json.decoder.JSONDecodeError:\n            log.warning(\"not received output from entity linking\")\n        if el_output:\n            if self.use_el_api_requester:\n                el_output = el_output[0]\n            if el_output:\n                if isinstance(el_output[0], dict):\n                    entity_ids = [entity_info.get(\"entity_ids\", []) for entity_info in el_output]\n                if isinstance(el_output[0], list):\n                    entity_ids, *_ = el_output\n            if not self.use_el_api_requester and entity_ids:\n                entity_ids = entity_ids[0]\n\n        return entity_ids\n\n    def sparql_template_parser(self, question: str,\n                               entity_ids: List[List[str]],\n                               type_ids: List[List[str]],\n                               answer_types: Set[str],\n                               rels_from_template: Optional[List[Tuple[str]]] = None,\n                               rel_dirs_from_template: Optional[List[str]] = None) -> Union[List[Dict[str, Any]], list]:\n        candidate_outputs = []\n        if isinstance(self.template_nums, str):\n            self.template_nums = [self.template_nums]\n        template_log_list = [str([elem[\"query_template\"], elem[\"template_num\"]])\n                             for elem in self.template_queries.values() if elem[\"template_num\"] in self.template_nums]\n        log.debug(f\"(find_candidate_answers)self.template_nums: {' --- '.join(template_log_list)}\")\n        init_templates = []\n        for template_num in self.template_nums:\n            for num, template in self.template_queries.items():\n                if (num == template_num and self.syntax_structure_known) or \\\n                        (template[\"template_num\"] == template_num and not self.syntax_structure_known):\n                    init_templates.append(template)\n        templates = [template for template in init_templates if\n                     (not self.syntax_structure_known and [len(entity_ids), len(type_ids)] == template[\n                         \"entities_and_types_num\"])\n                     or self.syntax_structure_known]\n        if not templates:\n            templates = [template for template in init_templates if\n                         (not self.syntax_structure_known and [len(entity_ids), 0] == template[\n                             \"entities_and_types_num\"])\n                         or self.syntax_structure_known]\n        if not templates:\n            return candidate_outputs\n        if rels_from_template is not None:\n            query_template = {}\n            for template in templates:\n                if template[\"rel_dirs\"] == rel_dirs_from_template:\n                    query_template = template\n            if query_template:\n                candidate_outputs = self.query_parser(question, [query_template], entity_ids, type_ids, answer_types,\n                                                      rels_from_template)\n        else:\n            candidate_outputs = []\n            for priority in range(1, 3):\n                pr_templates = [template for template in templates if template[\"priority\"] == priority]\n                candidate_outputs = self.query_parser(question, pr_templates, entity_ids, type_ids, answer_types,\n                                                      rels_from_template)\n                if candidate_outputs:\n                    return candidate_outputs\n\n            if not candidate_outputs:\n                alt_template_nums = templates[0].get(\"alternative_templates\", [])\n                log.debug(f\"Using alternative templates {alt_template_nums}\")\n                alt_templates = [self.template_queries[num] for num in alt_template_nums]\n                candidate_outputs = self.query_parser(question, alt_templates, entity_ids, type_ids, answer_types,\n                                                      rels_from_template)\n                if candidate_outputs:\n                    return candidate_outputs\n\n        log.debug(\"candidate_rels_and_answers:\\n\" + '\\n'.join([str(output) for output in candidate_outputs[:5]]))\n        return candidate_outputs\n\n    def find_top_rels(self, question: str, entity_ids: List[List[str]], triplet_info: Tuple) -> \\\n            Tuple[List[Tuple[str, float]], Dict[str, float], Set[Tuple[str, str]]]:\n        ex_rels, entity_rel_conn = [], set()\n        direction, source, rel_type, n_hop = triplet_info\n        if source == \"wiki\":\n            queries_list = list({(entity, direction, rel_type) for entity_id in entity_ids\n                                 for entity in entity_id[:self.entities_to_leave]})\n            entity_ids_list = [elem[0] for elem in queries_list]\n            parser_info_list = [\"find_rels\" for i in range(len(queries_list))]\n            ex_rels = self.wiki_parser(parser_info_list, queries_list)\n            for ex_rels_elem, entity_id in zip(ex_rels, entity_ids_list):\n                for rel in ex_rels_elem:\n                    entity_rel_conn.add((entity_id, rel.split(\"/\")[-1]))\n            if self.use_wp_api_requester and ex_rels:\n                ex_rels = [rel[0] for rel in ex_rels]\n            ex_rels = list(set(itertools.chain.from_iterable(ex_rels)))\n            if n_hop in {\"1-of-2-hop\", \"2-hop\"}:\n                queries_list = list({(entity, \"backw\", rel_type) for entity_id in entity_ids\n                                     for entity in entity_id[:self.entities_to_leave]})\n                entity_ids_list = [elem[0] for elem in queries_list]\n                parser_info_list = [\"find_rels\" for i in range(len(queries_list))]\n                ex_rels_backw = self.wiki_parser(parser_info_list, queries_list)\n                for ex_rels_elem, entity_id in zip(ex_rels_backw, entity_ids_list):\n                    for rel in ex_rels_elem:\n                        entity_rel_conn.add((entity_id, rel.split(\"/\")[-1]))\n                ex_rels_backw = list(set(itertools.chain.from_iterable(ex_rels_backw)))\n                ex_rels += ex_rels_backw\n            if self.delete_rel_prefix:\n                ex_rels = [rel.split('/')[-1] for rel in ex_rels]\n        elif source in {\"rank_list_1\", \"rel_list_1\"}:\n            ex_rels = self.rels_in_ranking_queries.get(\"one_rel_in_query\", [])\n        elif source in {\"rank_list_2\", \"rel_list_2\"}:\n            ex_rels = self.rels_in_ranking_queries.get(\"two_rels_in_query\", [])\n\n        ex_rels = [rel for rel in ex_rels if not any([rel.endswith(t_rel) for t_rel in self.kb_prefixes[\"type_rels\"]])]\n        rels_with_scores = self.rel_ranker.rank_rels(question, ex_rels)\n        if n_hop == \"2-hop\" and rels_with_scores and entity_ids and entity_ids[0]:\n            rels_1hop = [rel for rel, score in rels_with_scores]\n            queries_list = [(entity_ids[0], rels_1hop[:5])]\n            parser_info_list = [\"find_rels_2hop\"]\n            ex_rels_2hop = self.wiki_parser(parser_info_list, queries_list)\n            if self.delete_rel_prefix:\n                ex_rels_2hop = [rel.split('/')[-1] for rel in ex_rels_2hop]\n            rels_with_scores = self.rel_ranker.rank_rels(question, ex_rels_2hop)\n\n        rels_with_scores = list(set(rels_with_scores))\n        rels_with_scores = sorted(rels_with_scores, key=lambda x: x[1], reverse=True)\n        rels_scores_dict = {rel: score for rel, score in rels_with_scores}\n\n        return rels_with_scores[:self.rels_to_leave], rels_scores_dict, entity_rel_conn\n\n    def find_answer_wikihow(self, howto_sentence: str) -> str:\n        tags = []\n        search_results = search(howto_sentence, 5)\n        if search_results:\n            article_id = search_results[0][\"article_id\"]\n            html = get_html(article_id)\n            page = BeautifulSoup(html, 'lxml')\n            tags = list(page.find_all(['p']))\n        if tags:\n            howto_content = f\"{tags[0].text.strip()}@en\"\n        else:\n            howto_content = \"Not Found\"\n        return howto_content\n\n    def query_parser(self, question, query_templates, entity_ids, type_ids, answer_types, rels_from_template):\n        raise NotImplementedError\n"
  },
  {
    "path": "deeppavlov/models/kbqa/rel_ranking_infer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import Tuple, List, Any, Optional\n\nfrom scipy.special import softmax\n\nfrom deeppavlov.core.common.chainer import Chainer\nfrom deeppavlov.core.common.file import load_pickle, read_json\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.core.models.serializable import Serializable\nfrom deeppavlov.models.kbqa.sentence_answer import sentence_answer\nfrom deeppavlov.models.kbqa.wiki_parser import WikiParser\n\nlog = getLogger(__name__)\n\n\n@register('rel_ranking_infer')\nclass RelRankerInfer(Component, Serializable):\n    \"\"\"Class for ranking of paths in subgraph\"\"\"\n\n    def __init__(self, load_path: str,\n                 rel_q2name_filename: str,\n                 return_elements: List[str] = None,\n                 ranker: Chainer = None,\n                 wiki_parser: Optional[WikiParser] = None,\n                 batch_size: int = 32,\n                 softmax: bool = False,\n                 use_api_requester: bool = False,\n                 rank: bool = True,\n                 nll_rel_ranking: bool = False,\n                 nll_path_ranking: bool = False,\n                 top_possible_answers: int = -1,\n                 top_n: int = 1,\n                 pos_class_num: int = 1,\n                 rel_thres: float = 0.0,\n                 type_rels: List[str] = None, **kwargs):\n        \"\"\"\n\n        Args:\n            load_path: path to folder with wikidata files\n            rel_q2name_filename: name of file which maps relation id to name\n            return_elements: what elements return in output\n            ranker: component deeppavlov.models.ranking.rel_ranker\n            wiki_parser: component deeppavlov.models.wiki_parser\n            batch_size: infering batch size\n            softmax: whether to process relation scores with softmax function\n            use_api_requester: whether wiki parser will be used as external api\n            rank: whether to rank relations or simple copy input\n            nll_rel_ranking: whether use components trained with nll loss for relation ranking\n            nll_path_ranking: whether use components trained with nll loss for relation path ranking\n            top_possible_answers: number of answers returned for a question in each list of candidate answers\n            top_n: number of lists of candidate answers returned for a question\n            pos_class_num: index of positive class in the output of relation ranking model\n            rel_thres: threshold of relation confidence\n            type_rels: list of relations in the knowledge base which connect an entity and its type \n            **kwargs:\n        \"\"\"\n        super().__init__(save_path=None, load_path=load_path)\n        self.rel_q2name_filename = rel_q2name_filename\n        self.ranker = ranker\n        self.wiki_parser = wiki_parser\n        self.batch_size = batch_size\n        self.softmax = softmax\n        self.return_elements = return_elements or list()\n        self.use_api_requester = use_api_requester\n        self.rank = rank\n        self.nll_rel_ranking = nll_rel_ranking\n        self.nll_path_ranking = nll_path_ranking\n        self.top_possible_answers = top_possible_answers\n        self.top_n = top_n\n        self.pos_class_num = pos_class_num\n        self.rel_thres = rel_thres\n        self.type_rels = type_rels or set()\n        self.load()\n\n    def load(self) -> None:\n        if self.rel_q2name_filename.endswith(\"pickle\"):\n            self.rel_q2name = load_pickle(self.load_path / self.rel_q2name_filename)\n        elif self.rel_q2name_filename.endswith(\"json\"):\n            self.rel_q2name = read_json(self.load_path / self.rel_q2name_filename)\n\n    def save(self) -> None:\n        pass\n\n    def __call__(self, questions_batch: List[str],\n                 template_type_batch: List[str],\n                 raw_answers_batch: List[List[Tuple[str]]],\n                 entity_substr_batch: List[List[str]],\n                 template_answers_batch: List[str]) -> List[str]:\n        answers_batch, outp_confidences_batch, answer_ids_batch = [], [], []\n        entities_and_rels_batch, queries_batch, triplets_batch = [], [], []\n        for question, template_type, raw_answers, entities, template_answer in \\\n                zip(questions_batch, template_type_batch, raw_answers_batch, entity_substr_batch,\n                    template_answers_batch):\n            answers_with_scores = []\n            l_questions, l_rels, l_rels_labels, l_cur_answers, l_entities, l_types, l_sparql_queries, l_triplets, \\\n            l_confs = self.preprocess_ranking_input(question, raw_answers)\n\n            n_batches = len(l_questions) // self.batch_size + int(len(l_questions) % self.batch_size > 0)\n            for i in range(n_batches):\n                if self.rank:\n                    if self.nll_path_ranking:\n                        probas = self.ranker([l_questions[0]],\n                                             [l_rels_labels[self.batch_size * i:self.batch_size * (i + 1)]])\n                        probas = probas[0]\n                    else:\n                        probas = self.ranker(l_questions[self.batch_size * i:self.batch_size * (i + 1)],\n                                             l_rels_labels[self.batch_size * i:self.batch_size * (i + 1)])\n                        probas = [proba[0] for proba in probas]\n                else:\n                    probas = [rel_conf for rel_conf, entity_conf in\n                              l_confs[self.batch_size * i:self.batch_size * (i + 1)]]\n                for j in range(self.batch_size * i, self.batch_size * (i + 1)):\n                    if j < len(l_cur_answers) and (probas[j - self.batch_size * i] > self.rel_thres or\n                                                   (len(l_rels[j]) > 1 and not set(l_rels[j]).intersection(\n                                                       self.type_rels))):\n                        answers_with_scores.append((l_cur_answers[j], l_sparql_queries[j], l_triplets[j],\n                                                    l_entities[j], l_types[j], l_rels_labels[j], l_rels[j],\n                                                    round(probas[j - self.batch_size * i], 3),\n                                                    round(l_confs[j][0], 3), l_confs[j][1]))\n            answers_with_scores = sorted(answers_with_scores, key=lambda x: x[-1] * x[-3], reverse=True)\n            if template_type == \"simple_boolean\" and not answers_with_scores:\n                answers_with_scores = [([\"No\"], \"\", [], [], [], [], [], 1.0, 1.0, 1.0)]\n            res_answers_list, res_answer_ids_list, res_confidences_list, res_entities_and_rels_list = [], [], [], []\n            res_queries_list, res_triplets_list = [], []\n            for n, ans_sc_elem in enumerate(answers_with_scores):\n                init_answer_ids, query, triplets, q_entities, q_types, _, q_rels, p_conf, r_conf, e_conf = ans_sc_elem\n                answer_ids = []\n                for answer_id in init_answer_ids:\n                    answer_id = str(answer_id).replace(\"@en\", \"\").strip('\"')\n                    if answer_id not in answer_ids:\n                        answer_ids.append(answer_id)\n\n                if self.top_possible_answers > 0:\n                    answer_ids = answer_ids[:self.top_possible_answers]\n                answer_ids_input = [(answer_id, question) for answer_id in answer_ids]\n                answer_ids = [str(answer_id).split(\"/\")[-1] for answer_id in answer_ids]\n                parser_info_list = [\"find_label\" for _ in answer_ids_input]\n                init_answer_labels = self.wiki_parser(parser_info_list, answer_ids_input)\n                if n < 7:\n                    log.debug(f\"answers: {init_answer_ids[:3]} --- query {query} --- entities {q_entities} --- \"\n                              f\"types {q_types[:3]} --- q_rels {q_rels} --- {ans_sc_elem[5:]} --- \"\n                              f\"answer_labels {init_answer_labels[:3]}\")\n                answer_labels = []\n                for label in init_answer_labels:\n                    if label not in answer_labels:\n                        answer_labels.append(label)\n                answer_labels = [label for label in answer_labels if (label and label != \"Not Found\")][:5]\n                answer_labels = [str(label) for label in answer_labels]\n                if len(answer_labels) > 2:\n                    answer = f\"{', '.join(answer_labels[:-1])} and {answer_labels[-1]}\"\n                else:\n                    answer = ', '.join(answer_labels)\n\n                if \"sentence_answer\" in self.return_elements:\n                    try:\n                        answer = sentence_answer(question, answer, entities, template_answer)\n                    except ValueError as e:\n                        log.warning(f\"Error in sentence answer, {e}\")\n\n                res_answers_list.append(answer)\n                res_answer_ids_list.append(answer_ids)\n                if \"several_confidences\" in self.return_elements:\n                    res_confidences_list.append((p_conf, r_conf, e_conf))\n                else:\n                    res_confidences_list.append(p_conf)\n                res_entities_and_rels_list.append([q_entities[:-1], q_rels])\n                res_queries_list.append(query)\n                res_triplets_list.append(triplets)\n\n            if self.top_n == 1:\n                if answers_with_scores:\n                    answers_batch.append(res_answers_list[0])\n                    outp_confidences_batch.append(res_confidences_list[0])\n                    answer_ids_batch.append(res_answer_ids_list[0])\n                    entities_and_rels_batch.append(res_entities_and_rels_list[0])\n                    queries_batch.append(res_queries_list[0])\n                    triplets_batch.append(res_triplets_list[0])\n                else:\n                    answers_batch.append(\"Not Found\")\n                    outp_confidences_batch.append(0.0)\n                    answer_ids_batch.append([])\n                    entities_and_rels_batch.append([])\n                    queries_batch.append([])\n                    triplets_batch.append([])\n            else:\n                answers_batch.append(res_answers_list[:self.top_n])\n                outp_confidences_batch.append(res_confidences_list[:self.top_n])\n                answer_ids_batch.append(res_answer_ids_list[:self.top_n])\n                entities_and_rels_batch.append(res_entities_and_rels_list[:self.top_n])\n                queries_batch.append(res_queries_list[:self.top_n])\n                triplets_batch.append(res_triplets_list[:self.top_n])\n\n        answer_tuple = (answers_batch,)\n        if \"confidences\" in self.return_elements:\n            answer_tuple += (outp_confidences_batch,)\n        if \"answer_ids\" in self.return_elements:\n            answer_tuple += (answer_ids_batch,)\n        if \"entities_and_rels\" in self.return_elements:\n            answer_tuple += (entities_and_rels_batch,)\n        if \"queries\" in self.return_elements:\n            answer_tuple += (queries_batch,)\n        if \"triplets\" in self.return_elements:\n            answer_tuple += (triplets_batch,)\n\n        return answer_tuple\n\n    def preprocess_ranking_input(self, question, answers):\n        l_questions, l_rels, l_rels_labels, l_cur_answers = [], [], [], []\n        l_entities, l_types, l_sparql_queries, l_triplets, l_confs = [], [], [], [], []\n        for ans_and_rels in answers:\n            answer, sparql_query, confidence = \"\", \"\", []\n            entities, types, rels, rels_labels, triplets = [], [], [], [], []\n            if ans_and_rels:\n                rels = [rel.split('/')[-1] for rel in ans_and_rels[\"relations\"]]\n                answer = ans_and_rels[\"answers\"]\n                entities = ans_and_rels[\"entities\"]\n                types = ans_and_rels[\"types\"]\n                sparql_query = ans_and_rels[\"sparql_query\"]\n                triplets = ans_and_rels[\"triplets\"]\n                confidence = ans_and_rels[\"output_conf\"]\n                rels_labels = []\n                for rel in rels:\n                    if rel in self.rel_q2name:\n                        label = self.rel_q2name[rel]\n                        if isinstance(label, list):\n                            label = label[0]\n                        rels_labels.append(label.lower())\n            if rels_labels:\n                l_questions.append(question)\n                l_rels.append(rels)\n                l_rels_labels.append(rels_labels)\n                l_cur_answers.append(answer)\n                l_entities.append(entities)\n                l_types.append(types)\n                l_sparql_queries.append(sparql_query)\n                l_triplets.append(triplets)\n                l_confs.append(confidence)\n        return l_questions, l_rels, l_rels_labels, l_cur_answers, l_entities, l_types, l_sparql_queries, l_triplets, \\\n               l_confs\n\n    def rank_rels(self, question: str, candidate_rels: List[str]) -> List[Tuple[str, Any]]:\n        rels_with_scores = []\n        if question is not None:\n            questions, rels_labels, rels = [], [], []\n            for candidate_rel in candidate_rels:\n                if candidate_rel in self.rel_q2name:\n                    cur_rels_labels = self.rel_q2name[candidate_rel]\n                    if isinstance(cur_rels_labels, str):\n                        cur_rels_labels = [cur_rels_labels]\n                    for cur_rel in cur_rels_labels:\n                        questions.append(question)\n                        rels.append(candidate_rel)\n                        rels_labels.append(cur_rel)\n            if questions:\n                n_batches = len(rels) // self.batch_size + int(len(rels) % self.batch_size > 0)\n                for i in range(n_batches):\n                    if self.nll_rel_ranking:\n                        probas = self.ranker([questions[0]],\n                                             [rels_labels[i * self.batch_size:(i + 1) * self.batch_size]])\n                        probas = probas[0]\n                    else:\n                        probas = self.ranker(questions[i * self.batch_size:(i + 1) * self.batch_size],\n                                             rels_labels[i * self.batch_size:(i + 1) * self.batch_size])\n                        probas = [proba[self.pos_class_num] for proba in probas]\n                    for j, rel in enumerate(rels[i * self.batch_size:(i + 1) * self.batch_size]):\n                        rels_with_scores.append((rel, probas[j]))\n            if self.softmax:\n                scores = [score for rel, score in rels_with_scores]\n                softmax_scores = softmax(scores)\n                rels_with_scores = [(rel, softmax_score) for (rel, score), softmax_score in\n                                    zip(rels_with_scores, softmax_scores)]\n            rels_with_scores_dict = {}\n            for rel, score in rels_with_scores:\n                if rel not in rels_with_scores_dict:\n                    rels_with_scores_dict[rel] = []\n                rels_with_scores_dict[rel].append(score)\n            rels_with_scores = [(rel, max(scores)) for rel, scores in rels_with_scores_dict.items()]\n            rels_with_scores = sorted(rels_with_scores, key=lambda x: x[1], reverse=True)\n        return rels_with_scores\n"
  },
  {
    "path": "deeppavlov/models/kbqa/ru_adj_to_noun.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom collections import defaultdict\nfrom logging import getLogger\nfrom typing import List\n\nimport numpy as np\nimport spacy\nfrom scipy.sparse import csr_matrix\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\n\nlog = getLogger(__name__)\n\n\n@register('ru_adj_to_noun')\nclass RuAdjToNoun:\n    \"\"\"\n        Class for converting an adjective in Russian to the corresponding noun, for example:\n        \"московский\" -> \"Москва\", \"африканский\" -> \"Африка\"\n    \"\"\"\n\n    def __init__(self, freq_dict_filename: str, candidate_nouns: int = 10, freq_thres: float = 4.5,\n                 score_thres: float = 2.8, **kwargs):\n        \"\"\"\n\n        Args:\n            freq_dict_filename: file with the dictionary of Russian words with the corresponding frequencies\n            candidate_nouns: how many candidate nouns to leave after search\n            **kwargs:\n        \"\"\"\n        self.candidate_nouns = candidate_nouns\n        self.freq_thres = freq_thres\n        self.score_thres = score_thres\n        alphabet = \"абвгдеёжзийклмнопрстуфхцчшщъыьэюя-\"\n        self.alphabet_length = len(alphabet)\n        self.max_word_length = 24\n        self.letter_nums = {letter: num for num, letter in enumerate(alphabet)}\n        with open(str(expand_path(freq_dict_filename)), 'r') as fl:\n            lines = fl.readlines()\n        pos_freq_dict = defaultdict(list)\n        for line in lines:\n            line_split = line.strip('\\n').split('\\t')\n            if re.match(\"[\\d]+\\.[\\d]+\", line_split[2]):\n                pos_freq_dict[line_split[1]].append((line_split[0], float(line_split[2])))\n        self.nouns_with_freq = pos_freq_dict[\"s.PROP\"]\n        self.adj_set = set([word for word, freq in pos_freq_dict[\"a\"]])\n        self.nouns = [noun[0] for noun in self.nouns_with_freq]\n        self.matrix = self.make_sparse_matrix(self.nouns).transpose()\n        self.nlp = spacy.load(\"ru_core_news_sm\")\n\n    def search(self, word: str):\n        word = self.nlp(word)[0].lemma_\n        if word in self.adj_set:\n            q_matrix = self.make_sparse_matrix([word])\n            scores = q_matrix * self.matrix\n            scores = np.squeeze(scores.toarray())\n            indices = np.argsort(-scores)[:self.candidate_nouns]\n            scores = list(scores[indices])\n            candidates = [self.nouns_with_freq[indices[i]] + (scores[i],) for i in range(len(indices))]\n            candidates = [cand for cand in candidates if cand[0][:3].lower() == word[:3].lower()]\n            candidates = sorted(candidates, key=lambda x: (x[2], x[1]), reverse=True)\n            log.debug(f\"AdjToNoun, found nouns: {candidates}\")\n            if candidates and candidates[0][1] > self.freq_thres and candidates[0][2] > self.score_thres:\n                return candidates[0][0]\n        return \"\"\n\n    def make_sparse_matrix(self, words: List[str]):\n        indptr = []\n        indices = []\n        data = []\n\n        total_length = 0\n\n        for n, word in enumerate(words):\n            indptr.append(total_length)\n            for cnt, letter in enumerate(word.lower()):\n                col = self.alphabet_length * cnt + self.letter_nums[letter]\n                indices.append(col)\n                init_value = 1.0 - cnt * 0.05\n                if init_value < 0:\n                    init_value = 0\n                data.append(init_value)\n            total_length += len(word)\n\n        indptr.append(total_length)\n\n        data = np.array(data)\n        indptr = np.array(indptr)\n        indices = np.array(indices)\n\n        matrix = csr_matrix((data, indices, indptr), shape=(len(words), self.max_word_length * self.alphabet_length))\n\n        return matrix\n"
  },
  {
    "path": "deeppavlov/models/kbqa/sentence_answer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport importlib\nimport re\nfrom logging import getLogger\n\nimport pkg_resources\nimport spacy\n\nlog = getLogger(__name__)\n\n# en_core_web_sm is installed and used by test_inferring_pretrained_model in the same interpreter session during tests.\n# Spacy checks en_core_web_sm package presence with pkg_resources, but pkg_resources is initialized with interpreter,\n# sot it doesn't see en_core_web_sm installed after interpreter initialization, so we use importlib.reload below.\n\nif 'en-core-web-sm' not in pkg_resources.working_set.by_key.keys():\n    importlib.reload(pkg_resources)\n\n# TODO: move nlp to sentence_answer, sentence_answer to rel_ranking_infer and revise en_core_web_sm requirement,\n# TODO: make proper downloading with spacy.cli.download\nnlp = spacy.load('en_core_web_sm')\n\npronouns = [\"who\", \"what\", \"when\", \"where\", \"how\"]\n\n\ndef find_tokens(tokens, node, not_inc_node):\n    if node != not_inc_node:\n        tokens.append(node.text)\n        for elem in node.children:\n            tokens = find_tokens(tokens, elem, not_inc_node)\n    return tokens\n\n\ndef find_inflect_dict(sent_nodes):\n    inflect_dict = {}\n    for node in sent_nodes:\n        if node.dep_ == \"aux\" and node.tag_ == \"VBD\" and (node.head.tag_ == \"VBP\" or node.head.tag_ == \"VB\"):\n            new_verb = node.head._.inflect(\"VBD\")\n            inflect_dict[node.head.text] = new_verb\n            inflect_dict[node.text] = \"\"\n        if node.dep_ == \"aux\" and node.tag_ == \"VBZ\" and node.head.tag_ == \"VB\":\n            new_verb = node.head._.inflect(\"VBZ\")\n            inflect_dict[node.head.text] = new_verb\n            inflect_dict[node.text] = \"\"\n    return inflect_dict\n\n\ndef find_wh_node(sent_nodes):\n    wh_node = \"\"\n    main_head = \"\"\n    wh_node_head = \"\"\n    for node in sent_nodes:\n        if node.text.lower() in pronouns:\n            wh_node = node\n            break\n\n    if wh_node:\n        wh_node_head = wh_node.head\n        if wh_node_head.dep_ == \"ccomp\":\n            main_head = wh_node_head.head\n\n    return wh_node, wh_node_head, main_head\n\n\ndef find_tokens_to_replace(wh_node_head, main_head, question_tokens, question):\n    redundant_tokens_to_replace = []\n    question_tokens_to_replace = []\n\n    if main_head:\n        redundant_tokens_to_replace = find_tokens([], main_head, wh_node_head)\n    what_tokens_fnd = re.findall(\"what (.*) (is|was|does|did) (.*)\", question, re.IGNORECASE)\n    if what_tokens_fnd:\n        what_tokens = what_tokens_fnd[0][0].split()\n        if len(what_tokens) <= 2:\n            redundant_tokens_to_replace += what_tokens\n\n    wh_node_head_desc = [node for node in wh_node_head.children if node.text != \"?\"]\n    wh_node_head_dep = [node.dep_ for node in wh_node_head.children if\n                        (node.text != \"?\" and node.dep_ not in [\"aux\", \"prep\"] and node.text.lower() not in pronouns)]\n    for node in wh_node_head_desc:\n        if node.dep_ == \"nsubj\" and len(wh_node_head_dep) > 1 or node.text.lower() in pronouns or node.dep_ == \"aux\":\n            question_tokens_to_replace.append(node.text)\n            for elem in node.subtree:\n                question_tokens_to_replace.append(elem.text)\n\n    question_tokens_to_replace = list(set(question_tokens_to_replace))\n\n    redundant_replace_substr = []\n    for token in question_tokens:\n        if token in redundant_tokens_to_replace:\n            redundant_replace_substr.append(token)\n        else:\n            if redundant_replace_substr:\n                break\n\n    redundant_replace_substr = ' '.join(redundant_replace_substr)\n\n    question_replace_substr = []\n\n    for token in question_tokens:\n        if token in question_tokens_to_replace:\n            question_replace_substr.append(token)\n        else:\n            if question_replace_substr:\n                break\n\n    question_replace_substr = ' '.join(question_replace_substr)\n\n    return redundant_replace_substr, question_replace_substr\n\n\ndef sentence_answer(question, entity_title, entities=None, template_answer=None):\n    log.debug(f\"question {question} entity_title {entity_title} entities {entities} template_answer {template_answer}\")\n    sent_nodes = nlp(question)\n    reverse = False\n    if sent_nodes[-2].tag_ == \"IN\":\n        reverse = True\n    question_tokens = [elem.text for elem in sent_nodes]\n    log.debug(f\"spacy tags: {[(elem.text, elem.tag_, elem.dep_, elem.head.text) for elem in sent_nodes]}\")\n\n    inflect_dict = find_inflect_dict(sent_nodes)\n    wh_node, wh_node_head, main_head = find_wh_node(sent_nodes)\n    redundant_replace_substr, question_replace_substr = find_tokens_to_replace(wh_node_head, main_head,\n                                                                               question_tokens, question)\n    log.debug(f\"redundant_replace_substr {redundant_replace_substr} question_replace_substr {question_replace_substr}\")\n    if redundant_replace_substr:\n        answer = question.replace(redundant_replace_substr, '')\n    else:\n        answer = question\n\n    if answer.endswith('?'):\n        answer = answer.replace('?', '').strip()\n\n    if question_replace_substr:\n        if template_answer and entities:\n            answer = template_answer.replace(\"[ent]\", entities[0]).replace(\"[ans]\", entity_title)\n        elif wh_node.text.lower() in [\"what\", \"who\", \"how\"]:\n            fnd_date = re.findall(f\"what (day|year) (.*)\\?\", question, re.IGNORECASE)\n            fnd_wh = re.findall(\"what (is|was) the name of (.*) (which|that) (.*)\\?\", question, re.IGNORECASE)\n            fnd_name = re.findall(\"what (is|was) the name (.*)\\?\", question, re.IGNORECASE)\n            if fnd_date:\n                fnd_date_aux = re.findall(f\"what (day|year) (is|was) ({entities[0]}) (.*)\\?\", question, re.IGNORECASE)\n                if fnd_date_aux:\n                    answer = f\"{entities[0]} {fnd_date_aux[0][1]} {fnd_date_aux[0][3]} on {entity_title}\"\n                else:\n                    answer = f\"{fnd_date[0][1]} on {entity_title}\"\n            elif fnd_wh:\n                answer = f\"{entity_title} {fnd_wh[0][3]}\"\n            elif fnd_name:\n                aux_verb, sent_cut = fnd_name[0]\n                if sent_cut.startswith(\"of \"):\n                    sent_cut = sent_cut[3:]\n                answer = f\"{entity_title} {aux_verb} {sent_cut}\"\n            else:\n                if reverse:\n                    answer = answer.replace(question_replace_substr, '')\n                    answer = f\"{answer} {entity_title}\"\n                else:\n                    answer = answer.replace(question_replace_substr, entity_title)\n        elif wh_node.text.lower() in [\"when\", \"where\"] and entities:\n            sent_cut = re.findall(f\"(when|where) (was|is) {entities[0]} (.*)\\?\", question, re.IGNORECASE)\n            if sent_cut:\n                if sent_cut[0][0].lower() == \"when\":\n                    answer = f\"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} on {entity_title}\"\n                else:\n                    answer = f\"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} in {entity_title}\"\n            else:\n                answer = answer.replace(question_replace_substr, '')\n                answer = f\"{answer} in {entity_title}\"\n\n    for old_tok, new_tok in inflect_dict.items():\n        answer = answer.replace(old_tok, new_tok)\n    answer = re.sub(\"\\s+\", \" \", answer).strip()\n\n    answer = answer + '.'\n\n    return answer\n"
  },
  {
    "path": "deeppavlov/models/kbqa/template_matcher.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport functools\nimport json\nimport multiprocessing as mp\nimport re\nfrom logging import getLogger\nfrom typing import Any, Tuple, List, Union\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.serializable import Serializable\n\nlog = getLogger(__name__)\n\n\nclass RegexpMatcher:\n    def __init__(self, question):\n        self.question = question\n\n    def __call__(self, template):\n        res = re.findall(template[\"template_regexp\"], self.question)\n        found_template = []\n        if res:\n            found_template.append((res[0], template))\n        return found_template\n\n\n@register('template_matcher')\nclass TemplateMatcher(Serializable):\n    \"\"\"\n        This class matches the question with one of the templates\n        to extract entity substrings and define which relations\n        corresponds to the question\n    \"\"\"\n\n    def __init__(self, load_path: str, templates_filename: str,\n                 num_processors: int = None, **kwargs) -> None:\n        \"\"\"\n\n        Args:\n            load_path: path to folder with file with templates\n            templates_filename: file with templates\n            **kwargs:\n        \"\"\"\n        super().__init__(save_path=None, load_path=load_path)\n        self.templates_filename = templates_filename\n        self.num_processors = mp.cpu_count() if num_processors == None else num_processors\n        self.pool = mp.Pool(self.num_processors)\n        self.load()\n\n    def load(self) -> None:\n        log.debug(f\"(load)self.load_path / self.templates_filename: {self.load_path / self.templates_filename}\")\n        with open(self.load_path / self.templates_filename) as fl:\n            self.templates = json.load(fl)\n\n    def save(self) -> None:\n        raise NotImplementedError\n\n    def __call__(self, question: str, entities_from_ner: List[str]) -> \\\n            Tuple[Union[List[str], list], list, Union[list, Any], Union[list, Any], Union[str, Any], Union[list, Any],\n                  Union[str, Any], Union[list, Any], Union[str, Any]]:\n        question = question.lower()\n        question = self.sanitize(question)\n        question_length = len(question)\n        entities, types, relations, relation_dirs = [], [], [], []\n        query_type = \"\"\n        template_found = \"\"\n        entity_types = []\n        template_answer = \"\"\n        answer_types = []\n        results = self.pool.map(RegexpMatcher(question), self.templates)\n        results = functools.reduce(lambda x, y: x + y, results)\n        replace_tokens = [(\"the uk\", \"united kingdom\"), (\"the us\", \"united states\")]\n        if results:\n            min_length = 100\n            for result in results:\n                found_ent, template = result\n                positions_entity_tokens = template[\"positions_entity_tokens\"]\n                positions_type_tokens = template[\"positions_type_tokens\"]\n                positions_unuseful_tokens = template[\"positions_unuseful_tokens\"]\n                template_len = template[\"template_len\"]\n                template_found = template[\"template\"]\n                entities_cand = [found_ent[pos].replace('?', '') for pos in positions_entity_tokens]\n                types_cand = [found_ent[pos].replace('?', '').split(',')[0] for pos in positions_type_tokens]\n                unuseful_tokens = [found_ent[pos].replace('?', '') for pos in positions_unuseful_tokens]\n                entity_lengths = [len(entity) for entity in entities_cand]\n                entity_num_tokens = all([len(entity.split(' ')) < 6 for entity in entities_cand])\n                type_lengths = [len(entity_type) for entity_type in types_cand]\n                unuseful_tokens_len = sum([len(unuseful_tok) for unuseful_tok in unuseful_tokens])\n                log.debug(f\"found template: {template}, {found_ent}\")\n                match, entities_cand = self.match_template_and_ner(entities_cand, entities_from_ner, template_found)\n                if match and (0 not in entity_lengths or 0 not in type_lengths and entity_num_tokens):\n                    cur_len = sum(entity_lengths) + sum(type_lengths)\n                    log.debug(f\"lengths: entity+type {cur_len}, question {question_length}, \"\n                              f\"template {template_len}, unuseful tokens {unuseful_tokens_len}\")\n                    if cur_len < min_length and unuseful_tokens_len + template_len + cur_len == question_length:\n                        entities = entities_cand\n                        for old_token, new_token in replace_tokens:\n                            entities = [entity.replace(old_token, new_token) for entity in entities]\n                        types = types_cand\n                        relations = template[\"relations\"]\n                        relation_dirs = template[\"rel_dirs\"]\n                        query_type = template[\"template_type\"]\n                        entity_types = template.get(\"entity_types\", [])\n                        template_answer = template.get(\"template_answer\", \"\")\n                        answer_types = template.get(\"answer_types\", [])\n                        min_length = cur_len\n\n        return entities, types, relations, relation_dirs, query_type, entity_types, template_answer, answer_types, \\\n            template_found\n\n    def sanitize(self, question: str) -> str:\n        question = re.sub(r\"^(a |the )\", '', question)\n        date_interval = re.findall(\"([\\d]{4}-[\\d]{4})\", question)\n        if date_interval:\n            question = question.replace(date_interval[0], '')\n        question = question.replace('  ', ' ')\n        return question\n\n    def match_template_and_ner(self, entities_cand: List[str], entities_from_ner: List[str], template: str):\n        entities_from_ner = [entity.lower() for entity in entities_from_ner]\n        entities_from_ner = [re.sub(r\"^(a |the )\", '', entity) for entity in entities_from_ner]\n        entities_cand = [re.sub(r\"^(a |the )\", '', entity) for entity in entities_cand]\n        entities_cand = [entity.strip() for entity in entities_cand]\n        log.debug(f\"entities_cand {entities_cand} entities_from_ner {entities_from_ner}\")\n        match = set(entities_cand) == set(entities_from_ner) or not entities_from_ner or template == \"how to xxx?\"\n        return match, entities_cand\n"
  },
  {
    "path": "deeppavlov/models/kbqa/tree_to_sparql.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nimport re\nfrom collections import defaultdict\nfrom io import StringIO\nfrom logging import getLogger\nfrom typing import Any, List, Tuple, Dict, Union\n\nimport spacy\nfrom navec import Navec\nfrom razdel import tokenize\nfrom slovnet import Syntax\nfrom udapi.block.read.conllu import Conllu\nfrom udapi.core.node import Node\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.file import read_json\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.core.models.serializable import Serializable\nfrom deeppavlov.models.kbqa.ru_adj_to_noun import RuAdjToNoun\nfrom deeppavlov.models.kbqa.utils import preprocess_template_queries\n\nlog = getLogger(__name__)\n\n\n@register('slovnet_syntax_parser')\nclass SlovnetSyntaxParser(Component, Serializable):\n    \"\"\"Class for syntax parsing using Slovnet library\"\"\"\n\n    def __init__(self, load_path: str, navec_filename: str, syntax_parser_filename: str, tree_patterns_filename: str,\n                 **kwargs):\n        super().__init__(save_path=None, load_path=load_path)\n        self.navec_filename = expand_path(navec_filename)\n        self.syntax_parser_filename = expand_path(syntax_parser_filename)\n        self.tree_patterns = read_json(expand_path(tree_patterns_filename))\n        self.re_tokenizer = re.compile(r\"[\\w']+|[^\\w ]\")\n        self.pronouns = {\"q_pronouns\": {\"какой\", \"какая\", \"какое\", \"каком\", \"каким\", \"какую\", \"кто\", \"что\", \"как\",\n                                        \"когда\", \"где\", \"чем\", \"сколько\"},\n                         \"how_many\": {\"сколько\"}}\n        self.first_tokens = {\"первый\", \"первая\", \"первое\"}\n        self.nlp = spacy.load(\"ru_core_news_sm\")\n        self.load()\n\n    def load(self) -> None:\n        navec = Navec.load(self.navec_filename)\n        self.syntax = Syntax.load(self.syntax_parser_filename)\n        self.syntax.navec(navec)\n\n    def save(self) -> None:\n        pass\n\n    def preprocess_sentences(self, sentences, entity_offsets_batch):\n        sentences_tokens_batch, replace_dict_batch = [], []\n        for sentence, entity_offsets in zip(sentences, entity_offsets_batch):\n            if sentence.islower():\n                for start, end in entity_offsets:\n                    entity_old = sentence[start:end]\n                    if entity_old:\n                        entity_new = f\"{entity_old[0].upper()}{entity_old[1:]}\"\n                        sentence = sentence.replace(entity_old, entity_new)\n                sentence = f\"{sentence[0].upper()}{sentence[1:]}\"\n            names3 = re.findall(r\"([\\w]{1}\\.)([ ]?)([\\w]{1}\\.)([ ])([\\w]{3,})\", sentence)\n            replace_dict = {}\n            for name in names3:\n                names_str = \"\".join(name)\n                replace_dict[name[-1]] = (names_str, \"name\")\n                sentence = sentence.replace(names_str, name[-1])\n            names2 = re.findall(r\"([\\w]{1}\\.)([ ])([\\w]{3,})\", sentence)\n            for name in names2:\n                names_str = \"\".join(name)\n                replace_dict[name[-1]] = (names_str, \"name\")\n                sentence = sentence.replace(names_str, name[-1])\n            works_of_art = re.findall(r'([\"«])(.*?)([\"»])', sentence)\n            for symb_start, work_of_art, symb_end in works_of_art:\n                work_of_art_tokens = re.findall(self.re_tokenizer, work_of_art)\n                if len(work_of_art.split()) > 1:\n                    short_substr = \"\"\n                    for tok in work_of_art_tokens:\n                        if self.nlp(tok)[0].pos_ == \"NOUN\":\n                            short_substr = tok\n                            break\n                    if not short_substr:\n                        short_substr = work_of_art_tokens[0]\n                    replace_dict[short_substr] = (work_of_art, \"name\")\n                    sentence = sentence.replace(work_of_art, short_substr)\n            while True:\n                tokens = sentence.split()\n                found_substr = False\n                for i in range(len(tokens) - 2):\n                    found = True\n                    for j in range(i, i + 3):\n                        if len(tokens[j]) < 2 or tokens[j][0] in '(\"' or tokens[j][-1] in '\"),.?':\n                            found = False\n                    if found and i > 0:\n                        token_tags = [self.nlp(tokens[j])[0].pos_ for j in range(i, i + 3)]\n                        lemm_tokens = {self.nlp(tok)[0].lemma_ for tok in tokens[i:i + 3]}\n                        if token_tags == [\"DET\", \"DET\", \"NOUN\"] and not lemm_tokens & self.first_tokens:\n                            long_substr = \" \".join(tokens[i:i + 3])\n                            replace_dict[tokens[i + 2]] = (long_substr, \"adj\")\n                            sentence = sentence.replace(long_substr, tokens[i + 2])\n                            found_substr = True\n                    if found_substr:\n                        break\n                if not found_substr:\n                    break\n            sentence_tokens = [tok.text for tok in tokenize(sentence)]\n            sentences_tokens_batch.append(sentence_tokens)\n            log.debug(f\"replace_dict: {replace_dict} --- sentence: {sentence_tokens}\")\n            replace_dict_batch.append(replace_dict)\n        return sentences_tokens_batch, replace_dict_batch\n\n    def get_markup(self, proc_syntax_batch, replace_dict_batch):\n        markup_batch = []\n        for proc_syntax, replace_dict in zip(proc_syntax_batch, replace_dict_batch):\n            markup_list = []\n            for elem in proc_syntax.tokens:\n                markup_list.append({\"id\": elem.id, \"text\": elem.text, \"head_id\": int(elem.head_id), \"rel\": elem.rel})\n            ids, words, head_ids, rels = self.get_elements(markup_list)\n            head_ids, markup_list = self.correct_cycle(ids, head_ids, rels, markup_list)\n            for substr in replace_dict:\n                substr_full, substr_type = replace_dict[substr]\n                found_n = -1\n                for n, markup_elem in enumerate(markup_list):\n                    if markup_elem[\"text\"] == substr:\n                        found_n = n\n                if found_n > -1:\n                    before_markup_list = copy.deepcopy(markup_list[:found_n])\n                    after_markup_list = copy.deepcopy(markup_list[found_n + 1:])\n                    substr_tokens = [tok.text for tok in tokenize(substr_full)]\n                    new_markup_list = []\n                    if substr_type == \"name\":\n                        for j in range(len(substr_tokens)):\n                            new_markup_elem = {\"id\": str(found_n + j + 1), \"text\": substr_tokens[j]}\n                            if j == 0:\n                                new_markup_elem[\"rel\"] = markup_list[found_n][\"rel\"]\n                                if int(markup_list[found_n][\"head_id\"]) < found_n + 1:\n                                    new_markup_elem[\"head_id\"] = markup_list[found_n][\"head_id\"]\n                                else:\n                                    new_markup_elem[\"head_id\"] = str(int(markup_list[found_n][\"head_id\"]) + len(\n                                        substr_tokens) - 1)\n                            else:\n                                new_markup_elem[\"rel\"] = \"flat:name\"\n                                new_markup_elem[\"head_id\"] = str(found_n + 1)\n                            new_markup_list.append(new_markup_elem)\n                    elif substr_type == \"adj\":\n                        for j in range(len(substr_tokens)):\n                            new_elem = {\"id\": str(found_n + j + 1), \"text\": substr_tokens[j]}\n                            if j == len(substr_tokens) - 1:\n                                new_elem[\"rel\"] = markup_list[found_n][\"rel\"]\n                                if markup_list[found_n][\"head_id\"] < found_n + 1:\n                                    new_elem[\"head_id\"] = markup_list[found_n][\"head_id\"]\n                                else:\n                                    new_elem[\"head_id\"] = markup_list[found_n][\"head_id\"] + len(substr_tokens) - 1\n                            else:\n                                new_elem[\"rel\"] = \"amod\"\n                                new_elem[\"head_id\"] = str(found_n + len(substr_tokens))\n                            new_markup_list.append(new_elem)\n\n                    for j in range(len(before_markup_list)):\n                        if int(before_markup_list[j][\"head_id\"]) > found_n + 1:\n                            before_markup_list[j][\"head_id\"] = int(before_markup_list[j][\"head_id\"]) + \\\n                                                               len(substr_tokens) - 1\n                        if before_markup_list[j][\"head_id\"] == found_n + 1 and substr_type == \"adj\":\n                            before_markup_list[j][\"head_id\"] = found_n + len(substr_tokens)\n                    for j in range(len(after_markup_list)):\n                        after_markup_list[j][\"id\"] = str(int(after_markup_list[j][\"id\"]) + len(substr_tokens) - 1)\n                        if int(after_markup_list[j][\"head_id\"]) > found_n + 1:\n                            after_markup_list[j][\"head_id\"] = int(after_markup_list[j][\"head_id\"]) + \\\n                                                              len(substr_tokens) - 1\n                        if after_markup_list[j][\"head_id\"] == found_n + 1 and substr_type == \"adj\":\n                            after_markup_list[j][\"head_id\"] = found_n + len(substr_tokens)\n\n                    markup_list = before_markup_list + new_markup_list + after_markup_list\n            for j in range(len(markup_list)):\n                markup_list[j][\"head_id\"] = str(markup_list[j][\"head_id\"])\n            markup_batch.append(markup_list)\n        return markup_batch\n\n    def find_cycle(self, ids, head_ids):\n        for i in range(len(ids)):\n            for j in range(len(ids)):\n                if i < j and head_ids[j] == str(i + 1) and head_ids[i] == str(j + 1):\n                    return i + 1\n        return -1\n\n    def correct_markup(self, words, head_ids, rels, root_n):\n        if len(words) > 3:\n            pos = [self.nlp(words[i])[0].pos_ for i in range(len(words))]\n            for tree_pattern in self.tree_patterns:\n                first_word = tree_pattern.get(\"first_word\", \"\")\n                (r_start, r_end), rel_info = tree_pattern.get(\"rels\", [[0, 0], \"\"])\n                (p_start, p_end), pos_info = tree_pattern.get(\"pos\", [[0, 0], \"\"])\n                if (not first_word or words[0].lower() in self.pronouns[first_word]) \\\n                        and (not rel_info or rels[r_start:r_end] == rel_info) \\\n                        and (not pos_info or pos[p_start:p_end] == pos_info):\n                    for ind, deprel in tree_pattern.get(\"rel_ids\", {}).items():\n                        rels[int(ind)] = deprel\n                    for ind, head_id in tree_pattern.get(\"head_ids\", {}).items():\n                        head_ids[int(ind)] = head_id\n                    root_n = tree_pattern[\"root_n\"]\n                    break\n            if words[0].lower() in {\"какой\", \"какая\", \"какое\"} and rels[:3] == [\"det\", \"obj\", \"root\"] \\\n                    and pos[1:3] == [\"NOUN\", \"VERB\"] and \"nsubj\" not in rels:\n                rels[1] = \"nsubj\"\n        return head_ids, rels, root_n\n\n    def find_root(self, rels):\n        root_n = -1\n        for n in range(len(rels)):\n            if rels[n] == \"root\":\n                root_n = n + 1\n                break\n        return root_n\n\n    def get_elements(self, markup_elem):\n        ids, words, head_ids, rels = [], [], [], []\n        for elem in markup_elem:\n            ids.append(elem[\"id\"])\n            words.append(elem[\"text\"])\n            head_ids.append(elem[\"head_id\"])\n            rels.append(elem[\"rel\"])\n        return ids, words, head_ids, rels\n\n    def correct_cycle(self, ids, head_ids, rels, markup_elem):\n        cycle_num = -1\n        for n, (elem_id, head_id) in enumerate(zip(ids, head_ids)):\n            if str(head_id) == str(elem_id):\n                cycle_num = n\n        root_n = self.find_root(rels)\n        if cycle_num > 0 and root_n > -1:\n            head_ids[cycle_num] = root_n\n        markup_elem[cycle_num][\"head_id\"] = root_n\n        return head_ids, markup_elem\n\n    def process_markup(self, markup_batch):\n        processed_markup_batch = []\n        for markup_elem in markup_batch:\n            processed_markup = []\n            ids, words, head_ids, rels = self.get_elements(markup_elem)\n            if \"root\" not in {rel.lower() for rel in rels}:\n                found_root = False\n                for n, (elem_id, head_id) in enumerate(zip(ids, head_ids)):\n                    if elem_id == head_id:\n                        rels[n] = \"root\"\n                        head_ids[n] = 0\n                        found_root = True\n                if not found_root:\n                    for n in range(len(ids)):\n                        if rels[n] == \"nsubj\":\n                            rels[n] = \"root\"\n                            head_ids[n] = 0\n                            found_root = True\n                if not found_root:\n                    for n in range(len(ids)):\n                        if self.nlp(words[n])[0].pos_ == \"VERB\":\n                            rels[n] = \"root\"\n                            head_ids[n] = 0\n\n            root_n = self.find_root(rels)\n            head_ids, rels, root_n = self.correct_markup(words, head_ids, rels, root_n)\n            if words[-1] == \"?\" and -1 < root_n != head_ids[-1]:\n                head_ids[-1] = root_n\n\n            head_ids, markup_elem = self.correct_cycle(ids, head_ids, rels, markup_elem)\n            i = self.find_cycle(ids, head_ids)\n            if i == 1 and root_n > -1:\n                head_ids[i - 1] = root_n\n            for elem_id, word, head_id, rel in zip(ids, words, head_ids, rels):\n                processed_markup.append(f\"{elem_id}\\t{word}\\t_\\t_\\t_\\t_\\t{head_id}\\t{rel}\\t_\\t_\")\n            processed_markup_batch.append(\"\\n\".join(processed_markup))\n        return processed_markup_batch\n\n    def __call__(self, sentences, entity_offsets_batch):\n        sentences_tokens_batch, substr_dict_batch = self.preprocess_sentences(sentences, entity_offsets_batch)\n        proc_syntax_batch = list(self.syntax.map(sentences_tokens_batch))\n        markup_batch = self.get_markup(proc_syntax_batch, substr_dict_batch)\n        processed_markup_batch = self.process_markup(markup_batch)\n        return processed_markup_batch\n\n\n@register('tree_to_sparql')\nclass TreeToSparql(Component):\n    \"\"\"\n        Class for building of sparql query template using syntax parser\n    \"\"\"\n\n    def __init__(self, sparql_queries_filename: str, syntax_parser: Component, kb_prefixes: Dict[str, str],\n                 adj_to_noun: RuAdjToNoun = None, **kwargs):\n        \"\"\"\n\n        Args:\n            sparql_queries_filename: file with sparql query templates\n            syntax_parser: component for syntactic parsing of the input question\n            kb_prefixes: prefixes for entities, relations and types in the knowledge base\n            adj_to_noun: component deeppavlov.models.kbqa.tree_to_sparql:RuAdjToNoun\n            **kwargs:\n        \"\"\"\n        self.q_pronouns = {\"какой\", \"какая\", \"какое\", \"каком\", \"каким\", \"какую\", \"кто\", \"что\", \"как\", \"когда\",\n                           \"где\", \"чем\", \"сколько\"}\n        self.how_many = \"сколько\"\n        self.change_root_tokens = {\"каким был\", \"какой была\"}\n        self.first_tokens = {\"первый\", \"первая\", \"первое\"}\n        self.last_tokens = {\"последний\"}\n        self.begin_tokens = {\"начинать\", \"начать\"}\n        self.end_tokens = {\"завершить\", \"завершать\", \"закончить\"}\n        self.ranking_tokens = {\"самый\"}\n        self.date_tokens = {\"год\", \"месяц\"}\n        self.nlp = spacy.load(\"ru_core_news_sm\")\n        self.re_tokenizer = re.compile(r\"[\\w']+|[^\\w ]\")\n        self.sparql_queries_filename = expand_path(sparql_queries_filename)\n        template_queries = read_json(self.sparql_queries_filename)\n        self.template_queries = preprocess_template_queries(template_queries, kb_prefixes)\n        self.syntax_parser = syntax_parser\n        self.adj_to_noun = adj_to_noun\n\n    def __call__(self, questions_batch: List[str], substr_batch: List[List[str]], tags_batch: List[List[str]],\n                 offsets_batch: List[List[List[int]]], positions_batch: List[List[List[int]]],\n                 probas_batch: List[List[float]]) -> Tuple[\n        List[Union[str, Any]], List[Union[List[str], List[Union[str, Any]]]], List[Union[List[str], Any]], List[\n            Union[List[Union[str, Any]], Any]], List[Union[List[Union[float, Any]], Any]], List[List[int]], List[\n            Union[List[str], List[Any]]]]:\n        substr_batch, tags_batch, offsets_batch, positions_batch, probas_batch = \\\n            self.sort_substr(substr_batch, tags_batch, offsets_batch, positions_batch, probas_batch)\n        log.debug(f\"substr: {substr_batch} tags: {tags_batch} positions: {positions_batch}\")\n        query_nums_batch, s_substr_batch, s_tags_batch, s_probas_batch, types_batch = [], [], [], [], []\n        entities_to_link_batch = []\n        clean_questions_batch = []\n        count = False\n        for question, substr_list, tags_list, offsets_list, probas_list, positions in \\\n                zip(questions_batch, substr_batch, tags_batch, offsets_batch, probas_batch, positions_batch):\n            entities_dict, probas_dict = {}, {}\n            for substr, tag, proba in zip(substr_list, tags_list, probas_list):\n                entities_dict[substr.lower()] = tag\n                probas_dict[substr.lower()] = proba\n            for i in range(len(substr_list)):\n                substr = substr_list[i]\n                if len(substr) > 2 and (\"-\" in substr or f\"{substr}-\" in question) and \" - \" not in substr:\n                    if \"-\" in substr:\n                        length = len(re.findall(self.re_tokenizer, substr))\n                    else:\n                        length = 3\n                    substr_tokens = list(tokenize(substr))\n                    positions[i] = [positions[i][j] for j in range(len(substr_tokens))]\n                    if i < len(substr_list) - 1:\n                        for j in range(i + 1, len(substr_list)):\n                            pos_inds = positions[j]\n                            pos_inds = [ind - length + 1 for ind in pos_inds]\n                            positions[j] = pos_inds\n\n            root, tree, tree_desc, unknown_node, unknown_branch = self.syntax_parse(question, offsets_list)\n            query_nums = [\"7\"]\n            s_substr_list = substr_list\n            s_tags_list = tags_list\n            s_probas_list = probas_list\n            types_list = []\n            if unknown_node:\n                log.debug(f\"syntax tree info 1, unknown node: {unknown_node.form}, unkn branch: {unknown_branch.form}\")\n                log.debug(f\"wh_leaf: {self.wh_leaf}\")\n                clause_node, clause_branch = self.find_clause_node(root, unknown_branch)\n                log.debug(f\"clause node: {clause_node}\")\n                tok_and_ord = {node.ord: node for node in tree.descendants}\n                appos_token_nums = sorted(self.find_appos_tokens(root, tok_and_ord, []))\n                appos_tokens = [elem.form for elem in tree_desc if elem.ord in appos_token_nums]\n                clause_token_nums = sorted(self.find_clause_tokens(root, tok_and_ord, clause_node))\n                clause_tokens = [elem.form for elem in tree_desc if elem.ord in clause_token_nums]\n                log.debug(f\"appos tokens: {appos_tokens}\")\n                log.debug(f\"clause_tokens: {clause_tokens}\")\n                question, ranking_tokens = self.sanitize_question(tree, root, appos_token_nums, clause_token_nums)\n                if appos_token_nums or clause_token_nums:\n                    root, tree, tree_desc, unknown_node, unknown_branch = self.syntax_parse(question, offsets_list)\n                    log.debug(f\"syntax tree info 2, unknown node: {unknown_node}, unkn branch: {unknown_branch}\")\n\n                if unknown_node:\n                    modifiers, clause_modifiers = self.find_modifiers_of_unknown(unknown_node)\n                    log.debug(f\"modifiers: {modifiers} --- clause modifiers: {[nd.form for nd in clause_modifiers]}\")\n                    if f\"{tree_desc[0].form.lower()} {tree_desc[1].form.lower()}\" in self.change_root_tokens:\n                        new_root = root.children[0]\n                    else:\n                        new_root = root\n                    root_desc = defaultdict(list)\n                    for node in new_root.children:\n                        if node.deprel not in [\"punct\", \"advmod\", \"cop\", \"mark\"]:\n                            if node == unknown_branch:\n                                root_desc[node.deprel].append(node)\n                            else:\n                                if self.find_entities(node, positions) or \\\n                                        (self.find_year_or_number(node) and node.deprel in [\"obl\", \"nummod\"]):\n                                    root_desc[node.deprel].append(node)\n\n                    if root.form.lower() == self.how_many or (\"nsubj\" in root_desc.keys() and\n                                                              self.how_many in [nd.form.lower() for nd in\n                                                                                root_desc[\"nsubj\"]]):\n                        count = True\n                    log.debug(f\"root_desc {root_desc.keys()}\")\n                    self.root_entity = False\n                    if root.ord - 1 in positions:\n                        self.root_entity = True\n\n                    temporal_order = self.find_first_last(new_root)\n                    new_root_nf = self.nlp(new_root.form)[0].lemma_\n                    if new_root_nf in self.begin_tokens or new_root_nf in self.end_tokens:\n                        temporal_order = new_root_nf\n                    query_nums, s_substr_list, types_list = self.build_query(new_root, unknown_branch, root_desc,\n                                                                             unknown_node, modifiers, clause_modifiers,\n                                                                             clause_node, positions, entities_dict,\n                                                                             count, temporal_order, ranking_tokens)\n                    s_tags_list, s_probas_list = [], []\n                    for substr in s_substr_list:\n                        substr = substr.replace(\" - \", \"-\")\n                        s_tags_list.append(entities_dict.get(substr.lower(), \"E\"))\n                        s_probas_list.append(probas_dict.get(substr.lower(), 1.0))\n            clean_questions_batch.append(question)\n            if query_nums and s_substr_list:\n                entities_to_link = [1 for _ in s_substr_list]\n                s_substr_list_lower = [s.lower() for s in s_substr_list]\n                for substr, tag, proba in zip(substr_list, tags_list, probas_list):\n                    if substr.lower() not in s_substr_list_lower:\n                        s_substr_list.append(substr)\n                        s_tags_list.append(tag)\n                        s_probas_list.append(proba)\n                        entities_to_link.append(0)\n                s_substr_batch.append(s_substr_list)\n                s_tags_batch.append(s_tags_list)\n                s_probas_batch.append(s_probas_list)\n                entities_to_link_batch.append(entities_to_link)\n            else:\n                mod_len = 0\n                gr_len = 1\n                if all([tags_list[i] == tags_list[0] for i in range(len(tags_list))]):\n                    gr_len = len(substr_list)\n                elif len(substr_list) > 1:\n                    mod_len = 1\n                for num, template in self.template_queries.items():\n                    syntax_info = [gr_len, 0, mod_len, 0, False, False, False]\n                    if syntax_info == list(template[\"syntax_structure\"].values()):\n                        query_nums.append(num)\n                entities_to_link = [1 for _ in s_substr_list]\n                s_substr_batch.append(substr_list)\n                s_tags_batch.append(tags_list)\n                s_probas_batch.append(probas_list)\n                entities_to_link_batch.append(entities_to_link)\n            query_nums_batch.append(query_nums)\n            types_batch.append(types_list)\n        log.debug(f\"clean_questions: {clean_questions_batch} --- substr: {s_substr_batch} --- tags: {s_tags_batch} \"\n                  f\"--- entities_to_link {entities_to_link_batch} --- types: {types_batch}\")\n        return clean_questions_batch, query_nums_batch, s_substr_batch, s_tags_batch, s_probas_batch, \\\n               entities_to_link_batch, types_batch\n\n    def sort_substr(self, substr_batch: List[List[str]], tags_batch: List[List[str]],\n                    offsets_batch: List[List[List[int]]], positions_batch: List[List[List[int]]],\n                    probas_batch: List[List[float]]) -> Tuple[\n        List[List[str]], List[List[str]], List[List[List[int]]], List[List[List[int]]], List[List[float]]]:\n        s_substr_batch, s_tags_batch, s_offsets_batch, s_positions_batch, s_probas_batch = [], [], [], [], []\n        for substr_list, tags_list, offsets_list, positions_list, probas_list \\\n                in zip(substr_batch, tags_batch, offsets_batch, positions_batch, probas_batch):\n            substr_info = [(substr, tag, offsets, positions, proba) for substr, tag, offsets, positions, proba\n                           in zip(substr_list, tags_list, offsets_list, positions_list, probas_list)]\n            substr_info = sorted(substr_info, key=lambda x: x[3][0])\n            s_substr_batch.append([elem[0] for elem in substr_info])\n            s_tags_batch.append([elem[1] for elem in substr_info])\n            s_offsets_batch.append([elem[2] for elem in substr_info])\n            s_positions_batch.append([elem[3] for elem in substr_info])\n            s_probas_batch.append([elem[4] for elem in substr_info])\n        return s_substr_batch, s_tags_batch, s_offsets_batch, s_positions_batch, s_probas_batch\n\n    def syntax_parse(self, question: str, entity_offsets_list: List[List[int]]) -> Tuple[\n        Union[str, Any], Union[str, Any], Union[str, Any], str, str]:\n        syntax_tree = self.syntax_parser([question], [entity_offsets_list])[0]\n        log.debug(f\"syntax tree: \\n{syntax_tree}\")\n        root, tree, tree_desc, unknown_node, unknown_branch = \"\", \"\", \"\", \"\", \"\"\n        try:\n            tree = Conllu(filehandle=StringIO(syntax_tree)).read_tree()\n            root = self.find_root(tree)\n            tree_desc = tree.descendants\n        except ValueError as e:\n            log.warning(f\"error in parsing syntax tree, {e}\")\n        if root:\n            unknown_node, unknown_branch = self.find_branch_with_unknown(root)\n            log.debug(f\"syntax tree info, root: {root.form} unk_node: {unknown_node} unk_branch: {unknown_branch}\")\n        return root, tree, tree_desc, unknown_node, unknown_branch\n\n    def sanitize_question(self, tree: Node, root: Node, appos_token_nums: List[int], clause_token_nums: List[int]) -> \\\n            Tuple[str, list]:\n        ranking_tokens = self.find_ranking_tokens(root, appos_token_nums, clause_token_nums)\n        question_tokens = []\n        for node in tree.descendants:\n            if node.ord not in appos_token_nums + clause_token_nums:\n                if ranking_tokens and (node.ord in ranking_tokens or node.form.lower() in self.q_pronouns):\n                    question_tokens.append(self.nlp(node.form)[0].lemma_)\n                else:\n                    question_tokens.append(node.form)\n        question = \" \".join(question_tokens)\n        log.debug(f\"sanitized question: {question}\")\n        return question, ranking_tokens\n\n    def find_root(self, tree: Node) -> Node:\n        for node in tree.descendants:\n            if node.deprel == \"root\" and node.children:\n                return node\n\n    def find_branch_with_unknown(self, root: Node) -> Tuple[str, str]:\n        self.wh_leaf = False\n        self.one_chain = False\n        if root.form.lower() in self.q_pronouns:\n            if \"nsubj\" in [node.deprel for node in root.children] or root.form.lower() in self.how_many:\n                self.one_chain = True\n            else:\n                for node in root.children:\n                    if node.deprel == \"nsubj\":\n                        return node, node\n        if not self.one_chain:\n            for node in root.children:\n                if node.form.lower() in self.q_pronouns:\n                    if node.children:\n                        for child in node.children:\n                            if child.deprel in [\"nmod\", \"obl\"]:\n                                return child, node\n                    else:\n                        self.wh_leaf = True\n                else:\n                    for child in node.descendants:\n                        if child.form.lower() in self.q_pronouns:\n                            return child.parent, node\n        if self.wh_leaf or self.one_chain:\n            for node in root.children:\n                if node.deprel in [\"nsubj\", \"obl\", \"obj\", \"nmod\", \"xcomp\"] and node.form.lower() not in self.q_pronouns:\n                    return node, node\n\n        return \"\", \"\"\n\n    def find_modifiers_of_unknown(self, node: Node) -> Tuple[List[Union[str, Any]], list]:\n        modifiers = []\n        clause_modifiers = []\n        for mod in node.children:\n            if mod.deprel in [\"amod\", \"nmod\"] or (mod.deprel == \"appos\" and mod.children):\n                noun_mod = \"\"\n                if self.adj_to_noun:\n                    noun_mod = self.adj_to_noun.search(mod.form)\n                if noun_mod:\n                    modifiers.append(noun_mod)\n                else:\n                    modifiers.append(mod)\n            if mod.deprel == \"acl\":\n                clause_modifiers.append(mod)\n        return modifiers, clause_modifiers\n\n    def find_clause_node(self, root: Node, unknown_branch: Node) -> Tuple[str, str]:\n        for node in root.children:\n            if node.deprel == \"obl\" and node != unknown_branch:\n                for elem in node.children:\n                    if elem.deprel == \"acl\":\n                        return elem, node\n        return \"\", \"\"\n\n    def find_entities(self, node: Node, positions: List[List[int]]) -> List[str]:\n        node_desc = [(node.form, node.ord, node.parent)] + \\\n                    [(elem.form, elem.ord, elem.parent) for elem in node.descendants]\n        node_desc = sorted(node_desc, key=lambda x: x[1])\n        entities_list, heads_list = [], []\n        for pos_elem in positions:\n            entity, parents = [], []\n            for ind in pos_elem:\n                for node_elem in node_desc:\n                    if ind + 1 == node_elem[1]:\n                        entity.append(node_elem[0])\n                        parents.append(node_elem[2])\n                        break\n            if len(entity) == len(pos_elem):\n                entity = \" \".join(entity).replace(\" .\", \".\")\n                entities_list.append(entity)\n                heads_list.append(parents[0])\n        log.debug(f\"node: {node.form} --- found_entities: {entities_list} --- node_desc: {node_desc} --- \"\n                  f\"positions: {positions}\")\n        return entities_list\n\n    def find_year_or_number(self, node: Node) -> bool:\n        found = False\n        for elem in node.descendants:\n            if elem.deprel == \"nummod\" or re.findall(r\"[\\d]{4}\", elem.form):\n                return True\n        return found\n\n    def find_year_constraint(self, node: Node) -> list:\n        node_desc = [(node.form, node.ord)] + [(elem.form, elem.ord) for elem in node.descendants]\n        node_desc = sorted(node_desc, key=lambda x: x[1])\n        desc_text = \" \".join([elem[0] for elem in node_desc])\n        for symb in \".,:;)\":\n            desc_text = desc_text.replace(f\" {symb}\", symb)\n        for pattern in [r\"в ([\\d]{3,4}) году\", r\"с ([\\d]{3,4}) по ([\\d]{3,4})\"]:\n            fnd = re.findall(pattern, desc_text)\n            if fnd:\n                return fnd\n        return []\n\n    def find_appos_tokens(self, node: Node, tok_and_ord: List[Tuple[Node, int]],\n                          appos_token_nums: List[int]) -> List[int]:\n        for elem in node.children:\n            e_desc = elem.descendants\n            if elem.deprel == \"appos\" and elem.ord > 1 and tok_and_ord[elem.ord - 1].deprel == \"punct\" \\\n                    and not all([nd.deprel in {\"appos\", \"flat:name\"} for nd in e_desc]) \\\n                    and not ({\"«\", '\"', '``', '('} & {nd.form for nd in e_desc}):\n                appos_token_nums.append(elem.ord)\n                for desc in elem.descendants:\n                    appos_token_nums.append(desc.ord)\n            else:\n                appos_token_nums = self.find_appos_tokens(elem, tok_and_ord, appos_token_nums)\n        return appos_token_nums\n\n    def find_clause_tokens(self, node: Node, tok_and_ord: Dict[int, Node], clause_node: Node) -> List[int]:\n        clause_token_nums = []\n        for elem in node.children:\n            if elem != clause_node and elem.deprel == \"acl\":\n                clause_token_nums.append(elem.ord)\n                for desc in elem.descendants:\n                    clause_token_nums.append(desc.ord)\n            else:\n                clause_token_nums = self.find_appos_tokens(elem, tok_and_ord, clause_token_nums)\n        return clause_token_nums\n\n    def find_first_last(self, node: Node) -> str:\n        first_or_last = \"\"\n        nodes = [node]\n        while nodes:\n            for node in nodes:\n                node_desc = defaultdict(set)\n                for elem in node.children:\n                    normal_form = self.nlp(elem.form.lower())[0].lemma_\n                    node_desc[elem.deprel].add(normal_form)\n                log.debug(f\"find_first_last {node_desc}\")\n                if \"amod\" in node_desc.keys() and \"nmod\" in node_desc.keys() and \\\n                        node_desc[\"amod\"].intersection(self.first_tokens | self.last_tokens):\n                    first_or_last = ' '.join(node_desc[\"amod\"].intersection(self.first_tokens | self.last_tokens))\n                    return first_or_last\n            nodes = [elem for node in nodes for elem in node.children]\n        return first_or_last\n\n    def find_ranking_tokens(self, node: Node, appos_token_nums: List[int], clause_token_nums: List[int]) -> list:\n        ranking_tokens = []\n        for elem in node.descendants:\n            if self.nlp(elem.form)[0].lemma_ in self.ranking_tokens \\\n                    and elem.ord not in appos_token_nums + clause_token_nums:\n                ranking_tokens.append(elem.ord)\n                ranking_tokens.append(elem.parent.ord)\n                return ranking_tokens\n        return ranking_tokens\n\n    @staticmethod\n    def choose_grounded_entity(grounded_entities: List[str], entities_dict: Dict[str, str]):\n        tags = [entities_dict.get(entity.lower(), \"\") for entity in grounded_entities]\n        if len(grounded_entities) > 1:\n            if not all([tags[i] == tags[0] for i in range(1, len(tags))]):\n                for f_tag in [\"WORK_OF_ART\", \"FAC\", \"PERSON\", \"GPE\"]:\n                    for entity, tag in zip(grounded_entities, tags):\n                        if tag == f_tag:\n                            return [entity]\n            elif not all([entity[0].islower() for entity in grounded_entities]):\n                for entity in grounded_entities:\n                    if entity[0].isupper():\n                        return [entity]\n        return grounded_entities\n\n    def build_query(self, root: Node, unknown_branch: Node, root_desc: Dict[str, List[Node]], unknown_node: Node,\n                    unknown_modifiers: List[Node], clause_modifiers: List[Node], clause_node: Node,\n                    positions: List[List[int]], entities_dict: Dict[str, str], count: bool = False,\n                    temporal_order: str = \"\", ranking_tokens: List[str] = None) -> Tuple[\n        List[str], List[str], List[str]]:\n        query_nums = []\n        grounded_entities_list, types_list, modifiers_list, qualifier_entities_list = [], [], [], []\n        found_year_or_number = False\n        order = False\n        root_desc_deprels = []\n        for key in root_desc.keys():\n            for i in range(len(root_desc[key])):\n                if key in {\"nsubj\", \"obj\", \"obl\", \"iobj\", \"acl\", \"nmod\", \"xcomp\", \"cop\"}:\n                    root_desc_deprels.append(key)\n        root_desc_deprels = sorted(root_desc_deprels)\n        log.debug(f\"build_query: root_desc.keys, {root_desc_deprels}, positions {positions}, wh_leaf {self.wh_leaf}, \"\n                  f\"one_chain {self.one_chain}, temporal order {temporal_order}, ranking tokens {ranking_tokens}\")\n        if root_desc_deprels in [[\"nsubj\", \"obl\"],\n                                 [\"nsubj\", \"obj\"],\n                                 [\"nsubj\", \"xcomp\"],\n                                 [\"obj\", \"xcomp\"],\n                                 [\"nmod\", \"nsubj\"],\n                                 [\"obj\", \"obl\"],\n                                 [\"iobj\", \"nsubj\"],\n                                 [\"acl\", \"nsubj\"],\n                                 [\"cop\", \"nsubj\", \"obl\"],\n                                 [\"obj\"],\n                                 [\"obl\"],\n                                 [\"nmod\"],\n                                 [\"xcomp\"],\n                                 [\"nsubj\"]]:\n            if self.wh_leaf or self.one_chain:\n                if root_desc_deprels == [\"nsubj\", \"obl\"]:\n                    grounded_entities_list = self.find_entities(root_desc[\"nsubj\"][0], positions)\n                    if not grounded_entities_list:\n                        grounded_entities_list = self.find_entities(root_desc[\"obl\"][0], positions)\n                else:\n                    for nodes in root_desc.values():\n                        if nodes[0].form not in self.q_pronouns:\n                            grounded_entities_list = self.find_entities(nodes[0], positions)\n                            if grounded_entities_list:\n                                break\n            else:\n                if self.root_entity:\n                    grounded_entities_list = [root.form]\n                for nodes in root_desc.values():\n                    if nodes[0] != unknown_branch:\n                        grounded_entities_list = self.find_entities(nodes[0], positions)\n                        if grounded_entities_list:\n                            type_entity = unknown_node.form\n                            types_list.append(type_entity)\n                            break\n\n                if unknown_modifiers:\n                    for n, modifier in enumerate(unknown_modifiers):\n                        if isinstance(modifier, str):\n                            modifiers_list.append(modifier)\n                        else:\n                            modifier_entities = self.find_entities(modifier, positions)\n                            if modifier_entities:\n                                modifiers_list += modifier_entities\n                if clause_modifiers:\n                    found_year_or_number = self.find_year_or_number(clause_modifiers[0])\n                    if found_year_or_number:\n                        query_nums.append(\"0\")\n                    qualifier_entities_list = self.find_entities(clause_modifiers[0], positions)\n\n        if root_desc_deprels == [\"nsubj\", \"obl\", \"obl\"]:\n            grounded_entities_list = self.find_entities(root_desc[\"nsubj\"][0], positions)\n            for node in root_desc[\"obl\"]:\n                if node == unknown_branch:\n                    types_list.append(node.form)\n                else:\n                    grounded_entities_list += self.find_entities(node, positions)\n\n        if root_desc_deprels == [\"nsubj\", \"obj\", \"obj\"]:\n            obj_desc = root_desc[\"obj\"]\n            qualifier_entities_list = self.find_entities(obj_desc[0], positions)\n            grounded_entities_list = self.find_entities(obj_desc[1], positions)\n\n        year_constraint = self.find_year_constraint(root)\n        if root_desc_deprels == [\"nmod\", \"nsubj\"] and year_constraint:\n            if len(year_constraint[0]) == 2:\n                query_nums.append(\"24\")\n            elif len(year_constraint[0]) == 1:\n                query_nums.append(\"0\")\n\n        if root_desc_deprels == [\"obj\", \"xcomp\"]:\n            grounded_entities_list = self.find_entities(root_desc[\"xcomp\"][0], positions)\n\n        if (self.wh_leaf and root_desc_deprels in [[\"nsubj\", \"obj\", \"obl\"], [\"obj\", \"obl\"]]) \\\n                or (root_desc_deprels in [[\"nsubj\", \"obj\", \"obl\"], [\"obl\", \"xcomp\"]]\n                    and self.find_year_or_number(root_desc[\"obl\"][0])):\n            found_year_or_number = self.find_year_or_number(root_desc[\"obl\"][0])\n            nsubj_ent_list, obj_ent_list = [], []\n            if \"nsubj\" in root_desc_deprels:\n                nsubj_ent_list = self.find_entities(root_desc[\"nsubj\"][0], positions)\n            if \"obj\" in root_desc:\n                obj_ent_list = self.find_entities(root_desc[\"obj\"][0], positions)\n            obl_ent_list = self.find_entities(root_desc[\"obl\"][0], positions)\n            log.debug(f\"nsubj_ent: {nsubj_ent_list} --- obj_ent: {obj_ent_list} obl_ent: {obl_ent_list}\")\n            if self.wh_leaf:\n                grounded_entities_list = obl_ent_list\n                qualifier_entities_list = obj_ent_list\n            elif not found_year_or_number and nsubj_ent_list and obl_ent_list:\n                grounded_entities_list = nsubj_ent_list\n                modifiers_list = obl_ent_list\n            else:\n                grounded_entities_list = obj_ent_list\n            if found_year_or_number:\n                query_nums.append(\"0\")\n            if not grounded_entities_list:\n                grounded_entities_list = self.find_entities(root, positions)\n                grounded_entities_list = self.choose_grounded_entity(grounded_entities_list, entities_dict)\n\n        if clause_node:\n            for node in clause_node.children:\n                if node.deprel == \"obj\":\n                    grounded_entities_list = self.find_entities(node, positions)\n                if self.find_year_or_number(node):\n                    query_nums.append(\"0\")\n\n            if not self.wh_leaf:\n                type_entity = unknown_node.form\n                types_list.append(type_entity)\n\n        if root_desc_deprels == [\"nmod\", \"nmod\"]:\n            grounded_entities_list = self.find_entities(root_desc[\"nmod\"][0], positions)\n            modifiers_list = self.find_entities(root_desc[\"nmod\"][1], positions)\n\n        if root_desc_deprels == [\"nmod\", \"nsubj\", \"nummod\"]:\n            if not self.wh_leaf:\n                grounded_entities_list = self.find_entities(root_desc[\"nmod\"][0], positions)\n                found_year_or_number = self.find_year_or_number(root_desc[\"nummod\"][0])\n\n        if temporal_order and not query_nums:\n            for deprel in root_desc:\n                for node in root_desc[deprel]:\n                    entities = self.find_entities(node, positions)\n                    if entities:\n                        grounded_entities_list = entities\n                        break\n                if grounded_entities_list:\n                    break\n            if temporal_order in self.first_tokens | self.begin_tokens:\n                query_nums += [\"22\"]\n            if temporal_order in self.last_tokens | self.end_tokens:\n                query_nums += [\"23\"]\n        log.debug(f\"query_nums: {query_nums} --- year_constraint: {year_constraint}\")\n\n        if count:\n            grounded_entities_list = self.find_entities(root, positions)\n\n        grounded_entities_list = self.choose_grounded_entity(grounded_entities_list, entities_dict)\n        entities_list = grounded_entities_list + qualifier_entities_list + modifiers_list\n        types_list = [tp for tp in types_list\n                      if not (len(tp.split()) == 1 and self.nlp(tp)[0].lemma_ in self.date_tokens)]\n\n        gr_len = len(grounded_entities_list)\n        types_len = len(types_list)\n        mod_len = len(modifiers_list)\n        qua_len = len(qualifier_entities_list)\n        if qua_len or count:\n            types_len = 0\n\n        if not temporal_order and not query_nums:\n            for num, template in self.template_queries.items():\n                syntax_info = [gr_len, types_len, mod_len, qua_len, found_year_or_number, count, order]\n                if syntax_info == list(template[\"syntax_structure\"].values()):\n                    query_nums.append(num)\n                if mod_len:\n                    syntax_info[1] = 0\n                    if syntax_info == list(template[\"syntax_structure\"].values()):\n                        query_nums.append(num)\n\n        log.debug(f\"tree_to_sparql, grounded entities: {grounded_entities_list} --- types: {types_list} --- \"\n                  f\"modifier entities: {modifiers_list} --- qualifier entities: {qualifier_entities_list} --- \"\n                  f\"year_or_number {found_year_or_number} --- count: {count} --- order: {order} --- \"\n                  f\"query nums: {query_nums}\")\n\n        return query_nums, entities_list, types_list\n"
  },
  {
    "path": "deeppavlov/models/kbqa/type_define.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport pickle\nfrom typing import List\n\nimport spacy\nfrom nltk.corpus import stopwords\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\n\n\n@register('answer_types_extractor')\nclass AnswerTypesExtractor:\n    \"\"\"Class which defines answer types for the question\"\"\"\n\n    def __init__(self, lang: str, types_filename: str, types_sets_filename: str,\n                 num_types_to_return: int = 15, **kwargs):\n        \"\"\"\n\n        Args:\n            lang: Russian or English\n            types_filename: filename with dictionary where keys are type ids and values are type labels\n            types_sets_filename: filename with dictionary where keys are NER tags and values are Wikidata types\n                corresponding to tags\n            num_types_to_return: how many answer types to return for each question\n            **kwargs:\n        \"\"\"\n        self.lang = lang\n        self.types_filename = str(expand_path(types_filename))\n        self.types_sets_filename = str(expand_path(types_sets_filename))\n        self.num_types_to_return = num_types_to_return\n        if self.lang == \"@en\":\n            self.stopwords = set(stopwords.words(\"english\"))\n            self.nlp = spacy.load(\"en_core_web_sm\")\n            self.pronouns = [\"what\"]\n        elif self.lang == \"@ru\":\n            self.stopwords = set(stopwords.words(\"russian\"))\n            self.nlp = spacy.load(\"ru_core_news_sm\")\n            self.pronouns = [\"какой\", \"каком\"]\n        with open(self.types_filename, 'rb') as fl:\n            self.types_dict = pickle.load(fl)\n        with open(self.types_sets_filename, 'rb') as fl:\n            self.types_sets = pickle.load(fl)\n\n    def __call__(self, questions_batch: List[str], entity_substr_batch: List[List[str]],\n                 tags_batch: List[List[str]], types_substr_batch: List[List[str]] = None):\n        if types_substr_batch is None:\n            types_substr_batch = []\n            for question, entity_substr_list in zip(questions_batch, entity_substr_batch):\n                types_substr = []\n                type_noun = \"\"\n                doc = self.nlp(question)\n                token_pos_dict = {}\n                for n, token in enumerate(doc):\n                    token_pos_dict[token.text] = n\n                for token in doc:\n                    if token.text.lower() in self.pronouns and token.head.dep_ in [\"attr\", \"nsubj\"]:\n                        type_noun = token.head.text\n                        if not any([type_noun in entity_substr.lower() for entity_substr in entity_substr_list]):\n                            types_substr.append(type_noun)\n                        break\n                if type_noun:\n                    for token in doc:\n                        if token.head.text == type_noun and token.dep_ in [\"amod\", \"compound\"]:\n                            type_adj = token.text\n                            if not any([type_adj.lower() in entity_substr.lower() for entity_substr in\n                                        entity_substr_list]):\n                                types_substr.append(type_adj)\n                            break\n                        elif token.head.text == type_noun and token.dep_ == \"prep\":\n                            if len(list(token.children)) == 1 \\\n                                    and not any([list(token.children)[0].text in entity_substr.lower()\n                                                 for entity_substr in entity_substr_list]):\n                                types_substr += [token.text, list(token.children)[0].text]\n                elif any([word in question for word in self.pronouns]):\n                    for token in doc:\n                        if token.dep_ == \"nsubj\" and not any([token.text in entity_substr.lower()\n                                                              for entity_substr in entity_substr_list]):\n                            types_substr.append(token.text)\n                types_substr = [(token, token_pos_dict[token]) for token in types_substr]\n                types_substr = sorted(types_substr, key=lambda x: x[1])\n                types_substr = \" \".join([elem[0] for elem in types_substr])\n                types_substr_batch.append(types_substr)\n        types_sets_batch = [set() for _ in questions_batch]\n        for n, (question, types_sets) in enumerate(zip(questions_batch, types_sets_batch)):\n            question = question.lower()\n            if not types_sets:\n                if self.lang == \"@ru\":\n                    if question.startswith(\"кто\"):\n                        types_sets_batch[n] = self.types_sets[\"PER\"]\n                    elif question.startswith(\"где\"):\n                        types_sets_batch[n] = self.types_sets[\"LOC\"]\n                    elif any([question.startswith(elem) for elem in [\"когда\", \"в каком году\", \"в каком месяце\"]]):\n                        types_sets_batch[n] = {\"date\"}\n                    elif len(question.split()) > 1 and (any([question.startswith(elem) for elem in [\"кем \", \"как\"]]) \\\n                                                        or question.split()[1].startswith(\"как\")):\n                        types_sets_batch[n] = {\"not_date\"}\n                elif self.lang == \"@en\":\n                    if question.startswith(\"who\"):\n                        types_sets_batch[n] = self.types_sets[\"PER\"]\n                    elif question.startswith(\"where\"):\n                        types_sets_batch[n] = self.types_sets[\"LOC\"]\n                    elif any([question.startswith(elem) for elem in [\"when\", \"what year\", \"what month\"]]):\n                        types_sets_batch[n] = {\"date\"}\n\n        new_entity_substr_batch, new_entity_offsets_batch, new_tags_batch = [], [], []\n        for question, entity_substr_list, tags_list in zip(questions_batch, entity_substr_batch, tags_batch):\n            new_entity_substr, new_tags = [], []\n            if not entity_substr_list:\n                doc = self.nlp(question)\n                for token in doc:\n                    if token.dep_ == \"nsubj\":\n                        new_entity_substr.append(token.text)\n                        new_tags.append(\"MISC\")\n                        break\n                new_entity_substr_batch.append(new_entity_substr)\n                new_tags_batch.append(new_tags)\n            else:\n                new_entity_substr_batch.append(entity_substr_list)\n                new_tags_batch.append(tags_list)\n\n        return types_sets_batch, new_entity_substr_batch, new_tags_batch\n"
  },
  {
    "path": "deeppavlov/models/kbqa/utils.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport itertools\nimport re\nfrom collections import namedtuple\nfrom typing import List, Tuple, Dict, Any\n\n\ndef find_query_features(query, qualifier_rels=None, question=None, order_from_query=None):\n    query = query.lower().replace(\"select distinct\", \"select\")\n    answer_ent = re.findall(r\"select [\\(]?([\\S]+) \", query)\n    order_info_nt = namedtuple(\"order_info\", [\"variable\", \"sorting_order\"])\n    order_variable = re.findall(\"order by (asc|desc)\\((.*)\\)\", query)\n    if order_variable:\n        if (qualifier_rels and len(qualifier_rels[0][4:]) > 1) or order_from_query:\n            answers_sorting_order = order_variable[0][0]\n        else:\n            answers_sorting_order = order_of_answers_sorting(question)\n        order_info = order_info_nt(order_variable[0][1], answers_sorting_order)\n    else:\n        order_info = order_info_nt(None, None)\n    filter_from_query = re.findall(\"contains\\((\\?\\w), (.+?)\\)\", query)\n    return answer_ent, order_info, filter_from_query\n\n\ndef extract_year(question_tokens: List[str], question: str) -> str:\n    question_patterns = [r'.*\\d{1,2}/\\d{1,2}/(\\d{4}).*', r'.*\\d{1,2}-\\d{1,2}-(\\d{4}).*', r'.*(\\d{4})-\\d{1,2}-\\d{1,2}.*']\n    from_to_patterns = [r\"from ([\\d]{3,4}) to [\\d]{3,4}\", r\"с ([\\d]{3,4}) по [\\d]{3,4}\"]\n    token_patterns = [r'(\\d{4})', r'^(\\d{4})-.*', r'.*-(\\d{4})$']\n    year = \"\"\n    for pattern in question_patterns:\n        fnd = re.search(pattern, question)\n        if fnd is not None:\n            year = fnd.group(1)\n            break\n    else:\n        for pattern in from_to_patterns:\n            fnd = re.findall(pattern, question)\n            if fnd:\n                return fnd[0]\n        for token in question_tokens:\n            for pattern in token_patterns:\n                fnd = re.search(pattern, token)\n                if fnd is not None:\n                    return fnd.group(1)\n    return year\n\n\ndef extract_number(question_tokens: List[str], question: str) -> str:\n    number = \"\"\n    fnd = re.search(r'.*(\\d\\.\\d+e\\+\\d+)\\D*', question)\n    if fnd is not None:\n        number = fnd.group(1)\n    else:\n        for tok in question_tokens:\n            if tok[0].isdigit():\n                number = tok\n                break\n\n    number = number.replace('1st', '1').replace('2nd', '2').replace('3rd', '3')\n    number = number.strip(\".0\")\n\n    return number\n\n\ndef order_of_answers_sorting(question: str) -> str:\n    question_lower = question.lower()\n    max_words = [\"maximum\", \"highest\", \"max \", \"greatest\", \"most\", \"longest\", \"biggest\", \"deepest\", \"завершил\",\n                 \"закончил\", \"завершает\"]\n    for word in max_words:\n        if word in question_lower:\n            return \"desc\"\n    return \"asc\"\n\n\ndef make_combs(entity_ids: List[List[str]], permut: bool) -> List[List[str]]:\n    entity_ids = [[(entity, n) for n, entity in enumerate(entities_list)] for entities_list in entity_ids]\n    entity_ids = list(itertools.product(*entity_ids))\n    entity_ids = [comb for comb in entity_ids if not\n    (all([comb[i][0][0].split(\"/\")[-1] == comb[0][0][0].split(\"/\")[-1] for i in range(len(comb))])\n     and not all([comb[i][0][0] == comb[0][0][0] for i in range(len(comb))]))]\n    entity_ids_permut = []\n    if permut:\n        for comb in entity_ids:\n            entity_ids_permut += itertools.permutations(comb)\n    else:\n        entity_ids_permut = entity_ids\n    entity_ids = sorted(entity_ids_permut, key=lambda x: sum([elem[1] for elem in x]))\n    ent_combs = [[elem[0] for elem in comb] + [sum([elem[1] for elem in comb])] for comb in entity_ids]\n    return ent_combs\n\n\ndef fill_slots(query: str, entity_comb: List[str], type_comb: List[str], rel_comb: List[Tuple[str, float]],\n               delete_rel_prefix: bool = False) -> str:\n    for n, entity in enumerate(entity_comb[:-1]):\n        query = query.replace(f\"e{n + 1}\", entity)\n    for n, entity_type in enumerate(type_comb[:-1]):  # type_entity\n        query = query.replace(f\"t{n + 1}\", entity_type)\n    for n, (rel, score) in enumerate(rel_comb[:-1]):\n        if not rel.startswith(\"?\"):\n            if delete_rel_prefix:\n                rel = rel.split(\"/\")[-1]\n            query = query.replace(f\"r{n + 1}\", rel)\n    return query\n\n\ndef correct_variables(query_triplets: List[str], answer_ent: List[str], query_info: Dict[str, str]):\n    for i in range(len(query_triplets)):\n        for ent_var in answer_ent:\n            triplet_elements = query_triplets[i].split()\n            for j in range(len(triplet_elements)):\n                if triplet_elements[j] not in ent_var and triplet_elements[j].startswith(\"?\"):\n                    triplet_elements[j] = query_info[\"mid_var\"]\n                    break\n                if triplet_elements[j].startswith(\"?\") \\\n                        and triplet_elements[j] not in [query_info[\"mid_var\"], query_info[\"unk_var\"]]:\n                    triplet_elements[j] = query_info[\"unk_var\"]\n                    break\n            query_triplets[i] = \" \".join(triplet_elements)\n            query_triplets[i] = query_triplets[i].replace(ent_var, query_info[\"unk_var\"])\n    return query_triplets\n\n\ndef query_from_triplets(query_triplets: List[str], answer_ent: List[str], query_info: Dict[str, str]) -> str:\n    filled_query = \" . \".join(query_triplets)\n    if answer_ent and answer_ent[0].lower().startswith(\"count\"):\n        filled_query = f\"SELECT COUNT({query_info['unk_var']}) \" + \\\n                       f\"WHERE {{ {filled_query}. }}\"\n    else:\n        filled_query = f\"SELECT {query_info['unk_var']} WHERE {{ {filled_query}. }}\"\n    filled_query = filled_query.replace(\" ..\", \".\")\n    return filled_query\n\n\ndef fill_query(query: List[str], entity_comb: List[str], type_comb: List[str], rel_comb: List[Tuple[str, float]],\n               map_query_str_to_kb) -> List[str]:\n    ''' example of query: [\"wd:E1\", \"p:R1\", \"?s\"]\n                   entity_comb: [\"Q159\"]\n                   type_comb: []\n                   rel_comb: [\"P17\"]\n        map_query_str_to_kb = [(\"P0\", \"http://wd\"),\n                               (\"P00\", \"http://wl\"),\n                               (\"wd:\", \"http://we/\"),\n                               (\"wdt:\", \"http://wpd/\"),\n                               (\" p:\", \" http://wp/\"),\n                               (\"ps:\", \"http://wps/\"),\n                               (\"pq:\", \"http://wpq/\")]\n    '''\n    query = \" \".join(query)\n\n    for query_str, wikidata_str in map_query_str_to_kb:\n        query = query.replace(query_str, wikidata_str)\n    query = fill_slots(query, entity_comb, type_comb, rel_comb)\n    query = query.replace(\"http://wpd/P0\", \"http://wd\")\n    query = query.replace(\"http://wpd/P00\", \"http://wl\")\n    query = query.split(' ')\n    return query\n\n\ndef make_sparql_query(query_info: Tuple[List[str], List[str], List[str], Dict[str, Any], Dict[str, Any]],\n                      entities: List[str], rels: List[Tuple[str, float]], types: List[str],\n                      query_info_dict: Dict[str, str]) -> List[str]:\n    query_triplets, filled_triplets, answer_ent, filter_info, order_info = query_info\n    query_triplets = [fill_slots(elem, entities, types, rels, delete_rel_prefix=True) for elem in query_triplets]\n    query_triplets = correct_variables(query_triplets, answer_ent, query_info_dict)\n    filled_queries = []\n    if any([\"qualifier\" in filter_info_element for filter_info_element in filter_info]):\n        filled_queries.append(query_from_triplets(query_triplets, answer_ent, query_info_dict))\n    else:\n        for triplets_p in list(itertools.permutations(query_triplets)):\n            filled_queries.append(query_from_triplets(triplets_p, answer_ent, query_info_dict))\n    return filled_queries\n\n\ndef merge_sparql_query(query_info: Tuple[List[str], List[str], Dict[str, Any], Dict[str, Any]],\n                       query_info_dict: Dict[str, str]) -> str:\n    query_triplets, answer_ent, filter_info, order_info = query_info\n    query = query_from_triplets(query_triplets, answer_ent, query_info_dict)\n    return query\n\n\ndef preprocess_template_queries(template_queries: Dict[str, Any], kb_prefixes: Dict[str, str]) -> Dict[str, Any]:\n    for template_num in template_queries:\n        template = template_queries[template_num]\n        query = template[\"query_template\"]\n        q_triplets = re.findall(\"{[ ]?(.*?)[ ]?}\", query)[0].split(' . ')\n        q_triplets = [triplet.split(' ')[:3] for triplet in q_triplets]\n        if not \"rel_types\" in template:\n            template[\"rel_types\"] = [\"direct\" for _ in q_triplets]\n        rel_types = template[\"rel_types\"]\n        rel_dirs, n_hops, entities, types, gr_ent, mod_ent, q_ent = [], [], set(), set(), set(), set(), set()\n\n        for n, (triplet, rel_type) in enumerate(zip(q_triplets, rel_types)):\n            if not triplet[1].startswith(kb_prefixes[\"type_rel\"]):\n                if triplet[2].startswith(\"?\"):\n                    rel_dirs.append(\"forw\")\n                else:\n                    rel_dirs.append(\"backw\")\n            for ind in [0, 2]:\n                if triplet[ind].startswith(kb_prefixes[\"entity\"]):\n                    entities.add(triplet[ind])\n                elif triplet[ind].startswith(kb_prefixes[\"type\"]):\n                    types.add(triplet[ind])\n            if rel_type in {\"qualifier\", \"statement\"}:\n                if triplet[2].startswith(kb_prefixes[\"entity\"]):\n                    q_ent.add(triplet[2])\n            else:\n                if triplet[0].startswith(kb_prefixes[\"entity\"]):\n                    gr_ent.add(triplet[0])\n                elif triplet[2].startswith(kb_prefixes[\"entity\"]):\n                    mod_ent.add(triplet[2])\n            if triplet[1].startswith(kb_prefixes[\"rel\"]) and triplet[0].startswith(\"?\") and triplet[2].startswith(\"?\"):\n                n_hops.append(\"2-hop\")\n            elif n == 0 and len(q_triplets) == 2 and q_triplets[1][1].startswith(kb_prefixes[\"rel\"]) \\\n                    and q_triplets[1][0].startswith(\"?\") and q_triplets[1][2].startswith(\"?\"):\n                n_hops.append(\"1-of-2-hop\")\n            else:\n                n_hops.append(\"1-hop\")\n        syntax_structure = {\"gr_ent\": len(gr_ent), \"types\": len(types), \"mod_ent\": len(mod_ent),\n                            \"q_ent\": len(q_ent), \"year_or_number\": False, \"count\": False, \"order\": False}\n        if \"filter\" in query.lower():\n            syntax_structure[\"year_or_number\"] = True\n        if \"order\" in query.lower():\n            syntax_structure[\"order\"] = True\n        if \"count\" in query.lower():\n            syntax_structure[\"count\"] = True\n        if not \"query_sequence\" in template:\n            template[\"query_sequence\"] = list(range(1, len(q_triplets) + 1))\n        template[\"rel_dirs\"] = rel_dirs\n        template[\"n_hops\"] = n_hops\n        template[\"entities_and_types_num\"] = [len(entities), len(types)]\n        if entities:\n            entities_str = '_'.join([str(num) for num in list(range(1, len(entities) + 1))])\n        else:\n            entities_str = \"0\"\n        if types:\n            types_str = '_'.join([str(num) for num in list(range(1, len(types) + 1))])\n        else:\n            types_str = \"0\"\n        template[\"entities_and_types_select\"] = f\"{entities_str} {types_str}\"\n        template[\"syntax_structure\"] = syntax_structure\n        if \"return_if_found\" not in template:\n            template[\"return_if_found\"] = False\n        if \"priority\" not in template:\n            template[\"priority\"] = 1\n        template_queries[template_num] = template\n    return template_queries\n"
  },
  {
    "path": "deeppavlov/models/kbqa/wiki_parser.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport datetime\nimport re\nfrom collections import namedtuple\nfrom logging import getLogger\nfrom typing import List, Tuple, Dict, Any, Union\n\nfrom hdt import HDTDocument\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.file import load_pickle, read_json\nfrom deeppavlov.core.common.registry import register\n\nlog = getLogger(__name__)\n\n\n@register('wiki_parser')\nclass WikiParser:\n    \"\"\"This class extract relations, objects or triplets from Wikidata HDT file.\"\"\"\n\n    def __init__(self, wiki_filename: str,\n                 file_format: str = \"hdt\",\n                 prefixes: Dict[str, Union[str, Dict[str, str]]] = None,\n                 rel_q2name_filename: str = None,\n                 max_comb_num: int = 1e6,\n                 lang: str = \"@en\", **kwargs) -> None:\n        \"\"\"\n\n        Args:\n            wiki_filename: file with Wikidata\n            file_format: format of Wikidata file\n            lang: Russian or English language\n            **kwargs:\n        \"\"\"\n\n        if prefixes is None:\n            prefixes = {\n                \"entity\": \"http://we\",\n                \"label\": \"http://wl\",\n                \"alias\": \"http://wal\",\n                \"description\": \"http://wd\",\n                \"rels\": {\n                    \"direct\": \"http://wpd\",\n                    \"no_type\": \"http://wp\",\n                    \"statement\": \"http://wps\",\n                    \"qualifier\": \"http://wpq\",\n                    \"type\": \"http://wpd/P31\"\n                },\n                \"statement\": \"http://ws\"\n            }\n        self.prefixes = prefixes\n        self.file_format = file_format\n        self.wiki_filename = str(expand_path(wiki_filename))\n        if self.file_format == \"hdt\":\n            self.document = HDTDocument(self.wiki_filename)\n        elif self.file_format == \"pickle\":\n            self.document = load_pickle(self.wiki_filename)\n            self.parsed_document = {}\n        else:\n            raise ValueError(\"Unsupported file format\")\n        self.used_rels = set()\n        self.rel_q2name = dict()\n        if rel_q2name_filename:\n            if rel_q2name_filename.endswith(\"json\"):\n                self.rel_q2name = read_json(str(expand_path(rel_q2name_filename)))\n            elif rel_q2name_filename.endswith(\"pickle\"):\n                self.rel_q2name = load_pickle(str(expand_path(rel_q2name_filename)))\n            else:\n                raise ValueError(f\"Unsupported file format: {rel_q2name_filename}\")\n\n        self.max_comb_num = max_comb_num\n        self.lang = lang\n        self.replace_tokens = [('\"', ''), (self.lang, \" \"), ('$', ' '), ('  ', ' ')]\n\n    def __call__(self, parser_info_list: List[str], queries_list: List[Any]) -> List[Any]:\n        wiki_parser_output = self.execute_queries_list(parser_info_list, queries_list)\n        return wiki_parser_output\n\n    def execute_queries_list(self, parser_info_list: List[str], queries_list: List[Any]):\n        wiki_parser_output = []\n        query_answer_types = []\n        for parser_info, query in zip(parser_info_list, queries_list):\n            if parser_info == \"query_execute\":\n                answers, found_rels, found_combs = [], [], []\n                try:\n                    what_return, rels_from_query, query_seq, filter_info, order_info, answer_types, rel_types, \\\n                    return_if_found = query\n                    if answer_types:\n                        query_answer_types = answer_types\n                    answers, found_rels, found_combs = \\\n                        self.execute(what_return, rels_from_query, query_seq, filter_info, order_info,\n                                     query_answer_types, rel_types)\n                except ValueError:\n                    log.warning(\"Wrong arguments are passed to wiki_parser\")\n                wiki_parser_output.append([answers, found_rels, found_combs])\n            elif parser_info == \"find_rels\":\n                rels = []\n                try:\n                    rels = self.find_rels(*query)\n                except:\n                    log.warning(\"Wrong arguments are passed to wiki_parser\")\n                wiki_parser_output.append(rels)\n            elif parser_info == \"find_rels_2hop\":\n                rels = []\n                try:\n                    rels = self.find_rels_2hop(*query)\n                except ValueError:\n                    log.warning(\"Wrong arguments are passed to wiki_parser\")\n                wiki_parser_output += rels\n            elif parser_info == \"find_object\":\n                objects = []\n                try:\n                    objects = self.find_object(*query)\n                except:\n                    log.warning(\"Wrong arguments are passed to wiki_parser\")\n                wiki_parser_output.append(objects)\n            elif parser_info == \"check_triplet\":\n                check_res = False\n                try:\n                    check_res = self.check_triplet(*query)\n                except:\n                    log.warning(\"Wrong arguments are passed to wiki_parser\")\n                wiki_parser_output.append(check_res)\n            elif parser_info == \"find_label\":\n                label = \"\"\n                try:\n                    label = self.find_label(*query)\n                except:\n                    log.warning(\"Wrong arguments are passed to wiki_parser\")\n                wiki_parser_output.append(label)\n            elif parser_info == \"find_types\":\n                types = []\n                try:\n                    types = self.find_types(query)\n                except:\n                    log.warning(\"Wrong arguments are passed to wiki_parser\")\n                wiki_parser_output.append(types)\n            elif parser_info == \"fill_triplets\":\n                filled_triplets = []\n                try:\n                    filled_triplets = self.fill_triplets(*query)\n                except ValueError:\n                    log.warning(\"Wrong arguments are passed to wiki_parser\")\n                wiki_parser_output.append(filled_triplets)\n            elif parser_info == \"find_triplets\":\n                if self.file_format == \"hdt\":\n                    triplets = []\n                    try:\n                        triplets_forw, c = self.document.search_triples(f\"{self.prefixes['entity']}/{query}\", \"\", \"\")\n                        triplets.extend([triplet for triplet in triplets_forw\n                                         if not triplet[2].startswith(self.prefixes[\"statement\"])])\n                        triplets_backw, c = self.document.search_triples(\"\", \"\", f\"{self.prefixes['entity']}/{query}\")\n                        triplets.extend([triplet for triplet in triplets_backw\n                                         if not triplet[0].startswith(self.prefixes[\"statement\"])])\n                    except:\n                        log.warning(\"Wrong arguments are passed to wiki_parser\")\n                    wiki_parser_output.append(list(triplets))\n                else:\n                    triplets = {}\n                    try:\n                        triplets = self.document.get(query, {})\n                    except:\n                        log.warning(\"Wrong arguments are passed to wiki_parser\")\n                    uncompressed_triplets = {}\n                    if triplets:\n                        if \"forw\" in triplets:\n                            uncompressed_triplets[\"forw\"] = self.uncompress(triplets[\"forw\"])\n                        if \"backw\" in triplets:\n                            uncompressed_triplets[\"backw\"] = self.uncompress(triplets[\"backw\"])\n                    wiki_parser_output.append(uncompressed_triplets)\n            elif parser_info == \"find_triplets_for_rel\":\n                found_triplets = []\n                try:\n                    found_triplets, c = \\\n                        self.document.search_triples(\"\", f\"{self.prefixes['rels']['direct']}/{query}\", \"\")\n                except:\n                    log.warning(\"Wrong arguments are passed to wiki_parser\")\n                wiki_parser_output.append(list(found_triplets))\n            elif parser_info == \"parse_triplets\" and self.file_format == \"pickle\":\n                for entity in query:\n                    self.parse_triplets(entity)\n                wiki_parser_output.append(\"ok\")\n            else:\n                raise ValueError(\"Unsupported query type\")\n\n        return wiki_parser_output\n\n    def execute(self, what_return: List[str],\n                rels_from_query: List[str],\n                query_seq: List[List[str]],\n                filter_info: List[Tuple[str]] = None,\n                order_info: namedtuple = None,\n                answer_types: List[str] = None,\n                rel_types: List[str] = None):\n        \"\"\"\n            Let us consider an example of the question \"What is the deepest lake in Russia?\"\n            with the corresponding SPARQL query            \n            \"SELECT ?ent WHERE { ?ent wdt:P31 wd:T1 . ?ent wdt:R1 ?obj . ?ent wdt:R2 wd:E1 } ORDER BY ASC(?obj) LIMIT 5\"\n\n            arguments:\n                what_return: [\"?obj\"]\n                query_seq: [[\"?ent\", \"http://www.wikidata.org/prop/direct/P17\", \"http://www.wikidata.org/entity/Q159\"]\n                            [\"?ent\", \"http://www.wikidata.org/prop/direct/P31\", \"http://www.wikidata.org/entity/Q23397\"],\n                            [\"?ent\", \"http://www.wikidata.org/prop/direct/P4511\", \"?obj\"]]\n                filter_info: []\n                order_info: order_info(variable='?obj', sorting_order='asc')\n        \"\"\"\n        extended_combs = []\n        answers, found_rels, found_combs = [], [], []\n\n        for n, (query, rel_type) in enumerate(zip(query_seq, rel_types)):\n            unknown_elem_positions = [(pos, elem) for pos, elem in enumerate(query) if elem.startswith('?')]\n            \"\"\"\n                n = 0, query = [\"?ent\", \"http://www.wikidata.org/prop/direct/P17\",\n                                \"http://www.wikidata.org/entity/Q159\"]\n                       unknown_elem_positions = [\"?ent\"]\n                n = 1, query = [\"?ent\", \"http://www.wikidata.org/prop/direct/P31\",\n                                \"http://www.wikidata.org/entity/Q23397\"]\n                       unknown_elem_positions = [(0, \"?ent\")]\n                n = 2, query = [\"?ent\", \"http://www.wikidata.org/prop/direct/P4511\", \"?obj\"]\n                       unknown_elem_positions = [(0, \"?ent\"), (2, \"?obj\")]\n            \"\"\"\n            if n == 0:\n                combs, triplets = self.search(query, unknown_elem_positions, rel_type)\n                # combs = [{\"?ent\": \"http://www.wikidata.org/entity/Q5513\"}, ...]\n            else:\n                if combs:\n                    known_elements = []\n                    extended_combs = []\n                    if query[0].startswith(\"?\"):\n                        for elem in query:\n                            if elem in combs[0].keys():\n                                known_elements.append(elem)\n                        for comb in combs:\n                            \"\"\"\n                                n = 1\n                                query = [\"?ent\", \"http://www.wikidata.org/prop/direct/P31\",\n                                                                            \"http://www.wikidata.org/entity/Q23397\"]\n                                comb = {\"?ent\": \"http://www.wikidata.org/entity/Q5513\"}\n                                known_elements = [\"?ent\"], known_values = [\"http://www.wikidata.org/entity/Q5513\"]\n                                filled_query = [\"http://www.wikidata.org/entity/Q5513\", \n                                                \"http://www.wikidata.org/prop/direct/P31\", \n                                                \"http://www.wikidata.org/entity/Q23397\"]\n                                new_combs = [[\"http://www.wikidata.org/entity/Q5513\", \n                                              \"http://www.wikidata.org/prop/direct/P31\", \n                                              \"http://www.wikidata.org/entity/Q23397\"], ...]\n                                extended_combs = [{\"?ent\": \"http://www.wikidata.org/entity/Q5513\"}, ...]\n                            \"\"\"\n                            if comb:\n                                known_values = [comb[known_elem] for known_elem in known_elements]\n                                for known_elem, known_value in zip(known_elements, known_values):\n                                    filled_query = [elem.replace(known_elem, known_value) for elem in query]\n                                    new_combs, triplets = self.search(filled_query, unknown_elem_positions, rel_type)\n                                    for new_comb in new_combs:\n                                        extended_combs.append(self.merge_combs(comb, new_comb))\n                    else:\n                        new_combs, triplets = self.search(query, unknown_elem_positions, rel_type)\n                        for comb in combs:\n                            for new_comb in new_combs:\n                                extended_combs.append(self.merge_combs(comb, new_comb))\n                combs = extended_combs\n\n        is_boolean = self.define_is_boolean(query_seq)\n        if combs or is_boolean:\n            if filter_info:\n                for filter_elem, filter_value in filter_info:\n                    if filter_value == \"qualifier\":\n                        filter_value = \"wpq/\"\n                    combs = [comb for comb in combs if filter_value in comb[filter_elem]]\n\n            if order_info and not isinstance(order_info, list) and order_info.variable is not None:\n                reverse = True if order_info.sorting_order == \"desc\" else False\n                sort_elem = order_info.variable\n                if combs and \"?p\" in combs[0]:\n                    rel_combs = {}\n                    for comb in combs:\n                        if comb[\"?p\"] not in rel_combs:\n                            rel_combs[comb[\"?p\"]] = []\n                        rel_combs[comb[\"?p\"]].append(comb)\n                    rel_combs_list = rel_combs.values()\n                else:\n                    rel_combs_list = [combs]\n                new_rel_combs_list = []\n                for rel_combs in rel_combs_list:\n                    new_rel_combs = []\n                    for rel_comb in rel_combs:\n                        value_str = rel_comb[sort_elem].split('^^')[0].strip('\"+')\n                        fnd_date = re.findall(r\"[\\d]{3,4}-[\\d]{1,2}-[\\d]{1,2}\", value_str)\n                        fnd_num = re.findall(r\"([\\d]+)\\.([\\d]+)\", value_str)\n                        if fnd_date:\n                            rel_comb[sort_elem] = fnd_date[0]\n                        elif fnd_num or value_str.isdigit():\n                            rel_comb[sort_elem] = float(value_str)\n                        new_rel_combs.append(rel_comb)\n                    new_rel_combs = [(elem, n) for n, elem in enumerate(new_rel_combs)]\n                    new_rel_combs = sorted(new_rel_combs, key=lambda x: (x[0][sort_elem], x[1]), reverse=reverse)\n                    new_rel_combs = [elem[0] for elem in new_rel_combs]\n                    new_rel_combs_list.append(new_rel_combs)\n                combs = [new_rel_combs[0] for new_rel_combs in new_rel_combs_list]\n\n            if what_return and what_return[-1].startswith(\"count\"):\n                answers = [[len(combs)]]\n            else:\n                answers = [[elem[key] for key in what_return if key in elem] for elem in combs]\n\n            if answer_types:\n                if list(answer_types) == [\"date\"]:\n                    answers = [[entity for entity in answer\n                                if re.findall(r\"[\\d]{3,4}-[\\d]{1,2}-[\\d]{1,2}\", entity)] for answer in answers]\n                elif list(answer_types) == [\"not_date\"]:\n                    answers = [[entity for entity in answer\n                                if not re.findall(r\"[\\d]{3,4}-[\\d]{1,2}-[\\d]{1,2}\", entity)] for answer in answers]\n                else:\n                    answer_types = set(answer_types)\n                    answers = [[entity for entity in answer\n                                if answer_types.intersection(self.find_types(entity))] for answer in answers]\n            if is_boolean:\n                answers = [[\"Yes\" if len(triplets) > 0 else \"No\"]]\n            found_rels = [[elem[key] for key in rels_from_query if key in elem] for elem in combs]\n            ans_rels_combs = [(answer, rel, comb) for answer, rel, comb in zip(answers, found_rels, combs)\n                              if any([entity for entity in answer])]\n            answers = [elem[0] for elem in ans_rels_combs]\n            found_rels = [elem[1] for elem in ans_rels_combs]\n            found_combs = [elem[2] for elem in ans_rels_combs]\n\n        return answers, found_rels, found_combs\n\n    @staticmethod\n    def define_is_boolean(query_hdt_seq):\n        return len(query_hdt_seq) == 1 and all([not query_hdt_seq[0][i].startswith(\"?\") for i in [0, 2]])\n\n    @staticmethod\n    def merge_combs(comb1, comb2):\n        new_comb = {}\n        for key in comb1:\n            if (key in comb2 and comb1[key] == comb2[key]) or key not in comb2:\n                new_comb[key] = comb1[key]\n        for key in comb2:\n            if (key in comb1 and comb2[key] == comb1[key]) or key not in comb1:\n                new_comb[key] = comb2[key]\n        return new_comb\n\n    def search(self, query: List[str], unknown_elem_positions: List[Tuple[int, str]], rel_type):\n        query = list(map(lambda elem: \"\" if elem.startswith('?') else elem, query))\n        subj, rel, obj = query\n        if self.file_format == \"hdt\":\n            combs = []\n            triplets, cnt = self.document.search_triples(subj, rel, obj)\n            if cnt < self.max_comb_num:\n                triplets = list(triplets)\n                if rel == self.prefixes[\"description\"] or rel == self.prefixes[\"label\"]:\n                    triplets = [triplet for triplet in triplets if triplet[2].endswith(self.lang)]\n                    combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets]\n                else:\n                    if isinstance(self.prefixes[\"rels\"][rel_type], str):\n                        combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets\n                                 if (triplet[1].startswith(self.prefixes[\"rels\"][rel_type])\n                                     or triplet[1].startswith(self.prefixes[\"rels\"][\"type\"]))]\n                    else:\n                        combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets\n                                 if (any(triplet[1].startswith(tp) for tp in self.prefixes[\"rels\"][rel_type])\n                                     or triplet[1].startswith(self.prefixes[\"rels\"][\"type\"]))]\n            else:\n                log.debug(\"max comb num exceeds\")\n        else:\n            triplets = []\n            if subj:\n                subj, triplets = self.find_triplets(subj, \"forw\")\n                triplets = [[subj, triplet[0], obj] for triplet in triplets for obj in triplet[1:]]\n            if obj:\n                obj, triplets = self.find_triplets(obj, \"backw\")\n                triplets = [[subj, triplet[0], obj] for triplet in triplets for subj in triplet[1:]]\n            if rel:\n                if rel == self.prefixes[\"description\"]:\n                    triplets = [triplet for triplet in triplets if triplet[1] == \"descr_en\"]\n                else:\n                    rel = rel.split('/')[-1]\n                    triplets = [triplet for triplet in triplets if triplet[1] == rel]\n            combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets]\n\n        return combs, triplets\n\n    def find_label(self, entity: str, question: str = \"\") -> str:\n        entity = str(entity).replace('\"', '')\n        if self.file_format == \"hdt\":\n            if entity.startswith(\"Q\") or entity.startswith(\"P\"):\n                # example: \"Q5513\"\n                entity = f\"{self.prefixes['entity']}/{entity}\"\n                # \"http://www.wikidata.org/entity/Q5513\"\n\n            if entity.startswith(self.prefixes[\"entity\"]):\n                labels, c = self.document.search_triples(entity, self.prefixes[\"label\"], \"\")\n                # labels = [[\"http://www.wikidata.org/entity/Q5513\", \"http://www.w3.org/2000/01/rdf-schema#label\",\n                #                                                    '\"Lake Baikal\"@en'], ...]\n                for label in labels:\n                    if label[2].endswith(self.lang):\n                        found_label = label[2].strip(self.lang)\n                        for old_tok, new_tok in self.replace_tokens:\n                            found_label = found_label.replace(old_tok, new_tok)\n                        found_label = found_label.strip()\n                        return found_label\n\n            elif entity.endswith(self.lang):\n                # entity: '\"Lake Baikal\"@en'\n                entity = entity[:-3].replace('$', ' ').replace('  ', ' ')\n                return entity\n\n            elif \"^^\" in entity:\n                \"\"\"\n                    examples:\n                        '\"1799-06-06T00:00:00Z\"^^<http://www.w3.org/2001/XMLSchema#dateTime>' (date)\n                        '\"+1642\"^^<http://www.w3.org/2001/XMLSchema#decimal>' (number)\n                \"\"\"\n                entity = entity.split(\"^^\")[0]\n                for token in [\"T00:00:00Z\", \"+\"]:\n                    entity = entity.replace(token, '')\n                entity = self.format_date(entity, question).replace('$', '')\n                return entity\n\n            elif re.findall(r\"[\\d]{3,4}-[\\d]{2}-[\\d]{2}\", entity):\n                entity = self.format_date(entity, question).replace('$', '')\n                return entity\n\n            elif entity in [\"Yes\", \"No\"]:\n                return entity\n\n            elif entity.isdigit():\n                entity = entity.replace('.', ',')\n                return entity\n\n        if self.file_format == \"pickle\":\n            if entity:\n                if entity.startswith(\"Q\") or entity.startswith(\"P\"):\n                    triplets = self.document.get(entity, {}).get(\"forw\", [])\n                    triplets = self.uncompress(triplets)\n                    for triplet in triplets:\n                        if triplet[0] == \"name_en\":\n                            return triplet[1]\n                else:\n                    entity = self.format_date(entity, question)\n                    return entity\n\n        return \"Not Found\"\n\n    def format_date(self, entity, question):\n        dates_dict = {\"January\": \"января\", \"February\": \"февраля\", \"March\": \"марта\", \"April\": \"апреля\", \"May\": \"мая\",\n                      \"June\": \"июня\", \"July\": \"июля\", \"August\": \"августа\", \"September\": \"сентября\",\n                      \"October\": \"октября\",\n                      \"November\": \"ноября\", \"December\": \"декабря\"}\n        date_info = re.findall(\"([\\d]{3,4})-([\\d]{1,2})-([\\d]{1,2})\", entity)\n        if date_info:\n            year, month, day = date_info[0]\n            if \"how old\" in question.lower() or \"сколько лет\" in question.lower():\n                entity = datetime.datetime.now().year - int(year)\n            elif \"в каком году\" in question.lower():\n                entity = year\n            elif \"в каком месяце\" in question.lower():\n                entity = month\n            elif day not in {\"00\", \"0\"}:\n                date = datetime.datetime.strptime(f\"{year}-{month}-{day}\", \"%Y-%m-%d\")\n                entity = date.strftime(\"%d %B %Y\")\n            else:\n                entity = year\n            if self.lang == \"@ru\":\n                for mnth, mnth_replace in dates_dict.items():\n                    entity = entity.replace(mnth, mnth_replace)\n            return str(entity)\n        entity = entity.lstrip('+-')\n        return entity\n\n    def find_alias(self, entity: str) -> List[str]:\n        aliases = []\n        if entity.startswith(self.prefixes[\"entity\"]):\n            labels, cardinality = self.document.search_triples(entity, self.prefixes[\"alias\"], \"\")\n            aliases = [label[2].strip(self.lang).strip('\"') for label in labels if label[2].endswith(self.lang)]\n        return aliases\n\n    def find_rels(self, entity: str, direction: str, rel_type: str = \"no_type\") -> List[str]:\n        rels = []\n        if self.file_format == \"hdt\":\n            if not rel_type:\n                rel_type = \"direct\"\n            if direction == \"forw\":\n                query = [f\"{self.prefixes['entity']}/{entity}\", \"\", \"\"]\n            else:\n                query = [\"\", \"\", f\"{self.prefixes['entity']}/{entity}\"]\n            triplets, c = self.document.search_triples(*query)\n            triplets = list(triplets)\n            if isinstance(self.prefixes['rels'][rel_type], str):\n                start_str = f\"{self.prefixes['rels'][rel_type]}/P\"\n                rels = {triplet[1] for triplet in triplets if triplet[1].startswith(start_str)}\n            else:\n                rels = {triplet[1] for triplet in triplets\n                        if any([triplet[1].startswith(tp) for tp in self.prefixes['rels'][rel_type]])}\n            rels = list(rels)\n            if self.used_rels:\n                rels = [rel for rel in rels if rel.split(\"/\")[-1] in self.used_rels]\n        return rels\n\n    def find_rels_2hop(self, entity_ids, rels_1hop):\n        rels = []\n        for entity_id in entity_ids:\n            for rel_1hop in rels_1hop:\n                triplets, cnt = self.document.search_triples(f\"{self.prefixes['entity']}/{entity_id}\", rel_1hop, \"\")\n                triplets = [triplet for triplet in triplets if triplet[2].startswith(self.prefixes['entity'])]\n                objects_1hop = [triplet[2].split(\"/\")[-1] for triplet in triplets]\n                triplets, cnt = self.document.search_triples(\"\", rel_1hop, f\"{self.prefixes['entity']}/{entity_id}\")\n                triplets = [triplet for triplet in triplets if triplet[0].startswith(self.prefixes['entity'])]\n                objects_1hop += [triplet[0].split(\"/\")[-1] for triplet in triplets]\n                for object_1hop in objects_1hop[:5]:\n                    tr_2hop, cnt = self.document.search_triples(f\"{self.prefixes['entity']}/{object_1hop}\", \"\", \"\")\n                    rels_2hop = [elem[1] for elem in tr_2hop if elem[1] != rel_1hop]\n                    if self.used_rels:\n                        rels_2hop = [elem for elem in rels_2hop if elem.split(\"/\")[-1] in self.used_rels]\n                    rels += rels_2hop\n                    tr_2hop, cnt = self.document.search_triples(\"\", \"\", f\"{self.prefixes['entity']}/{object_1hop}\")\n                    rels_2hop = [elem[1] for elem in tr_2hop if elem[1] != rel_1hop]\n                    if self.used_rels:\n                        rels_2hop = [elem for elem in rels_2hop if elem.split(\"/\")[-1] in self.used_rels]\n                    rels += rels_2hop\n        rels = list(set(rels))\n        return rels\n\n    def find_object(self, entity: str, rel: str, direction: str) -> List[str]:\n        objects = []\n        if not direction:\n            direction = \"forw\"\n        if self.file_format == \"hdt\":\n            entity = f\"{self.prefixes['entity']}/{entity.split('/')[-1]}\"\n            rel = f\"{self.prefixes['rels']['direct']}/{rel}\"\n            if direction == \"forw\":\n                triplets, cnt = self.document.search_triples(entity, rel, \"\")\n                if cnt < self.max_comb_num:\n                    objects.extend([triplet[2].split('/')[-1] for triplet in triplets])\n            else:\n                triplets, cnt = self.document.search_triples(\"\", rel, entity)\n                objects.extend([triplet[0].split('/')[-1] for triplet in triplets])\n        else:\n            entity = entity.split('/')[-1]\n            rel = rel.split('/')[-1]\n            triplets = self.document.get(entity, {}).get(direction, [])\n            triplets = self.uncompress(triplets)\n            for found_rel, *objects in triplets:\n                if rel == found_rel:\n                    objects.extend(objects)\n        return objects\n\n    def check_triplet(self, subj: str, rel: str, obj: str) -> bool:\n        if self.file_format == \"hdt\":\n            subj = f\"{self.prefixes['entity']}/{subj}\"\n            rel = f\"{self.prefixes['rels']['direct']}/{rel}\"\n            obj = f\"{self.prefixes['entity']}/{obj}\"\n            triplets, cnt = self.document.search_triples(subj, rel, obj)\n            if cnt > 0:\n                return True\n            else:\n                return False\n        else:\n            subj = subj.split('/')[-1]\n            rel = rel.split('/')[-1]\n            obj = obj.split('/')[-1]\n            triplets = self.document.get(subj, {}).get(\"forw\", [])\n            triplets = self.uncompress(triplets)\n            for found_rel, *objects in triplets:\n                if found_rel == rel:\n                    for found_obj in objects:\n                        if found_obj == obj:\n                            return True\n            return False\n\n    def find_types(self, entity: str):\n        types = []\n        if self.file_format == \"hdt\":\n            if not entity.startswith(\"http\"):\n                entity = f\"{self.prefixes['entity']}/{entity}\"\n            tr, c = self.document.search_triples(entity, f\"{self.prefixes['rels']['direct']}/P31\", \"\")\n            types = [triplet[2].split('/')[-1] for triplet in tr]\n            for rel in [\"P106\", \"P21\"]:\n                tr, c = self.document.search_triples(entity, f\"{self.prefixes['rels']['direct']}/{rel}\", \"\")\n                types += [triplet[2].split('/')[-1] for triplet in tr]\n\n        if self.file_format == \"pickle\":\n            entity = entity.split('/')[-1]\n            triplets = self.document.get(entity, {}).get(\"forw\", [])\n            triplets = self.uncompress(triplets)\n            for triplet in triplets:\n                if triplet[0] == \"P31\":\n                    types = triplet[1:]\n        types = set(types)\n        return types\n\n    def find_subclasses(self, entity: str):\n        types = []\n        if self.file_format == \"hdt\":\n            if not entity.startswith(\"http\"):\n                entity = f\"{self.prefixes['entity']}/{entity}\"\n            tr, c = self.document.search_triples(entity, f\"{self.prefixes['rels']['direct']}/P279\", \"\")\n            types = [triplet[2].split('/')[-1] for triplet in tr]\n        if self.file_format == \"pickle\":\n            entity = entity.split('/')[-1]\n            triplets = self.document.get(entity, {}).get(\"forw\", [])\n            triplets = self.uncompress(triplets)\n            for triplet in triplets:\n                if triplet[0] == \"P279\":\n                    types = triplet[1:]\n        types = set(types)\n        return types\n\n    def uncompress(self, triplets: Union[str, List[List[str]]]) -> List[List[str]]:\n        if isinstance(triplets, str):\n            triplets = triplets.split('\\t')\n            triplets = [triplet.strip().split(\"  \") for triplet in triplets]\n        return triplets\n\n    def parse_triplets(self, entity):\n        triplets = self.document.get(entity, {})\n        for direction in [\"forw\", \"backw\"]:\n            if direction in triplets:\n                dir_triplets = triplets[direction]\n                dir_triplets = self.uncompress(dir_triplets)\n                if entity in self.parsed_document:\n                    self.parsed_document[entity][direction] = dir_triplets\n                else:\n                    self.parsed_document[entity] = {direction: dir_triplets}\n\n    def find_triplets(self, subj: str, direction: str) -> Tuple[str, List[List[str]]]:\n        subj = subj.split('/')[-1]\n        if subj in self.parsed_document:\n            triplets = self.parsed_document.get(subj, {}).get(direction, [])\n        else:\n            triplets = self.document.get(subj, {}).get(direction, [])\n            triplets = self.uncompress(triplets)\n        return subj, triplets\n\n    def fill_triplets(self, init_triplets, what_to_return, comb):\n        filled_triplets = []\n        for n, (subj, rel, obj) in enumerate(init_triplets):\n            if \"statement\" in self.prefixes and subj.startswith(\"?\") \\\n                    and comb.get(subj, \"\").startswith(self.prefixes[\"statement\"]) and not rel.startswith(\"?\") \\\n                    and (obj == what_to_return[0] or re.findall(r\"[\\d]{3,4}\", comb.get(what_to_return[0], \"\"))):\n                continue\n            else:\n                if \"statement\" in self.prefixes and subj.startswith(\"?\") \\\n                        and str(comb.get(subj, \"\")).startswith(self.prefixes[\"statement\"]):\n                    if not comb.get(what_to_return[0], \"\").startswith(\"http\") \\\n                            and re.findall(r\"[\\d]{3,4}\", comb.get(what_to_return[0], \"\")):\n                        subj = init_triplets[1][2]\n                    else:\n                        subj = what_to_return[0]\n                if \"statement\" in self.prefixes and obj.startswith(\"?\") \\\n                        and str(comb.get(obj, \"\")).startswith(self.prefixes[\"statement\"]):\n                    if not str(comb.get(what_to_return[0], \"\")).startswith(\"http\") \\\n                            and re.findall(r\"[\\d]{3,4}\", str(comb.get(what_to_return[0], \"\"))):\n                        obj = init_triplets[1][2]\n                    else:\n                        obj = what_to_return[0]\n                subj, obj = str(subj), str(obj)\n                if subj.startswith(\"?\"):\n                    subj = comb.get(subj, \"\")\n                if obj.startswith(\"?\"):\n                    obj = comb.get(obj, \"\")\n                if rel.startswith(\"?\"):\n                    rel = comb.get(rel, \"\")\n                subj_label = self.find_label(subj)\n                obj_label = self.find_label(obj)\n                if rel in self.rel_q2name:\n                    rel_label = self.rel_q2name[rel]\n                elif rel.split(\"/\")[-1] in self.rel_q2name:\n                    rel_label = self.rel_q2name[rel.split(\"/\")[-1]]\n                else:\n                    rel_label = self.find_label(rel)\n                if isinstance(rel_label, list) and rel_label:\n                    rel_label = rel_label[0]\n                filled_triplets.append([subj_label, rel_label, obj_label])\n        return filled_triplets\n"
  },
  {
    "path": "deeppavlov/models/morpho_syntax_parser/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/morpho_syntax_parser/dependency_decoding.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nimport numpy as np\nfrom ufal.chu_liu_edmonds import chu_liu_edmonds\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\n@register('chu_liu_edmonds_transformer')\nclass ChuLiuEdmonds(Component):\n    \"\"\"\n    A wrapper for Chu-Liu-Edmonds algorithm for maximum spanning tree\n    \"\"\"\n\n    def __init__(self, min_edge_prob=1e-6, **kwargs):\n        self.min_edge_prob = min_edge_prob\n\n    def __call__(self, probs: List[np.ndarray]) -> List[List[int]]:\n        \"\"\"Applies Chu-Liu-Edmonds algorithm to the matrix of head probabilities.\n        probs: a 3D-array of probabilities of shape B*L*(L+1)\n        \"\"\"\n        answer = []\n        for elem in probs:\n            m, n = elem.shape\n            if n == m + 1:\n                elem = np.log10(np.maximum(self.min_edge_prob, elem)) - np.log10(self.min_edge_prob)\n                elem = np.concatenate([np.zeros_like(elem[:1, :]), elem], axis=0)\n                # it makes impossible to create multiple edges 0->i\n                elem[1:, 0] += np.log10(self.min_edge_prob) * len(elem)\n                heads, _ = chu_liu_edmonds(elem.astype(\"float64\"))\n                answer.append(heads[1:])\n            else:\n                raise ValueError(\"First and second axis lengths m, n of probs should satisfy the condition n == m + 1\")\n        return answer\n"
  },
  {
    "path": "deeppavlov/models/morpho_syntax_parser/joint.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Union, List\n\nfrom deeppavlov.core.common.chainer import Chainer\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nUD_COLUMN_FEAT_MAPPING = {\"id\": 0, \"word\": 1, \"lemma\": 2, \"upos\": 3, \"feats\": 5, \"head\": 6, \"deprel\": 7}\n\n\n@register(\"joint_tagger_parser\")\nclass JointTaggerParser(Component):\n    \"\"\"\n    A class to perform joint morphological and syntactic parsing.\n    It is just a wrapper that calls the models for tagging and parsing\n    and comprises their results in a single output.\n    Args:\n        tagger: the morphological tagger model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance)\n        parser_path: the syntactic parser model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance)\n        output_format: the output format, it may be either `ud` (alias: `conllu`) or `json`.\n    Attributes:\n        tagger: a morphological tagger model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance)\n        parser: a syntactic parser model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance)\n    \"\"\"\n\n    def __init__(self, tagger: Chainer, parser: Chainer,\n                 output_format: str = \"ud\", *args, **kwargs):\n        if output_format not in [\"ud\", \"conllu\", \"json\", \"dict\"]:\n            UserWarning(\"JointTaggerParser output_format can be only `ud`, `conllu` or `json`. \" \\\n                        \"Unknown format: {}, setting the output_format to `ud`.\".format(output_format))\n            output_format = \"ud\"\n        self.output_format = output_format\n        self.tagger = tagger\n        self.parser = parser\n\n    def __call__(self, data: Union[List[str], List[List[str]]]) \\\n            -> Union[List[List[dict]], List[str], List[List[str]]]:\n        tagger_output = self.tagger(data)\n        parser_output = self.parser(data)\n        answer = []\n        for i, (tagger_sent, parser_sent) in enumerate(zip(tagger_output, parser_output)):\n            curr_sent_answer = []\n            for j, curr_word_tagger_output in enumerate(tagger_sent):\n                curr_word_tagger_output = curr_word_tagger_output.split(\"\\t\")\n                curr_word_parser_output = parser_sent[j].split(\"\\t\")\n                curr_word_answer = curr_word_tagger_output[:]\n                # setting parser output\n                curr_word_answer[6:8] = curr_word_parser_output[6:8]\n                if self.output_format in [\"json\", \"dict\"]:\n                    curr_word_answer = {key: curr_word_answer[index]\n                                        for key, index in UD_COLUMN_FEAT_MAPPING.items()}\n                    curr_word_answer = str(curr_word_answer)\n                curr_word_answer = \"\\t\".join(curr_word_answer)\n                curr_sent_answer.append(curr_word_answer)\n            curr_sent_answer = \"\\n\".join(str(x) for x in curr_sent_answer)\n            answer.append(curr_sent_answer)\n        return answer\n"
  },
  {
    "path": "deeppavlov/models/morpho_syntax_parser/spacy_lemmatizer.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nimport spacy\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\n@register('spacy_lemmatizer')\nclass SpacyLemmatizer(Component):\n    def __init__(self, model: str, **kwargs):\n        self.nlp = spacy.load(model)\n\n    def __call__(self, words_batch: List[List[str]]):\n        return [[self.nlp(word)[0].lemma_ for word in words_list] for words_list in words_batch]\n"
  },
  {
    "path": "deeppavlov/models/morpho_syntax_parser/syntax_parsing.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List, Optional, Tuple, Union\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\ndef make_pos_and_tag(tag: str, sep: str = \",\",\n                     return_mode: Optional[str] = None) -> Tuple[str, Union[str, list, dict, tuple]]:\n    \"\"\"\n    Args:\n        tag: the part-of-speech tag\n        sep: the separator between part-of-speech tag and grammatical features\n        return_mode: the type of return value, can be None, list, dict or sorted_items\n    Returns:\n        the part-of-speech label and grammatical features in required format\n    \"\"\"\n    if tag.endswith(\" _\"):\n        tag = tag[:-2]\n    if sep in tag:\n        pos, tag = tag.split(sep, maxsplit=1)\n    else:\n        pos, tag = tag, (\"_\" if return_mode is None else \"\")\n    if return_mode in [\"dict\", \"list\", \"sorted_items\"]:\n        tag = tag.split(\"|\") if tag != \"\" else []\n        if return_mode in [\"dict\", \"sorted_items\"]:\n            tag = dict(tuple(elem.split(\"=\")) for elem in tag)\n            if return_mode == \"sorted_items\":\n                tag = tuple(sorted(tag.items()))\n    return pos, tag\n\n\nclass OutputPrettifier(Component):\n    \"\"\"Base class for formatting the output of dependency parser and morphotagger\"\"\"\n\n    def __init__(self, return_string: bool = True, begin: str = \"\", end: str = \"\\n\", sep: str = \"\\n\",\n                 **kwargs) -> None:\n        self.return_string = return_string\n        self.begin = begin\n        self.end = end\n        self.sep = sep\n\n    def prettify(self, tokens: List[str], heads: List[int], deps: List[str]) -> Union[List[str], str]:\n        raise NotImplementedError\n\n    def __call__(self, X: List[List[str]], Y: List[List[int]], Z: List[List[str]]) -> List[Union[List[str], str]]:\n        \"\"\"Calls the :meth:`~prettify` function for each input sentence.\n        Args:\n            X: a list of input sentences\n            Y: a list of lists of head positions for sentence words\n            Z: a list of lists of dependency labels for sentence words\n        Returns:\n            a list of prettified UD outputs\n        \"\"\"\n        return [self.prettify(x, y, z) for x, y, z in zip(X, Y, Z)]\n\n\n@register('dependency_output_prettifier')\nclass DependencyOutputPrettifier(OutputPrettifier):\n    \"\"\"Class which prettifies dependency parser output\n    to 10-column (Universal Dependencies) format.\n    Args:\n        begin: a string to append in the beginning\n        end: a string to append in the end\n        sep: separator between word analyses\n    \"\"\"\n\n    def __init__(self, return_string: bool = True, begin: str = \"\", end: str = \"\\n\", sep: str = \"\\n\",\n                 **kwargs) -> None:\n        super().__init__(return_string, begin, end, sep, **kwargs)\n        self.format_string = \"{}\\t{}\\t_\\t_\\t_\\t_\\t{}\\t{}\\t_\\t_\"\n\n    def prettify(self, tokens: List[str], heads: List[int], deps: List[str]) -> Union[List[str], str]:\n        \"\"\"Prettifies output of dependency parser.\n        Args:\n            tokens: tokenized source sentence\n            heads: list of head positions, the output of the parser\n            deps: list of head positions, the output of the parser\n        Returns:\n            the prettified output of the parser\n        \"\"\"\n        answer = []\n        for i, (word, head, dep) in enumerate(zip(tokens, heads, deps)):\n            answer.append(self.format_string.format(i + 1, word, head, dep))\n        if self.return_string:\n            answer = self.begin + self.sep.join(answer) + self.end\n        return answer\n\n\n@register('lemmatized_output_prettifier')\nclass LemmatizedOutputPrettifier(OutputPrettifier):\n    \"\"\"Class which prettifies morphological tagger output to 4-column\n    or 10-column (Universal Dependencies) format.\n    Args:\n        format_mode: output format,\n            in `basic` mode output data contains 4 columns (id, word, pos, features),\n            in `conllu` or `ud` mode it contains 10 columns:\n            id, word, lemma, pos, xpos, feats, head, deprel, deps, misc\n            (see http://universaldependencies.org/format.html for details)\n            Only id, word, lemma, tag and pos columns are predicted in current version,\n            other columns are filled by `_` value.\n        begin: a string to append in the beginning\n        end: a string to append in the end\n        sep: separator between word analyses\n    \"\"\"\n\n    def __init__(self, return_string: bool = True, begin: str = \"\", end: str = \"\\n\", sep: str = \"\\n\",\n                 **kwargs) -> None:\n        super().__init__(return_string, begin, end, sep, **kwargs)\n        self.format_string = \"{}\\t{}\\t{}\\t{}\\t_\\t{}\\t_\\t_\\t_\\t_\"\n\n    def prettify(self, tokens: List[str], tags: List[str], lemmas: List[str]) -> Union[List[str], str]:\n        \"\"\"Prettifies output of morphological tagger.\n        Args:\n            tokens: tokenized source sentence\n            tags: list of tags, the output of a tagger\n            lemmas: list of lemmas, the output of a lemmatizer\n        Returns:\n            the prettified output of the tagger.\n        Examples:\n            >>> sent = \"John really likes pizza .\".split()\n            >>> tags = [\"PROPN,Number=Sing\", \"ADV\",\n            >>>         \"VERB,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\",\n            >>>         \"NOUN,Number=Sing\", \"PUNCT\"]\n            >>> lemmas = \"John really like pizza .\".split()\n            >>> prettifier = LemmatizedOutputPrettifier()\n            >>> self.prettify(sent, tags, lemmas)\n                1\tJohn\tJohn\tPROPN\t_\tNumber=Sing\t_\t_\t_\t_\n                2\treally\treally\tADV\t_\t_\t_\t_\t_\t_\n                3\tlikes\tlike\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t_\t_\t_\t_\n                4\tpizza\tpizza\tNOUN\t_\tNumber=Sing\t_\t_\t_\t_\n                5\t.\t.\tPUNCT\t_\t_\t_\t_\t_\t_\n        \"\"\"\n        answer = []\n        for i, (word, tag, lemma) in enumerate(zip(tokens, tags, lemmas)):\n            pos, tag = make_pos_and_tag(tag, sep=\",\")\n            answer.append(self.format_string.format(i + 1, word, lemma, pos, tag))\n        if self.return_string:\n            answer = self.begin + self.sep.join(answer) + self.end\n        return answer\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/preprocessors/dirty_comments_preprocessor.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nimport string\nfrom typing import List\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\n@register('dirty_comments_preprocessor')\nclass DirtyCommentsPreprocessor(Component):\n    \"\"\"\n    Class implements preprocessing of english texts with low level of literacy such as comments\n    \"\"\"\n\n    def __init__(self, remove_punctuation: bool = True, *args, **kwargs):\n        self.remove_punctuation = remove_punctuation\n\n    def __call__(self, batch: List[str], **kwargs) -> List[str]:\n        \"\"\"\n        Preprocess given batch\n\n        Args:\n            batch: list of text samples\n            **kwargs: additional arguments\n\n        Returns:\n            list of preprocessed text samples\n        \"\"\"\n        f = [x.lower() for x in batch]\n        f = [re.sub(\"<\\S*>\", \" \", x) for x in f]\n        f = [re.sub('\\s+', ' ', x) for x in f]\n\n        f = [x.replace(\"won't\", \"will not\") for x in f]\n        f = [x.replace(\"can't\", \"cannot\") for x in f]\n        f = [x.replace(\"i'm\", \"i am\") for x in f]\n        f = [x.replace(\" im \", \" i am \") for x in f]\n        f = [x.replace(\"'re\", \" are\") for x in f]\n        f = [x.replace(\"ain't\", \"is not\") for x in f]\n        f = [x.replace(\"'ll\", \" will\") for x in f]\n        f = [x.replace(\"n't\", \" not\") for x in f]\n        f = [x.replace(\"'ve\", \" have\") for x in f]\n        f = [x.replace(\"'s\", \" is\") for x in f]\n        f = [x.replace(\"'d\", \" would\") for x in f]\n\n        f = [re.sub(\"ies( |$)\", \"y \", x) for x in f]\n        f = [re.sub(\"s( |$)\", \" \", x) for x in f]\n        f = [re.sub(\"ing( |$)\", \" \", x) for x in f]\n\n        f = [x.replace(\" u \", \" you \") for x in f]\n        f = [x.replace(\" em \", \" them \") for x in f]\n        f = [x.replace(\" da \", \" the \") for x in f]\n        f = [x.replace(\" yo \", \" you \") for x in f]\n        f = [x.replace(\" ur \", \" your \") for x in f]\n        f = [x.replace(\" u r \", \" you are \") for x in f]\n        f = [x.replace(\" urs \", \" yours \") for x in f]\n        f = [x.replace(\"y'all\", \"you all\") for x in f]\n\n        f = [x.replace(\" r u \", \" are you \") for x in f]\n        f = [x.replace(\" r you\", \" are you\") for x in f]\n        f = [x.replace(\" are u \", \" are you \") for x in f]\n\n        f = [x.replace(\"\\\\n\", \" \") for x in f]\n        f = [x.replace(\"\\\\t\", \" \") for x in f]\n        f = [x.replace(\"\\\\xa0\", \" \") for x in f]\n        f = [x.replace(\"\\\\xc2\", \" \") for x in f]\n        f = [re.sub(\"[0-9]+\", \" 0 \", x) for x in f]\n\n        f = [re.sub(r'([' + string.printable + r'])\\1{3,}', r'\\1\\1', x).strip() for x in f]\n\n        if self.remove_punctuation:\n            f = [re.sub(r'([' + string.punctuation + '])', ' ', x) for x in f]\n\n        f = [re.sub(' +', ' ', x) for x in f]\n        return f\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/dnnc_preprocessor.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import List, Tuple\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlog = getLogger(__name__)\n\n\n@register('dnnc_pair_generator')\nclass PairGenerator(Component):\n    \"\"\"\n    Generates all possible ordered pairs from 'texts_batch' and 'support_dataset'\n    \n    Args:\n        bidirectional: adds pairs in reverse order\n    \"\"\"\n\n    def __init__(self, bidirectional: bool = False, **kwargs) -> None:\n        self.bidirectional = bidirectional\n\n    def __call__(self,\n                 texts: List[str],\n                 dataset: List[List[str]],\n                ) -> Tuple[List[str], List[str], List[str], List[str]]:\n        hypotesis_batch = []\n        premise_batch = []\n        hypotesis_labels_batch = []\n        for [premise, [hypotesis, hypotesis_labels]] in zip(texts * len(dataset),\n                                                            np.repeat(dataset, len(texts), axis=0)):\n            premise_batch.append(premise)\n            hypotesis_batch.append(hypotesis)\n            hypotesis_labels_batch.append(hypotesis_labels)\n\n            if self.bidirectional:\n                premise_batch.append(hypotesis)\n                hypotesis_batch.append(premise)\n                hypotesis_labels_batch.append(hypotesis_labels)\n        return texts, hypotesis_batch, premise_batch, hypotesis_labels_batch\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/mask.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\n@register('mask')\nclass Mask(Component):\n    \"\"\"Takes a batch of tokens and returns the masks of corresponding length\"\"\"\n    def __init__(self, *args, **kwargs):\n        pass\n\n    @staticmethod\n    def __call__(tokens_batch, **kwargs):\n        batch_size = len(tokens_batch)\n        max_len = max(len(utt) for utt in tokens_batch)\n        mask = np.zeros([batch_size, max_len], dtype=np.float32)\n        for n, utterance in enumerate(tokens_batch):\n            mask[n, :len(utterance)] = 1\n\n        return mask\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/multitask_preprocessor.py",
    "content": "from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Iterable\nfrom logging import getLogger\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.models.preprocessors.torch_transformers_preprocessor import *\n\nlog = getLogger(__name__)\n\n\n@register('multitask_pipeline_preprocessor')\nclass MultiTaskPipelinePreprocessor(Component):\n    \"\"\"\n    Extracts out the task_id from the first index of each example for each task.\n    Then splits the input and performs tokenization\n    Params:\n    \n    vocab_file(str): vocabulary file for tokenization\n    do_lower_case(bool): if True, tokenization is lower-cased. Default: True\n    preprocessor(str): name of DeepPavlov class that is used for tokenization. \n    Default: TorchTransformersPreprocessor\n    preprocessors(List[str]): list of names of DeepPavlov classes that are used for tokenization.\n    Overrides preprocessor . The length of list must be equal to the number of tasks\n    max_seq_length(int): Maximum sequence length for tokenizer. Default: 512\n    strict(bool): if True, we always try to split data assuming predefined modes as in multitask_example.json  \n    If False, we go without splitting if we are not sure how to split the data. Default: False\n    print_first_example(bool): if True, we print the first input example after initialization. Default: False \n    \"\"\"\n\n    def __init__(self,\n                 vocab_file,\n                 do_lower_case: bool = True,\n                 preprocessor: str = 'TorchTransformersPreprocessor',\n                 preprocessors: List[str] = None,\n                 max_seq_length: int = 512,\n                 strict=False,\n                 print_first_example=False,\n                 *args, **kwargs):\n        self.strict = strict\n        self.printed = False\n        self.print_first_example = print_first_example\n        self.prefix = ''\n        if preprocessors is None:\n            log.info(\n                f'Assuming the same preprocessor name for all : {preprocessor}')\n            self.preprocessor = eval(preprocessor)(vocab_file, do_lower_case,\n                                                   max_seq_length, *args, **kwargs)\n            self.preprocessors = None\n        else:\n            for i in range(len(preprocessors)):\n                preprocessors[i] = eval(preprocessors[i])\n            self.n_task = len(preprocessors)\n            self.preprocessors = [preprocessors[i](vocab_file=vocab_file, do_lower_case=do_lower_case,\n                                                   max_seq_length=max_seq_length,\n                                                   *args, **kwargs) for i in range(len(preprocessors))]\n\n    def split(self, features):\n        if all([isinstance(k, str) for k in features]) or all([k is None for k in features]):\n            # single sentence classification\n            log.debug('Assuming single sentence classification')\n            texts_a, texts_b = features, None\n        elif all([isinstance(k, tuple) and len(k) == 2 for k in features]):\n            log.debug(\n                'Assuming sentence pair classification or classification for multichoice')\n            texts_a, texts_b = [], []\n            for feature in features:\n                text_a, text_b = feature\n                texts_a.append(text_a)\n                texts_b.append(text_b)\n        elif all([isinstance(k, list) for k in features]):\n            log.debug('Assuming ner classification')\n            texts_a, texts_b = list(features), None\n        else:\n            if self.strict:\n                raise Exception(f'Unsupported task data {features}')\n            else:\n                log.warning('Data not split.Going without splitting')\n                texts_a, texts_b = features, None\n        return texts_a, texts_b\n\n    def __call__(self, *args):\n        \"\"\"\n        Returns batches of values from ``inp``. Every batch contains values that have same key from\n        ``keys_to_extract`` attribute. The order of elements of ``keys_to_extract`` is preserved.\n\n        Args:\n            inp: A sequence of dictionaries with identical keys\n\n        Returns:\n            A list of lists of values of dictionaries from ``inp``\n        \"\"\"\n        self.n_task = len(args)\n        if self.preprocessors is None:\n            # Defining preprocessor list while we call the function, as only he\n            self.preprocessors = [self.preprocessor\n                                  for _ in range(self.n_task)]\n        answer = []\n        for i in range(len(args)):\n            if all([j is None for j in args[i]]):\n                log.debug('All nones received')\n                answer.append([])\n            else:\n                texts_a, texts_b = self.split(args[i])\n                #log.debug(f'Preprocessor {self.preprocessors[i]}')\n                if all([j is None for j in texts_a]):\n                    log.debug('All nones')\n                    answer.append([])\n                else:\n                    if 'choice' in str(self.preprocessors[i]):\n                        if isinstance(texts_a[0], str) and isinstance(texts_b[0],list):\n                            for j in range(len(texts_b)):\n                                texts_a[j] = [texts_a[j] for _ in range(len(texts_b[j]))]\n                        if self.prefix:\n                            for j in range(len(texts_a)):\n                                 texts_a[j] = [' '.join([self.prefix, text]) for text in texts_a[j]]\n                    else:\n                        if self.prefix:\n                            texts_a = [' '.join([self.prefix, text]) for text in texts_a]\n                    answer.append(self.preprocessors[i](texts_a, texts_b))\n                    if not self.printed and self.print_first_example:\n                        print((texts_a, texts_b))\n                        print(answer[-1])\n                        self.printed = True\n        if answer == [[]]:\n            raise Exception('Empty answer')\n        return answer\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/ner_preprocessor.py",
    "content": "import errno\nimport os\nfrom logging import getLogger\nfrom typing import List\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.estimator import Estimator\n\nlog = getLogger(__name__)\n\n\n@register(\"ner_vocab\")\nclass NerVocab(Estimator):\n    \"\"\" Implementation of the NER vocabulary\n\n    Params:\n        word_file_path: the path to the pre-trained word embedding model\n        save_path: the folder path to save dictionary files\n        load_path: the folder path from which the dictionary files are loaded\n        char_level: the flag arg indicating the character vocabulary\n    \"\"\"\n\n    def __init__(self,\n                 word_file_path=None,\n                 save_path=None,\n                 load_path=None,\n                 char_level=False,\n                 **kwargs):\n\n        super().__init__(save_path=save_path, load_path=load_path, **kwargs)\n\n        self.word_file_path = word_file_path\n        self.char_level = char_level\n\n        if word_file_path is not None:\n            self.load_from_file(word_file_path)\n            if self.save_path is not None:\n                self.save_to_file(self.save_path)\n        elif self.load_path is not None:\n            self.load_from_file(self.load_path)\n\n    def load_from_file(self, filename):\n        if filename is None or not os.path.exists(filename):\n            return\n\n        self._t2i, self._i2t = {}, {}\n        for i, line in enumerate(open(file=filename, mode=\"r\", encoding=\"utf-8\").readlines()):\n            word = line.strip()\n            self._t2i[word] = i\n            self._i2t[i] = word\n\n    def save_to_file(self, filename):\n        if filename is None:\n            return\n\n        dir_name = os.path.dirname(filename)\n        if not os.path.exists(dir_name):\n            os.makedirs(dir_name)\n        with open(file=filename, mode=\"w\", encoding=\"utf-8\") as fo:\n            for word in self._t2i.keys():\n                fo.write(\"{}\\n\".format(word))\n\n    def fit(self, sents: [List[List[str]]], *args):\n        if self.word_file_path is not None:\n            return\n\n        if self.char_level:\n            items = set([char for sent in sents for word in sent for char in word])\n        else:\n            items = set([word for sent in sents for word in sent])\n        items = [\"<UNK>\", \"<PAD>\"] + list(items)\n\n        self._t2i = {k: v for v, k in enumerate(items)}\n        self._i2t = {k: v for k, v in enumerate(items)}\n\n        self.save_to_file(self.save_path)\n\n    def pad_batch(self, tokens: List[List[int]]):\n        \"\"\" Create padded batch of words, tags, chunk pos, even batch of characters\n\n        Params:\n            tokens: list of raw words, pos, chunk, or tags.\n\n        Returns:\n            the padded batch\n        \"\"\"\n\n        batch_size = len(tokens)\n\n        if not self.char_level:\n            max_len = max([len(seq) for seq in tokens])\n            padded_batch = np.full((batch_size, max_len), self._t2i[\"<PAD>\"])\n            for i, seq in enumerate(tokens):\n                padded_batch[i, :len(seq)] = seq\n        else:\n            max_len_seq = max([len(seq) for seq in tokens])\n            if max_len_seq == 0:\n                max_len_sub_seq = 0\n            else:\n                max_len_sub_seq = max([len(sub_seq) for seq in tokens for sub_seq in seq])\n            padded_batch = np.full((batch_size, max_len_seq, max_len_sub_seq), self._t2i[\"<PAD>\"])\n            for i, seq in enumerate(tokens):\n                for j, sub_seq in enumerate(seq):\n                    padded_batch[i, j, :len(sub_seq)] = sub_seq\n        return padded_batch\n\n    def __call__(self, sents, **kwargs):\n        if not self.char_level:\n            sents_ind = [[self._t2i[word] if word in self._t2i else 0 for word in sent] for sent in sents]\n        else:\n            sents_ind = [[[self._t2i[char] if char in self._t2i else 0 for char in word] for word in sent] for sent in\n                         sents]\n        padded_sents = self.pad_batch(sents_ind)\n\n        return padded_sents\n\n    def load(self, *args, **kwargs):\n        log.debug(\"[loading vocabulary from {}]\".format(self.load_path))\n        if self.load_path is not None:\n            self.load_from_file(self.load_path)\n\n    def save(self, *args, **kwargs):\n        log.info(\"[saving vocabulary to {}]\".format(self.save_path))\n        if not os.path.exists(os.path.dirname(self.save_path)):\n            try:\n                os.makedirs(os.path.dirname(self.save_path))\n            except OSError as exc:\n                if exc.errno != errno.EEXIST:\n                    raise\n        self.save_to_file(self.save_path)\n\n    @property\n    def len(self):\n        return len(self._t2i)\n\n    @property\n    def t2i(self):\n        return self._t2i\n\n    @property\n    def i2t(self):\n        return self._i2t\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/odqa_preprocessors.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom itertools import chain\nfrom logging import getLogger\nfrom typing import List, Callable, Union, Tuple, Optional\n\nfrom nltk import sent_tokenize\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlogger = getLogger(__name__)\n\n\n@register('document_chunker')\nclass DocumentChunker(Component):\n    \"\"\"Make chunks from a document or a list of documents. Don't tear up sentences if needed.\n\n    Args:\n        sentencize_fn: a function for sentence segmentation\n        keep_sentences: whether to tear up sentences between chunks or not\n        tokens_limit: a number of tokens in a single chunk (usually this number corresponds to the squad model limit)\n        flatten_result: whether to flatten the resulting list of lists of chunks\n        paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored\n\n    Attributes:\n        keep_sentences: whether to tear up sentences between chunks or not\n        tokens_limit: a number of tokens in a single chunk\n        flatten_result: whether to flatten the resulting list of lists of chunks\n        paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored\n\n    \"\"\"\n\n    def __init__(self, sentencize_fn: Callable = sent_tokenize, keep_sentences: bool = True,\n                 tokens_limit: int = 400, flatten_result: bool = False,\n                 paragraphs: bool = False, number_of_paragraphs: int = -1, *args, **kwargs) -> None:\n        self._sentencize_fn = sentencize_fn\n        self.keep_sentences = keep_sentences\n        self.tokens_limit = tokens_limit\n        self.flatten_result = flatten_result\n        self.paragraphs = paragraphs\n        self.number_of_paragraphs = number_of_paragraphs\n\n    def __call__(self, batch_docs: List[Union[str, List[str]]],\n                 batch_docs_ids: Optional[List[Union[str, List[str]]]] = None) -> \\\n            Union[Tuple[Union[List[str], List[List[str]]], Union[List[str], List[List[str]]]],\n                  Union[List[str], List[List[str]]]]:\n        \"\"\"Make chunks from a batch of documents. There can be several documents in each batch.\n        Args:\n            batch_docs: a batch of documents / a batch of lists of documents\n            batch_docs_ids (optional) : a batch of documents ids / a batch of lists of documents ids\n        Returns:\n            chunks of docs, flattened or not and\n            chunks of docs ids, flattened or not if batch_docs_ids were passed\n        \"\"\"\n\n        result = []\n        result_ids = []\n\n        empty_docs_ids_flag = False\n\n        if not batch_docs_ids:\n            empty_docs_ids_flag = True\n\n        if empty_docs_ids_flag:\n            batch_docs_ids = [[[] for j in i] for i in batch_docs]\n\n        for ids, docs in zip(batch_docs_ids, batch_docs):\n            batch_chunks = []\n            batch_chunks_ids = []\n            if isinstance(docs, str):\n                docs = [docs]\n                ids = [ids]\n\n            for id, doc in zip(ids, docs):\n                if self.paragraphs:\n                    split_doc = doc.split('\\n\\n')\n                    split_doc = [sd.strip() for sd in split_doc]\n                    split_doc = list(filter(lambda x: len(x) > 40, split_doc))\n                    if self.number_of_paragraphs != -1:\n                        split_doc = split_doc[:self.number_of_paragraphs]\n                    batch_chunks.append(split_doc)\n                    batch_chunks_ids.append([id] * len(split_doc))\n                else:\n                    doc_chunks = []\n                    if self.keep_sentences:\n                        sentences = sent_tokenize(doc)\n                        n_tokens = 0\n                        keep = []\n                        for s in sentences:\n                            n_tokens += len(s.split())\n                            if n_tokens > self.tokens_limit:\n                                if keep:\n                                    doc_chunks.append(' '.join(keep))\n                                    n_tokens = 0\n                                    keep.clear()\n                            keep.append(s)\n                        if keep:\n                            doc_chunks.append(' '.join(keep))\n                        batch_chunks.append(doc_chunks)\n                        batch_chunks_ids.append([id] * len(doc_chunks))\n                    else:\n                        split_doc = doc.split()\n                        doc_chunks = [split_doc[i:i + self.tokens_limit] for i in\n                                      range(0, len(split_doc), self.tokens_limit)]\n                        batch_chunks.append(doc_chunks)\n                        batch_chunks_ids.append([id] * len(doc_chunks))\n            result.append(batch_chunks)\n            result_ids.append(batch_chunks_ids)\n\n        if self.flatten_result:\n            if isinstance(result[0][0], list):\n                for i in range(len(result)):\n                    flattened = list(chain.from_iterable(result[i]))\n                    flattened_ids = list(chain.from_iterable(result_ids[i]))\n                    result[i] = flattened\n                    result_ids[i] = flattened_ids\n\n        if empty_docs_ids_flag:\n            return result\n\n        return result, result_ids\n\n\n@register('string_multiplier')\nclass StringMultiplier(Component):\n    \"\"\"Make a list of strings from a provided string. A length of the resulting list equals a length\n    of a provided reference argument.\n\n    \"\"\"\n\n    def __init__(self, **kwargs):\n        pass\n\n    def __call__(self, batch_s: List[str], ref: List[str]) -> List[List[str]]:\n        \"\"\" Multiply each string in a provided batch of strings.\n\n        Args:\n            batch_s: a batch of strings to be multiplied\n            ref: a reference to obtain a length of the resulting list\n\n        Returns:\n            a multiplied s as list\n\n        \"\"\"\n        res = []\n        for s, r in zip(batch_s, ref):\n            res.append([s] * len(r))\n\n        return res\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/one_hotter.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List, Union, Iterable\n\nimport numpy as np\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.utils import zero_pad\nfrom deeppavlov.core.models.component import Component\n\n\n@register('one_hotter')\nclass OneHotter(Component):\n    \"\"\"\n    One-hot featurizer with zero-padding.\n    If ``single_vector``, return the only vector per sample which can have several elements equal to ``1``.\n\n    Parameters:\n        depth: the depth for one-hotting\n        pad_zeros: whether to pad elements of batch with zeros\n        single_vector: whether to return one vector for the sample (sum of each one-hotted vectors)\n    \"\"\"\n\n    def __init__(self, depth: int, pad_zeros: bool = False,\n                 single_vector=False, *args, **kwargs):\n        self._depth = depth\n        self._pad_zeros = pad_zeros\n        self.single_vector = single_vector\n        if self._pad_zeros and self.single_vector:\n            raise ConfigError(\"Cannot perform ``single_vector`` with zero padding for OneHotter\")\n\n    def __call__(self, batch: List[List[int]], **kwargs) -> Union[List[List[np.ndarray]], List[np.ndarray]]:\n        \"\"\"\n        Convert given batch of list of labels to one-hot representation of the batch.\n\n        Args:\n            batch: list of samples, where each sample is a list of integer labels.\n            **kwargs: additional arguments\n\n        Returns:\n            if ``single_vector``, list of one-hot representations of each sample,\n            otherwise, list of lists of one-hot representations of each label in a sample\n        \"\"\"\n        one_hotted_batch = []\n\n        for utt in batch:\n            if isinstance(utt, Iterable):\n                one_hotted_utt = self._to_one_hot(utt, self._depth)\n            elif isinstance(utt, int):\n                if self._pad_zeros or self.single_vector:\n                    one_hotted_utt = self._to_one_hot([utt], self._depth)\n                else:\n                    one_hotted_utt = self._to_one_hot([utt], self._depth).reshape(-1)\n\n            if self.single_vector:\n                one_hotted_utt = np.sum(one_hotted_utt, axis=0)\n\n            one_hotted_batch.append(one_hotted_utt)\n\n        if self._pad_zeros:\n            one_hotted_batch = zero_pad(one_hotted_batch)\n        return one_hotted_batch\n\n    @staticmethod\n    def _to_one_hot(x, n):\n        b = np.zeros([len(x), n], dtype=np.float32)\n        for q, tok in enumerate(x):\n            b[q, int(tok)] = 1\n        return b\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/re_preprocessor.py",
    "content": "# Copyright 2021 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Tuple, List, Union\n\nimport numpy as np\nfrom transformers import BertTokenizer\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.file import read_json\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlog = getLogger(__name__)\n\n\n@register('re_preprocessor')\nclass REPreprocessor(Component):\n    def __init__(\n            self,\n            vocab_file: str,\n            special_token: str = '<ENT>',\n            ner_tags=None,\n            max_seq_length: int = 512,\n            do_lower_case: bool = False,\n            default_tag: str = None,\n            **kwargs\n    ):\n        \"\"\"\n        Args:\n            vocab_file: path to vocabulary / name of vocabulary for tokenizer initialization\n            special_token: an additional token that will be used for marking the entities in the document\n            do_lower_case: set True if lowercasing is needed\n            default_tag: used for test purposes to create a valid input\n        Return:\n            list of feature batches with input_ids, attention_mask, entity_pos, ner_tags\n        \"\"\"\n\n        self.special_token = special_token\n        self.special_tokens_dict = {'additional_special_tokens': [self.special_token]}\n        self.default_tag = default_tag\n\n        if ner_tags is None:\n            ner_tags = ['ORG', 'TIME', 'MISC', 'LOC', 'PER', 'NUM']\n        self.ner2id = {tag: tag_id for tag_id, tag in enumerate(ner_tags)}\n        self.max_seq_length = max_seq_length\n\n        if Path(vocab_file).is_file():\n            vocab_file = str(expand_path(vocab_file))\n            self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)\n        else:\n            self.tokenizer = BertTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)\n\n    def __call__(\n            self, tokens: Union[Tuple, List[List[str]]], entity_pos: Union[Tuple, List[List[Tuple]]],\n            entity_tags: Union[Tuple, List[List[str]]],\n    ) -> Tuple[List, List, List, List, List]:\n        \"\"\"\n        Tokenize and create masks; recalculate the entity positions regarding the document boarders.\n        Args:\n            tokens: List of tokens of each document: List[List[tokens in doc]]\n            entity_pos: start and end positions of the entities' mentions\n            entity_tags: NER tag of the entities\n        Return:\n            input_ids: List[List[int]],\n            attention_mask: List[List[int]],\n            entity_poss: List[\n                            List[\n                                List[(entity1_mention1_start_id, entity1_mention1_end_id), ...],\n                                List[(entity2_mention1_start_id, entity2_mention1_end_id), ...]\n                            ]\n                        ]\n            entity_tags: List[List[int]]\n            nf_samples: List[int] - contains the information about whether the corresponding sample is real sample or\n                fake (for testing): 0 means the sample is real, 1 - it is fake.\n        \"\"\"\n\n        _ = self.tokenizer.add_special_tokens(self.special_tokens_dict)\n\n        input_ids, attention_mask, upd_entity_pos, upd_entity_tags, nf_samples = [], [], [], [], []\n\n        # this workaround is for proper testing: for an unknown reason during test in test_quick_start.py\n        # each input list is transformed into a tuple, e.g., tokens -> tuple(tokens, ).\n        # todo: refactoring\n        if type(tokens) == tuple and type(entity_pos) == tuple and type(entity_tags) == tuple:\n            tokens = tokens[0]\n            entity_pos = entity_pos[0]\n            entity_tags = entity_tags[0]\n\n        for n_sample, (doc, ent_pos, ent_tags) in enumerate(zip(tokens, entity_pos, entity_tags)):\n\n            # valid scenario\n            if isinstance(ent_pos, list) and len(ent_pos) == 2:\n                count = 0\n                doc_wordpiece_tokens = []\n\n                entity1_pos_start = list(zip(*ent_pos[0]))[0]  # first entity mentions' start positions\n                entity1_pos_end = list(zip(*ent_pos[0]))[1]  # first entity mentions' end positions\n                entity2_pos_start = list(zip(*ent_pos[1]))[0]  # second entity mentions' start positions\n                entity2_pos_end = list(zip(*ent_pos[1]))[1]  # second entity mentions' end positions\n\n                upd_entity1_pos_start, upd_entity2_pos_start, upd_entity1_pos_end, upd_entity2_pos_end = [], [], [], []\n                for n, token in enumerate(doc):\n                    if n in entity1_pos_start:\n                        doc_wordpiece_tokens.append(self.special_token)\n                        upd_entity1_pos_start.append(count)\n                        count += 1\n\n                    if n in entity1_pos_end:\n                        doc_wordpiece_tokens.append(self.special_token)\n                        count += 1\n                        upd_entity1_pos_end.append(count)\n\n                    if n in entity2_pos_start:\n                        doc_wordpiece_tokens.append(self.special_token)\n                        upd_entity2_pos_start.append(count)\n                        count += 1\n\n                    if n in entity2_pos_end:\n                        doc_wordpiece_tokens.append(self.special_token)\n                        count += 1\n                        upd_entity2_pos_end.append(count)\n\n                    word_tokens = self.tokenizer.tokenize(token)\n                    doc_wordpiece_tokens += word_tokens\n                    count += len(word_tokens)\n\n                # special case when the entity is the last in the doc\n                if len(doc) in entity1_pos_end:\n                    doc_wordpiece_tokens.append(self.special_token)\n                    count += 1\n                    upd_entity1_pos_end.append(count)\n                if len(doc) in entity2_pos_end:\n                    doc_wordpiece_tokens.append(self.special_token)\n                    count += 1\n                    upd_entity2_pos_end.append(count)\n                    word_tokens = self.tokenizer.tokenize(token)\n                    doc_wordpiece_tokens += word_tokens\n                    count += len(word_tokens)\n\n                upd_entity_1_pos = list(zip(upd_entity1_pos_start, upd_entity1_pos_end))\n                upd_entity_2_pos = list(zip(upd_entity2_pos_start, upd_entity2_pos_end))\n\n                # text entities for self check\n                upd_entity1_text = [doc_wordpiece_tokens[ent_m[0]:ent_m[1]] for ent_m in upd_entity_1_pos]\n                upd_entity2_text = [doc_wordpiece_tokens[ent_m[0]:ent_m[1]] for ent_m in upd_entity_2_pos]\n\n                enc_entity_tags = self.encode_ner_tag(ent_tags)\n\n                encoding = self.tokenizer.encode_plus(\n                    doc_wordpiece_tokens[:self.max_seq_length],     # truncate tokens\n                    add_special_tokens=True,\n                    truncation=True,\n                    max_length=self.max_seq_length,\n                    pad_to_max_length=True,\n                    return_attention_mask=True\n                )\n                upd_entity_pos.append([upd_entity_1_pos, upd_entity_2_pos])\n                nf_samples.append(0)\n\n            # api test scenario\n            else:\n                # for api test: dump values of entity tags and entity pos\n                encoding = self.tokenizer.encode_plus(\n                    doc,\n                    add_special_tokens=True,\n                    truncation=True,\n                    max_length=self.max_seq_length,\n                    pad_to_max_length=True,\n                    return_attention_mask=True\n                )\n                upd_entity_pos.append([[(0, 1)], [(0, 1)]])\n                enc_entity_tags = self.encode_ner_tag([self.default_tag] * 2)\n                nf_samples.append(1)\n\n            input_ids.append(encoding['input_ids'])\n            attention_mask.append(encoding['attention_mask'])\n            upd_entity_tags.append(enc_entity_tags)\n\n        return input_ids, attention_mask, upd_entity_pos, upd_entity_tags, nf_samples\n\n    def encode_ner_tag(self, ner_tags: List) -> List:\n        \"\"\" Encode NER tags with one hot encodings \"\"\"\n        enc_ner_tags = []\n        for ner_tag in ner_tags:\n            ner_tag_one_hot = [0] * len(self.ner2id)\n            ner_tag_one_hot[self.ner2id[ner_tag]] = 1\n            enc_ner_tags.append(ner_tag_one_hot)\n        return enc_ner_tags\n\n\n@register('re_postprocessor')\nclass REPostprocessor:\n\n    def __init__(self, rel2id_path: str, rel2label_path: str, **kwargs):\n        self.rel2id_path = rel2id_path\n        self.rel2label_path = rel2label_path\n        self.rel2id = read_json(str(expand_path(self.rel2id_path)))\n        self.id2rel = {rel_id: rel for rel, rel_id in self.rel2id.items()}\n        self.rel2label = read_json(str(expand_path(self.rel2label_path)))\n\n    def __call__(self, model_output: List, nf_samples: List) -> Tuple[List[str], List[str]]:\n        \"\"\"\n        The model output is transformed to the relation id and relation name\n        Args:\n            model_output: List of probability vectors\n            nf_samples: contains the information about true and fake samples (0 - true sample and should be included\n                to the output, 1 - fake sample)\n        Return:\n            wikidata_relation_id: List of wiki ids of found relations\n            relation_name: List of names of found relations\n        \"\"\"\n\n        wikidata_relation_id, relation_name = [], []\n\n        for predictions, nf_sample in zip(model_output, nf_samples):\n            if nf_sample:\n                wikidata_relation_id.append(\"-\")\n                relation_name.append(\"-\")\n            else:\n                rel_indices = np.nonzero(predictions)[0]\n\n                for index in rel_indices:\n                    if index == 0:\n                        wikidata_relation_id.append(\"-\")\n                        relation_name.append(\"no relation\")\n                        continue\n\n                    rel_p = self.id2rel[index]\n                    wikidata_relation_id.append(rel_p)\n\n                    if rel_p in self.rel2label:\n                        relation_name.append(self.rel2label[rel_p])\n                    else:\n                        relation_name.append(\"-\")\n        return wikidata_relation_id, relation_name\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/response_base_loader.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nfrom logging import getLogger\n\nimport numpy as np\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.serializable import Serializable\n\nlogger = getLogger(__name__)\n\n\n@register('response_base_loader')\nclass ResponseBaseLoader(Serializable):\n    \"\"\"Class for loading a base with text responses (and contexts) and their vector representations.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.resps = None\n        self.resp_vecs = None\n        self.conts = None\n        self.cont_vecs = None\n        self.load()\n\n    def load(self):\n        if self.load_path is not None:\n            resp_file = self.load_path / \"responses.csv\"\n            if resp_file.exists():\n                with open(resp_file) as f:\n                    responses = f.readlines()\n                    self.resps = [el.strip('#\\n') for el in responses]\n            else:\n                logger.error(\"Please provide responses.csv file to the {} directory\".format(self.load_path))\n                sys.exit(1)\n            resp_vec_file = self.load_path / \"resp_vecs.npy\"\n            if resp_vec_file.exists():\n                self.resp_vecs = np.load(resp_vec_file)\n            cont_file = self.load_path / \"contexts.csv\"\n            if cont_file.exists():\n                with open(cont_file) as f:\n                    contexts = f.readlines()\n                    self.conts = [el.strip('#\\n') for el in contexts]\n            else:\n                logger.error(\"Please add contexts.csv file to the {} directory\".format(self.load_path))\n                sys.exit(1)\n            cont_vec_file = self.load_path / \"cont_vecs.npy\"\n            if cont_vec_file.exists():\n                self.cont_vecs = np.load(cont_vec_file)\n\n    def save(self):\n        logger.error(\"The method save of the {} class is not used.\".format(self.__class__))\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/sanitizer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nimport sys\nimport unicodedata\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\n@register('sanitizer')\nclass Sanitizer(Component):\n    \"\"\"Remove all combining characters like diacritical marks from tokens\n\n    Args:\n        diacritical: whether to remove diacritical signs or not\n            diacritical signs are something like hats and stress marks\n        nums: whether to replace all digits with 1 or not\n    \"\"\"\n\n    def __init__(self,\n                 diacritical: bool = True,\n                 nums: bool = False,\n                 *args, **kwargs) -> None:\n        self.diacritical = diacritical\n        self.nums = nums\n        self.combining_characters = dict.fromkeys([c for c in range(sys.maxunicode)\n                                                   if unicodedata.combining(chr(c))])\n\n    def filter_diacritical(self, tokens_batch):\n        \"\"\"Takes batch of tokens and returns the batch with sanitized tokens\"\"\"\n        sanitized_batch = []\n        for utterance in tokens_batch:\n            sanitized_utterance = []\n            for token in utterance:\n                token = unicodedata.normalize('NFD', token)\n                sanitized_utterance.append(token.translate(self.combining_characters))\n            sanitized_batch.append(sanitized_utterance)\n        return sanitized_batch\n\n    def replace_nums(self, tokens_batch):\n        sanitized_batch = []\n        for utterance in tokens_batch:\n            sanitized_batch.append([re.sub('[0-9]', '1', token) for token in utterance])\n        return sanitized_batch\n\n    def __call__(self, tokens_batch, **kwargs):\n        if self.filter_diacritical:\n            tokens_batch = self.filter_diacritical(tokens_batch)\n        if self.nums:\n            tokens_batch = self.replace_nums(tokens_batch)\n        return tokens_batch\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/sentseg_preprocessor.py",
    "content": "from typing import List\n\nfrom deeppavlov.core.common.registry import register\n\n\n@register(\"sentseg_restore_sent\")\ndef SentSegRestoreSent(batch_words: List[List[str]], batch_tags: List[List[str]]) -> List[str]:\n    ret = []\n    for words, tags in zip(batch_words, batch_tags):\n        if len(tags) == 0:\n            ret.append(\"\")\n            continue\n        sent = words[0]\n        punct = \"\" if tags[0] == \"O\" else tags[0][-1]\n        for word, tag in zip(words[1:], tags[1:]):\n            if tag != \"O\":\n                sent += punct\n                punct = tag[-1]\n            sent += \" \" + word\n        sent += punct\n        ret.append(sent)\n\n    return ret\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/squad_preprocessor.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport bisect\nfrom logging import getLogger\nfrom typing import List, Dict\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlogger = getLogger(__name__)\n\n\n@register('squad_bert_mapping')\nclass SquadBertMappingPreprocessor(Component):\n    \"\"\"Create mapping from BERT subtokens to their characters positions and vice versa.\n        Args:\n            do_lower_case: set True if lowercasing is needed\n    \"\"\"\n\n    def __init__(self, do_lower_case: bool = True, *args, **kwargs):\n        self.do_lower_case = do_lower_case\n\n    def __call__(self, contexts_batch, bert_features_batch, subtokens_batch, **kwargs):\n        subtok2chars_batch: List[List[Dict[int, int]]] = []\n        char2subtoks_batch: List[List[Dict[int, int]]] = []\n\n        for batch_counter, (context_list, features_list, subtokens_list) in \\\n                enumerate(zip(contexts_batch, bert_features_batch, subtokens_batch)):\n            subtok2chars_list, char2subtoks_list = [], []\n            for context, features, subtokens in zip(context_list, features_list, subtokens_list):\n                if self.do_lower_case:\n                    context = context.lower()\n                context_start = subtokens.index('[SEP]') + 1\n                idx = 0\n                subtok2char: Dict[int, int] = {}\n                char2subtok: Dict[int, int] = {}\n                for i, subtok in list(enumerate(subtokens))[context_start:-1]:\n                    subtok = subtok[2:] if subtok.startswith('##') else subtok\n                    subtok_pos = context[idx:].find(subtok)\n                    if subtok_pos == -1:\n                        # it could be UNK\n                        idx += 1  # len was at least one\n                    else:\n                        # print(k, '\\t', t, p + idx)\n                        idx += subtok_pos\n                        subtok2char[i] = idx\n                        for j in range(len(subtok)):\n                            char2subtok[idx + j] = i\n                        idx += len(subtok)\n                subtok2chars_list.append(subtok2char)\n                char2subtoks_list.append(char2subtok)\n            subtok2chars_batch.append(subtok2chars_list)\n            char2subtoks_batch.append(char2subtoks_list)\n        return subtok2chars_batch, char2subtoks_batch\n\n\n@register('squad_bert_ans_preprocessor')\nclass SquadBertAnsPreprocessor(Component):\n    \"\"\"Create answer start and end positions in subtokens.\n        Args:\n            do_lower_case: set True if lowercasing is needed\n    \"\"\"\n\n    def __init__(self, do_lower_case: bool = True, *args, **kwargs):\n        self.do_lower_case = do_lower_case\n\n    def __call__(self, answers_raw, answers_start, char2subtoks, **kwargs):\n        answers, starts, ends = [], [], []\n        for answers_raw, answers_start, c2sub in zip(answers_raw, answers_start, char2subtoks):\n            answers.append([])\n            starts.append([])\n            ends.append([])\n            for ans, ans_st in zip(answers_raw, answers_start):\n                if self.do_lower_case:\n                    ans = ans.lower()\n                try:\n                    indices = {c2sub[0][i] for i in range(ans_st, ans_st + len(ans)) if i in c2sub[0]}\n                    st = min(indices)\n                    end = max(indices)\n                except ValueError:\n                    # 0 - CLS token\n                    st, end = 0, 0\n                    ans = ''\n                starts[-1] += [st]\n                ends[-1] += [end]\n                answers[-1] += [ans]\n        return answers, starts, ends\n\n\n@register('squad_bert_ans_postprocessor')\nclass SquadBertAnsPostprocessor(Component):\n    \"\"\"Extract answer and create answer start and end positions in characters from subtoken positions.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        pass\n\n    def __call__(self, answers_start_batch, answers_end_batch, contexts_batch,\n                 subtok2chars_batch, subtokens_batch, ind_batch, *args, **kwargs):\n        answers = []\n        starts = []\n        ends = []\n        for answer_st, answer_end, context_list, sub2c_list, subtokens_list, ind in \\\n                zip(answers_start_batch, answers_end_batch, contexts_batch, subtok2chars_batch, subtokens_batch,\n                    ind_batch):\n            sub2c = sub2c_list[ind]\n            subtok = subtokens_list[ind][answer_end]\n            context = context_list[ind]\n            # CLS token is no_answer token\n            if answer_st == 0 or answer_end == 0:\n                answers += ['']\n                starts += [-1]\n                ends += [-1]\n            else:\n                st = self.get_char_position(sub2c, answer_st)\n                end = self.get_char_position(sub2c, answer_end)\n\n                subtok = subtok[2:] if subtok.startswith('##') else subtok\n                answer = context[st:end + len(subtok)]\n                answers += [answer]\n                starts += [st]\n                ends += [ends]\n        return answers, starts, ends\n\n    @staticmethod\n    def get_char_position(sub2c, sub_pos):\n        keys = list(sub2c.keys())\n        found_idx = bisect.bisect(keys, sub_pos)\n        if found_idx == 0:\n            return sub2c[keys[0]]\n\n        return sub2c[keys[found_idx - 1]]\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/str_lower.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Union\n\nfrom deeppavlov.core.common.registry import register\n\n\n@register('str_lower')\ndef str_lower(batch: Union[str, list, tuple]):\n    \"\"\"Recursively search for strings in a list and convert them to lowercase\n\n    Args:\n        batch: a string or a list containing strings at some level of nesting\n\n    Returns:\n        the same structure where all strings are converted to lowercase\n    \"\"\"\n    if isinstance(batch, str):\n        return batch.lower()\n    else:\n        return list(map(str_lower, batch))\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/str_token_reverser.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List, Union\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nStrTokenReverserInfo = Union[List[str], List['StrTokenReverserInfo']]\n\n\n@register('str_token_reverser')\nclass StrTokenReverser(Component):\n    \"\"\"Component for converting strings to strings with reversed token positions\n\n    Args:\n        tokenized: The parameter is only needed to reverse tokenized strings.\n    \"\"\"\n\n    def __init__(self, tokenized: bool = False, *args, **kwargs) -> None:\n        self.tokenized = tokenized\n\n    @staticmethod\n    def _reverse_str(raw_string):\n        splitted = raw_string.split()\n        splitted.reverse()\n        string = ' '.join(splitted)\n        return string\n\n    @staticmethod\n    def _reverse_tokens(raw_tokens):\n        raw_tokens.reverse()\n        return raw_tokens\n\n    def __call__(self, batch: Union[str, list, tuple]) -> StrTokenReverserInfo:\n        \"\"\"Recursively search for strings in a list and convert them to strings with reversed token positions\n\n        Args:\n            batch: a string or a list containing strings\n\n        Returns:\n            the same structure where all strings tokens are reversed\n        \"\"\"\n        if isinstance(batch, (list, tuple)):\n            batch = batch.copy()\n\n        if self.tokenized:\n            if isinstance(batch, (list, tuple)):\n                if isinstance(batch[-1], str):\n                    return self._reverse_tokens(batch)\n                else:\n                    return [self(line) for line in batch]\n            raise RuntimeError(f'The objects passed to the reverser are not list or tuple! '\n                               f' But they are {type(batch)}.'\n                               f' If you want to passed str type directly use option tokenized = False')\n        else:\n            if isinstance(batch, (list, tuple)):\n                return [self(line) for line in batch]\n            else:\n                return self._reverse_str(batch)\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/str_utf8_encoder.py",
    "content": "# originally based on https://github.com/allenai/bilm-tf/blob/master/bilm/data.py\n\n# Modifications copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections import Counter, OrderedDict\nfrom itertools import chain\nfrom logging import getLogger\nfrom typing import Union, List, Tuple\n\nimport numpy as np\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.estimator import Estimator\n\nlog = getLogger(__name__)\n\nStrUTF8EncoderInfo = Union[List[str], List['StrUTF8EncoderInfo']]\n\n\n@register('str_utf8_encoder')\nclass StrUTF8Encoder(Estimator):\n    \"\"\"Component for encoding all strings to utf8 codes\n\n    Args:\n        max_word_length: Max length of words of input and output batches.\n        pad_special_char_use: Whether to use special char for padding  or not.\n        word_boundary_special_char_use: Whether to add word boundaries by special chars or not.\n        sentence_boundary_special_char_use: Whether to add word boundaries by special chars or not.\n        reversed_sentense_tokens: Whether to use reversed sequences of tokens or not.\n        bos: Name of a special token of the begin of a sentence.\n        eos: Name of a special token of the end of a sentence.\n    \"\"\"\n\n    def __init__(self,\n                 max_word_length: int = 50,\n                 pad_special_char_use: bool = False,\n                 word_boundary_special_char_use: bool = False,\n                 sentence_boundary_special_char_use: bool = False,\n                 reversed_sentense_tokens: bool = False,\n                 bos: str = '<S>',\n                 eos: str = '</S>',\n                 **kwargs) -> None:\n        super().__init__(**kwargs)\n\n        if word_boundary_special_char_use and max_word_length < 3:\n            raise ConfigError(f\"`max_word_length` should be more than 3!\")\n        if max_word_length < 1:\n            raise ConfigError(f\"`max_word_length` should be more than 1!\")\n\n        self._max_word_length = max_word_length\n        self._reverse = reversed_sentense_tokens\n\n        self._pad_special_char_use = pad_special_char_use\n        self._word_boundary_special_char_use = word_boundary_special_char_use\n        self._sentence_boundary_special_char_use = sentence_boundary_special_char_use\n\n        # char ids 0-255 come from utf-8 encoding bytes\n        # assign 256-300 to special chars\n        self.bos_char = 256  # <begin sentence>\n        self.eos_char = 257  # <end sentence>\n        self.bow_char = 258  # <begin word>\n        self.eow_char = 259  # <end word>\n        self.pad_char = 260  # <padding>\n\n        self._len = 261  # an upper bound of all indexes\n\n        # the charcter representation of the begin/end of sentence characters\n        def _make_bos_eos(indx):\n            indx = np.array([indx], dtype=np.int32)\n            if self._word_boundary_special_char_use:\n                code = np.pad(indx, (1, 1), 'constant', constant_values=(self.bow_char, self.eow_char))\n            else:\n                code = indx\n            if self._pad_special_char_use:\n                code = np.pad(code, (0, self._max_word_length - code.shape[0]), 'constant',\n                              constant_values=(self.pad_char))\n            else:\n                pass\n            return code\n\n        self.bos_chars = _make_bos_eos(self.bos_char)\n        self.eos_chars = _make_bos_eos(self.eos_char)\n\n        if self._sentence_boundary_special_char_use:\n            self._eos_chars = [self.eos_chars]\n            self._bos_chars = [self.bos_chars]\n        else:\n            self._eos_chars = []\n            self._bos_chars = []\n\n        if self.load_path:\n            self.load()\n        else:\n            self.tokens = []\n        self._word_char_ids = OrderedDict()\n\n        for token in self.tokens:\n            self._word_char_ids[token] = self._convert_word_to_char_ids(token)\n        self._word_char_ids[bos] = self.bos_chars\n        self._word_char_ids[eos] = self.eos_chars\n\n    def __call__(self, batch: Union[List[str], Tuple[str]]) -> StrUTF8EncoderInfo:\n        \"\"\"Recursively search for strings in a list and utf8 encode\n\n        Args:\n            batch: a string or a list containing strings\n\n        Returns:\n            the same structure where all strings are utf8 encoded\n        \"\"\"\n        if isinstance(batch, (list, tuple)):\n            if isinstance(batch[-1], str):\n                return self._encode_chars(batch)\n            else:\n                return [self(line) for line in batch]\n        raise RuntimeError(f'The objects passed to the reverser are not list or tuple of str! '\n                           f' But they are {type(batch)}.')\n\n    def load(self) -> None:\n        if self.load_path:\n            if self.load_path.is_file():\n                log.debug(f\"[loading vocabulary from {self.load_path}]\")\n                self.tokens = []\n                for ln in self.load_path.open('r', encoding='utf8'):\n                    token = ln.strip().split()[0]\n                    self.tokens.append(token)\n            else:\n                raise ConfigError(f\"Provided `load_path` for {self.__class__.__name__} doesn't exist!\")\n        else:\n            raise ConfigError(f\"`load_path` for {self} is not provided!\")\n\n    def save(self) -> None:\n        log.info(f\"[saving vocabulary to {self.save_path}]\")\n        with self.save_path.open('wt', encoding='utf8') as f:\n            for token in self._word_char_ids.keys():\n                f.write('{}\\n'.format(token))\n\n    def fit(self, *args) -> None:\n        words = chain(*args)\n        # filter(None, <>) -- to filter empty words\n        freqs = Counter(filter(None, chain(*words)))\n        for token, _ in freqs.most_common():\n            if not (token in self._word_char_ids):\n                self._word_char_ids[token] = self._convert_word_to_char_ids(token)\n\n    def _convert_word_to_char_ids(self, word):\n\n        code = np.zeros([self._max_word_length], dtype=np.int32)\n        if self._pad_special_char_use:\n            code[:] = self.pad_char\n        if self._word_boundary_special_char_use:\n            word_encoded = word.encode('utf-8', 'ignore')[:self._max_word_length - 2]\n            code[0] = self.bow_char\n\n            for k, chr_id in enumerate(word_encoded, start=1):\n                code[k] = chr_id\n\n            code[len(word_encoded) + 1] = self.eow_char\n        else:\n            word_encoded = word.encode('utf-8', 'ignore')[:self._max_word_length]\n\n            for k, chr_id in enumerate(word_encoded):\n                code[k] = chr_id\n\n        if not self._pad_special_char_use:\n            if self._word_boundary_special_char_use:\n                code = code[:len(word_encoded) + 2]\n            else:\n                code = code[:len(word_encoded)]\n        return code\n\n    def _word_to_char_ids(self, word):\n        if word in self._word_char_ids:\n            return self._word_char_ids[word]\n        else:\n            return self._convert_word_to_char_ids(word)\n\n    def _encode_chars(self, sentence):\n        \"\"\"\n        Encode the sentence as a white space delimited string of tokens.\n        \"\"\"\n        chars_ids = [self._word_to_char_ids(cur_word)\n                     for cur_word in sentence]\n        return self._wrap_in_s_char(chars_ids)\n\n    def _wrap_in_s_char(self, chars_ids):\n        chars_ids = chars_ids if self._pad_special_char_use else list(chars_ids)\n        if self._reverse:\n            ret = self._eos_chars + chars_ids + self._bos_chars\n        else:\n            ret = self._bos_chars + chars_ids + self._eos_chars\n        return np.vstack(ret) if self._pad_special_char_use else ret\n\n    def __len__(self):\n        return self._len\n\n    @property\n    def len(self):\n        \"\"\"\n        An upper bound of all indexes.\n        \"\"\"\n        return len(self)\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/torch_transformers_preprocessor.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nimport random\nimport re\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Tuple, List, Optional, Union, Dict, Set, Any\n\nimport nltk\nimport numpy as np\nimport torch\nfrom transformers import AutoTokenizer\nfrom transformers.data.processors.utils import InputFeatures\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.utils import zero_pad\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.models.preprocessors.mask import Mask\n\nlog = getLogger(__name__)\n\n\n@register('torch_transformers_multiplechoice_preprocessor')\nclass TorchTransformersMultiplechoicePreprocessor(Component):\n    \"\"\"Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks.\n\n    Args:\n        vocab_file: path to vocabulary\n        do_lower_case: set True if lowercasing is needed\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n\n    Attributes:\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n        tokenizer: instance of Bert FullTokenizer\n\n    \"\"\"\n\n    def __init__(self,\n                 vocab_file: str,\n                 do_lower_case: bool = True,\n                 max_seq_length: int = 512,\n                 **kwargs) -> None:\n        self.max_seq_length = max_seq_length\n        if Path(vocab_file).is_file():\n            vocab_file = str(expand_path(vocab_file))\n            self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, **kwargs)\n        else:\n            self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case, **kwargs)\n\n    def tokenize_mc_examples(self,\n                             contexts: List[List[str]],\n                             choices: List[List[str]]) -> Dict[str, torch.tensor]:\n\n        num_choices = len(contexts[0])\n        batch_size = len(contexts)\n\n        # tokenize examples in groups of `num_choices`\n        examples = []\n        for context_list, choice_list in zip(contexts, choices):\n            for context, choice in zip(context_list, choice_list):\n                tokenized_input = self.tokenizer.encode_plus(text=context,\n                                                             text_pair=choice,\n                                                             return_attention_mask=True,\n                                                             add_special_tokens=True,\n                                                             truncation=True)\n\n                examples.append(tokenized_input)\n\n        padded_examples = self.tokenizer.pad(\n            examples,\n            padding=True,\n            max_length=self.max_seq_length,\n            return_tensors='pt',\n        )\n\n        padded_examples = {k: v.view(batch_size, num_choices, -1) for k, v in padded_examples.items()}\n\n        return padded_examples\n\n    def __call__(self, texts_a: List[List[str]], texts_b: List[List[str]] = None) -> Dict[str, torch.tensor]:\n        \"\"\"Tokenize and create masks.\n\n        texts_a and texts_b are separated by [SEP] token\n\n        Args:\n            texts_a: list of texts,\n            texts_b: list of texts, it could be None, e.g. single sentence classification task\n\n        Returns:\n            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \\\n                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens\n        \"\"\"\n        input_features = []\n        if texts_a and texts_b and texts_a[0] and texts_b[0]:\n            input_features = self.tokenize_mc_examples(texts_a, texts_b)\n        return input_features\n\n\n@register('torch_transformers_preprocessor')\nclass TorchTransformersPreprocessor(Component):\n    \"\"\"Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks.\n\n    Args:\n        vocab_file: A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co or\n            a path to a `directory` containing vocabulary files required by the tokenizer.\n        do_lower_case: set True if lowercasing is needed\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n\n    Attributes:\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n        tokenizer: instance of Bert FullTokenizer\n\n    \"\"\"\n\n    def __init__(self,\n                 vocab_file: str,\n                 do_lower_case: bool = True,\n                 max_seq_length: int = 512,\n                 **kwargs) -> None:\n        self.max_seq_length = max_seq_length\n        self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case, **kwargs)\n\n    def __call__(self, texts_a: List, texts_b: Optional[List[str]] = None) -> Union[List[InputFeatures],\n                                                                                    Tuple[List[InputFeatures],\n                                                                                    List[List[str]]]]:\n        \"\"\"Tokenize and create masks.\n        texts_a and texts_b are separated by [SEP] token\n        Args:\n            texts_a: list of texts,\n            texts_b: list of texts, it could be None, e.g. single sentence classification task\n        Returns:\n            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \\\n                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens\n        \"\"\"\n\n        # in case of iterator's strange behaviour\n        if isinstance(texts_a, tuple):\n            texts_a = list(texts_a)\n        elif isinstance(texts_a, str):\n            raise TypeError(f'Received string {texts_a} as an input! Check the iterator output')\n        elif texts_a == []:\n            return {}\n\n        texts_a = [k for k in texts_a if k is not None]  # handle dummy output\n\n        input_features = self.tokenizer(text=texts_a,\n                                        text_pair=texts_b,\n                                        add_special_tokens=True,\n                                        max_length=self.max_seq_length,\n                                        padding='max_length',\n                                        return_attention_mask=True,\n                                        truncation=True,\n                                        return_tensors='pt')\n        return input_features\n\n\n@register('torch_transformers_entity_ranker_preprocessor')\nclass TorchTransformersEntityRankerPreprocessor(Component):\n    \"\"\"Class for tokenization of text into subtokens, encoding of subtokens with indices and obtaining positions of\n    special [ENT]-tokens\n    Args:\n        vocab_file: path to vocabulary\n        do_lower_case: set True if lowercasing is needed\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n        special_tokens: list of special tokens\n        special_token_id: id of special token\n        return_special_tokens_pos: whether to return positions of found special tokens\n    \"\"\"\n\n    def __init__(self,\n                 vocab_file: str,\n                 do_lower_case: bool = False,\n                 max_seq_length: int = 512,\n                 special_tokens: List[str] = None,\n                 special_token_id: int = None,\n                 return_special_tokens_pos: bool = False,\n                 **kwargs) -> None:\n        self.max_seq_length = max_seq_length\n        self.do_lower_case = do_lower_case\n        if Path(vocab_file).is_file():\n            vocab_file = str(expand_path(vocab_file))\n            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,\n                                           do_lower_case=do_lower_case)\n        else:\n            self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)\n        if special_tokens is not None:\n            special_tokens_dict = {'additional_special_tokens': special_tokens}\n            self.tokenizer.add_special_tokens(special_tokens_dict)\n        self.special_token_id = special_token_id\n        self.return_special_tokens_pos = return_special_tokens_pos\n\n    def __call__(self, texts_a: List[str]) -> Tuple[Any, List[int]]:\n        \"\"\"Tokenize and find special tokens positions.\n        Args:\n            texts_a: list of texts,\n        Returns:\n            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \\\n                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens\n            batch of indices of special token ids in input ids sequence\n        \"\"\"\n        # in case of iterator's strange behaviour\n        if isinstance(texts_a, tuple):\n            texts_a = list(texts_a)\n        if self.do_lower_case:\n            texts_a = [text.lower() for text in texts_a]\n        lengths = []\n        input_ids_batch = []\n        for text_a in texts_a:\n            encoding = self.tokenizer.encode_plus(\n                text_a, add_special_tokens=True, pad_to_max_length=True, return_attention_mask=True)\n            input_ids = encoding[\"input_ids\"]\n            input_ids_batch.append(input_ids)\n            lengths.append(len(input_ids))\n\n        max_length = min(max(lengths), self.max_seq_length)\n        input_features = self.tokenizer(text=texts_a,\n                                        add_special_tokens=True,\n                                        max_length=max_length,\n                                        padding='max_length',\n                                        return_attention_mask=True,\n                                        truncation=True,\n                                        return_tensors='pt')\n        special_tokens_pos = []\n        for input_ids_list in input_ids_batch:\n            found_n = -1\n            for n, input_id in enumerate(input_ids_list):\n                if input_id == self.special_token_id:\n                    found_n = n\n                    break\n            if found_n == -1:\n                found_n = 0\n            special_tokens_pos.append(found_n)\n\n        if self.return_special_tokens_pos:\n            return input_features, special_tokens_pos\n        else:\n            return input_features\n\n\n@register('torch_squad_transformers_preprocessor')\nclass TorchSquadTransformersPreprocessor(Component):\n    \"\"\"Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks.\n\n    Args:\n        vocab_file: path to vocabulary\n        do_lower_case: set True if lowercasing is needed\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n\n    Attributes:\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n        tokenizer: instance of Bert FullTokenizer\n\n    \"\"\"\n\n    def __init__(self,\n                 vocab_file: str,\n                 do_lower_case: bool = True,\n                 max_seq_length: int = 512,\n                 add_token_type_ids: bool = False,\n                 **kwargs) -> None:\n        self.max_seq_length = max_seq_length\n        self.add_token_type_ids = add_token_type_ids\n        if Path(vocab_file).is_file():\n            vocab_file = str(expand_path(vocab_file))\n            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,\n                                           do_lower_case=do_lower_case)\n        else:\n            self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)\n\n    def __call__(self, question_batch: List[str], context_batch: Optional[List[str]] = None) -> Union[\n        List[InputFeatures],\n        Tuple[List[InputFeatures],\n              List[List[str]]]]:\n        \"\"\"Tokenize and create masks.\n\n        texts_a_batch and texts_b_batch are separated by [SEP] token\n\n        Args:\n            texts_a_batch: list of texts,\n            texts_b_batch: list of texts, it could be None, e.g. single sentence classification task\n\n        Returns:\n            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \\\n                subtoken mask, segment mask, or tuple of batch of InputFeatures, batch of subtokens and batch of\n                split paragraphs\n        \"\"\"\n\n        if context_batch is None:\n            context_batch = [None] * len(question_batch)\n\n        input_features_batch, tokens_batch, split_context_batch = [], [], []\n        for question, context in zip(question_batch, context_batch):\n            question_list, context_list = [], []\n            context_subtokens = self.tokenizer.tokenize(context)\n            question_subtokens = self.tokenizer.tokenize(question)\n            max_chunk_len = self.max_seq_length - len(question_subtokens) - 3\n            if 0 < max_chunk_len < len(context_subtokens):\n                number_of_chunks = math.ceil(len(context_subtokens) / max_chunk_len)\n                sentences = nltk.sent_tokenize(context)\n                for chunk in np.array_split(sentences, number_of_chunks):\n                    context_list += [' '.join(chunk)]\n                    question_list += [question]\n            else:\n                context_list += [context]\n                question_list += [question]\n\n            input_features_list, tokens_list = [], []\n            for question_elem, context_elem in zip(question_list, context_list):\n                encoded_dict = self.tokenizer.encode_plus(\n                    text=question_elem, text_pair=context_elem,\n                    add_special_tokens=True,\n                    max_length=self.max_seq_length,\n                    truncation=True,\n                    padding='max_length',\n                    return_attention_mask=True,\n                    return_tensors='pt')\n                if 'token_type_ids' not in encoded_dict:\n                    if self.add_token_type_ids:\n                        input_ids = encoded_dict['input_ids']\n                        seq_len = input_ids.size(1)\n                        sep = torch.where(input_ids == self.tokenizer.sep_token_id)[1][0].item()\n                        len_a = min(sep + 1, seq_len)\n                        len_b = seq_len - len_a\n                        encoded_dict['token_type_ids'] = torch.cat((torch.zeros(1, len_a, dtype=int),\n                                                                    torch.ones(1, len_b, dtype=int)), dim=1)\n                    else:\n                        encoded_dict['token_type_ids'] = torch.tensor([0])\n\n                curr_features = InputFeatures(input_ids=encoded_dict['input_ids'],\n                                              attention_mask=encoded_dict['attention_mask'],\n                                              token_type_ids=encoded_dict['token_type_ids'],\n                                              label=None)\n                input_features_list.append(curr_features)\n                tokens_list.append(self.tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0]))\n\n            input_features_batch.append(input_features_list)\n            tokens_batch.append(tokens_list)\n            split_context_batch.append(context_list)\n\n        return input_features_batch, tokens_batch, split_context_batch\n\n\n@register('rel_ranking_preprocessor')\nclass RelRankingPreprocessor(Component):\n    \"\"\"Class for tokenization of text and relation labels\n    Args:\n        vocab_file: path to vocabulary\n        add_special_tokens: special_tokens_list\n        do_lower_case: set True if lowercasing is needed\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n    \"\"\"\n\n    def __init__(self,\n                 vocab_file: str,\n                 do_lower_case: bool = True,\n                 max_seq_length: int = 512,\n                 **kwargs) -> None:\n        self.max_seq_length = max_seq_length\n        self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)\n\n    def __call__(self, questions_batch: List[List[str]], rels_batch: List[List[str]] = None) -> Dict[str, torch.tensor]:\n        \"\"\"Tokenize questions and relations\n        texts_a and texts_b are separated by [SEP] token\n        Args:\n            questions_batch: list of texts,\n            rels_batch: list of relations list\n        Returns:\n            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \\\n                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens\n        \"\"\"\n        lengths, proc_rels_batch = [], []\n        for question, rels_list in zip(questions_batch, rels_batch):\n            if isinstance(rels_list, list):\n                rels_str = \" \".join(rels_list)\n            else:\n                rels_str = rels_list\n            encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_str,\n                                                  return_attention_mask=True, add_special_tokens=True,\n                                                  truncation=True)\n            lengths.append(len(encoding[\"input_ids\"]))\n            proc_rels_batch.append(rels_str)\n        max_len = max(lengths)\n        input_ids_batch, attention_mask_batch, token_type_ids_batch = [], [], []\n        for question, rels_list in zip(questions_batch, proc_rels_batch):\n            encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_list,\n                                                  truncation=True, max_length=max_len,\n                                                  pad_to_max_length=True, return_attention_mask=True)\n            input_ids_batch.append(encoding[\"input_ids\"])\n            attention_mask_batch.append(encoding[\"attention_mask\"])\n            if \"token_type_ids\" in encoding:\n                token_type_ids_batch.append(encoding[\"token_type_ids\"])\n            else:\n                token_type_ids_batch.append([0])\n        input_features = {\"input_ids\": torch.LongTensor(input_ids_batch),\n                          \"attention_mask\": torch.LongTensor(attention_mask_batch),\n                          \"token_type_ids\": torch.LongTensor(token_type_ids_batch)}\n        return input_features\n\n\n@register('path_ranking_preprocessor')\nclass PathRankingPreprocessor(Component):\n    def __init__(self,\n                 vocab_file: str,\n                 additional_special_tokens: List[str] = None,\n                 do_lower_case: bool = True,\n                 max_seq_length: int = 67,\n                 **kwargs) -> None:\n        self.max_seq_length = max_seq_length\n        self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)\n        self.additional_special_tokens = additional_special_tokens\n        if self.additional_special_tokens:\n            self.tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens})\n\n    def __call__(self, questions_batch: List[str], rels_batch: List[List[List[str]]]):\n        lengths, proc_rels_batch = [], []\n        for question, rels_list in zip(questions_batch, rels_batch):\n            proc_rels_list = []\n            for rels in rels_list:\n                if isinstance(rels, str):\n                    rels = [rels]\n                rels_str = \"\"\n                if len(rels) == 1:\n                    if self.additional_special_tokens:\n                        rels_str = f\"<one_rel> {rels[0]} </one_rel>\"\n                    else:\n                        rels_str = rels[0]\n                elif len(rels) == 2:\n                    if rels[0] == rels[1]:\n                        rels_str = f\"<double> {rels[0]} </double>\"\n                    else:\n                        rels_str = f\"<first_rel> {rels[0]} <mid> {rels[1]} </second_rel>\"\n                encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_str,\n                                                      return_attention_mask=True, add_special_tokens=True,\n                                                      truncation=True)\n                lengths.append(len(encoding[\"input_ids\"]))\n                proc_rels_list.append(rels_str)\n            proc_rels_batch.append(proc_rels_list)\n\n        max_len = min(max(lengths), self.max_seq_length)\n        input_ids_batch, attention_mask_batch, token_type_ids_batch = [], [], []\n        for question, rels_list in zip(questions_batch, proc_rels_batch):\n            input_ids_list, attention_mask_list, token_type_ids_list = [], [], []\n            for rels_str in rels_list:\n                encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_str,\n                                                      truncation=True, max_length=max_len, add_special_tokens=True,\n                                                      pad_to_max_length=True, return_attention_mask=True)\n                input_ids_list.append(encoding[\"input_ids\"])\n                attention_mask_list.append(encoding[\"attention_mask\"])\n                if \"token_type_ids\" in encoding:\n                    token_type_ids_list.append(encoding[\"token_type_ids\"])\n                else:\n                    token_type_ids_list.append([0])\n            input_ids_batch.append(input_ids_list)\n            attention_mask_batch.append(attention_mask_list)\n            token_type_ids_batch.append(token_type_ids_list)\n        input_features = {\"input_ids\": input_ids_batch, \"attention_mask\": attention_mask_batch,\n                          \"token_type_ids\": token_type_ids_batch}\n        return input_features\n\n\n@register('torch_transformers_ner_preprocessor')\nclass TorchTransformersNerPreprocessor(Component):\n    \"\"\"\n    Takes tokens and splits them into bert subtokens, encodes subtokens with their indices.\n    Creates a mask of subtokens (one for the first subtoken, zero for the others).\n\n    If tags are provided, calculates tags for subtokens.\n\n    Args:\n        vocab_file: path to vocabulary\n        do_lower_case: set True if lowercasing is needed\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n        max_subword_length: replace token to <unk> if it's length is larger than this\n            (defaults to None, which is equal to +infinity)\n        token_masking_prob: probability of masking token while training\n        provide_subword_tags: output tags for subwords or for words\n        subword_mask_mode: subword to select inside word tokens, can be \"first\" or \"last\"\n            (default=\"first\")\n        return_features: if True, returns answer in features format\n\n    Attributes:\n        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens\n        max_subword_length: rmax lenght of a bert subtoken\n        tokenizer: instance of Bert FullTokenizer\n    \"\"\"\n\n    def __init__(self,\n                 vocab_file: str,\n                 do_lower_case: bool = False,\n                 max_seq_length: int = 512,\n                 max_subword_length: int = None,\n                 token_masking_prob: float = 0.0,\n                 provide_subword_tags: bool = False,\n                 subword_mask_mode: str = \"first\",\n                 return_features: bool = False,\n                 **kwargs):\n        self._re_tokenizer = re.compile(r\"[\\d]+[\\d\\.,]+[\\d]+|[\\w'\\.:@]+|[^\\w ]\")\n        self.provide_subword_tags = provide_subword_tags\n        self.mode = kwargs.get('mode')\n        self.max_seq_length = max_seq_length\n        self.max_subword_length = max_subword_length\n        self.subword_mask_mode = subword_mask_mode\n        if Path(vocab_file).is_file():\n            vocab_file = str(expand_path(vocab_file))\n            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,\n                                           do_lower_case=do_lower_case)\n        else:\n            self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)\n        self.token_masking_prob = token_masking_prob\n        self.return_features = return_features\n\n    def __call__(self,\n                 tokens: Union[List[List[str]], List[str]],\n                 tags: List[List[str]] = None,\n                 **kwargs):\n        tokens_offsets_batch = [[] for _ in tokens]\n        if isinstance(tokens[0], str):\n            tokens_batch = []\n            tokens_offsets_batch = []\n            for s in tokens:\n                tokens_list = []\n                tokens_offsets_list = []\n                matches = tuple(re.finditer(self._re_tokenizer, s))\n                for i, elem in enumerate(matches):\n                    if (i == len(matches) - 1) and (elem[0][-1] == '.'):\n                        tokens_list.append(elem[0][:-1])\n                        tokens_list.append('.')\n                        tokens_offsets_list.append((elem.start(), elem.end() - 1))\n                        tokens_offsets_list.append((elem.end() - 1, elem.end()))\n                    else:\n                        tokens_list.append(elem[0])\n                        tokens_offsets_list.append((elem.start(), elem.end()))\n                tokens_batch.append(tokens_list)\n                tokens_offsets_batch.append(tokens_offsets_list)\n            tokens = tokens_batch\n        subword_tokens, subword_tok_ids, startofword_markers, subword_tags = [], [], [], []\n        for i in range(len(tokens)):\n            toks = tokens[i]\n            ys = ['O'] * len(toks) if tags is None else tags[i]\n            assert len(toks) == len(ys), \\\n                f\"toks({len(toks)}) should have the same length as ys({len(ys)})\"\n            sw_toks, sw_marker, sw_ys = \\\n                self._ner_bert_tokenize(toks,\n                                        ys,\n                                        self.tokenizer,\n                                        self.max_subword_length,\n                                        mode=self.mode,\n                                        subword_mask_mode=self.subword_mask_mode,\n                                        token_masking_prob=self.token_masking_prob)\n            if self.max_seq_length is not None:\n                if len(sw_toks) > self.max_seq_length:\n                    raise RuntimeError(f\"input sequence after bert tokenization\"\n                                       f\" shouldn't exceed {self.max_seq_length} tokens.\")\n            subword_tokens.append(sw_toks)\n            subword_tok_ids.append(self.tokenizer.convert_tokens_to_ids(sw_toks))\n            startofword_markers.append(sw_marker)\n            subword_tags.append(sw_ys)\n            assert len(sw_marker) == len(sw_toks) == len(subword_tok_ids[-1]) == len(sw_ys), \\\n                f\"length of sow_marker({len(sw_marker)}), tokens({len(sw_toks)}),\" \\\n                f\" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})\" \\\n                f\" for tokens = `{toks}` should match\"\n\n        subword_tok_ids = zero_pad(subword_tok_ids, dtype=int, padding=0)\n        startofword_markers = zero_pad(startofword_markers, dtype=int, padding=0)\n        attention_mask = Mask()(subword_tokens)\n\n        if tags is not None:\n            if self.provide_subword_tags:\n                return tokens, subword_tokens, subword_tok_ids, \\\n                       attention_mask, startofword_markers, subword_tags\n            else:\n                nonmasked_tags = [[t for t in ts if t != 'X'] for ts in tags]\n                for swts, swids, swms, ts in zip(subword_tokens,\n                                                 subword_tok_ids,\n                                                 startofword_markers,\n                                                 nonmasked_tags):\n                    if (len(swids) != len(swms)) or (len(ts) != sum(swms)):\n                        log.warning('Not matching lengths of the tokenization!')\n                        log.warning(f'Tokens len: {len(swts)}\\n Tokens: {swts}')\n                        log.warning(f'Markers len: {len(swms)}, sum: {sum(swms)}')\n                        log.warning(f'Masks: {swms}')\n                        log.warning(f'Tags len: {len(ts)}\\n Tags: {ts}')\n            if self.return_features:\n                feature_list = ({'input_ids': torch.Tensor(subword_tok_ids),\n                                 'attention_mask': torch.Tensor(attention_mask),\n                                 'token_type_ids': torch.Tensor(startofword_markers),\n                                 'labels': torch.Tensor(nonmasked_tags)})\n                return feature_list\n            else:\n                return tokens, subword_tokens, subword_tok_ids, \\\n                    attention_mask, startofword_markers, nonmasked_tags\n        if self.return_features:\n            feature_list = ({'input_ids': torch.Tensor(subword_tok_ids),\n                             'attention_mask': torch.Tensor(attention_mask),\n                             'token_type_ids': torch.Tensor(startofword_markers)\n                             })\n            return feature_list\n        else:\n            return tokens, subword_tokens, subword_tok_ids, \\\n                startofword_markers, attention_mask, tokens_offsets_batch\n\n    @staticmethod\n    def _ner_bert_tokenize(tokens: List[str],\n                           tags: List[str],\n                           tokenizer: AutoTokenizer,\n                           max_subword_len: int = None,\n                           mode: str = None,\n                           subword_mask_mode: str = \"first\",\n                           token_masking_prob: float = None) -> Tuple[List[str], List[int], List[str]]:\n        do_masking = (mode == 'train') and (token_masking_prob is not None)\n        do_cutting = (max_subword_len is not None)\n        tokens_subword = ['[CLS]']\n        startofword_markers = [0]\n        tags_subword = ['X']\n        for token, tag in zip(tokens, tags):\n            token_marker = int(tag != 'X')\n            subwords = tokenizer.tokenize(token)\n            if not subwords or (do_cutting and (len(subwords) > max_subword_len)):\n                tokens_subword.append('[UNK]')\n                startofword_markers.append(token_marker)\n                tags_subword.append(tag)\n            else:\n                if do_masking and (random.random() < token_masking_prob):\n                    tokens_subword.extend(['[MASK]'] * len(subwords))\n                else:\n                    tokens_subword.extend(subwords)\n                if subword_mask_mode == \"last\":\n                    startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker])\n                else:\n                    startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1))\n                tags_subword.extend([tag] + ['X'] * (len(subwords) - 1))\n\n        tokens_subword.append('[SEP]')\n        startofword_markers.append(0)\n        tags_subword.append('X')\n        return tokens_subword, startofword_markers, tags_subword\n\n\n@register('torch_bert_ranker_preprocessor')\nclass TorchBertRankerPreprocessor(TorchTransformersPreprocessor):\n    \"\"\"Tokenize text to sub-tokens, encode sub-tokens with their indices, create tokens and segment masks for ranking.\n\n    Builds features for a pair of context with each of the response candidates.\n    \"\"\"\n\n    def __call__(self, batch: List[List[str]]) -> List[List[InputFeatures]]:\n        \"\"\"Tokenize and create masks.\n\n        Args:\n            batch: list of elements where the first element represents the batch with contexts\n                and the rest of elements represent response candidates batches\n\n        Returns:\n            list of feature batches with subtokens, subtoken ids, subtoken mask, segment mask.\n        \"\"\"\n\n        if isinstance(batch[0], str):\n            batch = [batch]\n\n        cont_resp_pairs = []\n        if len(batch[0]) == 1:\n            contexts = batch[0]\n            responses_empt = [None] * len(batch)\n            cont_resp_pairs.append(zip(contexts, responses_empt))\n        else:\n            contexts = [el[0] for el in batch]\n            for i in range(1, len(batch[0])):\n                responses = []\n                for el in batch:\n                    responses.append(el[i])\n                cont_resp_pairs.append(zip(contexts, responses))\n\n        input_features = []\n\n        for s in cont_resp_pairs:\n            sub_list_features = []\n            for context, response in s:\n                encoded_dict = self.tokenizer.encode_plus(\n                    text=context, text_pair=response, add_special_tokens=True, max_length=self.max_seq_length,\n                    pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')\n\n                curr_features = InputFeatures(input_ids=encoded_dict['input_ids'],\n                                              attention_mask=encoded_dict['attention_mask'],\n                                              token_type_ids=encoded_dict['token_type_ids'],\n                                              label=None)\n                sub_list_features.append(curr_features)\n            input_features.append(sub_list_features)\n\n        return input_features\n\n\n@dataclass\nclass RecordFlatExample:\n    \"\"\"Dataclass to store a flattened ReCoRD example. Contains `probability` for\n    a given `entity` candidate, as well as its label.\n    \"\"\"\n    index: str\n    label: int\n    probability: float\n    entity: str\n\n\n@dataclass\nclass RecordNestedExample:\n    \"\"\"Dataclass to store a nested ReCoRD example. Contains a single predicted entity, as well as\n    a list of correct answers.\n    \"\"\"\n    index: str\n    prediction: str\n    answers: List[str]\n\n\n@register(\"torch_record_postprocessor\")\nclass TorchRecordPostprocessor:\n    \"\"\"Combines flat classification examples into nested examples. When called returns nested examples\n    that weren't previously returned during current iteration over examples.\n\n    Args:\n        is_binary: signifies whether the classifier uses binary classification head\n    Attributes:\n        record_example_accumulator: underling accumulator that transforms flat examples\n        total_examples: overall number of flat examples that must be processed during current iteration\n    \"\"\"\n\n    def __init__(self, is_binary: bool = False, *args, **kwargs):\n        self.record_example_accumulator: RecordExampleAccumulator = RecordExampleAccumulator()\n        self.total_examples: Optional[int, None] = None\n        self.is_binary: bool = is_binary\n\n    def __call__(self,\n                 idx: List[str],\n                 y: List[int],\n                 y_pred_probas: np.ndarray,\n                 entities: List[str],\n                 num_examples: List[int],\n                 *args,\n                 **kwargs) -> List[RecordNestedExample]:\n        \"\"\"Postprocessor call\n\n        Args:\n            idx: list of string indices\n            y: list of integer labels\n            y_pred_probas: array of predicted probabilities\n            num_examples: list of duplicated total numbers of examples\n\n        Returns:\n            List[RecordNestedExample]: processed but not previously returned examples (may be empty in some cases)\n        \"\"\"\n        if isinstance(y_pred_probas, list):\n            y_pred_probas = [k for k in y_pred_probas if k is not None]\n            y = [k for k in y if k is not None]\n            y_pred_probas = np.array(y_pred_probas)\n        if y == []:\n            return []\n        if not self.is_binary:\n            # if we have outputs for both classes `0` and `1`\n            y_pred_probas = y_pred_probas[:, 1]\n        if self.total_examples != num_examples[0]:\n            # start over if num_examples is different\n            # implying that a different split is being evaluated\n            self.reset_accumulator()\n            self.total_examples = num_examples[0]\n        for index, label, probability, entity in zip(idx, y, y_pred_probas, entities):\n            self.record_example_accumulator.add_flat_example(index, label, probability, entity)\n            self.record_example_accumulator.collect_nested_example(index)\n            if self.record_example_accumulator.examples_processed >= self.total_examples:\n                # start over if all examples were processed\n                self.reset_accumulator()\n\n        return self.record_example_accumulator.return_examples()\n\n    def reset_accumulator(self):\n        \"\"\"Reinitialize the underlying accumulator from scratch\n        \"\"\"\n        self.record_example_accumulator = RecordExampleAccumulator()\n\n\nclass RecordExampleAccumulator:\n    \"\"\"ReCoRD example accumulator\n\n    Attributes:\n        examples_processed: total number of examples processed so far\n        record_counter: number of examples processed for each index\n        nested_len: expected number of flat examples for a given index\n        flat_examples: stores flat examples\n        nested_examples: stores nested examples\n        collected_indices: indices of collected nested examples\n        returned_indices: indices that have been returned\n    \"\"\"\n\n    def __init__(self):\n        self.examples_processed: int = 0\n        self.record_counter: Dict[str, int] = defaultdict(lambda: 0)\n        self.nested_len: Dict[str, int] = dict()\n        self.flat_examples: Dict[str, List[RecordFlatExample]] = defaultdict(lambda: [])\n        self.nested_examples: Dict[str, RecordNestedExample] = dict()\n        self.collected_indices: Set[str] = set()\n        self.returned_indices: Set[str] = set()\n\n    def add_flat_example(self, index: str, label: int, probability: float, entity: str):\n        \"\"\"Add a single flat example to the accumulator\n\n        Args:\n            index: example index\n            label: example label (`-1` means that label is not available)\n            probability: predicted probability\n            entity: candidate entity\n        \"\"\"\n        self.flat_examples[index].append(RecordFlatExample(index, label, probability, entity))\n        if index not in self.nested_len:\n            self.nested_len[index] = self.get_expected_len(index)\n        self.record_counter[index] += 1\n        self.examples_processed += 1\n\n    def ready_to_nest(self, index: str) -> bool:\n        \"\"\"Checks whether all the flat examples for a given index were collected at this point.\n        Args:\n            index: the index of the candidate nested example\n        Returns:\n            bool: indicates whether the collected flat examples can be combined into a nested example\n        \"\"\"\n        return self.record_counter[index] == self.nested_len[index]\n\n    def collect_nested_example(self, index: str):\n        \"\"\"Combines a list of flat examples denoted by the given index into a single nested example\n        provided that all the necessary flat example have been collected by this time.\n        Args:\n            index: the index of the candidate nested example\n        \"\"\"\n        if self.ready_to_nest(index):\n            example_list: List[RecordFlatExample] = self.flat_examples[index]\n            entities: List[str] = []\n            labels: List[int] = []\n            probabilities: List[float] = []\n            answers: List[str] = []\n\n            for example in example_list:\n                entities.append(example.entity)\n                labels.append(example.label)\n                probabilities.append(example.probability)\n                if example.label == 1:\n                    answers.append(example.entity)\n\n            prediction_index = np.argmax(probabilities)\n            prediction = entities[prediction_index]\n\n            self.nested_examples[index] = RecordNestedExample(index, prediction, answers)\n            self.collected_indices.add(index)\n\n    def return_examples(self) -> List[RecordNestedExample]:\n        \"\"\"Determines which nested example were not yet returned during the current evaluation\n        cycle and returns them. May return an empty list if there are no new nested examples\n        to return yet.\n        Returns:\n            List[RecordNestedExample]: zero or more nested examples\n        \"\"\"\n        indices_to_return: Set[str] = self.collected_indices.difference(self.returned_indices)\n        examples_to_return: List[RecordNestedExample] = []\n        for index in indices_to_return:\n            examples_to_return.append(self.nested_examples[index])\n        self.returned_indices.update(indices_to_return)\n        log.debug(f'Returning {examples_to_return}')\n        return examples_to_return\n\n    @staticmethod\n    def get_expected_len(index: str) -> int:\n        \"\"\"\n        Calculates the total number of flat examples denoted by the give index\n        Args:\n            index: the index to calculate the number of examples for\n        Returns:\n            int: the expected number of examples for this index\n        \"\"\"\n        return int(index.split(\"-\")[-1])\n"
  },
  {
    "path": "deeppavlov/models/preprocessors/transformers_preprocessor.py",
    "content": "# Copyright 2020 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom logging import getLogger\nfrom typing import List, Union, Tuple\n\nimport numpy as np\nfrom transformers import BertTokenizer\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlog = getLogger(__name__)\n\n\ndef _pad(data: List[List[Union[int, float]]], value: Union[int, float] = 0):\n    max_len = max(map(len, data))\n    res = np.ones([len(data), max_len], dtype=type(value)) * value\n    for i, item in enumerate(data):\n        res[i][:len(item)] = item\n    return res\n\n\n@register('transformers_bert_preprocessor')\nclass TransformersBertPreprocessor(Component):\n    def __init__(self, vocab_file: str,\n                 do_lower_case: bool = False,\n                 max_seq_length: int = 512,\n                 tokenize_chinese_chars: bool = True,\n                 **kwargs):\n        vocab_file = expand_path(vocab_file)\n        self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case,\n                                       tokenize_chinese_chars=tokenize_chinese_chars)\n        self.max_seq_length = max_seq_length\n\n    def __call__(self, tokens_batch: Union[List[str], List[List[str]]]) ->\\\n            Tuple[List[List[str]], List[List[str]], np.ndarray, np.ndarray, np.ndarray]:\n\n        if isinstance(tokens_batch[0], str):  # skip for already tokenized text\n            tokens_batch = [self.tokenizer.basic_tokenizer.tokenize(sentence, self.tokenizer.all_special_tokens)\n                            for sentence in tokens_batch]\n        startofword_markers_batch = []\n        subtokens_batch = []\n        for tokens in tokens_batch:\n            startofword_markers = [0]\n            subtokens = ['[CLS]']\n            for token in tokens:\n                for i, subtoken in enumerate(self.tokenizer.wordpiece_tokenizer.tokenize(token)):\n                    startofword_markers.append(int(i == 0))\n                    subtokens.append(subtoken)\n            startofword_markers.append(0)\n            subtokens.append('[SEP]')\n            if len(subtokens) > self.max_seq_length:\n                raise RuntimeError(f\"input sequence after bert tokenization\"\n                                   f\" cannot exceed {self.max_seq_length} tokens.\")\n\n            startofword_markers_batch.append(startofword_markers)\n            subtokens_batch.append(subtokens)\n\n        encoded = self.tokenizer.batch_encode_plus([[subtokens, None] for subtokens in subtokens_batch],\n                                                   add_special_tokens=False)\n\n        return (tokens_batch, subtokens_batch,\n                _pad(encoded['input_ids'], value=self.tokenizer.pad_token_id),\n                _pad(startofword_markers_batch), _pad(encoded['attention_mask']))\n"
  },
  {
    "path": "deeppavlov/models/ranking/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/ranking/metrics.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\n\nfrom deeppavlov.core.common.metrics_registry import register_metric\n\n\n@register_metric('rank_response')\ndef rank_response(y_true, y_pred):\n    num_examples = float(len(y_pred))\n    predictions = np.array(y_pred)\n    predictions = np.flip(np.argsort(predictions, -1), -1)\n    rank_tot = 0\n    for el in predictions:\n        for i, x in enumerate(el):\n            if x == 0:\n                rank_tot += i\n                break\n    return float(rank_tot) / num_examples\n\n\n@register_metric('r@1_insQA')\ndef r_at_1_insQA(y_true, y_pred):\n    return recall_at_k_insQA(y_true, y_pred, k=1)\n\n\ndef recall_at_k_insQA(y_true, y_pred, k):\n    labels = np.repeat(np.expand_dims(np.asarray(y_true), axis=1), k, axis=1)\n    predictions = np.array(y_pred)\n    predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k]\n    flags = np.zeros_like(predictions)\n    for i in range(predictions.shape[0]):\n        for j in range(predictions.shape[1]):\n            if predictions[i][j] in np.arange(labels[i][j]):\n                flags[i][j] = 1.\n    return np.mean((np.sum(flags, -1) >= 1.).astype(float))\n"
  },
  {
    "path": "deeppavlov/models/relation_extraction/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/relation_extraction/losses.py",
    "content": "\"\"\"\nThis code is copied from ATLOP algorithm (https://github.com/wzhouad/ATLOP/blob/main/losses.py)\n\"\"\"\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch import Tensor\n\n\nclass ATLoss(nn.Module):\n    def __init__(self):\n        super().__init__()\n\n    def forward(self, logits: Tensor, labels: Tensor) -> float:\n        \"\"\"\n        Args:\n            logits: predicted probabilities (shape: batch size x num classes)\n            labels: one-hot encoded true labels (shape: batch size x num classes)\n        \"\"\"\n\n        # TH label\n        th_label = torch.zeros_like(labels, dtype=torch.float).to(labels)\n        th_label[:, 0] = 1.0\n        labels[:, 0] = 0.0\n\n        p_mask = labels + th_label          # = 1 for the gold labels + for 0 (negative) class, 0 otherwise\n        n_mask = 1 - labels         # = 0 for the gold labels, 1 otherwise\n\n        # Rank positive classes to TH\n        logit1 = logits - (1 - p_mask) * 1e30   # org logits remain for gold labels + 0 class, others are reduced by 1\n        loss1 = -(F.log_softmax(logit1, dim=-1) * labels).sum(1)\n\n        # Rank TH to negative classes\n        logit2 = logits - (1 - n_mask) * 1e30  # org logits remain for not gold and not 0-class, others are reduced by 1\n        loss2 = -(F.log_softmax(logit2, dim=-1) * th_label).sum(1)\n\n        # Sum two parts\n        loss = loss1 + loss2\n        loss = loss.mean()\n        return loss\n\n    def get_label(self, logits: Tensor, num_labels: int = -1, threshold: float = None) -> Tensor:\n        \"\"\" Calculated the labels \"\"\"\n        if threshold:\n            th_logit = torch.full((len(logits), 1), threshold)\n        else:\n            th_logit = logits[:, 0].unsqueeze(1)        # vector of predicted probabilities for class 0 (negative class)\n        output = torch.zeros_like(logits).to(logits)\n        mask = (logits > th_logit)    # for each sample: True, if prob for a class > prob for neg class, False otherwise\n        if num_labels > 0:\n            top_v, _ = torch.topk(logits, num_labels, dim=1)        # len(num_labels) max elements; sorted\n            top_v = top_v[:, -1]            # the smallest pro for each sample\n            mask = (logits >= top_v.unsqueeze(1)) & mask    # mask + additionally: logits should be bigger than minimum\n        output[mask] = 1.0\n        output[:, 0] = (output.sum(1) == 0.).to(logits)         # no relation if no label matched\n        return output\n"
  },
  {
    "path": "deeppavlov/models/relation_extraction/relation_extraction_bert.py",
    "content": "from logging import getLogger\nfrom typing import List, Optional, Union\n\nimport numpy as np\nimport torch\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\nfrom deeppavlov.models.classifiers.re_bert import BertWithAdaThresholdLocContextPooling\n\nlog = getLogger(__name__)\n\n\n@register('re_classifier')\nclass REBertModel(TorchModel):\n\n    def __init__(\n            self,\n            n_classes: int,\n            num_ner_tags: int,\n            pretrained_bert: str = None,\n            return_probas: bool = False,\n            threshold: Optional[float] = None,\n            **kwargs\n    ) -> None:\n        \"\"\"\n        Transformer-based model on PyTorch for relation extraction. It predicts a relation hold between entities in a\n        text sample (one or several sentences).\n        Args:\n            n_classes: number of output classes\n            num_ner_tags: number of NER tags\n            pretrained_bert: key title of pretrained Bert model (e.g. \"bert-base-uncased\")\n            return_probas: set this to `True` if you need the probabilities instead of raw answers\n            threshold: manually set value for defining the positively predicted classes (instead of adaptive one)\n        \"\"\"\n        self.n_classes = n_classes\n        self.return_probas = return_probas\n\n        if self.n_classes == 0:\n            raise ConfigError(\"Please provide a valid number of classes.\")\n\n        model = BertWithAdaThresholdLocContextPooling(\n            n_classes=self.n_classes,\n            pretrained_bert=pretrained_bert,\n            bert_tokenizer_config_file=pretrained_bert,\n            num_ner_tags=num_ner_tags,\n            threshold=threshold,\n        )\n\n        super().__init__(model, **kwargs)\n\n    def train_on_batch(\n            self, input_ids: List, attention_mask: List, entity_pos: List, entity_tags: List, labels: List\n    ) -> float:\n        \"\"\"\n        Trains the relation extraction BERT model on the given batch.\n        Returns:\n            dict with loss and learning rate values.\n        \"\"\"\n\n        _input = {\n            'input_ids': torch.LongTensor(input_ids).to(self.device),\n            'attention_mask': torch.LongTensor(attention_mask).to(self.device),\n            'entity_pos': entity_pos,\n            'ner_tags': entity_tags,\n            'labels': labels\n        }\n\n        self.model.train()\n        self.model.zero_grad()\n        self.optimizer.zero_grad()      # zero the parameter gradients\n\n        hidden_states = self.model(**_input)\n        loss = hidden_states[0]\n        self._make_step(loss)\n\n        return loss.item()\n\n    def __call__(\n            self, input_ids: List, attention_mask: List, entity_pos: List, entity_tags: List\n    ) -> Union[List[int], List[np.ndarray]]:\n        \"\"\" Get model predictions using features as input \"\"\"\n\n        self.model.eval()\n\n        _input = {\n            'input_ids': torch.LongTensor(input_ids).to(self.device),\n            'attention_mask': torch.LongTensor(attention_mask).to(self.device),\n            'entity_pos': entity_pos,\n            'ner_tags': entity_tags\n        }\n\n        with torch.no_grad():\n            indices, probas = self.model(**_input)\n\n        if self.return_probas:\n            pred = probas.cpu().numpy()\n            pred[np.isnan(pred)] = 0\n            pred_without_no_rel = []        # eliminate no_relation predictions\n            for elem in pred:\n                elem[0] = 0.0\n                pred_without_no_rel.append(elem)\n            new_pred = np.argmax(pred_without_no_rel, axis=1)\n            one_hot = [[0.0] * self.n_classes] * len(new_pred)\n            for i in range(len(new_pred)):\n                one_hot[i][new_pred[i]] = 1.0\n            pred = np.array(one_hot)\n        else:\n            pred = indices.cpu().numpy()\n            pred[np.isnan(pred)] = 0\n        return pred\n"
  },
  {
    "path": "deeppavlov/models/sklearn/__init__.py",
    "content": "from .sklearn_component import *\n"
  },
  {
    "path": "deeppavlov/models/sklearn/sklearn_component.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport inspect\nimport pickle\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Tuple, Union, Callable\n\nimport numpy as np\nfrom scipy.sparse import issparse, csr_matrix\nfrom scipy.sparse import spmatrix\nfrom scipy.sparse import vstack, hstack\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register, cls_from_str\nfrom deeppavlov.core.models.estimator import Estimator\n\nlog = getLogger(__name__)\n\n\n@register(\"sklearn_component\")\nclass SklearnComponent(Estimator):\n    \"\"\"\n    Class implements wrapper for sklearn components for feature extraction,\n    feature selection, classification, regression etc.\n\n    Args:\n        model_class: string with full name of sklearn model to use, e.g. ``sklearn.linear_model:LogisticRegression``\n        save_path: save path for model, e.g. full name ``model_path/model.pkl`` \\\n            or prefix ``model_path/model`` (still model will be saved to ``model_path/model.pkl``)\n        load_path: load path for model, e.g. full name ``model_path/model.pkl`` \\\n            or prefix ``model_path/model`` (still model will be loaded from ``model_path/model.pkl``)\n        infer_method: string name of class method to use for infering model, \\\n            e.g. ``predict``, ``predict_proba``, ``predict_log_proba``, ``transform``\n        ensure_list_output: whether to ensure that output for each sample is iterable (but not string)\n        kwargs: dictionary with parameters for the sklearn model\n\n    Attributes:\n        model: sklearn model instance\n        model_class: string with full name of sklearn model to use, e.g. ``sklearn.linear_model:LogisticRegression``\n        model_params: dictionary with parameters for the sklearn model without pipe parameters\n        pipe_params: dictionary with parameters for pipe: ``in``, ``out``, ``fit_on``, ``main``, ``name``\n        save_path: save path for model, e.g. full name ``model_path/model.pkl`` \\\n            or prefix ``model_path/model`` (still model will be saved to ``model_path/model.pkl``)\n        load_path: load path for model, e.g. full name ``model_path/model.pkl`` \\\n            or prefix ``model_path/model`` (still model will be loaded from ``model_path/model.pkl``)\n        infer_method: string name of class method to use for infering model, \\\n            e.g. ``predict``, ``predict_proba``, ``predict_log_proba``, ``transform``\n        ensure_list_output: whether to ensure that output for each sample is iterable (but not string)\n    \"\"\"\n\n    def __init__(self, model_class: str,\n                 save_path: Union[str, Path] = None,\n                 load_path: Union[str, Path] = None,\n                 infer_method: str = \"predict\",\n                 ensure_list_output: bool = False,\n                 **kwargs) -> None:\n        \"\"\"\n        Initialize component with given parameters\n        \"\"\"\n\n        super().__init__(save_path=save_path, load_path=load_path, **kwargs)\n        self.model_class = model_class\n        self.model_params = kwargs\n        self.model = None\n        self.ensure_list_output = ensure_list_output\n        self.pipe_params = {}\n        for required in [\"in\", \"out\", \"fit_on\", \"main\", \"name\"]:\n            self.pipe_params[required] = self.model_params.pop(required, None)\n\n        self.load()\n        self.infer_method = getattr(self.model, infer_method)\n\n    def fit(self, *args) -> None:\n        \"\"\"\n        Fit model on the given data\n\n        Args:\n            *args: list of x-inputs and, optionally, one y-input (the last one) to fit on.\n                Possible input (x0, ..., xK, y) or (x0, ..., xK) '\n                where K is the number of input data elements (the length of list ``in`` from config). \\\n                In case of several inputs (K > 1) input features will be stacked. \\\n                For example, one has x0: (n_samples, n_features0), ..., xK: (n_samples, n_featuresK), \\\n                then model will be trained on x: (n_samples, n_features0 + ... + n_featuresK).\n\n        Returns:\n            None\n        \"\"\"\n        n_inputs = len(self.pipe_params[\"in\"]) if isinstance(self.pipe_params[\"in\"], list) else 1\n        x_features = self.compose_input_data(args[:n_inputs])\n        if len(args) > n_inputs:\n            y_ = np.squeeze(np.array(args[-1]))\n        else:\n            y_ = None\n\n        try:\n            log.info(\"Fitting model {}\".format(self.model_class))\n            self.model.fit(x_features, y_)\n        except TypeError or ValueError:\n            if issparse(x_features):\n                log.info(\"Converting input for model {} to dense array\".format(self.model_class))\n                self.model.fit(x_features.todense(), y_)\n            else:\n                log.info(\"Converting input for model {} to sparse array\".format(self.model_class))\n                self.model.fit(csr_matrix(x_features), y_)\n\n        return\n\n    def __call__(self, *args):\n        \"\"\"\n        Infer on the given data according to given in the config infer method, \\\n            e.g. ``\"predict\", \"predict_proba\", \"transform\"``\n\n        Args:\n            *args: list of inputs\n\n        Returns:\n            predictions, e.g. list of labels, array of probability distribution, sparse array of vectorized samples\n        \"\"\"\n        x_features = self.compose_input_data(args)\n\n        try:\n            predictions = self.infer_method(x_features)\n        except TypeError or ValueError:\n            if issparse(x_features):\n                log.debug(\"Converting input for model {} to dense array\".format(self.model_class))\n                predictions = self.infer_method(x_features.todense())\n            else:\n                log.debug(\"Converting input for model {} to sparse array\".format(self.model_class))\n                predictions = self.infer_method(csr_matrix(x_features))\n\n        if isinstance(predictions, list):\n            #  ``predict_proba`` sometimes returns list of n_outputs (each output corresponds to a label)\n            #  but we will return (n_samples, n_labels)\n            #  where each value is a probability of a sample to belong with the label\n            predictions_ = [[predictions[j][i][1] for j in range(len(predictions))] for i in range(x_features.shape[0])]\n            predictions = np.array(predictions_)\n\n        if self.ensure_list_output and len(predictions.shape) == 1:\n            predictions = predictions.reshape(-1, 1)\n\n        if issparse(predictions):\n            return predictions\n        else:\n            return predictions.tolist()\n\n    def init_from_scratch(self) -> None:\n        \"\"\"\n        Initialize ``self.model`` as some sklearn model from scratch with given in ``self.model_params`` parameters.\n\n        Returns:\n            None\n        \"\"\"\n        log.debug(\"Initializing model {} from scratch\".format(self.model_class))\n        model_function = cls_from_str(self.model_class)\n\n        if model_function is None:\n            raise ConfigError(\"Model with {} model_class was not found.\".format(self.model_class))\n\n        given_params = {}\n        if self.model_params:\n            available_params = self.get_function_params(model_function)\n            for param_name in self.model_params.keys():\n                if param_name in available_params:\n                    try:\n                        given_params[param_name] = cls_from_str(self.model_params[param_name])\n                    except (AttributeError, ValueError, ConfigError):\n                        given_params[param_name] = self.model_params[param_name]\n\n        self.model = model_function(**given_params)\n        return\n\n    def load(self, fname: str = None) -> None:\n        \"\"\"\n        Initialize ``self.model`` as some sklearn model from saved re-initializing ``self.model_params`` parameters. \\\n            If in new given parameters ``warm_start`` is set to True and given model admits ``warm_start`` parameter, \\\n            model will be initilized from saved with opportunity to continue fitting.\n\n        Args:\n            fname: string name of path to model to load from\n\n        Returns:\n            None\n        \"\"\"\n        if fname is None:\n            fname = self.load_path\n\n        fname = Path(fname).with_suffix('.pkl')\n\n        if fname.exists():\n            log.debug(\"Loading model {} from {}\".format(self.model_class, str(fname)))\n            with open(fname, \"rb\") as f:\n                self.model = pickle.load(f)\n\n            warm_start = self.model_params.get(\"warm_start\", None)\n            self.model_params = {param: getattr(self.model, param) for param in self.get_class_attributes(self.model)}\n            self.model_class = self.model.__module__ + self.model.__class__.__name__\n            log.debug(\"Model {} loaded  with parameters\".format(self.model_class))\n\n            if warm_start and \"warm_start\" in self.model_params.keys():\n                self.model_params[\"warm_start\"] = True\n                log.debug(\"Fitting of loaded model can be continued because `warm_start` is set to True\")\n            else:\n                log.warning(\"Fitting of loaded model can not be continued. Model can be fitted from scratch.\"\n                            \"If one needs to continue fitting, please, look at `warm_start` parameter\")\n        else:\n            log.warning(\"Cannot load model from {}\".format(str(fname)))\n            self.init_from_scratch()\n\n        return\n\n    def save(self, fname: str = None) -> None:\n        \"\"\"\n        Save ``self.model`` to the file from ``fname`` or, if not given, ``self.save_path``. \\\n            If ``self.save_path`` does not have ``.pkl`` extension, then it will be replaced \\\n            to ``str(Path(self.save_path).stem) + \".pkl\"``\n\n        Args:\n            fname:  string name of path to model to save to\n\n        Returns:\n            None\n        \"\"\"\n        if fname is None:\n            fname = self.save_path\n\n        fname = Path(fname).with_suffix('.pkl')\n\n        log.info(\"Saving model to {}\".format(str(fname)))\n        with open(fname, \"wb\") as f:\n            pickle.dump(self.model, f, protocol=4)\n        return\n\n    @staticmethod\n    def compose_input_data(x: List[Union[Tuple[Union[np.ndarray, list, spmatrix, str]],\n                                         List[Union[np.ndarray, list, spmatrix, str]],\n                                         np.ndarray, spmatrix]]) -> Union[spmatrix, np.ndarray]:\n        \"\"\"\n        Stack given list of different types of inputs to the one matrix. If one of the inputs is a sparse matrix, \\\n            then output will be also a sparse matrix\n\n        Args:\n            x: list of data elements\n\n        Returns:\n            sparse or dense array of stacked data\n        \"\"\"\n        x_features = []\n        for i in range(len(x)):\n            if ((isinstance(x[i], tuple) or isinstance(x[i], list) or isinstance(x[i], np.ndarray) and len(x[i]))\n                    or (issparse(x[i]) and x[i].shape[0])):\n                if issparse(x[i][0]):\n                    x_features.append(vstack(list(x[i])))\n                elif isinstance(x[i][0], np.ndarray) or isinstance(x[i][0], list):\n                    x_features.append(np.vstack(list(x[i])))\n                elif isinstance(x[i][0], str):\n                    x_features.append(np.array(x[i]))\n                else:\n                    raise ConfigError('Not implemented this type of vectors')\n            else:\n                raise ConfigError(\"Input vectors cannot be empty\")\n\n        sparse = False\n        for inp in x_features:\n            if issparse(inp):\n                sparse = True\n        if sparse:\n            x_features = hstack(list(x_features))\n        else:\n            x_features = np.hstack(list(x_features))\n\n        return x_features\n\n    @staticmethod\n    def get_function_params(f: Callable) -> List[str]:\n        \"\"\"\n        Get list of names of given function's parameters\n\n        Args:\n            f: function\n\n        Returns:\n            list of names of given function's parameters\n        \"\"\"\n        return inspect.getfullargspec(f)[0]\n\n    @staticmethod\n    def get_class_attributes(cls: type) -> List[str]:\n        \"\"\"\n        Get list of names of given class' attributes\n\n        Args:\n            cls: class\n\n        Returns:\n            list of names of given class' attributes\n        \"\"\"\n        return list(cls.__dict__.keys())\n"
  },
  {
    "path": "deeppavlov/models/spelling_correction/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/spelling_correction/brillmoore/__init__.py",
    "content": "from .error_model import ErrorModel\n"
  },
  {
    "path": "deeppavlov/models/spelling_correction/brillmoore/error_model.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport csv\nimport itertools\nfrom collections import defaultdict, Counter\nfrom heapq import heappop, heappushpop, heappush\nfrom logging import getLogger\nfrom math import log, exp\nfrom typing import List, Iterable, Tuple\n\nfrom tqdm import tqdm\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.estimator import Estimator\nfrom deeppavlov.vocabs.typos import StaticDictionary\n\nlogger = getLogger(__name__)\n\n\n@register('spelling_error_model')\nclass ErrorModel(Estimator):\n    \"\"\"Component that uses statistics based error model to find best candidates in a static dictionary.\n    Based on An Improved Error Model for Noisy Channel Spelling Correction by Eric Brill and Robert C. Moore\n\n    Args:\n        dictionary: a :class:`~deeppavlov.vocabs.typos.StaticDictionary` object\n        window: maximum context window size\n        candidates_count: maximum number of replacement candidates to return for every token in the input\n\n    Attributes:\n        costs: logarithmic probabilities of character sequences replacements\n        dictionary: a :class:`~deeppavlov.vocabs.typos.StaticDictionary` object\n        window: maximum context window size\n        candidates_count: maximum number of replacement candidates to return for every token in the input\n    \"\"\"\n\n    def __init__(self, dictionary: StaticDictionary, window: int = 1, candidates_count: int = 1, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.costs = defaultdict(itertools.repeat(float('-inf')).__next__)\n        self.dictionary = dictionary\n        self.window = window\n        if self.window == 0:\n            self.find_candidates = self._find_candidates_window_0\n        else:\n            self.find_candidates = self._find_candidates_window_n\n        self.costs[('', '')] = log(1)\n        self.costs[('⟬', '⟬')] = log(1)\n        self.costs[('⟭', '⟭')] = log(1)\n\n        for c in self.dictionary.alphabet:\n            self.costs[(c, c)] = log(1)\n        # if self.ser_path.is_file():\n        self.load()\n\n        self.candidates_count = candidates_count\n\n    def _find_candidates_window_0(self, word, prop_threshold=1e-6):\n        threshold = log(prop_threshold)\n        d = {}\n        prefixes_heap = [(0, {''})]\n        candidates = [(float('-inf'), '') for _ in range(self.candidates_count)]\n        word = '⟬{}⟭'.format(word.lower().replace('ё', 'е'))\n        word_len = len(word) + 1\n        while prefixes_heap and -prefixes_heap[0][0] > candidates[0][0]:\n            _, prefixes = heappop(prefixes_heap)\n            for prefix in prefixes:\n                res = []\n                for i in range(word_len):\n                    c = word[i - 1:i]\n                    res.append(max(\n                        (res[-1] + self.costs[('', c)]) if i else float('-inf'),\n                        d[prefix[:-1]][i] + self.costs[(prefix[-1], '')] if prefix else float(\n                            '-inf'),\n                        (d[prefix[:-1]][i - 1] + (self.costs[(prefix[-1], c)]))\n                        if prefix and i else float('-inf')\n                    ) if i or prefix else 0)\n                d[prefix] = res\n                if prefix in self.dictionary.words_set:\n                    heappushpop(candidates, (res[-1], prefix))\n                potential = max(res)\n                if potential > threshold:\n                    heappush(prefixes_heap, (-potential, self.dictionary.words_trie[prefix]))\n        return [(w.strip('⟬⟭'), score) for score, w in sorted(candidates, reverse=True) if\n                score > threshold]\n\n    def _find_candidates_window_n(self, word, prop_threshold=1e-6):\n        threshold = log(prop_threshold)\n        word = '⟬{}⟭'.format(word.lower().replace('ё', 'е'))\n        word_len = len(word) + 1\n        inf = float('-inf')\n        d = defaultdict(list)\n        d[''] = [0.] + [inf] * (word_len - 1)\n        prefixes_heap = [(0, self.dictionary.words_trie[''])]\n        candidates = [(inf, '')] * self.candidates_count\n        while prefixes_heap and -prefixes_heap[0][0] > candidates[0][0]:\n            _, prefixes = heappop(prefixes_heap)\n            for prefix in prefixes:\n                prefix_len = len(prefix)\n                d[prefix] = res = [inf]\n                for i in range(1, word_len):\n                    c_res = [inf]\n                    for li in range(1, min(prefix_len + 1, self.window + 2)):\n                        for ri in range(1, min(i + 1, self.window + 2)):\n                            prev = d[prefix[:-li]][i - ri]\n                            if prev > threshold:\n                                edit = (prefix[-li:], word[i - ri:i])\n                                if edit in self.costs:\n                                    c_res.append(prev +\n                                                 self.costs[edit])\n                    res.append(max(c_res))\n                if prefix in self.dictionary.words_set:\n                    heappushpop(candidates, (res[-1], prefix))\n                potential = max(res)\n                # potential = max(\n                #     [e for i in range(self.window + 2) for e in d[prefix[:prefix_len - i]]])\n                if potential > threshold:\n                    heappush(prefixes_heap, (-potential, self.dictionary.words_trie[prefix]))\n        return [(w.strip('⟬⟭'), score) for score, w in sorted(candidates, reverse=True) if\n                score > threshold]\n\n    def _infer_instance(self, instance: List[str]) -> List[List[Tuple[float, str]]]:\n        candidates = []\n        for incorrect in instance:\n            if any([c not in self.dictionary.alphabet for c in incorrect]):\n                candidates.append([(0, incorrect)])\n            else:\n                res = self.find_candidates(incorrect, prop_threshold=1e-6)\n                if res:\n                    candidates.append([(score, candidate) for candidate, score in res])\n                else:\n                    candidates.append([(0, incorrect)])\n        return candidates\n\n    def __call__(self, data: Iterable[Iterable[str]], *args, **kwargs) -> List[List[List[Tuple[float, str]]]]:\n        \"\"\"Propose candidates for tokens in sentences\n\n        Args:\n            data: batch of tokenized sentences\n\n        Returns:\n            batch of lists of probabilities and candidates for every token\n        \"\"\"\n        data = list(data)\n        if len(data) > 1:\n            data = tqdm(data, desc='Infering a batch with the error model', leave=False)\n        return [self._infer_instance(instance) for instance in data]\n\n    @staticmethod\n    def _distance_edits(seq1, seq2):\n        l1, l2 = len(seq1), len(seq2)\n        d = [[(i, ()) for i in range(l2 + 1)]]\n        d += [[(i, ())] + [(0, ())] * l2 for i in range(1, l1 + 1)]\n\n        for i in range(1, l1 + 1):\n            for j in range(1, l2 + 1):\n                edits = [\n                    (d[i - 1][j][0] + 1, d[i - 1][j][1] + ((seq1[i - 1], ''),)),\n                    (d[i][j - 1][0] + 1, d[i][j - 1][1] + (('', seq2[j - 1]),)),\n                    (d[i - 1][j - 1][0] + (seq1[i - 1] != seq2[j - 1]),\n                     d[i - 1][j - 1][1] + ((seq1[i - 1], seq2[j - 1]),))\n                ]\n                if i > 1 and j > 1 and seq1[i - 1] == seq2[j - 2] and seq1[i - 2] == seq2[j - 1]:\n                    edits.append((d[i - 2][j - 2][0] + (seq1[i - 1] != seq2[j - 1]),\n                                  d[i - 2][j - 2][1] + ((seq1[i - 2:i], seq2[j - 2:j]),)))\n                d[i][j] = min(edits, key=lambda x: x[0])\n\n        return d[-1][-1]\n\n    def fit(self, x: List[str], y: List[str]):\n        \"\"\"Calculate character sequences replacements probabilities\n\n        Args:\n            x: words with spelling errors\n            y: words without spelling errors\n        \"\"\"\n        changes = []\n        entries = []\n        data = list(zip(x, y))\n        window = 4\n        for error, correct in tqdm(data, desc='Training the error model'):\n            correct = '⟬{}⟭'.format(' '.join(correct))\n            error = '⟬{}⟭'.format(' '.join(error))\n            d, ops = self._distance_edits(correct, error)\n            if d <= 2:\n                w_ops = set()\n                for pos in range(len(ops)):\n                    left, right = list(zip(*ops))\n                    for l in range(pos, max(0, pos - window) - 1, -1):\n                        for r in range(pos + 1, min(len(ops), l + 2 + window)):\n                            w_ops.add(((''.join(left[l:r]), ''.join(right[l:r])), l, r))\n                ops = [x[0] for x in w_ops]\n\n                entries += [op[0] for op in ops]\n                changes += [op for op in ops]\n\n        e_count = Counter(entries)\n        c_count = Counter(changes)\n        incorrect_prior = 1\n        correct_prior = 19\n        for (w, s), c in c_count.items():\n            c = c + (incorrect_prior if w != s else correct_prior)\n            e = e_count[w] + incorrect_prior + correct_prior\n            p = c / e\n            self.costs[(w, s)] = log(p)\n\n    def save(self):\n        \"\"\"Save replacements probabilities to a file\n\n        \"\"\"\n        logger.info(\"[saving error_model to `{}`]\".format(self.save_path))\n\n        with open(self.save_path, 'w', newline='', encoding='utf8') as tsv_file:\n            writer = csv.writer(tsv_file, delimiter='\\t')\n            for (w, s), log_p in self.costs.items():\n                writer.writerow([w, s, exp(log_p)])\n\n    def load(self):\n        \"\"\"Load replacements probabilities from a file\n\n        \"\"\"\n        if self.load_path:\n            if self.load_path.is_file():\n                logger.debug(\"loading error_model from `{}`\".format(self.load_path))\n                with open(self.load_path, 'r', newline='', encoding='utf8') as tsv_file:\n                    reader = csv.reader(tsv_file, delimiter='\\t')\n                    for w, s, p in reader:\n                        self.costs[(w, s)] = log(float(p))\n            elif not self.load_path.parent.is_dir():\n                raise ConfigError(\"Provided `load_path` for {} doesn't exist!\".format(\n                    self.__class__.__name__))\n        else:\n            logger.warning('No load_path provided, initializing error model from scratch')\n"
  },
  {
    "path": "deeppavlov/models/spelling_correction/electors/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/spelling_correction/electors/kenlm_elector.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Tuple\n\nimport kenlm\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlogger = getLogger(__name__)\n\n\n@register('kenlm_elector')\nclass KenlmElector(Component):\n    \"\"\"Component that chooses a candidate with the highest product of base and language model probabilities\n\n    Args:\n         load_path: path to the kenlm model file\n         beam_size: beam size for highest probability search\n\n    Attributes:\n        lm: kenlm object\n        beam_size: beam size for highest probability search\n    \"\"\"\n\n    def __init__(self, load_path: Path, beam_size: int = 4, *args, **kwargs):\n        self.lm = kenlm.Model(str(expand_path(load_path)))\n        self.beam_size = beam_size\n\n    def __call__(self, batch: List[List[List[Tuple[float, str]]]]) -> List[List[str]]:\n        \"\"\"Choose the best candidate for every token\n\n        Args:\n            batch: batch of probabilities and string values of candidates for every token in a sentence\n\n        Returns:\n            batch of corrected tokenized sentences\n        \"\"\"\n        return [self._infer_instance(candidates) for candidates in batch]\n\n    def _infer_instance(self, candidates: List[List[Tuple[float, str]]]):\n        candidates = candidates + [[(0, '</s>')]]\n        state = kenlm.State()\n        self.lm.BeginSentenceWrite(state)\n        beam = [(0, state, [])]\n        for sublist in candidates:\n            new_beam = []\n            for beam_score, beam_state, beam_words in beam:\n                for score, candidate in sublist:\n                    prev_state = beam_state\n                    c_score = 0\n                    cs = candidate.split()\n                    for candidate in cs:\n                        state = kenlm.State()\n                        c_score += self.lm.BaseScore(prev_state, candidate, state)\n                        prev_state = state\n                    new_beam.append((beam_score + score + c_score, state, beam_words + cs))\n            new_beam.sort(reverse=True)\n            beam = new_beam[:self.beam_size]\n        score, state, words = beam[0]\n        return words[:-1]\n"
  },
  {
    "path": "deeppavlov/models/spelling_correction/electors/top1_elector.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import List, Tuple\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\nlogger = getLogger(__name__)\n\n\n@register('top1_elector')\nclass TopOneElector(Component):\n    \"\"\"Component that chooses a candidate with highest base probability for every token\n\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        pass\n\n    def __call__(self, batch: List[List[List[Tuple[float, str]]]]) -> List[List[str]]:\n        \"\"\"Choose the best candidate for every token\n\n        Args:\n            batch: batch of probabilities and string values of candidates for every token in a sentence\n\n        Returns:\n            batch of corrected tokenized sentences\n        \"\"\"\n        return [[max(sublist)[1] for sublist in candidates] for candidates in batch]\n"
  },
  {
    "path": "deeppavlov/models/spelling_correction/levenshtein/__init__.py",
    "content": "from .searcher_component import LevenshteinSearcherComponent\n"
  },
  {
    "path": "deeppavlov/models/spelling_correction/levenshtein/levenshtein_searcher.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nimport itertools\n\nimport numpy as np\nfrom sortedcontainers import SortedListWithKey\n\nfrom .tabled_trie import Trie, make_trie\n\n\nclass LevenshteinSearcher:\n    \"\"\"\n    Класс для поиска близких слов\n    в соответствии с расстоянием Левенштейна\n\n    \"\"\"\n\n    def __init__(self, alphabet, dictionary, operation_costs=None,\n                 allow_spaces=False, euristics='none'):\n        self.alphabet = alphabet\n        self.allow_spaces = allow_spaces\n        if isinstance(euristics, int):\n            if euristics < 0:\n                raise ValueError(\"Euristics should be non-negative integer or None\")\n            else:\n                self.euristics = euristics if euristics != 0 else None\n        elif euristics in [\"none\", \"None\", None]:\n            self.euristics = None\n        else:\n            raise ValueError(\"Euristics should be non-negative integer or None\")\n        if isinstance(dictionary, Trie):\n            # словарь передан уже в виде бора\n            self.dictionary = dictionary\n        else:\n            self.dictionary = make_trie(alphabet, dictionary, make_cashed=True,\n                                        precompute_symbols=self.euristics,\n                                        allow_spaces=self.allow_spaces)\n        self.transducer = SegmentTransducer(\n            alphabet, operation_costs=operation_costs, allow_spaces=allow_spaces)\n        self._precompute_euristics()\n        self._define_h_function()\n\n    def __contains__(self, word):\n        return word in self.dictionary\n\n    def search(self, word, d, allow_spaces=True, return_cost=True):\n        \"\"\"\n        Finds all dictionary words in d-window from word\n        \"\"\"\n        if not all((c in self.alphabet\n                    or (c == \" \" and self.allow_spaces)) for c in word):\n            return []\n            # raise ValueError(\"{0} contains an incorrect symbol\".format(word))\n        return self._trie_search(\n            word, d, allow_spaces=allow_spaces, return_cost=return_cost)\n\n    def _trie_search(self, word, d, transducer=None,\n                     allow_spaces=True, return_cost=True):\n        \"\"\"\n        Находит все слова в префиксном боре, расстояние до которых\n        в соответствии с заданным преобразователем не превышает d\n        \"\"\"\n        if transducer is None:\n            # разобраться с пробелами\n            transducer = self.transducer.inverse()\n        allow_spaces &= self.allow_spaces\n        trie = self.dictionary\n        #  инициализация переменных\n        used_agenda_keys = set()\n        agenda = SortedListWithKey(key=(lambda x: x[1]))\n        h = self.h_func(word, trie.root)\n        # agenda[self.agenda_key(\"\", 0, trie.root)] = (0.0, 0.0, h)\n        key, value = (\"\", 0, trie.root), (0.0, 0.0, h)\n        agenda.add((key, value))\n        answer = dict()\n        k = 0\n        # очередь с приоритетом с промежуточными результатами\n        while len(agenda) > 0:\n            key, value = agenda.pop(0)\n            if key in used_agenda_keys:\n                continue\n            used_agenda_keys.add(key)\n            low, pos, index = key\n            cost, g, h = value\n            # g --- текущая стоимость, h --- нижняя оценка будущей стоимости\n            # cost = g + h --- нижняя оценка суммарной стоимости\n            k += 1\n            max_upperside_length = min(len(word) - pos, transducer.max_up_length)\n            for upperside_length in range(max_upperside_length + 1):\n                new_pos = pos + upperside_length\n                curr_up = word[pos: new_pos]\n                if curr_up not in transducer.operation_costs:\n                    continue\n                for curr_low, curr_cost in transducer.operation_costs[curr_up].items():\n                    new_g = g + curr_cost\n                    if new_g > d:  # если g > d, то h можно не вычислять\n                        continue\n                    if curr_low == \" \":\n                        if allow_spaces and trie.is_final(index):\n                            new_index = trie.root\n                        else:\n                            new_index = Trie.NO_NODE\n                    else:\n                        new_index = trie.descend(index, curr_low)\n                    if new_index is Trie.NO_NODE:\n                        continue\n                    new_low = low + curr_low\n                    new_h = self.h_func(word[new_pos:], new_index)\n                    new_cost = new_g + new_h\n                    if new_cost > d:\n                        continue\n                    new_key = (new_low, new_pos, new_index)\n                    new_value = (new_cost, new_g, new_h)\n                    if new_pos == len(word) and trie.is_final(new_index):\n                        old_g = answer.get(new_low, None)\n                        if old_g is None or new_g < old_g:\n                            answer[new_low] = new_g\n                    agenda.add((new_key, new_value))\n        answer = sorted(answer.items(), key=(lambda x: x[1]))\n        if return_cost:\n            return answer\n        else:\n            return [elem[0] for elem in answer]\n\n    def _precompute_euristics(self):\n        \"\"\"\n        Предвычисляет будущие символы и стоимости операций с ними\n        для h-эвристики\n        \"\"\"\n        if self.euristics is None:\n            return\n        # вычисление минимальной стоимости операции,\n        # приводящей к появлению ('+') или исчезновению ('-') данного символа\n        removal_costs = {a: np.inf for a in self.alphabet}\n        insertion_costs = {a: np.inf for a in self.alphabet}\n        if self.allow_spaces:\n            removal_costs[' '] = np.inf\n            insertion_costs[' '] = np.inf\n        for up, costs in self.transducer.operation_costs.items():\n            for low, cost in costs.items():\n                if up == low:\n                    continue\n                if up != '':\n                    removal_cost = cost / len(up)\n                    for a in up:\n                        removal_costs[a] = min(removal_costs[a], removal_cost)\n                if low != '':\n                    insertion_cost = cost / len(low)\n                    for a in low:\n                        insertion_costs[a] = min(insertion_costs[a], insertion_cost)\n        # предвычисление возможных будущих символов в узлах дерева\n        # precompute_future_symbols(self.dictionary, self.euristics, self.allow_spaces)\n        # предвычисление стоимостей потери символа в узлах дерева\n        self._absense_costs_by_node = _precompute_absense_costs(\n            self.dictionary, removal_costs, insertion_costs,\n            self.euristics, self.allow_spaces)\n        # массив для сохранения эвристик\n        self._temporary_euristics = [dict() for i in range(len(self.dictionary))]\n\n    def _define_h_function(self):\n        if self.euristics in [None, 0]:\n            self.h_func = (lambda *x: 0.0)\n        else:\n            self.h_func = self._euristic_h_function\n\n    def _euristic_h_function(self, suffix, index):\n        \"\"\"\n        Вычисление h-эвристики из работы Hulden,2009 для текущей вершины словаря\n\n        Аргументы:\n        ----------\n        suffix : string\n            непрочитанный суффикс входного слова\n        index : int\n            индекс текущего узла в словаре\n\n        Возвращает:\n        -----------\n        cost : float\n            оценка снизу для стоимости замены,\n            приводящей к входному слову с суффиксом suffix,\n            если прочитанный префикс слова без опечатки\n            привёл в вершину с номером index\n        \"\"\"\n        if self.euristics > 0:\n            suffix = suffix[:self.euristics]\n        # кэширование результатов\n        index_temporary_euristics = self._temporary_euristics[index]\n        cost = index_temporary_euristics.get(suffix, None)\n        if cost is not None:\n            return cost\n        # извлечение нужных данных из массивов\n        absense_costs = self._absense_costs_by_node[index]\n        data = self.dictionary.data[index]\n        costs = np.zeros(dtype=np.float64, shape=(self.euristics,))\n        # costs[j] --- оценка штрафа при предпросмотре вперёд на j символов\n        for i, a in enumerate(suffix):\n            costs[i:] += absense_costs[a][i:]\n        cost = max(costs)\n        index_temporary_euristics[suffix] = cost\n        return cost\n\n    def _minimal_replacement_cost(self, first, second):\n        first_symbols, second_symbols = set(), set()\n        removal_cost, insertion_cost = 0, 0\n        for a, b in itertools.zip_longest(first, second, fillvalue=None):\n            if a is not None:\n                first_symbols.add(a)\n            if b is not None:\n                second_symbols.add(b)\n            removal_cost = max(removal_cost, len(first_symbols - second_symbols))\n            insertion_cost = max(insertion_cost, len(second_symbols - first_symbols))\n        return min(removal_cost, insertion_cost)\n\n\ndef _precompute_absense_costs(dictionary, removal_costs, insertion_costs, n,\n                              allow_spaces=False):\n    \"\"\"\n    Вычисляет минимальную стоимость появления нового символа в узлах словаря\n    в соответствии со штрафами из costs\n\n    Аргументы:\n    ---------------\n    dictionary : Trie\n        словарь, хранящийся в виде ациклического автомата\n\n    removal_costs : dict\n        штрафы за удаление символов\n\n    insertion_costs : dict\n        штрафы за вставку символов\n\n    n : int\n        глубина ``заглядывания вперёд'' в словаре\n\n    Возвращает\n    ---------------\n    answer : list of dicts, len(answer)=len(dictionary)\n        answer[i][a][j] равно минимальному штрафу за появление символа a\n        в j-ой позиции в вершине с номером i\n    \"\"\"\n    answer = [dict() for node in dictionary.data]\n    if n == 0:\n        return answer\n    curr_alphabet = copy.copy(dictionary.alphabet)\n    if allow_spaces:\n        curr_alphabet += [' ']\n    for l, (costs_in_node, node) in enumerate(zip(answer, dictionary.data)):\n        # определение минимальной стоимости удаления символов\n        curr_node_removal_costs = np.empty(dtype=np.float64, shape=(n,))\n        if len(node[0]) > 0:\n            curr_node_removal_costs[0] = min(removal_costs[symbol] for symbol in node[0])\n            for j, symbols in enumerate(node[1:], 1):\n                if len(symbols) == 0:\n                    curr_node_removal_costs[j:] = curr_node_removal_costs[j - 1]\n                    break\n                curr_cost = min(removal_costs[symbol] for symbol in symbols)\n                curr_node_removal_costs[j] = min(curr_node_removal_costs[j - 1], curr_cost)\n        else:\n            curr_node_removal_costs[:] = np.inf\n        # определение минимальной стоимости вставки\n        for a in curr_alphabet:\n            curr_symbol_costs = np.empty(dtype=np.float64, shape=(n,))\n            curr_symbol_costs.fill(insertion_costs[a])\n            for j, symbols in enumerate(node):\n                if a in symbols:\n                    curr_symbol_costs[j:] = 0.0\n                    break\n                curr_symbol_costs[j] = min(curr_symbol_costs[j], curr_node_removal_costs[j])\n            costs_in_node[a] = curr_symbol_costs\n    return answer\n\n\nclass SegmentTransducer:\n    \"\"\"\n    Класс, реализующий взвешенный конечный преобразователь,\n    осуществляющий замены из заданного списка операций\n\n    Аргументы:\n    ----------\n    alphabet : list\n        алфавит\n\n    operation_costs : dict or None(optional, default=None)\n        словарь вида {(up,low) : cost}\n\n    allow_spaces : bool(optional, default=False)\n        разрешены ли элементы трансдукции, содержащие пробел\n        (используется только если явно не заданы operation costs\n        и они равны значению по умолчанию)\n\n    \"\"\"\n\n    def __init__(self, alphabet, operation_costs=None, allow_spaces=False):\n        self.alphabet = alphabet\n        if operation_costs is None:\n            self._make_default_operation_costs(allow_spaces=allow_spaces)\n        elif not isinstance(operation_costs, dict):\n            raise TypeError(\"Operation costs must be a dictionary\")\n        else:\n            self.operation_costs = operation_costs\n        self._make_reversed_operation_costs()\n        self._make_maximal_key_lengths()\n        # self.maximal_value_lengths = {}\n        # for up, probs in self.operation_costs.items():\n        # СЛИШКОМ МНОГО ВЫЗОВОВ, НАДО КАК-ТО ЗАПОМНИТЬ\n        # МАКСИМАЛЬНЫЕ ДЛИНЫ КЛЮЧЕЙ ПРИ ОБРАЩЕНИИ\n        # max_low_length = max(len(low) for low in probs) if (len(probs) > 0) else -1\n        # self.maximal_value_lengths[up] = self.maximal_key_length\n\n    def get_operation_cost(self, up, low):\n        \"\"\"\n        Возвращает стоимость элементарной трансдукции up->low\n        или np.inf, если такой элементарной трансдукции нет\n\n        Аргументы:\n        ----------\n        up, low : string\n            элементы элементарной трансдукции\n\n        Возвращает:\n        -----------\n        cost : float\n            стоимость элементарной трансдукции up->low\n            (np.inf, если такая трансдукция отсутствует)\n        \"\"\"\n        up_costs = self.operation_costs.get(up, None)\n        if up_costs is None:\n            return np.inf\n        cost = up_costs.get(low, np.inf)\n        return cost\n\n    def inverse(self):\n        \"\"\"\n        Строит пробразователь, задающий обратное конечное преобразование\n        \"\"\"\n        # УПРОСТИТЬ ОБРАЩЕНИЕ!!!\n        inversed_transducer = SegmentTransducer(self.alphabet, operation_costs=dict())\n        inversed_transducer.operation_costs = self._reversed_operation_costs\n        inversed_transducer._reversed_operation_costs = self.operation_costs\n        inversed_transducer.max_low_length = self.max_up_length\n        inversed_transducer.max_up_length = self.max_low_length\n        inversed_transducer.max_low_lengths_by_up = self.max_up_lengths_by_low\n        inversed_transducer.max_up_lengths_by_low = self.max_low_lengths_by_up\n        return inversed_transducer\n\n    def distance(self, first, second, return_transduction=False):\n        \"\"\"\n        Вычисляет трансдукцию минимальной стоимости,\n        отображающую first в second\n\n        Аргументы:\n        -----------\n        first : string\n        second : string\n            Верхний и нижний элементы трансдукции\n\n        return_transduction : bool (optional, default=False)\n            следует ли возвращать трансдукцию минимального веса\n            (см. возвращаемое значение)\n\n        Возвращает:\n        -----------\n        (final_cost, transductions) : tuple(float, list)\n            если return_transduction=True, то возвращает\n            минимальную стоимость трансдукции, переводящей first в second\n            и список трансдукций с данной стоимостью\n\n        final_cost : float\n            если return_transduction=False, то возвращает\n            минимальную стоимость трансдукции, переводящей first в second\n        \"\"\"\n        if return_transduction:\n            add_pred = (lambda x, y: (y == np.inf or x < y))\n        else:\n            add_pred = (lambda x, y: (y == np.inf or x <= y))\n        clear_pred = (lambda x, y: x < y < np.inf)\n        update_func = lambda x, y: min(x, y)\n        costs, backtraces = self._fill_levenshtein_table(first, second,\n                                                         update_func, add_pred, clear_pred)\n        final_cost = costs[-1][-1]\n        if final_cost == np.inf:\n            transductions = [None]\n        elif return_transduction:\n            transductions = self._backtraces_to_transductions(first, second, backtraces,\n                                                              final_cost, return_cost=False)\n        if return_transduction:\n            return final_cost, transductions\n        else:\n            return final_cost\n\n    def transduce(self, first, second, threshold):\n        \"\"\"\n        Возвращает все трансдукции, переводящие first в second,\n        чья стоимость не превышает threshold\n\n        Возвращает:\n        ----------\n        result : list\n            список вида [(трансдукция, стоимость)]\n        \"\"\"\n        add_pred = (lambda x, y: x <= threshold)\n        clear_pred = (lambda x, y: False)\n        update_func = (lambda x, y: min(x, y))\n        costs, backtraces = self._fill_levenshtein_table(first, second,\n                                                         update_func, add_pred, clear_pred,\n                                                         threshold=threshold)\n        result = self._backtraces_to_transductions(first, second,\n                                                   backtraces, threshold, return_cost=True)\n        return result\n\n    def lower_transductions(self, word, max_cost, return_cost=True):\n        \"\"\"\n        Возвращает все трансдукции с верхним элементом word,\n        чья стоимость не превышает max_cost\n\n    `   Возвращает:\n        ----------\n        result : list\n            список вида [(трансдукция, стоимость)], если return_cost=True\n            список трансдукций, если return_cost=False\n            список отсортирован в порядке возрастания стоимости трансдукции\n        \"\"\"\n        prefixes = [[] for i in range(len(word) + 1)]\n        prefixes[0].append(((), 0.0))\n        for pos in range(len(prefixes)):\n            # вставки\n            prefixes[pos] = self._perform_insertions(prefixes[pos], max_cost)\n            max_upperside_length = min(len(word) - pos, self.max_up_length)\n            for upperside_length in range(1, max_upperside_length + 1):\n                up = word[pos: pos + upperside_length]\n                for low, low_cost in self.operation_costs.get(up, dict()).items():\n                    for transduction, cost in prefixes[pos]:\n                        new_cost = cost + low_cost\n                        if new_cost <= max_cost:\n                            new_transduction = transduction + (up, low)\n                            prefixes[pos + upperside_length].append((new_transduction, new_cost))\n        answer = sorted(prefixes[-1], key=(lambda x: x[0]))\n        if return_cost:\n            return answer\n        else:\n            return [elem[0] for elem in answer]\n\n    def lower(self, word, max_cost, return_cost=True):\n        transductions = self.lower_transductions(word, max_cost, return_cost=True)\n        answer = dict()\n        for transduction, cost in transductions:\n            low = \"\".join(elem[1] for elem in transductions)\n            curr_cost = answer.get(low, None)\n            if curr_cost is None or cost < curr_cost:\n                answer[low] = cost\n        answer = sorted(answer.items(), key=(lambda x: x[1]))\n        if return_cost:\n            return answer\n        else:\n            return [elem[0] for elem in answer]\n\n    def upper(self, word, max_cost, return_cost=True):\n        inversed_transducer = self.inverse()\n        return inversed_transducer.lower(word, max_cost, return_cost)\n\n    def upper_transductions(self, word, max_cost, return_cost=True):\n        inversed_transducer = self.inverse()\n        return inversed_transducer.lower_transductions(word, max_cost, return_cost)\n\n    def _fill_levenshtein_table(self, first, second, update_func, add_pred, clear_pred,\n                                threshold=None):\n        \"\"\"\n        Функция, динамически заполняющая таблицу costs стоимости трансдукций,\n        costs[i][j] --- минимальная стоимость трансдукции,\n        переводящей first[:i] в second[:j]\n\n        Аргументы:\n        ----------\n        first, second : string\n            Верхний и нижний элементы трансдукции\n        update_func : callable, float*float -> bool\n            update_func(x, y) возвращает новое значение в ячейке таблицы costs,\n            если старое значение --- y, а потенциально новое значение --- x\n            везде update_func = min\n        add_pred : callable : float*float -> bool\n            add_pred(x, y) возвращает, производится ли добавление\n            нового элемента p стоимости x в ячейку backtraces[i][j]\n            в зависимости от значения costs[i][j]=y и текущей стоимости x\n        clear_pred : callable : float*float -> bool\n            clear_pred(x, y) возвращает, производится ли очистка\n            ячейки backtraces[i][j] в зависимости от значения costs[i][j]=y\n            и текущей стоимости x элемента p, добавляемого в эту ячейку\n\n        Возвращает:\n        -----------\n        costs : array, dtype=float, shape=(len(first)+1, len(second)+1)\n            массив, в ячейке с индексами i, j которого хранится\n            минимальная стоимость трансдукции, переводящей first[:i] в second[:j]\n        backtraces : array, dtype=list, shape=(len(first)+1, len(second)+1)\n            массив, в ячейке с индексами i, j которого хранятся\n            обратные ссылки на предыдущую ячейку в оптимальной трансдукции,\n            приводящей в ячейку backtraces[i][j]\n        \"\"\"\n        m, n = len(first), len(second)\n        # если threshold=None, то в качестве порога берётся удвоенная стоимость\n        # трансдукции, отображающей символы на одинаковых позициях друг в друга\n        if threshold is None:\n            threshold = 0.0\n            for a, b in zip(first, second):\n                threshold += self.get_operation_cost(a, b)\n            if m > n:\n                for a in first[n:]:\n                    threshold += self.get_operation_cost(a, '')\n            elif m < n:\n                for b in second[m:]:\n                    threshold += self.get_operation_cost('', b)\n            threshold *= 2\n        # инициализация возвращаемых массивов\n        costs = np.zeros(shape=(m + 1, n + 1), dtype=np.float64)\n        costs[:] = np.inf\n        backtraces = [None] * (m + 1)\n        for i in range(m + 1):\n            backtraces[i] = [[] for j in range(n + 1)]\n        costs[0][0] = 0.0\n        for i in range(m + 1):\n            for i_right in range(i, min(i + self.max_up_length, m) + 1):\n                up = first[i: i_right]\n                max_low_length = self.max_low_lengths_by_up.get(up, -1)\n                if max_low_length == -1:  # no up key in transduction\n                    continue\n                up_costs = self.operation_costs[up]\n                for j in range(n + 1):\n                    if costs[i][j] > threshold:\n                        continue\n                    if len(backtraces[i][j]) == 0 and i + j > 0:\n                        continue  # не нашлось обратных ссылок\n                    for j_right in range((j if i_right > i else j + 1),\n                                         min(j + max_low_length, n) + 1):\n                        low = second[j: j_right]\n                        curr_cost = up_costs.get(low, np.inf)\n                        old_cost = costs[i_right][j_right]\n                        new_cost = costs[i][j] + curr_cost\n                        if new_cost > threshold:\n                            continue\n                        if add_pred(new_cost, old_cost):\n                            if clear_pred(new_cost, old_cost):\n                                backtraces[i_right][j_right] = []\n                            costs[i_right][j_right] = update_func(new_cost, old_cost)\n                            backtraces[i_right][j_right].append((i, j))\n        return costs, backtraces\n\n    def _make_reversed_operation_costs(self):\n        \"\"\"\n        Заполняет массив _reversed_operation_costs\n        на основе имеющегося массива operation_costs\n        \"\"\"\n        _reversed_operation_costs = dict()\n        for up, costs in self.operation_costs.items():\n            for low, cost in costs.items():\n                if low not in _reversed_operation_costs:\n                    _reversed_operation_costs[low] = dict()\n                _reversed_operation_costs[low][up] = cost\n        self._reversed_operation_costs = _reversed_operation_costs\n\n    def _make_maximal_key_lengths(self):\n        \"\"\"\n        Вычисляет максимальную длину элемента low\n        в элементарной трансдукции (up, low) для каждого up\n        и максимальную длину элемента up\n        в элементарной трансдукции (up, low) для каждого low\n        \"\"\"\n        self.max_up_length = \\\n            (max(len(up) for up in self.operation_costs)\n             if len(self.operation_costs) > 0 else -1)\n        self.max_low_length = \\\n            (max(len(low) for low in self._reversed_operation_costs)\n             if len(self._reversed_operation_costs) > 0 else -1)\n        self.max_low_lengths_by_up, self.max_up_lengths_by_low = dict(), dict()\n        for up, costs in self.operation_costs.items():\n            self.max_low_lengths_by_up[up] = \\\n                max(len(low) for low in costs) if len(costs) > 0 else -1\n        for low, costs in self._reversed_operation_costs.items():\n            self.max_up_lengths_by_low[low] = \\\n                max(len(up) for up in costs) if len(costs) > 0 else -1\n\n    def _backtraces_to_transductions(self, first, second, backtraces, threshold, return_cost=False):\n        \"\"\"\n        Восстанавливает трансдукции по таблице обратных ссылок\n\n        Аргументы:\n        ----------\n        first, second : string\n            верхние и нижние элементы трансдукции\n        backtraces : array-like, dtype=list, shape=(len(first)+1, len(second)+1)\n            таблица обратных ссылок\n        threshold : float\n            порог для отсева трансдукций,\n            возвращаются только трансдукции стоимостью <= threshold\n        return_cost : bool (optional, default=False)\n            если True, то вместе с трансдукциями возвращается их стоимость\n\n        Возвращает:\n        -----------\n        result : list\n            список вида [(трансдукция, стоимость)], если return_cost=True\n            и вида [трансдукция], если return_cost=False,\n            содержащий все трансдукции, переводящие first в second,\n            чья стоимость не превышает threshold\n        \"\"\"\n        m, n = len(first), len(second)\n        agenda = [None] * (m + 1)\n        for i in range(m + 1):\n            agenda[i] = [[] for j in range(n + 1)]\n        agenda[m][n] = [((), 0.0)]\n        for i_right in range(m, -1, -1):\n            for j_right in range(n, -1, -1):\n                current_agenda = agenda[i_right][j_right]\n                if len(current_agenda) == 0:\n                    continue\n                for (i, j) in backtraces[i_right][j_right]:\n                    up, low = first[i:i_right], second[j:j_right]\n                    add_cost = self.operation_costs[up][low]\n                    for elem, cost in current_agenda:\n                        new_cost = cost + add_cost\n                        if new_cost <= threshold:  # удаление трансдукций большой стоимости\n                            agenda[i][j].append((((up, low),) + elem, new_cost))\n        if return_cost:\n            return agenda[0][0]\n        else:\n            return [elem[0] for elem in agenda[0][0]]\n\n    def _perform_insertions(self, initial, max_cost):\n        \"\"\"\n        возвращает все трансдукции стоимости <= max_cost,\n        которые можно получить из элементов initial\n\n        Аргументы:\n        ----------\n        initial : list of tuples\n            список исходных трансдукций вида [(трансдукция, стоимость)]\n        max_cost : float\n            максимальная стоимость трансдукции\n\n        Возвращает:\n        -----------\n        final : list of tuples\n            финальный список трансдукций вида [(трансдукция, стоимость)]\n        \"\"\"\n        queue = list(initial)\n        final = initial\n        while len(queue) > 0:\n            transduction, cost = queue[0]\n            queue = queue[1:]\n            for string, string_cost in self.operation_costs[\"\"].items():\n                new_cost = cost + string_cost\n                if new_cost <= max_cost:\n                    new_transduction = transduction + (\"\", string)\n                    final.append((new_transduction, new_cost))\n                    queue.append((new_transduction, new_cost))\n        return final\n\n    def _make_default_operation_costs(self, allow_spaces=False):\n        \"\"\"\n        sets 1.0 cost for every replacement, insertion, deletion and transposition\n        \"\"\"\n        self.operation_costs = dict()\n        self.operation_costs[\"\"] = {c: 1.0 for c in list(self.alphabet) + [' ']}\n        for a in self.alphabet:\n            current_costs = {c: 1.0 for c in self.alphabet}\n            current_costs[a] = 0.0\n            current_costs[\"\"] = 1.0\n            if allow_spaces:\n                current_costs[\" \"] = 1.0\n            self.operation_costs[a] = current_costs\n        # транспозиции\n        for a, b in itertools.permutations(self.alphabet, 2):\n            self.operation_costs[a + b] = {b + a: 1.0}\n        # пробелы\n        if allow_spaces:\n            self.operation_costs[\" \"] = {c: 1.0 for c in self.alphabet}\n            self.operation_costs[\" \"][\"\"] = 1.0\n"
  },
  {
    "path": "deeppavlov/models/spelling_correction/levenshtein/searcher_component.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport string\nfrom logging import getLogger\nfrom math import log10\nfrom typing import Iterable, List, Tuple, Optional\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom .levenshtein_searcher import LevenshteinSearcher\n\nlogger = getLogger(__name__)\n\n\n@register('spelling_levenshtein')\nclass LevenshteinSearcherComponent(Component):\n    \"\"\"Component that finds replacement candidates for tokens at a set Damerau-Levenshtein distance\n\n    Args:\n        words: list of every correct word\n        max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates\n        error_probability: assigned probability for every edit\n        vocab_penalty: assigned probability of an out of vocabulary token being the correct one without changes\n\n    Attributes:\n        max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates\n        error_probability: assigned logarithmic probability for every edit\n        vocab_penalty: assigned logarithmic probability of an out of vocabulary token being the correct one without\n         changes\n    \"\"\"\n\n    _punctuation = frozenset(string.punctuation)\n\n    def __init__(self, words: Iterable[str], max_distance: int = 1, error_probability: float = 1e-4,\n                 vocab_penalty: Optional[float] = None, **kwargs):\n        words = list({word.strip().lower().replace('ё', 'е') for word in words})\n        alphabet = sorted({letter for word in words for letter in word})\n        self.max_distance = max_distance\n        self.error_probability = log10(error_probability)\n        self.vocab_penalty = self.error_probability if vocab_penalty is None else log10(vocab_penalty)\n        self.searcher = LevenshteinSearcher(alphabet, words, allow_spaces=True, euristics=2)\n\n    def _infer_instance(self, tokens: Iterable[str]) -> List[List[Tuple[float, str]]]:\n        candidates = []\n        for word in tokens:\n            if word in self._punctuation:\n                candidates.append([(0, word)])\n            else:\n                c = {candidate: self.error_probability * distance\n                     for candidate, distance in self.searcher.search(word, d=self.max_distance)}\n                c[word] = c.get(word, self.vocab_penalty)\n                candidates.append([(score, candidate) for candidate, score in c.items()])\n        return candidates\n\n    def __call__(self, batch: Iterable[Iterable[str]], *args, **kwargs) -> List[List[List[Tuple[float, str]]]]:\n        \"\"\"Propose candidates for tokens in sentences\n\n        Args:\n            batch: batch of tokenized sentences\n\n        Returns:\n            batch of lists of probabilities and candidates for every token\n        \"\"\"\n        return [self._infer_instance(tokens) for tokens in batch]\n"
  },
  {
    "path": "deeppavlov/models/spelling_correction/levenshtein/tabled_trie.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nfrom collections import defaultdict\n\nimport numpy as np\n\n\nclass Trie:\n    \"\"\"\n    Реализация префиксного бора (точнее, корневого направленного ациклического графа)\n\n    Атрибуты\n    --------\n    alphabet: list, алфавит\n    alphabet_codes: dict, словарь символ:код\n    compressed: bool, индикатор сжатия\n    cashed: bool, индикатор кэширования запросов к функции descend\n    root: int, индекс корня\n    graph: array, type=int, shape=(число вершин, размер алфавита), матрица потомков\n    graph[i][j] = k <-> вершина k --- потомок вершины i по ребру, помеченному символом alphabet[j]\n    data: array, type=object, shape=(число вершин), массив с данными, хранящямися в вершинах\n    final: array, type=bool, shape=(число вершин), массив индикаторов\n    final[i] = True <-> i --- финальная вершина\n    \"\"\"\n    NO_NODE = -1\n    SPACE_CODE = -1\n\n    ATTRS = ['is_numpied', 'precompute_symbols', 'allow_spaces',\n             'is_terminated', 'to_make_cashed']\n\n    def __init__(self, alphabet, make_sorted=True, make_alphabet_codes=True,\n                 is_numpied=False, to_make_cashed=False,\n                 precompute_symbols=None, allow_spaces=False, dict_storage=False):\n        self.alphabet = sorted(alphabet) if make_sorted else alphabet\n        self.alphabet_codes = ({a: i for i, a in enumerate(self.alphabet)}\n                               if make_alphabet_codes else self.alphabet)\n        self.alphabet_codes[\" \"] = Trie.SPACE_CODE\n        self.is_numpied = is_numpied\n        self.to_make_cashed = to_make_cashed\n        self.dict_storage = dict_storage\n        self.precompute_symbols = precompute_symbols\n        self.allow_spaces = allow_spaces\n        self.initialize()\n\n    def initialize(self):\n        self.root = 0\n        self.graph = [self._make_default_node()]\n        self.data, self.final = [None], [False]\n        self.nodes_number = 1\n        self.descend = self._descend_simple\n        self.is_terminated = False\n\n    def _make_default_node(self):\n        if self.dict_storage:\n            return defaultdict(lambda: -1)\n        elif self.is_numpied:\n            return np.full(shape=(len(self.alphabet),),\n                           fill_value=Trie.NO_NODE, dtype=int)\n        else:\n            return [Trie.NO_NODE] * len(self.alphabet)\n\n    def save(self, outfile):\n        \"\"\"\n        Сохраняет дерево для дальнейшего использования\n        \"\"\"\n        with open(outfile, \"w\", encoding=\"utf8\") as fout:\n            attr_values = [getattr(self, attr) for attr in Trie.ATTRS]\n            attr_values.append(any(x is not None for x in self.data))\n            fout.write(\"{}\\n{}\\t{}\\n\".format(\n                \" \".join(\"T\" if x else \"F\" for x in attr_values),\n                self.nodes_number, self.root))\n            fout.write(\" \".join(str(a) for a in self.alphabet) + \"\\n\")\n            for index, label in enumerate(self.final):\n                letters = self._get_letters(index, return_indexes=True)\n                children = self._get_children(index)\n                fout.write(\"{}\\t{}\\n\".format(\n                    \"T\" if label else \"F\", \" \".join(\"{}:{}\".format(*elem)\n                                                    for elem in zip(letters, children))))\n            if self.precompute_symbols is not None:\n                for elem in self.data:\n                    fout.write(\":\".join(\",\".join(\n                        map(str, symbols)) for symbols in elem) + \"\\n\")\n        return\n\n    def make_cashed(self):\n        \"\"\"\n        Включает кэширование запросов к descend\n        \"\"\"\n        self._descendance_cash = [dict() for _ in self.graph]\n        self.descend = self._descend_cashed\n\n    def make_numpied(self):\n        self.graph = np.array(self.graph)\n        self.final = np.asarray(self.final, dtype=bool)\n        self.is_numpied = True\n\n    def add(self, s):\n        \"\"\"\n        Добавление строки s в префиксный бор\n        \"\"\"\n        if self.is_terminated:\n            raise TypeError(\"Impossible to add string to fitted trie\")\n        if s == \"\":\n            self._set_final(self.root)\n            return\n        curr = self.root\n        for i, a in enumerate(s):\n            code = self.alphabet_codes[a]\n            next = self.graph[curr][code]\n            if next == Trie.NO_NODE:\n                curr = self._add_descendant(curr, s[i:])\n                break\n            else:\n                curr = next\n        self._set_final(curr)\n        return self\n\n    def fit(self, words):\n        for s in words:\n            self.add(s)\n        self.terminate()\n\n    def terminate(self):\n        if self.is_numpied:\n            self.make_numpied()\n        self.terminated = True\n        if self.precompute_symbols is not None:\n            precompute_future_symbols(self, self.precompute_symbols,\n                                      allow_spaces=self.allow_spaces)\n        if self.to_make_cashed:\n            self.make_cashed()\n\n    def __contains__(self, s):\n        if any(a not in self.alphabet for a in s):\n            return False\n        # word = tuple(self.alphabet_codes[a] for a in s)\n        node = self.descend(self.root, s)\n        return (node != Trie.NO_NODE) and self.is_final(node)\n\n    def words(self):\n        \"\"\"\n        Возвращает итератор по словам, содержащимся в боре\n        \"\"\"\n        branch, word, indexes = [self.root], [], [0]\n        letters_with_children = [self._get_children_and_letters(self.root)]\n        while len(branch) > 0:\n            if self.is_final(branch[-1]):\n                yield \"\".join(word)\n            while indexes[-1] == len(letters_with_children[-1]):\n                indexes.pop()\n                letters_with_children.pop()\n                branch.pop()\n                if len(indexes) == 0:\n                    raise StopIteration()\n                word.pop()\n            next_letter, next_child = letters_with_children[-1][indexes[-1]]\n            indexes[-1] += 1\n            indexes.append(0)\n            word.append(next_letter)\n            branch.append(next_child)\n            letters_with_children.append(self._get_children_and_letters(branch[-1]))\n\n    def is_final(self, index):\n        \"\"\"\n        Аргументы\n        ---------\n        index: int, номер вершины\n\n        Возвращает\n        ----------\n        True: если index --- номер финальной вершины\n        \"\"\"\n        return self.final[index]\n\n    def find_partitions(self, s, max_count=1):\n        \"\"\"\n        Находит все разбиения s = s_1 ... s_m на словарные слова s_1, ..., s_m\n        для m <= max_count\n        \"\"\"\n        curr_agenda = [(self.root, [], 0)]\n        for i, a in enumerate(s):\n            next_agenda = []\n            for curr, borders, cost in curr_agenda:\n                if cost >= max_count:\n                    continue\n                child = self.graph[curr][self.alphabet_codes[a]]\n                # child = self.graph[curr][a]\n                if child == Trie.NO_NODE:\n                    continue\n                next_agenda.append((child, borders, cost))\n                if self.is_final(child):\n                    next_agenda.append((self.root, borders + [i + 1], cost + 1))\n            curr_agenda = next_agenda\n        answer = []\n        for curr, borders, cost in curr_agenda:\n            if curr == self.root:\n                borders = [0] + borders\n                answer.append([s[left:borders[i + 1]] for i, left in enumerate(borders[:-1])])\n        return answer\n\n    def __len__(self):\n        return self.nodes_number\n\n    def __repr__(self):\n        answer = \"\"\n        for i, (final, data) in enumerate(zip(self.final, self.data)):\n            letters, children = self._get_letters(i), self._get_children(i)\n            answer += \"{0}\".format(i)\n            if final:\n                answer += \"F\"\n            for a, index in zip(letters, children):\n                answer += \" {0}:{1}\".format(a, index)\n            answer += \"\\n\"\n            if data is not None:\n                answer += \"data:{0} {1}\\n\".format(len(data), \" \".join(str(elem) for elem in data))\n        return answer\n\n    def _add_descendant(self, parent, s, final=False):\n        for a in s:\n            code = self.alphabet_codes[a]\n            parent = self._add_empty_child(parent, code, final)\n        return parent\n\n    def _add_empty_child(self, parent, code, final=False):\n        \"\"\"\n        Добавление ребёнка к вершине parent по символу с кодом code\n        \"\"\"\n        self.graph[parent][code] = self.nodes_number\n        self.graph.append(self._make_default_node())\n        self.data.append(None)\n        self.final.append(final)\n        self.nodes_number += 1\n        return (self.nodes_number - 1)\n\n    def _descend_simple(self, curr, s):\n        \"\"\"\n        Спуск из вершины curr по строке s\n        \"\"\"\n        for a in s:\n            curr = self.graph[curr][self.alphabet_codes[a]]\n            if curr == Trie.NO_NODE:\n                break\n        return curr\n\n    def _descend_cashed(self, curr, s):\n        \"\"\"\n        Спуск из вершины curr по строке s с кэшированием\n        \"\"\"\n        if s == \"\":\n            return curr\n        curr_cash = self._descendance_cash[curr]\n        answer = curr_cash.get(s, None)\n        if answer is not None:\n            return answer\n        # для оптимизации дублируем код\n        res = curr\n        for a in s:\n            res = self.graph[res][self.alphabet_codes[a]]\n            # res = self.graph[res][a]\n            if res == Trie.NO_NODE:\n                break\n        curr_cash[s] = res\n        return res\n\n    def _set_final(self, curr):\n        \"\"\"\n        Делает состояние curr завершающим\n        \"\"\"\n        self.final[curr] = True\n\n    def _get_letters(self, index, return_indexes=False):\n        \"\"\"\n        Извлекает все метки выходных рёбер вершины с номером index\n        \"\"\"\n        if self.dict_storage:\n            answer = list(self.graph[index].keys())\n        else:\n            answer = [i for i, elem in enumerate(self.graph[index])\n                      if elem != Trie.NO_NODE]\n        if not return_indexes:\n            answer = [(self.alphabet[i] if i >= 0 else \" \") for i in answer]\n        return answer\n\n    def _get_children_and_letters(self, index, return_indexes=False):\n        if self.dict_storage:\n            answer = list(self.graph[index].items())\n        else:\n            answer = [elem for elem in enumerate(self.graph[index])\n                      if elem[1] != Trie.NO_NODE]\n        if not return_indexes:\n            for i, (letter_index, child) in enumerate(answer):\n                answer[i] = (self.alphabet[letter_index], child)\n        return answer\n\n    def _get_children(self, index):\n        \"\"\"\n        Извлекает всех потомков вершины с номером index\n        \"\"\"\n        if self.dict_storage:\n            return list(self.graph[index].values())\n        else:\n            return [elem for elem in self.graph[index] if elem != Trie.NO_NODE]\n\n\nclass TrieMinimizer:\n    def __init__(self):\n        pass\n\n    def minimize(self, trie, dict_storage=False, make_cashed=False, make_numpied=False,\n                 precompute_symbols=None, allow_spaces=False, return_groups=False):\n        N = len(trie)\n        if N == 0:\n            raise ValueError(\"Trie should be non-empty\")\n        node_classes = np.full(shape=(N,), fill_value=-1, dtype=int)\n        order = self.generate_postorder(trie)\n        # processing the first node\n        index = order[0]\n        node_classes[index] = 0\n        class_representatives = [index]\n        node_key = ((), (), trie.is_final(index))\n        classes, class_keys = {node_key: 0}, [node_key]\n        curr_index = 1\n        for index in order[1:]:\n            letter_indexes = tuple(trie._get_letters(index, return_indexes=True))\n            children = trie._get_children(index)\n            children_classes = tuple(node_classes[i] for i in children)\n            key = (letter_indexes, children_classes, trie.is_final(index))\n            key_class = classes.get(key, None)\n            if key_class is not None:\n                node_classes[index] = key_class\n            else:\n                # появился новый класс\n                class_keys.append(key)\n                classes[key] = node_classes[index] = curr_index\n                class_representatives.append(curr_index)\n                curr_index += 1\n        # построение нового дерева\n        compressed = Trie(trie.alphabet, is_numpied=make_numpied,\n                          dict_storage=dict_storage, allow_spaces=allow_spaces,\n                          precompute_symbols=precompute_symbols)\n        L = len(classes)\n        new_final = [elem[2] for elem in class_keys[::-1]]\n        if dict_storage:\n            new_graph = [defaultdict(int) for _ in range(L)]\n        elif make_numpied:\n            new_graph = np.full(shape=(L, len(trie.alphabet)),\n                                fill_value=Trie.NO_NODE, dtype=int)\n            new_final = np.array(new_final, dtype=bool)\n        else:\n            new_graph = [[Trie.NO_NODE for a in trie.alphabet] for i in range(L)]\n        for (indexes, children, final), class_index in \\\n                sorted(classes.items(), key=(lambda x: x[1])):\n            row = new_graph[L - class_index - 1]\n            for i, child_index in zip(indexes, children):\n                row[i] = L - child_index - 1\n        compressed.graph = new_graph\n        compressed.root = L - node_classes[trie.root] - 1\n        compressed.final = new_final\n        compressed.nodes_number = L\n        compressed.data = [None] * L\n        if make_cashed:\n            compressed.make_cashed()\n        if precompute_symbols is not None:\n            if (trie.is_terminated and trie.precompute_symbols\n                    and trie.allow_spaces == allow_spaces):\n                # копируем будущие символы из исходного дерева\n                # нужно, чтобы возврат из финальных состояний в начальное был одинаковым в обоих деревьях\n                for i, node_index in enumerate(class_representatives[::-1]):\n                    # будущие символы для представителя i-го класса\n                    compressed.data[i] = copy.copy(trie.data[node_index])\n            else:\n                precompute_future_symbols(compressed, precompute_symbols, allow_spaces)\n        if return_groups:\n            node_classes = [L - i - 1 for i in node_classes]\n            return compressed, node_classes\n        else:\n            return compressed\n\n    def generate_postorder(self, trie):\n        \"\"\"\n        Обратная топологическая сортировка\n        \"\"\"\n        order, stack = [], []\n        stack.append(trie.root)\n        colors = ['white'] * len(trie)\n        while len(stack) > 0:\n            index = stack[-1]\n            color = colors[index]\n            if color == 'white':  # вершина ещё не обрабатывалась\n                colors[index] = 'grey'\n                for child in trie._get_children(index):\n                    # проверяем, посещали ли мы ребёнка раньше\n                    if child != Trie.NO_NODE and colors[child] == 'white':\n                        stack.append(child)\n            else:\n                if color == 'grey':\n                    colors[index] = 'black'\n                    order.append(index)\n                stack = stack[:-1]\n        return order\n\n\ndef load_trie(infile):\n    with open(infile, \"r\", encoding=\"utf8\") as fin:\n        line = fin.readline().strip()\n        flags = [x == 'T' for x in line.split()]\n        if len(flags) != len(Trie.ATTRS) + 1:\n            raise ValueError(\"Wrong file format\")\n        nodes_number, root = map(int, fin.readline().strip().split())\n        alphabet = fin.readline().strip().split()\n        trie = Trie(alphabet)\n        for i, attr in enumerate(Trie.ATTRS):\n            setattr(trie, attr, flags[i])\n        read_data = flags[-1]\n        final = [False] * nodes_number\n        # print(len(alphabet), nodes_number)\n        if trie.dict_storage:\n            graph = [defaultdict(lambda: -1) for _ in range(nodes_number)]\n        elif trie.is_numpied:\n            final = np.array(final)\n            graph = np.full(shape=(nodes_number, len(alphabet)),\n                            fill_value=Trie.NO_NODE, dtype=int)\n        else:\n            graph = [[Trie.NO_NODE for a in alphabet] for i in range(nodes_number)]\n        for i in range(nodes_number):\n            line = fin.readline().strip()\n            if \"\\t\" in line:\n                label, transitions = line.split(\"\\t\")\n                final[i] = (label == \"T\")\n            else:\n                label = line\n                final[i] = (label == \"T\")\n                continue\n            transitions = [x.split(\":\") for x in transitions.split()]\n            for code, value in transitions:\n                graph[i][int(code)] = int(value)\n        trie.graph = graph\n        trie.root = root\n        trie.final = final\n        trie.nodes_number = nodes_number\n        trie.data = [None] * nodes_number\n        if read_data:\n            for i in range(nodes_number):\n                line = fin.readline().strip(\"\\n\")\n                trie.data[i] = [set(elem.split(\",\")) for elem in line.split(\":\")]\n        if trie.to_make_cashed:\n            trie.make_cashed()\n        return trie\n\n\ndef make_trie(alphabet, words, compressed=True, is_numpied=False,\n              make_cashed=False, precompute_symbols=False,\n              allow_spaces=False, dict_storage=False):\n    trie = Trie(alphabet, is_numpied=is_numpied, to_make_cashed=make_cashed,\n                precompute_symbols=precompute_symbols, dict_storage=dict_storage)\n    trie.fit(words)\n    if compressed:\n        tm = TrieMinimizer()\n        trie = tm.minimize(trie, dict_storage=dict_storage, make_cashed=make_cashed,\n                           make_numpied=is_numpied, precompute_symbols=precompute_symbols,\n                           allow_spaces=allow_spaces)\n    return trie\n\n\ndef precompute_future_symbols(trie, n, allow_spaces=False):\n    \"\"\"\n    Collecting possible continuations of length <= n for every node\n    \"\"\"\n    if n == 0:\n        return\n    if trie.is_terminated and trie.precompute_symbols:\n        # символы уже предпосчитаны\n        return\n    for index, final in enumerate(trie.final):\n        trie.data[index] = [set() for i in range(n)]\n    for index, (node_data, final) in enumerate(zip(trie.data, trie.final)):\n        node_data[0] = set(trie._get_letters(index))\n        if allow_spaces and final:\n            node_data[0].add(\" \")\n    for d in range(1, n):\n        for index, (node_data, final) in enumerate(zip(trie.data, trie.final)):\n            children = set(trie._get_children(index))\n            for child in children:\n                node_data[d] |= trie.data[child][d - 1]\n            # в случае, если разрешён возврат по пробелу в стартовое состояние\n            if allow_spaces and final:\n                node_data[d] |= trie.data[trie.root][d - 1]\n    trie.terminated = True\n"
  },
  {
    "path": "deeppavlov/models/tokenizers/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/tokenizers/lazy_tokenizer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\n\nfrom nltk import word_tokenize\n\nfrom deeppavlov.core.common.registry import register\n\nlog = getLogger(__name__)\n\n\n@register('lazy_tokenizer')\ndef lazy_tokenizer(batch):\n    \"\"\"Tokenizes if there is something to tokenize.\"\"\"\n\n    if len(batch) > 0 and isinstance(batch[0], str):\n        batch = [word_tokenize(utt) for utt in batch]\n    return batch\n"
  },
  {
    "path": "deeppavlov/models/tokenizers/nltk_moses_tokenizer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom typing import Union, List\n\nfrom sacremoses import MosesDetokenizer, MosesTokenizer\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\n@register(\"nltk_moses_tokenizer\")\nclass NLTKMosesTokenizer(Component):\n    \"\"\"Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer\n\n    Attributes:\n        escape: whether escape characters for use in html markup\n        tokenizer: tokenizer instance from nltk.tokenize.moses\n        detokenizer: detokenizer instance from nltk.tokenize.moses\n\n    Args:\n        escape: whether escape characters for use in html markup\n    \"\"\"\n\n    def __init__(self, escape: bool = False, *args, **kwargs):\n        self.escape = escape\n        self.tokenizer = MosesTokenizer()\n        self.detokenizer = MosesDetokenizer()\n\n    def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]:\n        \"\"\"Tokenize given batch of strings or detokenize given batch of lists of tokens\n\n        Args:\n            batch: list of text samples or list of lists of tokens\n\n        Returns:\n            list of lists of tokens or list of text samples\n        \"\"\"\n        if isinstance(batch[0], str):\n            return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch]\n        else:\n            return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape)\n                    for line in batch]\n"
  },
  {
    "path": "deeppavlov/models/tokenizers/nltk_tokenizer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nimport nltk\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\n@register(\"nltk_tokenizer\")\nclass NLTKTokenizer(Component):\n    \"\"\"Class for splitting texts on tokens using NLTK\n\n    Args:\n        tokenizer: tokenization mode for `nltk.tokenize`\n        download: whether to download nltk data\n\n    Attributes:\n        tokenizer: tokenizer instance from nltk.tokenizers\n    \"\"\"\n\n    def __init__(self, tokenizer: str = \"wordpunct_tokenize\", download: bool = False,\n                 *args, **kwargs):\n        if download:\n            nltk.download()\n        self.tokenizer = getattr(nltk.tokenize, tokenizer, None)\n        if not callable(self.tokenizer):\n            raise AttributeError(\"Tokenizer {} is not defined in nltk.tokenizer\".format(tokenizer))\n\n    def __call__(self, batch: List[str]) -> List[List[str]]:\n        \"\"\"Tokenize given batch\n\n        Args:\n            batch: list of text samples\n\n        Returns:\n            list of lists of tokens\n        \"\"\"\n        return [self.tokenizer(sent) for sent in batch]\n"
  },
  {
    "path": "deeppavlov/models/tokenizers/spacy_tokenizer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import List, Generator, Any, Optional, Union, Tuple, Iterable\n\nimport spacy\nimport spacy.language\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.models.tokenizers.utils import detokenize, ngramize\n\nlogger = getLogger(__name__)\n\n\n# TODO: make proper handling through spacy.cli.download in the stage of python -m deeppavlov download\ndef _try_load_spacy_model(model_name: str, disable: Iterable[str] = ()):\n    disable = set(disable)\n    try:\n        model = spacy.load(model_name, disable=disable)\n    except OSError as e:\n        try:\n            model = __import__(model_name).load(disable=disable)\n            if not isinstance(model, spacy.language.Language):\n                raise RuntimeError(f'{model_name} is not a spacy model module')\n        except Exception:\n            raise e\n    return model\n\n\n@register('stream_spacy_tokenizer')\nclass StreamSpacyTokenizer(Component):\n    \"\"\"Tokenize or lemmatize a list of documents. Default spacy model is **en_core_web_sm**.\n    Return a list of tokens or lemmas for a whole document.\n    If is called onto ``List[str]``, performs detokenizing procedure.\n\n    Args:\n        disable: spacy pipeline elements to disable, serves a purpose of performing; if nothing\n        filter_stopwords: whether to ignore stopwords during tokenizing/lemmatizing and ngrams creation\n        batch_size: a batch size for spaCy buffering\n        ngram_range: size of ngrams to create; only unigrams are returned by default\n        lemmas: whether to perform lemmatizing or not\n        lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize`\n         and :meth:`_lemmatize` methods\n        alphas_only: whether to filter out non-alpha tokens; is performed by default by\n         :meth:`_filter` method\n        spacy_model: a string name of spacy model to use; DeepPavlov searches for this name in\n         downloaded spacy models; default model is **en_core_web_sm**, it downloads automatically\n         during DeepPavlov installation\n\n\n    Attributes:\n        stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing\n         and ngrams creation\n        model: a loaded spacy model\n        batch_size: a batch size for spaCy buffering\n        ngram_range: size of ngrams to create; only unigrams are returned by default\n        lemmas: whether to perform lemmatizing or not\n        lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize`\n         and :meth:`_lemmatize` methods\n        alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter`\n         method\n\n    \"\"\"\n\n    def __init__(self, disable: Optional[Iterable[str]] = None, filter_stopwords: bool = False,\n                 batch_size: Optional[int] = None, ngram_range: Optional[List[int]] = None,\n                 lemmas: bool = False, lowercase: Optional[bool] = None, alphas_only: Optional[bool] = None,\n                 spacy_model: str = 'en_core_web_sm', **kwargs):\n\n        if disable is None:\n            disable = ['parser', 'ner']\n        if ngram_range is None:\n            ngram_range = [1, 1]\n        self.model = _try_load_spacy_model(spacy_model, disable=disable)\n        self.stopwords = self.model.Defaults.stop_words if filter_stopwords else set()\n        self.batch_size = batch_size\n        self.ngram_range = tuple(ngram_range)  # cast JSON array to tuple\n        self.lemmas = lemmas\n        self.lowercase = lowercase\n        self.alphas_only = alphas_only\n\n    def __call__(self, batch: Union[List[str], List[List[str]]]) -> Union[List[List[str]], List[str]]:\n        \"\"\"Tokenize or detokenize strings, depends on the type structure of passed arguments.\n\n        Args:\n            batch: a batch of documents to perform tokenizing/lemmatizing;\n             or a batch of lists of tokens/lemmas to perform detokenizing\n\n        Returns:\n            a batch of lists of tokens/lemmas; or a batch of detokenized strings\n\n        Raises:\n            TypeError: If the first element of ``batch`` is neither List, nor str.\n\n        \"\"\"\n        if isinstance(batch[0], str):\n            if self.lemmas:\n                return list(self._lemmatize(batch))\n            else:\n                return list(self._tokenize(batch))\n        if isinstance(batch[0], list):\n            return [detokenize(doc) for doc in batch]\n        raise TypeError(\n            \"StreamSpacyTokenizer.__call__() is not implemented for `{}`\".format(type(batch[0])))\n\n    def _tokenize(self, data: List[str], ngram_range: Optional[Tuple[int, int]] = None, batch_size: int = 10000,\n                  lowercase: bool = True) -> Generator[List[str], Any, None]:\n        \"\"\"Tokenize a list of documents.\n\n        Args:\n            data: a list of documents to tokenize\n            ngram_range: size of ngrams to create; only unigrams are returned by default\n            batch_size: a batch size for spaCy buffering\n            lowercase: whether to perform lowercasing or not; is performed by default by\n                :meth:`_tokenize` and :meth:`_lemmatize` methods\n\n        Yields:\n            list of lists of ngramized tokens or list of detokenized strings\n\n        Returns:\n            None\n\n        \"\"\"\n        _batch_size = self.batch_size or batch_size\n        _ngram_range = ngram_range or self.ngram_range\n\n        if self.lowercase is None:\n            _lowercase = lowercase\n        else:\n            _lowercase = self.lowercase\n\n        for i, doc in enumerate(\n                self.model.tokenizer.pipe(data, batch_size=_batch_size)):\n            if _lowercase:\n                tokens = [t.lower_ for t in doc]\n            else:\n                tokens = [t.text for t in doc]\n            filtered = self._filter(tokens)\n            processed_doc = ngramize(filtered, ngram_range=_ngram_range, doc=data[i])\n            yield from processed_doc\n\n    def _lemmatize(self, data: List[str], ngram_range: Optional[Tuple[int, int]] = None, batch_size: int = 10000,\n                   lowercase: bool = True) -> Generator[List[str], Any, None]:\n        \"\"\"Lemmatize a list of documents.\n\n        Args:\n            data: a list of documents to tokenize\n            ngram_range: size of ngrams to create; only unigrams are returned by default\n            batch_size: a batch size for spaCy buffering\n\n       Yields:\n           list of lists of ngramized lemmas or list of detokenized strings\n\n        Returns:\n            None\n\n        \"\"\"\n        _batch_size = self.batch_size or batch_size\n        _ngram_range = ngram_range or self.ngram_range\n\n        if self.lowercase is None:\n            _lowercase = lowercase\n        else:\n            _lowercase = self.lowercase\n\n        for i, doc in enumerate(\n                self.model.pipe(data, batch_size=_batch_size)):\n            lemmas = [t.lemma_ for t in doc]\n            if _lowercase:\n                lemmas = [t.lower() for t in lemmas]\n            lemm_doc = \" \".join(lemmas)\n            filtered = self._filter(lemmas)\n            processed_doc = ngramize(filtered, ngram_range=_ngram_range, doc=lemm_doc)\n            yield from processed_doc\n\n    def _filter(self, items: List[str], alphas_only: bool = True) -> List[str]:\n        \"\"\"Filter a list of tokens/lemmas.\n\n        Args:\n            items: a list of tokens/lemmas to filter\n            alphas_only: whether to filter out non-alpha tokens\n\n        Returns:\n            a list of filtered tokens/lemmas\n\n        \"\"\"\n        if self.alphas_only is None:\n            _alphas_only = alphas_only\n        else:\n            _alphas_only = self.alphas_only\n\n        if _alphas_only:\n            filter_fn = lambda x: x.isalpha() and not x.isspace() and x not in self.stopwords\n        else:\n            filter_fn = lambda x: not x.isspace() and x not in self.stopwords\n\n        return list(filter(filter_fn, items))\n\n    def set_stopwords(self, stopwords: List[str]) -> None:\n        \"\"\"Redefine a list of stopwords.\n\n        Args:\n            stopwords: a list of stopwords\n\n        Returns:\n            None\n\n        \"\"\"\n        self.stopwords = stopwords\n"
  },
  {
    "path": "deeppavlov/models/tokenizers/split_tokenizer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\n\n\n@register(\"split_tokenizer\")\nclass SplitTokenizer(Component):\n    \"\"\"\n    Generates utterance's tokens by mere python's ``str.split()``.\n\n    Doesn't have any parameters.\n    \"\"\"\n\n    def __init__(self, **kwargs) -> None:\n        pass\n\n    def __call__(self, batch: List[str]) -> List[List[str]]:\n        \"\"\"\n        Tokenize given batch\n\n        Args:\n            batch: list of texts to tokenize\n\n        Returns:\n            tokenized batch\n        \"\"\"\n        if isinstance(batch, (list, tuple)):\n            return [sample.split() for sample in batch]\n        else:\n            raise NotImplementedError('not implemented for types other than'\n                                      ' list or tuple')\n"
  },
  {
    "path": "deeppavlov/models/tokenizers/utils.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom typing import List, Generator, Any\n\n\ndef detokenize(tokens):\n    \"\"\"\n    Detokenizing a text undoes the tokenizing operation, restores\n    punctuation and spaces to the places that people expect them to be.\n    Ideally, `detokenize(tokenize(text))` should be identical to `text`,\n    except for line breaks.\n    \"\"\"\n    text = ' '.join(tokens)\n    step0 = text.replace('. . .', '...')\n    step1 = step0.replace(\"`` \", '\"').replace(\" ''\", '\"')\n    step2 = step1.replace(\" ( \", \" (\").replace(\" ) \", \") \")\n    step3 = re.sub(r' ([.,:;?!%]+)([ \\'\"`])', r\"\\1\\2\", step2)\n    step4 = re.sub(r' ([.,:;?!%]+)$', r\"\\1\", step3)\n    step5 = step4.replace(\" '\", \"'\").replace(\" n't\", \"n't\") \\\n        .replace(\" nt\", \"nt\").replace(\"can not\", \"cannot\")\n    step6 = step5.replace(\" ` \", \" '\")\n    return step6.strip()\n\n\ndef ngramize(items: List[str], ngram_range=(1, 1), doc: str = None) -> Generator[List[str], Any, None]:\n    \"\"\"\n    Make ngrams from a list of tokens/lemmas\n    :param items: list of tokens, lemmas or other strings to form ngrams\n    :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to\n    (1, 2), for bigrams only should be set to (2, 2)\n    :return: ngrams (as strings) generator\n    \"\"\"\n\n    ngrams = []\n    ranges = [(0, i) for i in range(ngram_range[0], ngram_range[1] + 1)]\n    for r in ranges:\n        ngrams += list(zip(*[items[j:] for j in range(*r)]))\n\n    formatted_ngrams = [' '.join(item) for item in ngrams]\n    if doc is not None:\n        doc_lower = doc.lower()\n        formatted_ngrams = [ngram for ngram in formatted_ngrams if (ngram in doc or ngram in doc_lower)]\n\n    yield formatted_ngrams\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/models/torch_bert/crf.py",
    "content": "import numpy as np\nimport torch\nfrom torch import nn\nfrom torchcrf import CRF as CRFbase\n\n\nclass CRF(CRFbase):\n    \"\"\"Class with Conditional Random Field from PyTorch-CRF library\n       with modified training function\n    \"\"\"\n\n    def __init__(self, num_tags: int, batch_first: bool = False) -> None:\n        super().__init__(num_tags=num_tags, batch_first=batch_first)\n        nn.init.zeros_(self.transitions)\n        nn.init.zeros_(self.start_transitions)\n        nn.init.zeros_(self.end_transitions)\n        self.stats = torch.zeros((num_tags, num_tags), dtype=torch.float)\n        self.zeros = torch.zeros((num_tags, num_tags), dtype=torch.float)\n        self.neg = torch.full((num_tags, num_tags), -1000.0)\n\n    def forward(self, tags_batch: torch.LongTensor, y_masks: np.ndarray):\n        seq_lengths = np.sum(y_masks, axis=1)\n        for seq_len, tags_list in zip(seq_lengths, tags_batch):\n            if seq_len > 1:\n                for i in range(seq_len - 1):\n                    self.stats[int(tags_list[i])][int(tags_list[i + 1])] += 1.0\n        with torch.no_grad():\n            self.transitions.copy_(torch.where(self.stats > 0, self.zeros, self.neg))\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/multitask_transformer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections.abc import Iterable\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Dict, Optional\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss\nfrom transformers import AutoConfig, AutoModel\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\nfrom deeppavlov.models.torch_bert.torch_transformers_sequence_tagger import token_from_subtoken, \\\n    token_labels_to_subtoken_labels\n\nlog = getLogger(__name__)\n\n\nclass FocalLoss(nn.Module):\n    \"Non weighted version of Focal Loss\"\n\n    def __init__(self, alpha=.5, gamma=2, categorical_loss=False, weight=None):\n        super(FocalLoss, self).__init__()\n        self.alpha = torch.tensor([alpha, 1 - alpha]).cuda()\n        self.gamma = gamma\n        self.categorical = categorical_loss\n        self.weight = weight\n\n    def forward(self, inputs, targets):\n        if self.categorical:\n            loss = CrossEntropyLoss(weight=self.weight, reduction='none')(inputs, targets)\n        else:\n            loss = BCEWithLogitsLoss(weight=self.weight, reduction='none')(inputs, targets)\n        targets = targets.type(torch.long)\n        at = self.alpha.gather(0, targets.data.view(-1))\n        pt = torch.exp(-loss)\n        F_loss = at * (1 - pt) ** self.gamma * loss\n        return F_loss.mean()\n\n\ndef SoftCrossEntropyLoss(inputs, targets):\n    logprobs = torch.nn.functional.log_softmax(inputs, dim=1)\n    return -(targets * logprobs).sum() / inputs.shape[0]\n\n\ndef we_transform_input(name):\n    return name in ['sequence_labeling', 'multiple_choice']\n\n\nclass BertForMultiTask(nn.Module):\n    \"\"\"\n    BERT model for multiple choice,sequence labeling, ner, classification or regression\n    This module is composed of the BERT model with a linear layer on top of\n    the pooled output.\n    Params:\n    task_num_classes\n    task_types\n    backbone_model - na\n    \"\"\"\n\n    def __init__(self, tasks_num_classes, multilabel, task_types,\n                 weights, backbone_model='bert_base_uncased',\n                 dropout=None, new_model=False,focal=False,\n                 max_seq_len=320, model_takes_token_type_ids=True):\n\n        super(BertForMultiTask, self).__init__()\n        config = AutoConfig.from_pretrained(backbone_model, output_hidden_states=True, output_attentions=True)\n        self.bert = AutoModel.from_pretrained(pretrained_model_name_or_path=backbone_model,\n                                                config=config)\n        self.classes = tasks_num_classes  # classes for every task\n        self.weights = weights\n        self.multilabel = multilabel\n        self.new_model = new_model\n        self.model_takes_token_type_ids = model_takes_token_type_ids\n        if dropout is not None:\n            self.dropout = nn.Dropout(dropout)\n        elif hasattr(config, 'hidden_dropout_prob'):\n            self.dropout = nn.Dropout(config.hidden_dropout_prob)\n        elif hasattr(config, 'seq_classif_dropout'):\n            self.dropout = nn.Dropout(config.seq_classif_dropout)\n        elif hasattr(config, 'dropout'):\n            self.dropout = nn.Dropout(config.dropout)\n        else:\n            self.dropout = nn.Dropout(0)\n        self.max_seq_len = max_seq_len\n        self.activation = nn.Tanh()\n        self.task_types = task_types\n        self.focal=focal\n        OUT_DIM = config.hidden_size\n        if self.new_model and self.new_model!=2:\n            OUT_DIM = OUT_DIM * 2\n        self.bert.final_classifier = nn.ModuleList(\n            [\n                nn.Linear(OUT_DIM, num_labels) if self.task_types[i] not in ['multiple_choice',\n                                                                             'regression', 'binary_head']\n                else nn.Linear(OUT_DIM, 1) for i, num_labels in enumerate(self.classes)\n            ]\n        )\n        if self.new_model:# or True:\n            self.bert.pooling_layer = nn.Linear(OUT_DIM, OUT_DIM)\n        else:\n            self.bert.pooler = nn.Linear(OUT_DIM, OUT_DIM)\n\n    def get_logits(self, task_id, input_ids, attention_mask, token_type_ids):\n        name = self.task_types[task_id]\n        outputs = None\n        if we_transform_input(name):\n            input_ids = input_ids.view(-1, input_ids.size(-1))\n            attention_mask = attention_mask.view(-1, attention_mask.size(-1))\n            if token_type_ids is not None:\n                token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))\n        if token_type_ids is None or not self.model_takes_token_type_ids:\n            outputs = self.bert(input_ids=input_ids.long(),\n                                attention_mask=attention_mask.long())\n        else:\n            try:\n                outputs = self.bert(input_ids=input_ids.long(),\n                                token_type_ids=token_type_ids.long(),\n                                attention_mask=attention_mask.long())\n            except Exception as e:\n                if \"forward() got an unexpected keyword argument 'token_type_ids'\" in str(e):\n                    outputs = self.bert(input_ids=input_ids.long(),\n                                        attention_mask=attention_mask.long())\n                    self.model_takes_token_type_ids=False\n                else:\n                    raise e\n        if name == 'sequence_labeling':\n            return outputs.last_hidden_state\n        elif self.new_model == 2:\n            return outputs.last_hidden_state[:, task_id]\n        elif self.new_model:\n            return torch.cat([outputs.last_hidden_state[:, 0], outputs.last_hidden_state[:, task_id + 1]], axis=1)\n        else:\n            return outputs.last_hidden_state[:, 0]\n\n    def predict_on_top(self, task_id, last_hidden_state, labels=None):\n        name = self.task_types[task_id]\n        if name == 'sequence_labeling':\n            #  last hidden state is all token tensor\n            final_output = self.dropout(last_hidden_state)\n            logits = self.bert.final_classifier[task_id](final_output)\n            if labels is not None:\n                active_logits = logits.view(-1, self.classes[task_id])\n                if self.multilabel[task_id]:\n                    loss_fct = BCEWithLogitsLoss()\n                    loss = loss_fct(active_logits, labels)\n                elif not self.multilabel[task_id]:\n                    loss_fct = CrossEntropyLoss()\n                    loss = loss_fct(active_logits, labels.view(-1))\n                return loss, logits\n            else:\n                return logits\n        elif name in ['classification', 'regression', 'multiple_choice']:\n            #  last hidden state is a first token tensor\n            if self.new_model:  # or True:\n                pooled_output = self.bert.pooling_layer(last_hidden_state)\n            else:\n                pooled_output = self.bert.pooler(last_hidden_state)\n            pooled_output = self.activation(pooled_output)\n            pooled_output = self.dropout(pooled_output)\n            logits = self.bert.final_classifier[task_id](pooled_output)\n            if name == 'multiple_choice':\n                logits = logits.view((-1, self.classes[task_id]))\n                if labels is not None:\n                    l1, l2 = len(logits), len(labels)\n                    if len(logits) != len(labels):\n                        raise Exception(f'Len of logits {l1} and labels {l2} not match')\n            if labels is not None:\n                if name != \"regression\":\n                    if self.multilabel[task_id]:\n                        loss_fct = BCEWithLogitsLoss()\n                        loss = loss_fct(logits, labels)\n                    elif not self.multilabel[task_id]:\n                        if self.focal:\n                            if self.weights[task_id] is None:\n                                loss_fct = FocalLoss()\n                            else:\n                                loss_fct = FocalLoss(weight=torch.tensor([self.weights[task_id]]).cuda())\n                            loss = loss_fct(logits, labels.view(-1))\n                        else:\n                            if self.weights[task_id] is None:\n                                loss_fct = CrossEntropyLoss()\n                            else:\n                                loss_fct = CrossEntropyLoss(weight=torch.Tensor([self.weights[task_id]]).cuda())\n                            loss = loss_fct(logits, labels.view(-1))\n                    return loss, logits\n                elif name == \"regression\":\n                    loss_fct = MSELoss()\n                    loss = loss_fct(logits, labels.unsqueeze(1))\n                    return loss, logits\n            else:\n                return logits\n        elif name == 'binary_head':\n            last_hidden_state = self.dropout(last_hidden_state)\n            pooled_output = self.bert.pooler(last_hidden_state)\n            pooled_output = self.activation(pooled_output)\n            pooled_output = self.dropout(pooled_output)\n            logits = self.bert.final_classifier[task_id](pooled_output)\n            if labels is not None:\n                if self.focal:\n                    if self.weights[task_id] is None:\n                        loss_fct = FocalLoss()\n                    else:\n                        loss_fct = FocalLoss(weight=torch.tensor([self.weights[task_id]]).cuda())\n                else:\n                    if self.weights[task_id] is None:\n                        loss_fct = BCEWithLogitsLoss()\n                    else:\n                        loss_fct = BCEWithLogitsLoss(weight=torch.Tensor([self.weights[task_id]]).cuda())\n                if len(labels.shape) == 1 and len(logits.shape) == 2:\n                    labels = labels.unsqueeze(1)\n                loss = loss_fct(logits, labels)\n                return loss, logits\n            else:\n                return logits\n        else:\n            raise Exception(f'Unsupported name {name}')\n\n    def forward(self, task_id, input_ids, attention_mask, token_type_ids, labels=None):\n        last_hidden_state = self.get_logits(task_id, input_ids, attention_mask, token_type_ids)\n        return self.predict_on_top(task_id, last_hidden_state, labels)\n\n\n@register('multitask_transformer')\nclass MultiTaskTransformer(TorchModel):\n    \"\"\"\n    Multi-Task transformer-agnostic model\n    Args:\n        tasks: Dict of task names along with the labels for each task,\n        max_seq_len(int): maximum length of the input token sequence.\n        gradient_accumulation_steps(default:1): number of gradient accumulation steps,\n        steps_per_epoch(int): number of steps taken per epoch. Specify if gradient_accumulation_steps > 1\n        backbone_model(str): name of HuggingFace.Transformers backbone model. Default: 'bert-base-cased'\n        multilabel(default: False): set to true for multilabel classification,\n        return_probas(default: False): set true to return prediction probabilities,\n        freeze_embeddings(default: False): set true to freeze BERT embeddings\n        dropout(default: None): dropout for the final model layer.\n        If not set, defaults to the parameter hidden_dropout_prob of original model\n        cuda_cache_size(default:3): predicts cache size. Recommended if we need classify one samples for many tasks. 0 if we don't use cache\n        cuda_cache(default:True): if True, store cache on GPU\n        seed(default:42): Torch manual_random_seed\n    \"\"\"\n\n    def __init__(\n            self,\n            tasks: Dict[str, Dict],\n            max_seq_len: int = 320,\n            gradient_accumulation_steps: Optional[int] = 1,\n            steps_per_epoch: Optional[int] = None,\n            backbone_model: str = \"bert-base-cased\",\n            focal: bool = False,\n            return_probas: bool = False,\n            freeze_embeddings: bool = False,\n            new_model=False,\n            dropout: Optional[float] = None,\n            binary_threshold: float = 0.5,\n            seed: int = 42,\n            *args,\n            **kwargs,\n    ) -> None:\n        self.return_probas = return_probas\n        self.task_names = list(tasks.keys())\n        self.task_types = []\n        self.max_seq_len = max_seq_len\n        self.tasks_num_classes = []\n        self.task_names = []\n        self.multilabel = []\n        weights = []\n        self.types_to_cache = []\n        for task in tasks:\n            self.task_names.append(task)\n            self.tasks_num_classes.append(tasks[task].get('options', 1))\n            weights.append(tasks[task].get('weight', None))\n            self.task_types.append(tasks[task]['type'])\n            self.multilabel.append(tasks[task].get('multilabel', False))\n            self.types_to_cache.append(tasks[task].get('type_to_cache', -1))\n        if self.return_probas and 'sequence_labeling' in self.task_types:\n            log.warning(f'Return_probas for sequence_labeling not supported yet. Returning ids for this task')\n        self.n_tasks = len(tasks)\n        self.train_losses = [[] for _ in self.task_names]\n        self.gradient_accumulation_steps = gradient_accumulation_steps\n        self.steps_per_epoch = steps_per_epoch\n        self.steps_taken = 0\n        self.prev_id = None\n        self.printed = False\n        self.freeze_embeddings = freeze_embeddings\n        self.binary_threshold = binary_threshold\n        self._reset_cache()\n        torch.manual_seed(seed)\n\n        model = BertForMultiTask(\n            backbone_model=backbone_model,\n            tasks_num_classes=self.tasks_num_classes,\n            weights=weights,\n            multilabel=self.multilabel,\n            task_types=self.task_types,\n            new_model=new_model,\n            focal=focal,\n            dropout=dropout)\n\n        super().__init__(model, **kwargs)\n\n    def _reset_cache(self):\n        self.preds_cache = {index_: None for index_ in self.types_to_cache if index_ != -1}\n\n    def load(self, fname: Optional[str] = None, *args, **kwargs) -> None:\n        \"\"\"\n        Loads weights.\n        \"\"\"\n        super().load(fname)\n        if self.freeze_embeddings:\n            for n, p in self.model.bert.named_parameters():\n                if not ('final_classifier' in n or 'pool' in n):\n                    p.requires_grad = False\n\n    def _make_input(self, task_features, task_id, labels=None):\n        batch_input_size = None\n        if len(task_features) == 1 and isinstance(task_features, list):\n            task_features = task_features[0]\n\n        if isinstance(labels, Iterable) and all([k is None for k in labels]):\n            labels = None\n        _input = {}\n        element_list = [\"input_ids\", \"attention_mask\", \"token_type_ids\"]\n        for elem in element_list:\n            if elem in task_features:\n                _input[elem] = task_features[elem]\n                batch_input_size = _input[elem].shape[0]\n            elif hasattr(task_features, elem):\n                _input[elem] = getattr(task_features, elem)\n                batch_input_size = _input[elem].shape[0]\n            if elem in _input:\n                if we_transform_input(self.task_types[task_id]):\n                    _input[elem] = _input[elem].view(\n                        (-1, _input[elem].size(-1)))\n\n        if labels is not None:\n            if self.task_types[task_id] in [\"regression\", \"binary_head\"]:\n                _input[\"labels\"] = torch.tensor(\n                    np.array(labels, dtype=float), dtype=torch.float32\n                )\n            elif self.task_types[task_id] == 'multiple_choice':\n                labels = torch.Tensor(labels).long()\n                _input['labels'] = labels\n            elif self.task_types[task_id] == 'sequence_labeling':\n                subtoken_labels = [token_labels_to_subtoken_labels(y_el, y_mask, input_mask)\n                                   for y_el, y_mask, input_mask in zip(labels, _input['token_type_ids'].numpy(),\n                                                                       _input['attention_mask'].numpy())]\n                _input['labels'] = torch.from_numpy(\n                    np.array(subtoken_labels)).to(torch.int64)\n            else:\n                if not self.multilabel[task_id]:\n                    _input[\"labels\"] = torch.from_numpy(np.array(labels))\n                elif self.multilabel[task_id]:\n                    # We assume that labels already are one hot encoded\n                    num_classes = self.tasks_num_classes[task_id]\n                    _input['labels'] = torch.zeros((len(labels), num_classes))\n                    for i in range(len(labels)):\n                        for label_ind in labels[i]:\n                            _input['labels'][i][label_ind] = 1\n            element_list = element_list + ['labels']\n        for elem in element_list:\n            if elem not in _input:\n                _input[elem] = None\n            else:\n                _input[elem] = _input[elem].to(self.device)\n        if 'labels' in _input and self.task_types[task_id] != 'multiple_choice':\n            error_msg = f'Len of labels {len(_input[\"labels\"])} does not match len of ids {len(_input[\"input_ids\"])}'\n            if len(_input['labels']) != len(_input['input_ids']):\n                raise Exception(error_msg)\n        return _input, batch_input_size\n\n    def __call__(self, *args):\n        \"\"\"Make prediction for given features (texts).\n        Args:\n            features: batch of InputFeatures for all tasks\n        Returns:\n            predicted classes or probabilities of each class\n        \"\"\"\n        # IMPROVE ARGS CHECKING AFTER DEBUG\n        log.debug(f'Calling {args}')\n        self.validation_predictions = [None for _ in range(len(args))]\n        for task_id in range(len(self.task_names)):\n            if len(args[task_id]):\n                _input, batch_input_size = self._make_input(task_features=args[task_id], task_id=task_id)\n\n                if 'input_ids' not in _input:\n                    raise Exception(f'No input_ids in _input {_input}')\n                cache_key = self.types_to_cache[task_id]\n                if cache_key != -1 and self.preds_cache[cache_key] is not None:\n                    last_hidden_state = self.preds_cache[cache_key]\n                else:\n                    with torch.no_grad():\n                        if self.is_data_parallel:\n                            last_hidden_state = self.model.module.get_logits(task_id, **_input)\n                        else:\n                            last_hidden_state = self.model.get_logits(task_id, **_input)\n                        if cache_key != -1:\n                            self.preds_cache[cache_key] = last_hidden_state\n                with torch.no_grad():\n                    if self.is_data_parallel:\n                        logits = self.model.module.predict_on_top(task_id, last_hidden_state)\n                    else:\n                        logits = self.model.predict_on_top(task_id, last_hidden_state)\n                if self.task_types[task_id] == 'sequence_labeling':\n                    y_mask = _input['token_type_ids'].cpu()\n                    logits = token_from_subtoken(logits.cpu(), y_mask)\n                    predicted_ids = torch.argmax(logits, dim=-1).int().tolist()\n                    seq_lengths = torch.sum(y_mask, dim=1).int().tolist()\n                    pred = [prediction[:max_seq_len] for max_seq_len, prediction in zip(seq_lengths, predicted_ids)]\n                elif self.task_types[task_id] in ['regression', 'binary_head']:\n                    pred = logits[:, 0]\n                    if self.task_types[task_id] == 'binary_head':\n                        pred = torch.sigmoid(logits).squeeze(1)\n                        if not self.return_probas:\n                            pred = (pred > self.binary_threshold).int()\n                    pred = pred.cpu().numpy()\n                else:\n                    if self.multilabel[task_id]:\n                        probs = torch.sigmoid(logits)\n                        if self.return_probas:\n                            pred = probs\n                            pred = pred.cpu().numpy()\n                        else:\n                            numbers_of_sample, numbers_of_class = (probs > self.binary_threshold).nonzero(as_tuple=True)\n                            numbers_of_sample, numbers_of_class = numbers_of_sample.cpu().numpy(), numbers_of_class.cpu().numpy()\n                            pred = [[] for _ in range(len(logits))]\n                            for sample_num, class_num in zip(numbers_of_sample, numbers_of_class):\n                                pred[sample_num].append(int(class_num))\n                    else:\n                        if self.multilabel[task_id]:\n                            probs = torch.sigmoid(logits)\n                            if self.return_probas:\n                                pred = probs\n                                pred = pred.cpu().numpy()\n                            else:\n                                numbers_of_sample, numbers_of_class = (probs > self.binary_threshold).nonzero(as_tuple=True)\n                                numbers_of_sample, numbers_of_class = numbers_of_sample.cpu().numpy(), numbers_of_class.cpu().numpy()\n                                pred = [[] for _ in range(len(logits))]\n                                for sample_num, class_num in zip(numbers_of_sample, numbers_of_class):\n                                    pred[sample_num].append(int(class_num))\n                        else:\n                            if self.return_probas:\n                                pred = torch.softmax(logits, dim=-1)\n                            else:\n                                pred = torch.argmax(logits, dim=1)\n                            pred = pred.cpu().numpy()\n                self.validation_predictions[task_id] = pred\n        if len(args) == 1:\n            return self.validation_predictions[0]\n        for i in range(len(self.validation_predictions)):\n            if self.validation_predictions[i] is None:\n                self.validation_predictions[i] = []\n        self._reset_cache()\n        log.debug(self.validation_predictions)\n        return self.validation_predictions\n\n    def train_on_batch(self, *args):\n        \"\"\"Train model on given batch.\n        This method calls train_op using features and y (labels).\n        Args:\n            features: batch of InputFeatures\n            y: batch of labels (class id)\n        Returns:\n            dict with loss for each task\n        \"\"\"\n        log.debug(f'Training for {args}')\n        error_msg = f'Len of arguments {len(args)} is WRONG. ' \\\n                    f'Correct is {2 * self.n_tasks} as n_tasks is {self.n_tasks}'\n        if len(args) != 2 * self.n_tasks:\n            raise Exception(error_msg)\n        ids_to_iterate = [k for k in range(self.n_tasks) if len(args[k]) > 0]\n        if len(ids_to_iterate) == 0:\n            raise Exception(f'No examples given! Given args {args}')\n        elif len(ids_to_iterate) > 1:\n            raise Exception('Samples from more than 1 task in train_on_batch')\n        task_id = ids_to_iterate[0]\n        _input, batch_size = self._make_input(task_features=args[task_id], task_id=task_id,\n                                              labels=args[task_id + self.n_tasks])\n        if _input == {}:\n            raise Exception('Empty input!')\n\n        if self.prev_id is None:\n            self.prev_id = task_id\n        elif self.prev_id != task_id and not self.printed:\n            log.info('Seen samples from different tasks')\n            self.printed = True\n        if 'token_type_ids' not in _input:\n            _input['token_type_ids'] = None\n        loss, logits = self.model(task_id=task_id, **_input)\n        if self.is_data_parallel:\n            loss = loss.mean()\n        loss = loss / self.gradient_accumulation_steps\n        loss.backward()\n\n        # Clip the norm of the gradients to 1.0.\n        # This is to help prevent the \"exploding gradients\" problem.\n        if self.clip_norm:\n            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm)\n\n        if (self.steps_taken + 1) % self.gradient_accumulation_steps == 0 or (\n                self.steps_per_epoch is not None and (self.steps_taken + 1) % self.steps_per_epoch == 0):\n            self.optimizer.step()\n            self.optimizer.zero_grad()\n        self.train_losses[task_id] = loss.item()\n        self.steps_taken += 1\n        log.debug(f'train {task_id} {logits}')\n        return {\"losses\": self.train_losses}\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/torch_bert_ranker.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Dict, Union, Optional\n\nimport numpy as np\nimport torch\nfrom transformers import AutoModelForSequenceClassification, AutoConfig\nfrom transformers.data.processors.utils import InputFeatures\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\n\nlog = getLogger(__name__)\n\n\n@register('torch_bert_ranker')\nclass TorchBertRankerModel(TorchModel):\n    \"\"\"BERT-based model for interaction-based text ranking on PyTorch.\n\n    Linear transformation is trained over the BERT pooled output from [CLS] token.\n    Predicted probabilities of classes are used as a similarity measure for ranking.\n\n    Args:\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title)\n        n_classes: number of classes\n        return_probas: set True if class probabilities are returned instead of the most probable label\n    \"\"\"\n\n    def __init__(self, pretrained_bert: str = None,\n                 bert_config_file: Optional[str] = None,\n                 n_classes: int = 2,\n                 return_probas: bool = True,\n                 **kwargs) -> None:\n\n        self.return_probas = return_probas\n\n        if self.return_probas and n_classes == 1:\n            raise RuntimeError('Set return_probas to False for regression task!')\n\n        if pretrained_bert:\n            log.debug(f\"From pretrained {pretrained_bert}.\")\n            if Path(expand_path(pretrained_bert)).exists():\n                pretrained_bert = str(expand_path(pretrained_bert))\n            config = AutoConfig.from_pretrained(pretrained_bert,\n                                                # num_labels=self.n_classes,\n                                                output_attentions=False,\n                                                output_hidden_states=False)\n\n            model = AutoModelForSequenceClassification.from_pretrained(pretrained_bert, config=config)\n\n            # TODO: make better exception handling here and at\n            # deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel.load\n            try:\n                hidden_size = model.classifier.out_proj.in_features\n\n                if n_classes != model.num_labels:\n                    model.classifier.out_proj.weight = torch.nn.Parameter(torch.randn(n_classes, hidden_size))\n                    model.classifier.out_proj.bias = torch.nn.Parameter(torch.randn(n_classes))\n                    model.classifier.out_proj.out_features = n_classes\n                    model.num_labels = n_classes\n\n            except AttributeError:\n                hidden_size = model.classifier.in_features\n\n                if n_classes != model.num_labels:\n                    model.classifier.weight = torch.nn.Parameter(torch.randn(n_classes, hidden_size))\n                    model.classifier.bias = torch.nn.Parameter(torch.randn(n_classes))\n                    model.classifier.out_features = n_classes\n                    model.num_labels = n_classes\n\n\n        elif bert_config_file and expand_path(bert_config_file).is_file():\n            self.bert_config = AutoConfig.from_pretrained(str(expand_path(bert_config_file)))\n            model = AutoModelForSequenceClassification.from_config(config=self.bert_config)\n\n        else:\n            raise ConfigError(\"No pre-trained BERT model is given.\")\n\n        super().__init__(model, **kwargs)\n\n    def train_on_batch(self, features_li: List[List[InputFeatures]], y: Union[List[int], List[List[int]]]) -> Dict:\n        \"\"\"Train the model on the given batch.\n\n        Args:\n            features_li: list with the single element containing the batch of InputFeatures\n            y: batch of labels (class id or one-hot encoding)\n\n        Returns:\n            dict with loss and learning rate values\n        \"\"\"\n        features = features_li[0]\n\n        input_ids = [f.input_ids for f in features]\n        input_masks = [f.attention_mask for f in features]\n\n        b_input_ids = torch.cat(input_ids, dim=0).to(self.device)\n        b_input_masks = torch.cat(input_masks, dim=0).to(self.device)\n        b_labels = torch.from_numpy(np.array(y)).to(self.device)\n\n        self.optimizer.zero_grad()\n\n        loss, logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks,\n                                  labels=b_labels, return_dict=False)\n        self._make_step(loss)\n\n        return {'loss': loss.item()}\n\n    def __call__(self, features_li: List[List[InputFeatures]]) -> Union[List[int], List[List[float]]]:\n        \"\"\"Calculate scores for the given context over candidate responses.\n\n        Args:\n            features_li: list of elements where each element contains the batch of features\n             for contexts with particular response candidates\n\n        Returns:\n            predicted scores for contexts over response candidates\n        \"\"\"\n        if len(features_li) == 1 and len(features_li[0]) == 1:\n            msg = f\"It is not intended to use the {self.__class__} in the interact mode.\"\n            log.error(msg)\n            return [msg]\n\n        predictions = []\n        for features in features_li:\n\n            input_ids = [f.input_ids for f in features]\n            input_masks = [f.attention_mask for f in features]\n\n            b_input_ids = torch.cat(input_ids, dim=0).to(self.device)\n            b_input_masks = torch.cat(input_masks, dim=0).to(self.device)\n\n            with torch.no_grad():\n                # Forward pass, calculate logit predictions\n                logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks)\n                logits = logits[0]\n\n            if self.return_probas:\n                pred = torch.nn.functional.softmax(logits, dim=-1)[:, 1]\n                pred = pred.detach().cpu().numpy()\n            else:\n                logits = logits.detach().cpu().numpy()\n                pred = np.argmax(logits, axis=1)\n\n            predictions.append(pred)\n\n        if len(features_li) == 1:\n            predictions = predictions[0]\n        else:\n            predictions = np.hstack([np.expand_dims(el, 1) for el in predictions])\n\n        return predictions\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/torch_transformers_classifier.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Dict, Union, Optional, Tuple\n\nimport numpy as np\nimport torch\nfrom torch.nn import BCEWithLogitsLoss\nfrom transformers import AutoModelForSequenceClassification, AutoConfig, AutoModel, AutoTokenizer\nfrom transformers.modeling_outputs import SequenceClassifierOutput\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\n\nlog = getLogger(__name__)\n\n\n@register('torch_transformers_classifier')\nclass TorchTransformersClassifierModel(TorchModel):\n    \"\"\"Bert-based model for text classification on PyTorch.\n\n    It uses output from [CLS] token and predicts labels using linear transformation.\n\n    Args:\n        n_classes: number of classes\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        multilabel: set True if it is multi-label classification\n        return_probas: set True if return class probabilites instead of most probable label needed\n        attention_probs_keep_prob: keep_prob for Bert self-attention layers\n        hidden_keep_prob: keep_prob for Bert hidden layers\n        bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title)\n        is_binary: whether classification task is binary or multi-class\n        num_special_tokens: number of special tokens used by classification model\n    \"\"\"\n\n    def __init__(self, n_classes,\n                 pretrained_bert,\n                 multilabel: bool = False,\n                 return_probas: bool = False,\n                 attention_probs_keep_prob: Optional[float] = None,\n                 hidden_keep_prob: Optional[float] = None,\n                 bert_config_file: Optional[str] = None,\n                 is_binary: Optional[bool] = False,\n                 num_special_tokens: int = None,\n                 **kwargs) -> None:\n\n        self.return_probas = return_probas\n        self.multilabel = multilabel\n        self.n_classes = n_classes\n        self.is_binary = is_binary\n\n        if self.multilabel and not self.return_probas:\n            raise RuntimeError('Set return_probas to True for multilabel classification!')\n\n        if self.return_probas and self.n_classes == 1:\n            raise RuntimeError('Set return_probas to False for regression task!')\n\n        if pretrained_bert:\n            log.debug(f\"From pretrained {pretrained_bert}.\")\n            config = AutoConfig.from_pretrained(pretrained_bert,\n                                                # num_labels=self.n_classes,\n                                                output_attentions=False,\n                                                output_hidden_states=False)\n\n            if self.is_binary:\n                config.add_pooling_layer = False\n                model = AutoModelForBinaryClassification(pretrained_bert, config)\n            else:\n                model = AutoModelForSequenceClassification.from_pretrained(pretrained_bert, config=config)\n\n                # TODO need a better solution here and at\n                # deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel.load\n                try:\n                    hidden_size = model.classifier.out_proj.in_features\n\n                    if self.n_classes != model.num_labels:\n                        model.classifier.out_proj.weight = torch.nn.Parameter(torch.randn(self.n_classes,\n                                                                                               hidden_size))\n                        model.classifier.out_proj.bias = torch.nn.Parameter(torch.randn(self.n_classes))\n                        model.classifier.out_proj.out_features = self.n_classes\n                        model.num_labels = self.n_classes\n\n                except AttributeError:\n                    hidden_size = model.classifier.in_features\n\n                    if self.n_classes != model.num_labels:\n                        model.classifier.weight = torch.nn.Parameter(torch.randn(self.n_classes, hidden_size))\n                        model.classifier.bias = torch.nn.Parameter(torch.randn(self.n_classes))\n                        model.classifier.out_features = self.n_classes\n                        model.num_labels = self.n_classes\n\n        elif bert_config_file and Path(bert_config_file).is_file():\n            bert_config = AutoConfig.from_pretrained(str(expand_path(bert_config_file)))\n            if attention_probs_keep_prob is not None:\n                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob\n            if hidden_keep_prob is not None:\n                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob\n            model = AutoModelForSequenceClassification.from_config(config=bert_config)\n        else:\n            raise ConfigError(\"No pre-trained BERT model is given.\")\n\n        tokenizer = AutoTokenizer.from_pretrained(pretrained_bert)\n        if num_special_tokens is not None:\n            model.resize_token_embeddings(len(tokenizer) + num_special_tokens)\n\n        super().__init__(model, **kwargs)\n\n    def train_on_batch(self, features: Dict[str, torch.tensor], y: Union[List[int], List[List[int]]]) -> Dict:\n        \"\"\"Train model on given batch.\n        This method calls train_op using features and y (labels).\n\n        Args:\n            features: batch of InputFeatures\n            y: batch of labels (class id or one-hot encoding)\n\n        Returns:\n            dict with loss and learning_rate values\n        \"\"\"\n\n        _input = {key: value.to(self.device) for key, value in features.items()}\n\n        if self.n_classes > 1 and not self.is_binary:\n            _input[\"labels\"] = torch.from_numpy(np.array(y)).to(self.device)\n\n        # regression\n        else:\n            _input[\"labels\"] = torch.from_numpy(np.array(y, dtype=np.float32)).unsqueeze(1).to(self.device)\n\n        self.optimizer.zero_grad()\n\n        tokenized = {key: value for (key, value) in _input.items()\n                     if key in self.accepted_keys}\n\n        loss = self.model(**tokenized).loss\n        if self.is_data_parallel:\n            loss = loss.mean()\n        self._make_step(loss)\n\n        return {'loss': loss.item()}\n\n    def __call__(self, features: Dict[str, torch.tensor]) -> Union[List[int], List[List[float]]]:\n        \"\"\"Make prediction for given features (texts).\n\n        Args:\n            features: batch of InputFeatures\n\n        Returns:\n            predicted classes or probabilities of each class\n\n        \"\"\"\n\n        _input = {key: value.to(self.device) for key, value in features.items()}\n\n        with torch.no_grad():\n            tokenized = {key: value for (key, value) in _input.items()\n                         if key in self.accepted_keys}\n\n            # Forward pass, calculate logit predictions\n            logits = self.model(**tokenized)\n            logits = logits[0]\n\n        if self.return_probas:\n            if self.is_binary:\n                pred = torch.sigmoid(logits).squeeze(1)\n            elif not self.multilabel:\n                pred = torch.nn.functional.softmax(logits, dim=-1)\n            else:\n                pred = torch.nn.functional.sigmoid(logits)\n            pred = pred.detach().cpu().numpy()\n        elif self.n_classes > 1:\n            logits = logits.detach().cpu().numpy()\n            pred = np.argmax(logits, axis=1)\n        # regression\n        else:\n            pred = logits.squeeze(-1).detach().cpu().numpy()\n\n        return pred\n\n    # TODO move to the super class\n    @property\n    def accepted_keys(self) -> Tuple[str]:\n        if self.is_data_parallel:\n            accepted_keys = self.model.module.forward.__code__.co_varnames\n        else:\n            accepted_keys = self.model.forward.__code__.co_varnames\n        return accepted_keys\n\n\nclass AutoModelForBinaryClassification(torch.nn.Module):\n\n    def __init__(self, pretrained_bert, config):\n        super().__init__()\n        self.pretrained_bert = pretrained_bert\n        self.config = config\n\n        self.model = AutoModel.from_pretrained(self.pretrained_bert, self.config)\n        self.classifier = BinaryClassificationHead(config)\n\n        self.classifier.init_weights()\n\n    def forward(self,\n                input_ids=None,\n                attention_mask=None,\n                token_type_ids=None,\n                position_ids=None,\n                head_mask=None,\n                inputs_embeds=None,\n                labels=None,\n                output_attentions=None,\n                output_hidden_states=None,\n                return_dict=None):\n\n        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n\n        outputs = self.model(input_ids,\n                             attention_mask=attention_mask,\n                             token_type_ids=token_type_ids,\n                             position_ids=position_ids,\n                             head_mask=head_mask,\n                             inputs_embeds=inputs_embeds,\n                             output_attentions=output_attentions,\n                             output_hidden_states=output_hidden_states,\n                             return_dict=return_dict)\n\n        sequence_output = outputs[0]\n        logits = self.classifier(sequence_output)\n\n        loss = None\n        if labels is not None:\n            loss_fct = BCEWithLogitsLoss()\n            loss = loss_fct(logits, labels)\n        if not return_dict:\n            output = (logits,) + outputs[2:]\n            return ((loss,) + output) if loss is not None else output\n\n        return SequenceClassifierOutput(loss=loss,\n                                        logits=logits,\n                                        hidden_states=outputs.hidden_states,\n                                        attentions=outputs.attentions)\n\n\nclass BinaryClassificationHead(torch.nn.Module):\n    def __init__(self, config):\n        super().__init__()\n\n        self.config = config\n\n        self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)\n        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)\n        self.out_proj = torch.nn.Linear(config.hidden_size, 1)\n\n    def init_weights(self):\n        self.dense.weight.data.normal_(mean=0.0, std=self.config.initializer_range)\n        if self.dense.bias is not None:\n            self.dense.bias.data.zero_()\n\n    def forward(self, features, **kwargs):\n        x = features[:, 0, :]\n        x = self.dropout(x)\n        x = self.dense(x)\n        x = torch.tanh(x)\n        x = self.dropout(x)\n        x = self.out_proj(x)\n        return x\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/torch_transformers_el_ranker.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Tuple, Union, Any\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch import Tensor\nfrom transformers import AutoConfig, AutoTokenizer, AutoModel\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\nfrom deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersEntityRankerPreprocessor\n\nlog = getLogger(__name__)\n\n\n@register('torch_transformers_el_ranker')\nclass TorchTransformersElRanker(TorchModel):\n    \"\"\"Class for ranking of entities by context and description\n    Args:\n        encoder_save_path: path to save the encoder checkpoint\n        bilinear_save_path: path to save bilinear layer checkpoint\n        block_size: size of block in bilinear layer\n        emb_size: entity embedding size\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name\n        return_probas: set this to `True` if you need the probabilities instead of raw answers\n    \"\"\"\n\n    def __init__(\n            self,\n            encoder_save_path: str,\n            bilinear_save_path: str,\n            block_size: int,\n            emb_size: int,\n            pretrained_bert: str = None,\n            return_probas: bool = False,\n            **kwargs\n    ):\n        self.return_probas = return_probas\n\n        model = SiameseBertElModel(\n            pretrained_bert=pretrained_bert,\n            encoder_save_path=encoder_save_path,\n            bilinear_save_path=bilinear_save_path,\n            bert_config_file=pretrained_bert,\n            block_size=block_size,\n            emb_size=emb_size\n        )\n\n        super().__init__(model, **kwargs)\n\n    def train_on_batch(self, q_features: List[Dict],\n                       c_features: List[Dict],\n                       entity_tokens_pos: List[int],\n                       labels: List[int]) -> float:\n        \"\"\"\n\n        Args:\n            q_features: batch of indices of text subwords\n            c_features: batch of indices of entity description subwords\n            entity_tokens_pos: list of indices of special tokens\n            labels: 1 if entity is appropriate to context, 0 - otherwise\n\n        Returns:\n            the value of loss\n        \"\"\"\n        _input = {'labels': labels}\n        _input['entity_tokens_pos'] = entity_tokens_pos\n        for elem in ['input_ids', 'attention_mask']:\n            inp_elem = [getattr(f, elem) for f in q_features]\n            _input[f\"q_{elem}\"] = torch.LongTensor(inp_elem).to(self.device)\n        for elem in ['input_ids', 'attention_mask']:\n            inp_elem = [getattr(f, elem) for f in c_features]\n            _input[f\"c_{elem}\"] = torch.LongTensor(inp_elem).to(self.device)\n\n        self.model.train()\n        self.model.zero_grad()\n        self.optimizer.zero_grad()  # zero the parameter gradients\n\n        loss, softmax_scores = self.model(**_input)\n        self._make_step(loss)\n\n        return loss.item()\n\n    def __call__(self, q_features: List[Dict],\n                 c_features: List[Dict],\n                 entity_tokens_pos: List[int]) -> Union[List[int], List[np.ndarray]]:\n        \"\"\" Predicts entity labels (1 if the entity description is appropriate to the context, 0 - otherwise)\n\n        Args:\n            q_features: batch of indices of text subwords\n            c_features: batch of indices of entity description subwords\n            entity_tokens_pos: list of indices of special tokens\n\n        Returns:\n            Label indices or class probabilities for each token (not subtoken)\n\n        \"\"\"\n        self.model.eval()\n\n        _input = {'entity_tokens_pos': entity_tokens_pos}\n        for elem in ['input_ids', 'attention_mask']:\n            inp_elem = [getattr(f, elem) for f in q_features]\n            _input[f\"q_{elem}\"] = torch.LongTensor(inp_elem).to(self.device)\n        for elem in ['input_ids', 'attention_mask']:\n            inp_elem = [getattr(f, elem) for f in c_features]\n            _input[f\"c_{elem}\"] = torch.LongTensor(inp_elem).to(self.device)\n\n        with torch.no_grad():\n            softmax_scores = self.model(**_input)\n            if self.return_probas:\n                pred = softmax_scores\n            else:\n                pred = torch.argmax(softmax_scores, dim=1).cpu().numpy()\n\n        return pred\n\n    def save(self, fname: Optional[str] = None, *args, **kwargs) -> None:\n        if fname is None:\n            fname = self.save_path\n        if not fname.parent.is_dir():\n            raise ConfigError(\"Provided save path is incorrect!\")\n        weights_path = Path(fname).with_suffix(f\".pth.tar\")\n        log.info(f\"Saving model to {weights_path}.\")\n        torch.save({\n            \"model_state_dict\": self.model.cpu().state_dict(),\n            \"optimizer_state_dict\": self.optimizer.state_dict(),\n            \"epochs_done\": self.epochs_done\n        }, weights_path)\n        self.model.to(self.device)\n        self.model.save()\n\n\nclass TextEncoder(nn.Module):\n    \"\"\"Class for obtaining the BERT output for CLS-token and special entity token\n    Args:\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name\n        device: device to use\n    \"\"\"\n\n    def __init__(self, pretrained_bert: str = None,\n                 bert_config_file: str = None,\n                 device: torch.device = torch.device('cpu')):\n        super().__init__()\n        self.pretrained_bert = pretrained_bert\n        self.bert_config_file = bert_config_file\n        self.encoder, self.config, self.bert_config = None, None, None\n        self.device = device\n        self.load()\n        self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_bert)\n        self.encoder.resize_token_embeddings(len(self.tokenizer) + 1)\n        self.encoder.to(self.device)\n\n    def forward(self,\n                input_ids: Tensor,\n                attention_mask: Tensor,\n                entity_tokens_pos: List[int] = None\n                ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]:\n        if entity_tokens_pos is not None:\n            q_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)\n            q_hidden_states = q_outputs.last_hidden_state\n\n            entity_emb = []\n            for i in range(len(entity_tokens_pos)):\n                pos = entity_tokens_pos[i]\n                entity_emb.append(q_hidden_states[i, pos])\n\n            entity_emb = torch.stack(entity_emb, dim=0).to(self.device)\n            return entity_emb\n        else:\n            c_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)\n            c_cls_emb = c_outputs.last_hidden_state[:, :1, :].squeeze(1)\n            return c_cls_emb\n\n    def load(self) -> None:\n        if self.pretrained_bert:\n            log.debug(f\"From pretrained {self.pretrained_bert}.\")\n            self.config = AutoConfig.from_pretrained(\n                self.pretrained_bert, output_hidden_states=True\n            )\n            self.encoder = AutoModel.from_pretrained(self.pretrained_bert, config=self.config)\n\n        elif self.bert_config_file and Path(self.bert_config_file).is_file():\n            self.config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file)))\n            self.encoder = AutoModel.from_config(config=self.bert_config)\n        else:\n            raise ConfigError(\"No pre-trained BERT model is given.\")\n        self.encoder.to(self.device)\n\n\nclass BilinearRanking(nn.Module):\n    \"\"\"Class for calculation of bilinear form of two vectors\n    Args:\n        n_classes: number of classes for classification\n        emb_size: entity embedding size\n        block_size: size of block in bilinear layer\n    \"\"\"\n\n    def __init__(self, n_classes: int = 2, emb_size: int = 768, block_size: int = 8):\n        super().__init__()\n        self.n_classes = n_classes\n        self.emb_size = emb_size\n        self.block_size = block_size\n        self.bilinear = nn.Linear(self.emb_size * self.block_size, self.n_classes)\n        self.softmax = nn.Softmax(dim=1)\n\n    def forward(self, text1: Tensor, text2: Tensor):\n        b1 = text1.view(-1, self.emb_size // self.block_size, self.block_size)\n        b2 = text2.view(-1, self.emb_size // self.block_size, self.block_size)\n        bl = (b1.unsqueeze(3) * b2.unsqueeze(2)).view(-1, self.emb_size * self.block_size)\n        logits = self.bilinear(bl)\n        softmax_logits = self.softmax(logits)\n        log_softmax = F.log_softmax(logits, dim=-1)\n        return softmax_logits, log_softmax\n\n\nclass SiameseBertElModel(nn.Module):\n    \"\"\"Class with model for ranking of entities by context and description\n    Args:\n        emb_size: entity embedding size\n        block_size: size of block in bilinear layer\n        encoder_save_path: path to save the encoder checkpoint\n        bilinear_save_path: path to save bilinear layer checkpoint\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name\n        device: device to use\n    \"\"\"\n\n    def __init__(\n            self,\n            emb_size: int,\n            block_size: int,\n            encoder_save_path: str,\n            bilinear_save_path: str,\n            pretrained_bert: str = None,\n            bert_config_file: str = None,\n            device: torch.device = torch.device('cpu')\n    ):\n        super().__init__()\n        self.pretrained_bert = pretrained_bert\n        self.encoder_save_path = encoder_save_path\n        self.bilinear_save_path = bilinear_save_path\n        self.bert_config_file = bert_config_file\n        self.device = device\n\n        # initialize parameters that would be filled later\n        self.encoder = TextEncoder(pretrained_bert=self.pretrained_bert, device=self.device)\n        self.bilinear_ranker = BilinearRanking(emb_size, block_size)\n\n    def forward(\n            self,\n            q_input_ids: Tensor,\n            q_attention_mask: Tensor,\n            c_input_ids: Tensor,\n            c_attention_mask: Tensor,\n            entity_tokens_pos: List,\n            labels: List[int] = None\n    ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]:\n\n        entity_emb = self.encoder(input_ids=q_input_ids, attention_mask=q_attention_mask,\n                                  entity_tokens_pos=entity_tokens_pos)\n        c_cls_emb = self.encoder(input_ids=c_input_ids, attention_mask=c_attention_mask)\n        softmax_scores, log_softmax = self.bilinear_ranker(entity_emb, c_cls_emb)\n\n        if labels is not None:\n            labels_one_hot = [[0.0, 0.0] for _ in labels]\n            for i in range(len(labels)):\n                labels_one_hot[i][labels[i]] = 1.0\n            labels_one_hot = torch.Tensor(labels_one_hot).to(self.device)\n\n            bs, dim = labels_one_hot.shape\n            per_sample_loss = -torch.bmm(labels_one_hot.view(bs, 1, dim), log_softmax.view(bs, dim, 1)).squeeze(\n                2).squeeze(1)\n            loss = torch.mean(per_sample_loss)\n            return loss, softmax_scores\n        else:\n            return softmax_scores\n\n    def save(self) -> None:\n        encoder_weights_path = expand_path(self.encoder_save_path).with_suffix(f\".pth.tar\")\n        log.info(f\"Saving encoder to {encoder_weights_path}.\")\n        torch.save({\"model_state_dict\": self.encoder.cpu().state_dict()}, encoder_weights_path)\n        bilinear_weights_path = expand_path(self.bilinear_save_path).with_suffix(f\".pth.tar\")\n        log.info(f\"Saving bilinear weights to {bilinear_weights_path}.\")\n        torch.save({\"model_state_dict\": self.bilinear_ranker.cpu().state_dict()}, bilinear_weights_path)\n        self.encoder.to(self.device)\n        self.bilinear_ranker.to(self.device)\n\n\n@register('torch_transformers_entity_ranker_infer')\nclass TorchTransformersEntityRankerInfer:\n    \"\"\"Class for infering of model for ranking of entities from a knowledge base by context and description\n    Args:\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        encoder_weights_path: path to save the encoder checkpoint\n        bilinear_weights_path: path to save bilinear layer checkpoint\n        spaecial_token_id: id of special token\n        do_lower_case: whether to lower case the text\n        batch_size: batch size when model infering\n        emb_size: entity embedding size\n        block_size: size of block in bilinear layer\n        device: `cpu` or `gpu` device to use\n    \"\"\"\n\n    def __init__(self, pretrained_bert,\n                 encoder_weights_path,\n                 bilinear_weights_path,\n                 special_token_id: int,\n                 do_lower_case: bool = False,\n                 batch_size: int = 5,\n                 emb_size: int = 300,\n                 block_size: int = 8,\n                 device: str = \"gpu\", **kwargs):\n        self.device = torch.device(\"cuda\" if torch.cuda.is_available() and device == \"gpu\" else \"cpu\")\n        self.pretrained_bert = pretrained_bert\n        self.preprocessor = TorchTransformersEntityRankerPreprocessor(vocab_file=self.pretrained_bert,\n                                                                      do_lower_case=do_lower_case,\n                                                                      special_tokens=[\"[ENT]\"])\n        self.encoder, self.config = None, None\n        self.config = AutoConfig.from_pretrained(self.pretrained_bert, output_hidden_states=True)\n        self.emb_size = emb_size\n        self.block_size = block_size\n        self.encoder = TextEncoder(pretrained_bert=self.pretrained_bert, device=self.device)\n        self.encoder_weights_path = str(expand_path(encoder_weights_path))\n        self.bilinear_weights_path = str(expand_path(bilinear_weights_path))\n        encoder_checkpoint = torch.load(self.encoder_weights_path, map_location=self.device)\n        self.encoder.load_state_dict(encoder_checkpoint[\"model_state_dict\"])\n        self.encoder.to(self.device)\n        self.bilinear_ranking = BilinearRanking(emb_size=self.emb_size, block_size=self.block_size)\n        bilinear_checkpoint = torch.load(self.bilinear_weights_path, map_location=self.device)\n        self.bilinear_ranking.load_state_dict(bilinear_checkpoint[\"model_state_dict\"])\n        self.bilinear_ranking.to(self.device)\n        self.special_token_id = special_token_id\n        self.batch_size = batch_size\n\n    def __call__(self, contexts_batch: List[str],\n                 candidate_entities_batch: List[List[str]],\n                 candidate_entities_descr_batch: List[List[str]]):\n        entity_emb_batch = []\n\n        num_batches = len(contexts_batch) // self.batch_size + int(len(contexts_batch) % self.batch_size > 0)\n        for ii in range(num_batches):\n            contexts_list = contexts_batch[ii * self.batch_size:(ii + 1) * self.batch_size]\n            context_features = self.preprocessor(contexts_list)\n            context_input_ids = context_features[\"input_ids\"].to(self.device)\n            context_attention_mask = context_features[\"attention_mask\"].to(self.device)\n            special_tokens_pos = []\n            for input_ids_list in context_input_ids:\n                found_n = -1\n                for n, input_id in enumerate(input_ids_list):\n                    if input_id == self.special_token_id:\n                        found_n = n\n                        break\n                if found_n == -1:\n                    found_n = 0\n                special_tokens_pos.append(found_n)\n\n            cur_entity_emb_batch = self.encoder(input_ids=context_input_ids,\n                                                attention_mask=context_attention_mask,\n                                                entity_tokens_pos=special_tokens_pos)\n\n            entity_emb_batch += cur_entity_emb_batch.detach().cpu().numpy().tolist()\n\n        scores_batch = []\n        for entity_emb, candidate_entities_list, candidate_entities_descr_list in \\\n                zip(entity_emb_batch, candidate_entities_batch, candidate_entities_descr_batch):\n            if candidate_entities_list:\n                entity_emb = [entity_emb for _ in candidate_entities_list]\n                entity_emb = torch.Tensor(entity_emb).to(self.device)\n                descr_features = self.preprocessor(candidate_entities_descr_list)\n                descr_input_ids = descr_features[\"input_ids\"].to(self.device)\n                descr_attention_mask = descr_features[\"attention_mask\"].to(self.device)\n                candidate_entities_emb = self.encoder(input_ids=descr_input_ids,\n                                                      attention_mask=descr_attention_mask)\n                scores_list, _ = self.bilinear_ranking(entity_emb, candidate_entities_emb)\n                scores_list = scores_list.detach().cpu().numpy()\n                scores_list = [score[1] for score in scores_list]\n                entities_with_scores = [(entity, score) for entity, score in zip(candidate_entities_list, scores_list)]\n                entities_with_scores = sorted(entities_with_scores, key=lambda x: x[1], reverse=True)\n                scores_batch.append(entities_with_scores)\n            else:\n                scores_batch.append([])\n\n        return scores_batch\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/torch_transformers_multiplechoice.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Dict, Union, Optional\n\nimport numpy as np\nimport torch\nfrom transformers import AutoModelForMultipleChoice, AutoConfig\n\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\n\nlog = getLogger(__name__)\n\n\n@register('torch_transformers_multiplechoice')\nclass TorchTransformersMultiplechoiceModel(TorchModel):\n    \"\"\"Bert-based model for text classification on PyTorch.\n\n    It uses output from [CLS] token and predicts labels using linear transformation.\n\n    Args:\n        n_classes: number of classes\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        multilabel: set True if it is multi-label classification\n        return_probas: set True if return class probabilites instead of most probable label needed\n        attention_probs_keep_prob: keep_prob for Bert self-attention layers\n        hidden_keep_prob: keep_prob for Bert hidden layers\n        bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title)\n    \"\"\"\n\n    def __init__(self, n_classes,\n                 pretrained_bert,\n                 multilabel: bool = False,\n                 return_probas: bool = False,\n                 attention_probs_keep_prob: Optional[float] = None,\n                 hidden_keep_prob: Optional[float] = None,\n                 bert_config_file: Optional[str] = None,\n                 **kwargs) -> None:\n\n        self.return_probas = return_probas\n        self.multilabel = multilabel\n        self.n_classes = n_classes\n\n        if self.multilabel and not self.return_probas:\n            raise RuntimeError('Set return_probas to True for multilabel classification!')\n\n        if self.return_probas and self.n_classes == 1:\n            raise RuntimeError('Set return_probas to False for regression task!')\n\n        if pretrained_bert:\n            log.debug(f\"From pretrained {pretrained_bert}.\")\n            config = AutoConfig.from_pretrained(pretrained_bert, num_labels=self.n_classes,\n                                                output_attentions=False, output_hidden_states=False)\n\n            model = AutoModelForMultipleChoice.from_pretrained(pretrained_bert, config=config)\n\n        elif bert_config_file and Path(bert_config_file).is_file():\n            bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file)))\n            if attention_probs_keep_prob is not None:\n                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob\n            if hidden_keep_prob is not None:\n                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob\n            model = AutoModelForMultipleChoice.from_config(config=bert_config)\n        else:\n            raise ConfigError(\"No pre-trained BERT model is given.\")\n\n        super().__init__(model, **kwargs)\n\n    def train_on_batch(self, features: Dict[str, torch.tensor], y: Union[List[int], List[List[int]]]) -> Dict:\n        \"\"\"Train model on given batch.\n        This method calls train_op using features and y (labels).\n\n        Args:\n            features: batch of InputFeatures\n            y: batch of labels (class id or one-hot encoding)\n\n        Returns:\n            dict with loss and learning_rate values\n        \"\"\"\n\n        _input = {key: value.to(self.device) for key, value in features.items()}\n\n        _input[\"labels\"] = torch.tensor(y).long().to(self.device)\n\n        self.optimizer.zero_grad()\n\n        tokenized = {key: value for (key, value) in _input.items() if key in self.model.forward.__code__.co_varnames}\n\n        loss = self.model(**tokenized).loss\n        self._make_step(loss)\n\n        return {'loss': loss.item()}\n\n    def __call__(self, features: Dict[str, torch.tensor]) -> Union[List[int], List[List[float]]]:\n        \"\"\"Make prediction for given features (texts).\n\n        Args:\n            features: batch of InputFeatures\n\n        Returns:\n            predicted classes or probabilities of each class\n\n        \"\"\"\n\n        _input = {key: value.to(self.device) for key, value in features.items()}\n\n        with torch.no_grad():\n            tokenized = {key: value for (key, value) in _input.items()\n                         if key in self.model.forward.__code__.co_varnames}\n\n            # Forward pass, calculate logit predictions\n            logits = self.model(**tokenized)\n            logits = logits[0]\n\n        if self.return_probas:\n            if not self.multilabel:\n                pred = torch.nn.functional.softmax(logits, dim=-1)\n            else:\n                pred = torch.nn.functional.sigmoid(logits)\n            pred = pred.detach().cpu().numpy()\n        elif self.n_classes > 1:\n            logits = logits.detach().cpu().numpy()\n            pred = np.argmax(logits, axis=1)\n        else:  # regression\n            pred = logits.squeeze(-1).detach().cpu().numpy()\n\n        return pred\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/torch_transformers_nll_ranking.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Dict, Tuple, Union, Any\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch import Tensor\nfrom transformers import AutoConfig, AutoModel, AutoTokenizer\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\n\nlog = getLogger(__name__)\n\n\n@register('torch_transformers_nll_ranker')\nclass TorchTransformersNLLRanker(TorchModel):\n    \"\"\"Class for ranking of relations using the model trained with NLL loss\n    Args:\n        pretrained_bert: pretrained transformer checkpoint path or key title (e.g. \"bert-base-uncased\")\n        encoder_save_path: path to save the encoder checkpoint\n        linear_save_path: path to save linear layer checkpoint\n        return_probas: set this to `True` if you need the probabilities instead of raw answers\n    \"\"\"\n\n    def __init__(\n            self,\n            pretrained_bert: str = None,\n            encoder_save_path: str = None,\n            linear_save_path: str = None,\n            return_probas: bool = False,\n            **kwargs\n    ):\n        self.return_probas = return_probas\n\n        model = NLLRanking(\n            pretrained_bert=pretrained_bert,\n            encoder_save_path=encoder_save_path,\n            linear_save_path=linear_save_path,\n            bert_tokenizer_config_file=pretrained_bert,\n        )\n\n        super().__init__(model, **kwargs)\n\n    def train_on_batch(self, input_features: Dict[str, Any], positive_idx: List[int]) -> float:\n        _input = {'positive_idx': positive_idx,\n                  \"input_ids\": torch.LongTensor(input_features[\"input_ids\"]).to(self.device),\n                  \"attention_mask\": torch.LongTensor(input_features[\"attention_mask\"]).to(self.device),\n                  \"token_type_ids\": torch.LongTensor(input_features[\"token_type_ids\"]).to(self.device)}\n\n        self.model.train()\n        self.model.zero_grad()\n        self.optimizer.zero_grad()  # zero the parameter gradients\n\n        loss, softmax_scores = self.model(**_input)\n        loss.backward()\n        self.optimizer.step()\n\n        # Clip the norm of the gradients to prevent the \"exploding gradients\" problem\n        if self.clip_norm:\n            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm)\n\n        return loss.item()\n\n    def __call__(self, input_features: Dict[str, Any]) -> Union[List[int], List[np.ndarray]]:\n        self.model.eval()\n        _input = {\"input_ids\": torch.LongTensor(input_features[\"input_ids\"]).to(self.device),\n                  \"attention_mask\": torch.LongTensor(input_features[\"attention_mask\"]).to(self.device),\n                  \"token_type_ids\": torch.LongTensor(input_features[\"token_type_ids\"]).to(self.device)}\n\n        with torch.no_grad():\n            output = self.model(**_input)\n            if isinstance(output, tuple) and len(output) == 2:\n                loss, softmax_scores = output\n            else:\n                softmax_scores = output\n        if self.return_probas:\n            softmax_scores = softmax_scores.cpu().numpy().tolist()\n            return softmax_scores\n        else:\n            pred = torch.argmax(softmax_scores, dim=1)\n            pred = pred.cpu()\n            pred = pred.numpy()\n            return pred\n\n\nclass NLLRanking(nn.Module):\n    \"\"\"Class which implements the relation ranking model\n    Args:\n        pretrained_bert: pretrained transformer checkpoint path or key title (e.g. \"bert-base-uncased\")\n        encoder_save_path: path to save the encoder checkpoint\n        linear_save_path: path to save linear layer checkpoint\n        bert_tokenizer_config_file: path to configuration file of transformer tokenizer\n        device: cpu or gpu\n    \"\"\"\n\n    def __init__(\n            self,\n            pretrained_bert: str = None,\n            encoder_save_path: str = None,\n            linear_save_path: str = None,\n            bert_tokenizer_config_file: str = None,\n            device: str = \"gpu\"\n    ):\n        super().__init__()\n        self.pretrained_bert = pretrained_bert\n        self.encoder_save_path = encoder_save_path\n        self.linear_save_path = linear_save_path\n        self.device = torch.device(\"cuda\" if torch.cuda.is_available() and device == \"gpu\" else \"cpu\")\n\n        # initialize parameters that would be filled later\n        self.encoder, self.config, self.bert_config = None, None, None\n        self.load()\n\n        if Path(bert_tokenizer_config_file).is_file():\n            vocab_file = str(expand_path(bert_tokenizer_config_file))\n            tokenizer = AutoTokenizer(vocab_file=vocab_file)\n        else:\n            tokenizer = AutoTokenizer.from_pretrained(pretrained_bert)\n        self.encoder.resize_token_embeddings(len(tokenizer) + 7)\n\n    def forward(\n            self,\n            input_ids: Tensor,\n            attention_mask: Tensor,\n            token_type_ids: Tensor,\n            positive_idx: List[List[int]] = None\n    ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]:\n\n        bs, samples_num, seq_len = input_ids.size()\n        input_ids = input_ids.reshape(bs * samples_num, -1)\n        attention_mask = attention_mask.reshape(bs * samples_num, -1)\n        token_type_ids = token_type_ids.reshape(bs * samples_num, -1)\n        if hasattr(self.config, \"type_vocab_size\"):\n            encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask,\n                                          token_type_ids=token_type_ids)\n        else:\n            encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)\n        cls_emb = encoder_output.last_hidden_state[:, :1, :].squeeze(1)\n        scores = self.fc(cls_emb)\n        scores = scores.reshape(bs, samples_num)\n\n        if positive_idx is not None:\n            scores = F.log_softmax(scores, dim=1)\n            positive_idx = []\n            for i in range(bs):\n                positive_idx.append(0)\n            loss = F.nll_loss(scores, torch.tensor(positive_idx).to(scores.device), reduction=\"mean\")\n            return loss, scores\n        else:\n            return scores\n\n    def load(self) -> None:\n        if self.pretrained_bert:\n            log.info(f\"From pretrained {self.pretrained_bert}.\")\n            self.config = AutoConfig.from_pretrained(\n                self.pretrained_bert, output_hidden_states=True\n            )\n            self.encoder = AutoModel.from_pretrained(self.pretrained_bert, config=self.config)\n            self.fc = nn.Linear(self.config.hidden_size, 1)\n        else:\n            raise ConfigError(\"No pre-trained BERT model is given.\")\n\n        self.encoder.to(self.device)\n        self.fc.to(self.device)\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py",
    "content": "# Copyright 2019 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Union, Dict, Optional, Tuple\n\nimport numpy as np\nimport torch\nfrom transformers import AutoModelForTokenClassification, AutoConfig\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\nfrom deeppavlov.models.torch_bert.crf import CRF\n\nlog = getLogger(__name__)\n\n\ndef token_from_subtoken(units: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:\n    \"\"\" Assemble token level units from subtoken level units\n\n    Args:\n        units: torch.Tensor of shape [batch_size, SUBTOKEN_seq_length, n_features]\n        mask: mask of token beginnings. For example: for tokens\n\n                [[``[CLS]`` ``My``, ``capybara``, ``[SEP]``],\n                [``[CLS]`` ``Your``, ``aar``, ``##dvark``, ``is``, ``awesome``, ``[SEP]``]]\n\n            the mask will be\n\n                [[0, 1, 1, 0, 0, 0, 0],\n                [0, 1, 1, 0, 1, 1, 0]]\n\n    Returns:\n        word_level_units: Units assembled from ones in the mask. For the\n            example above this units will correspond to the following\n\n                [[``My``, ``capybara``],\n                [``Your`, ``aar``, ``is``, ``awesome``,]]\n\n            the shape of this tensor will be [batch_size, TOKEN_seq_length, n_features]\n    \"\"\"\n    shape = units.size()\n    batch_size = shape[0]\n    nf = shape[2]\n    nf_int = units.size()[-1]\n\n    token_seq_lengths = torch.sum(mask, 1).to(torch.int64)\n\n    n_words = torch.sum(token_seq_lengths)\n\n    max_token_seq_len = torch.max(token_seq_lengths)\n\n    idxs = torch.stack(torch.nonzero(mask, as_tuple=True), dim=1)\n\n    sample_ids_in_batch = torch.nn.functional.pad(input=idxs[:, 0], pad=[1, 0])\n\n    a = torch.logical_not(torch.eq(sample_ids_in_batch[1:], sample_ids_in_batch[:-1]).to(torch.int64))\n\n    q = a * torch.arange(n_words).to(torch.int64)\n    count_to_substract = torch.nn.functional.pad(torch.masked_select(q, q.to(torch.bool)), [1, 0])\n\n    new_word_indices = torch.arange(n_words).to(torch.int64) - torch.gather(\n        count_to_substract, dim=0, index=torch.cumsum(a, 0))\n\n    n_total_word_elements = (batch_size * max_token_seq_len).to(torch.int32)\n    word_indices_flat = (idxs[:, 0] * max_token_seq_len + new_word_indices).to(torch.int64)\n    x_mask = torch.sum(torch.nn.functional.one_hot(word_indices_flat, n_total_word_elements), 0)\n    x_mask = x_mask.to(torch.bool)\n\n    full_range = torch.arange(batch_size * max_token_seq_len).to(torch.int64)\n    nonword_indices_flat = torch.masked_select(full_range, torch.logical_not(x_mask))\n\n    def gather_nd(params, indices):\n        assert type(indices) == torch.Tensor\n        return params[indices.transpose(0, 1).long().numpy().tolist()]\n\n    elements = gather_nd(units, idxs)\n\n    sh = tuple(torch.stack([torch.sum(max_token_seq_len - token_seq_lengths), torch.tensor(nf)], 0).numpy())\n    paddings = torch.zeros(sh, dtype=torch.float64)\n\n    def dynamic_stitch(indices, data):\n        # https://discuss.pytorch.org/t/equivalent-of-tf-dynamic-partition/53735/2\n        n = sum(idx.numel() for idx in indices)\n        res = [None] * n\n        for i, data_ in enumerate(data):\n            idx = indices[i].view(-1)\n            if idx.numel() > 0:\n                d = data_.view(idx.numel(), -1)\n                k = 0\n                for idx_ in idx:\n                    res[idx_] = d[k].to(torch.float64)\n                    k += 1\n        return res\n\n    tensor_flat = torch.stack(dynamic_stitch([word_indices_flat, nonword_indices_flat], [elements, paddings]))\n\n    tensor = torch.reshape(tensor_flat, (batch_size, max_token_seq_len.item(), nf_int))\n\n    return tensor\n\n\ndef token_labels_to_subtoken_labels(labels, y_mask, input_mask):\n    subtoken_labels = []\n    labels_ind = 0\n    n_tokens_with_special = int(np.sum(input_mask))\n\n    for el in y_mask[1:n_tokens_with_special - 1]:\n        if el == 1:\n            subtoken_labels += [labels[labels_ind]]\n            labels_ind += 1\n        else:\n            subtoken_labels += [labels[labels_ind - 1]]\n\n    subtoken_labels = [0] + subtoken_labels + [0] * (len(input_mask) - n_tokens_with_special + 1)\n    return subtoken_labels\n\n\n@register('torch_transformers_sequence_tagger')\nclass TorchTransformersSequenceTagger(TorchModel):\n    \"\"\"Transformer-based model on PyTorch for text tagging. It predicts a label for every token (not subtoken)\n    in the text. You can use it for sequence labeling tasks, such as morphological tagging or named entity recognition.\n\n    Args:\n        n_tags: number of distinct tags\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name\n        attention_probs_keep_prob: keep_prob for Bert self-attention layers\n        hidden_keep_prob: keep_prob for Bert hidden layers\n        use_crf: whether to use Conditional Ramdom Field to decode tags\n    \"\"\"\n\n    def __init__(self,\n                 n_tags: int,\n                 pretrained_bert: str,\n                 bert_config_file: Optional[str] = None,\n                 attention_probs_keep_prob: Optional[float] = None,\n                 hidden_keep_prob: Optional[float] = None,\n                 use_crf: bool = False,\n                 **kwargs) -> None:\n\n        if pretrained_bert:\n            config = AutoConfig.from_pretrained(pretrained_bert, num_labels=n_tags,\n                                                output_attentions=False, output_hidden_states=False)\n            model = AutoModelForTokenClassification.from_pretrained(pretrained_bert, config=config)\n        elif bert_config_file and Path(bert_config_file).is_file():\n            bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file)))\n\n            if attention_probs_keep_prob is not None:\n                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob\n            if hidden_keep_prob is not None:\n                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob\n            model = AutoModelForTokenClassification(config=bert_config)\n        else:\n            raise ConfigError(\"No pre-trained BERT model is given.\")\n\n        self.crf = CRF(n_tags) if use_crf else None\n\n        super().__init__(model, **kwargs)\n\n    def train_on_batch(self,\n                       input_ids: Union[List[List[int]], np.ndarray],\n                       input_masks: Union[List[List[int]], np.ndarray],\n                       y_masks: Union[List[List[int]], np.ndarray],\n                       y: List[List[int]],\n                       *args, **kwargs) -> Dict[str, float]:\n        \"\"\"\n\n        Args:\n            input_ids: batch of indices of subwords\n            input_masks: batch of masks which determine what should be attended\n            args: arguments passed  to _build_feed_dict\n                and corresponding to additional input\n                and output tensors of the derived class.\n            kwargs: keyword arguments passed to _build_feed_dict\n                and corresponding to additional input\n                and output tensors of the derived class.\n\n        Returns:\n            dict with fields 'loss', 'head_learning_rate', and 'bert_learning_rate'\n        \"\"\"\n        b_input_ids = torch.from_numpy(input_ids).to(self.device)\n        b_input_masks = torch.from_numpy(input_masks).to(self.device)\n        subtoken_labels = [token_labels_to_subtoken_labels(y_el, y_mask, input_mask)\n                           for y_el, y_mask, input_mask in zip(y, y_masks, input_masks)]\n        b_labels = torch.from_numpy(np.array(subtoken_labels)).to(torch.int64).to(self.device)\n        self.optimizer.zero_grad()\n\n        loss = self.model(input_ids=b_input_ids,\n                          attention_mask=b_input_masks,\n                          labels=b_labels).loss\n        if self.crf is not None:\n            self.crf(y, y_masks)\n        if self.is_data_parallel:\n            loss = loss.mean()\n        self._make_step(loss)\n\n        return {'loss': loss.item()}\n\n    def __call__(self,\n                 input_ids: Union[List[List[int]], np.ndarray],\n                 input_masks: Union[List[List[int]], np.ndarray],\n                 y_masks: Union[List[List[int]], np.ndarray]) -> Tuple[List[List[int]], List[np.ndarray]]:\n        \"\"\" Predicts tag indices for a given subword tokens batch\n\n        Args:\n            input_ids: indices of the subwords\n            input_masks: mask that determines where to attend and where not to\n            y_masks: mask which determines the first subword units in the the word\n\n        Returns:\n            Label indices or class probabilities for each token (not subtoken)\n\n        \"\"\"\n        b_input_ids = torch.from_numpy(input_ids).to(self.device)\n        b_input_masks = torch.from_numpy(input_masks).to(self.device)\n\n        with torch.no_grad():\n            # Forward pass, calculate logit predictions\n            logits = self.model(b_input_ids, attention_mask=b_input_masks)\n\n            # Move logits and labels to CPU and to numpy arrays\n            logits = token_from_subtoken(logits[0].detach().cpu(), torch.from_numpy(y_masks))\n\n        probas = torch.nn.functional.softmax(logits, dim=-1)\n        probas = probas.detach().cpu().numpy()\n        if self.crf is not None:\n            logits = logits.transpose(1, 0).to(self.device)\n            pred = self.crf.decode(logits)\n        else:\n            logits = logits.detach().cpu().numpy()\n            pred = np.argmax(logits, axis=-1)\n        seq_lengths = np.sum(y_masks, axis=1)\n        pred = [p[:l] for l, p in zip(seq_lengths, pred)]\n\n        return pred, probas\n\n    def load(self, fname=None):\n        super().load(fname)\n        if self.crf is not None:\n            self.crf = self.crf.to(self.device)\n            if self.load_path:\n                weights_path_crf = Path(f\"{self.load_path}_crf\").resolve()\n                weights_path_crf = weights_path_crf.with_suffix(\".pth.tar\")\n                if weights_path_crf.exists():\n                    checkpoint = torch.load(weights_path_crf, map_location=self.device)\n                    self.crf.load_state_dict(checkpoint[\"model_state_dict\"], strict=False)\n                else:\n                    log.warning(f\"Init from scratch. Load path {weights_path_crf} does not exist.\")\n\n    def save(self, fname: Optional[str] = None, *args, **kwargs) -> None:\n        super().save(fname, *args, **kwargs)\n        if self.crf is not None:\n            if fname is None:\n                fname = self.save_path\n            weights_path_crf = Path(f\"{fname}_crf\").resolve()\n            weights_path_crf = weights_path_crf.with_suffix(\".pth.tar\")\n            torch.save({\"model_state_dict\": self.crf.cpu().state_dict()}, weights_path_crf)\n            self.crf.to(self.device)\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/torch_transformers_squad.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections import namedtuple\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Tuple, Optional, Dict\n\nimport numpy as np\nimport torch\nfrom transformers import AutoModelForQuestionAnswering, AutoConfig, AutoModel\nfrom transformers.data.processors.utils import InputFeatures\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\n\nlogger = getLogger(__name__)\n\n\ndef softmax_mask(val, mask):\n    inf = 1e30\n    return -inf * (1 - mask.to(torch.float32)) + val\n\n\nclass PassageReaderClassifier(torch.nn.Module):\n    \"\"\"The model with a Transformer encoder and two linear layers: the first for prediction of answer start and end\n    positions, the second defines the probability of the paragraph to contain the answer.\n\n    Args:\n        config: path to Transformer configuration file\n    \"\"\"\n\n    def __init__(self, config):\n        super().__init__()\n        self.encoder = AutoModel.from_config(config=config)\n        self.qa_outputs = torch.nn.Linear(config.hidden_size, 2)\n        self.qa_classifier = torch.nn.Linear(config.hidden_size, 1)\n\n    def forward(self, input_ids, attention_mask, token_type_ids):\n        out = self.encoder(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)\n        logits = self.qa_outputs(out[0])\n        start_logits, end_logits = logits.split(1, dim=-1)\n        start_logits = start_logits.squeeze(-1)\n        end_logits = end_logits.squeeze(-1)\n        rank_logits = self.qa_classifier(out[0][:, 0, :])\n        outputs = namedtuple(\"outputs\", \"start_logits end_logits rank_logits\")\n        return outputs(start_logits=start_logits, end_logits=end_logits, rank_logits=rank_logits)\n\n\n@register('torch_transformers_squad')\nclass TorchTransformersSquad(TorchModel):\n    \"\"\"Bert-based on PyTorch model for SQuAD-like problem setting:\n    It predicts start and end position of answer for given question and context.\n\n    [CLS] token is used as no_answer. If model selects [CLS] token as most probable\n    answer, it means that there is no answer in given context.\n\n    Start and end position of answer are predicted by linear transformation\n    of Bert outputs.\n\n    Args:\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        attention_probs_keep_prob: keep_prob for Bert self-attention layers\n        hidden_keep_prob: keep_prob for Bert hidden layers\n        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name\n        psg_cls: whether to use a separate linear layer to define if a passage contains the answer to the question\n        batch_size: batch size for inference of squad model\n    \"\"\"\n\n    def __init__(self,\n                 pretrained_bert: str,\n                 attention_probs_keep_prob: Optional[float] = None,\n                 hidden_keep_prob: Optional[float] = None,\n                 bert_config_file: Optional[str] = None,\n                 psg_cls: bool = False,\n                 batch_size: int = 10,\n                 **kwargs) -> None:\n        self.batch_size = batch_size\n        self.psg_cls = psg_cls\n\n        if pretrained_bert:\n            logger.debug(f\"From pretrained {pretrained_bert}.\")\n            config = AutoConfig.from_pretrained(pretrained_bert, output_attentions=False, output_hidden_states=False)\n            if self.psg_cls:\n                model = PassageReaderClassifier(config=config)\n            else:\n                model = AutoModelForQuestionAnswering.from_pretrained(pretrained_bert, config=config)\n\n        elif bert_config_file and Path(bert_config_file).is_file():\n            bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file)))\n            if attention_probs_keep_prob is not None:\n                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob\n            if hidden_keep_prob is not None:\n                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob\n            if self.psg_cls:\n                model = PassageReaderClassifier(config=self.bert_config)\n            else:\n                model = AutoModelForQuestionAnswering(config=self.bert_config)\n        else:\n            raise ConfigError(\"No pre-trained BERT model is given.\")\n        super().__init__(model, **kwargs)\n\n    def train_on_batch(self, features: List[List[InputFeatures]],\n                       y_st: List[List[int]], y_end: List[List[int]]) -> Dict:\n        \"\"\"Train model on given batch.\n        This method calls train_op using features and labels from y_st and y_end\n\n        Args:\n            features: batch of InputFeatures instances\n            y_st: batch of lists of ground truth answer start positions\n            y_end: batch of lists of ground truth answer end positions\n\n        Returns:\n            dict with loss and learning_rate values\n\n        \"\"\"\n        input_ids = [f[0].input_ids for f in features]\n        input_masks = [f[0].attention_mask for f in features]\n        input_type_ids = [f[0].token_type_ids for f in features]\n\n        b_input_ids = torch.cat(input_ids, dim=0).to(self.device)\n        b_input_masks = torch.cat(input_masks, dim=0).to(self.device)\n        b_input_type_ids = torch.cat(input_type_ids, dim=0).to(self.device)\n\n        y_st = [x[0] for x in y_st]\n        y_end = [x[0] for x in y_end]\n        b_y_st = torch.from_numpy(np.array(y_st)).to(self.device)\n        b_y_end = torch.from_numpy(np.array(y_end)).to(self.device)\n\n        input_ = {\n            'input_ids': b_input_ids,\n            'attention_mask': b_input_masks,\n            'token_type_ids': b_input_type_ids,\n            'start_positions': b_y_st,\n            'end_positions': b_y_end,\n            'return_dict': True\n        }\n\n        self.optimizer.zero_grad()\n        input_ = {arg_name: arg_value for arg_name, arg_value in input_.items() if arg_name in self.accepted_keys}\n        loss = self.model(**input_).loss\n        if self.is_data_parallel:\n            loss = loss.mean()\n        self._make_step(loss)\n\n        return {'loss': loss.item()}\n\n    @property\n    def accepted_keys(self) -> Tuple[str]:\n        if self.is_data_parallel:\n            accepted_keys = self.model.module.forward.__code__.co_varnames\n        else:\n            accepted_keys = self.model.forward.__code__.co_varnames\n        return accepted_keys\n\n    def __call__(self, features_batch: List[List[InputFeatures]]) -> Tuple[\n        List[List[int]], List[List[int]], List[List[float]], List[List[float]], List[int]]:\n        \"\"\"get predictions using features as input\n\n        Args:\n            features_batch: batch of InputFeatures instances\n\n        Returns:\n            start_pred_batch: answer start positions\n            end_pred_batch: answer end positions\n            logits_batch: answer logits\n            scores_batch: answer confidences\n            ind_batch: indices of paragraph pieces where the answer was found\n\n        \"\"\"\n        predictions = {}\n        # TODO: refactor batchification\n        indices, input_ids, input_masks, input_type_ids = [], [], [], []\n        for n, features_list in enumerate(features_batch):\n            for f in features_list:\n                input_ids.append(f.input_ids)\n                input_masks.append(f.attention_mask)\n                input_type_ids.append(f.token_type_ids)\n                indices.append(n)\n\n        num_batches = len(indices) // self.batch_size + int(len(indices) % self.batch_size > 0)\n        for i in range(num_batches):\n            b_input_ids = torch.cat(input_ids[i * self.batch_size:(i + 1) * self.batch_size], dim=0).to(self.device)\n            b_input_masks = torch.cat(input_masks[i * self.batch_size:(i + 1) * self.batch_size], dim=0).to(self.device)\n            b_input_type_ids = torch.cat(input_type_ids[i * self.batch_size:(i + 1) * self.batch_size],\n                                         dim=0).to(self.device)\n            input_ = {\n                'input_ids': b_input_ids,\n                'attention_mask': b_input_masks,\n                'token_type_ids': b_input_type_ids,\n                'return_dict': True\n            }\n\n            with torch.no_grad():\n                input_ = {arg_name: arg_value for arg_name, arg_value in input_.items()\n                          if arg_name in self.accepted_keys}\n                # Forward pass, calculate logit predictions\n                outputs = self.model(**input_)\n\n                logits_st = outputs.start_logits\n                logits_end = outputs.end_logits\n\n                bs = b_input_ids.size()[0]\n                seq_len = b_input_ids.size()[-1]\n                mask = torch.cat([torch.ones(bs, 1, dtype=torch.int32),\n                                  torch.zeros(bs, seq_len - 1, dtype=torch.int32)], dim=-1).to(self.device)\n                logit_mask = b_input_type_ids + mask\n                logits_st = softmax_mask(logits_st, logit_mask)\n                logits_end = softmax_mask(logits_end, logit_mask)\n\n                start_probs = torch.nn.functional.softmax(logits_st, dim=-1)\n                end_probs = torch.nn.functional.softmax(logits_end, dim=-1)\n                if self.psg_cls:\n                    scores = outputs.rank_logits.squeeze(1)\n                else:\n                    scores = torch.tensor(1) - start_probs[:, 0] * end_probs[:, 0]\n\n                outer = torch.matmul(start_probs.view(*start_probs.size(), 1),\n                                     end_probs.view(end_probs.size()[0], 1, end_probs.size()[1]))\n                outer_logits = torch.exp(logits_st.view(*logits_st.size(), 1) + logits_end.view(\n                    logits_end.size()[0], 1, logits_end.size()[1]))\n\n                context_max_len = torch.max(torch.sum(b_input_type_ids, dim=1)).to(torch.int64)\n\n                max_ans_length = torch.min(torch.tensor(20).to(self.device), context_max_len).to(torch.int64).item()\n\n                outer = torch.triu(outer, diagonal=0) - torch.triu(outer, diagonal=outer.size()[1] - max_ans_length)\n                outer_logits = torch.triu(outer_logits, diagonal=0) - torch.triu(\n                    outer_logits, diagonal=outer_logits.size()[1] - max_ans_length)\n\n                start_pred = torch.argmax(torch.max(outer, dim=2)[0], dim=1)\n                end_pred = torch.argmax(torch.max(outer, dim=1)[0], dim=1)\n                logits = torch.max(torch.max(outer_logits, dim=2)[0], dim=1)[0]\n\n            # Move logits and labels to CPU and to numpy arrays\n            start_pred = start_pred.detach().cpu().numpy()\n            end_pred = end_pred.detach().cpu().numpy()\n            logits = logits.detach().cpu().numpy().tolist()\n            scores = scores.detach().cpu().numpy().tolist()\n\n            for j, (start_pred_elem, end_pred_elem, logits_elem, scores_elem) in \\\n                    enumerate(zip(start_pred, end_pred, logits, scores)):\n                ind = indices[i * self.batch_size + j]\n                if ind in predictions:\n                    predictions[ind] += [(start_pred_elem, end_pred_elem, logits_elem, scores_elem)]\n                else:\n                    predictions[ind] = [(start_pred_elem, end_pred_elem, logits_elem, scores_elem)]\n\n        start_pred_batch, end_pred_batch, logits_batch, scores_batch, ind_batch = [], [], [], [], []\n        for ind in sorted(predictions.keys()):\n            prediction = predictions[ind]\n            max_ind = np.argmax([pred[2] for pred in prediction])\n            start_pred_batch.append(prediction[max_ind][0])\n            end_pred_batch.append(prediction[max_ind][1])\n            logits_batch.append(prediction[max_ind][2])\n            scores_batch.append(prediction[max_ind][3])\n            ind_batch.append(max_ind)\n\n        return start_pred_batch, end_pred_batch, logits_batch, scores_batch, ind_batch\n"
  },
  {
    "path": "deeppavlov/models/torch_bert/torch_transformers_syntax_parser.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import List, Dict, Union, Optional, Tuple\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom transformers import AutoConfig, AutoModel\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.errors import ConfigError\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.torch_model import TorchModel\nfrom deeppavlov.models.torch_bert.torch_transformers_sequence_tagger import token_from_subtoken\n\nlogger = getLogger(__name__)\n\n\nclass Biaffine(nn.Module):\n    def __init__(self, in1_features: int, in2_features: int, out_features: int):\n        super().__init__()\n        self.bilinear = PairwiseBilinear(in1_features + 1, in2_features + 1, out_features)\n        self.bilinear.weight.data.zero_()\n        self.bilinear.bias.data.zero_()\n\n    def forward(self, input1: torch.Tensor, input2: torch.Tensor) -> torch.Tensor:\n        input1 = torch.cat([input1, input1.new_ones(*input1.size()[:-1], 1)], dim=input1.dim() - 1)\n        input2 = torch.cat([input2, input2.new_ones(*input2.size()[:-1], 1)], dim=input2.dim() - 1)\n        return self.bilinear(input1, input2)\n\n\nclass PairwiseBilinear(nn.Module):\n    \"\"\"\n    https://github.com/stanfordnlp/stanza/blob/v1.1.1/stanza/models/common/biaffine.py#L5  # noqa\n    \"\"\"\n\n    def __init__(self, in1_features: int, in2_features: int, out_features: int, bias: bool = True):\n        super().__init__()\n        self.in1_features = in1_features\n        self.in2_features = in2_features\n        self.out_features = out_features\n        self.weight = nn.Parameter(torch.Tensor(in1_features, out_features, in2_features))\n        if bias:\n            self.bias = nn.Parameter(torch.Tensor(out_features))\n        else:\n            self.register_parameter(\"bias\", None)\n        self.reset_parameters()\n\n    def reset_parameters(self):\n        bound = 1 / math.sqrt(self.weight.size(0))\n        nn.init.uniform_(self.weight, -bound, bound)\n        if self.bias is not None:\n            nn.init.uniform_(self.bias, -bound, bound)\n\n    def forward(self, input1: torch.Tensor, input2: torch.Tensor) -> torch.Tensor:\n        d1, d2, out = self.in1_features, self.in2_features, self.out_features\n        n1, n2 = input1.size(1), input2.size(1)\n        # (b * n1, d1) @ (d1, out * d2) => (b * n1, out * d2)\n        x1W = torch.mm(input1.view(-1, d1), self.weight.view(d1, out * d2))\n        # (b, n1 * out, d2) @ (b, d2, n2) => (b, n1 * out, n2)\n        x1Wx2 = x1W.view(-1, n1 * out, d2).bmm(input2.transpose(1, 2))\n        y = x1Wx2.view(-1, n1, self.out_features, n2).transpose(2, 3)\n        if self.bias is not None:\n            y.add_(self.bias)\n        return y  # (b, n1, n2, out)\n\n    def extra_repr(self) -> str:\n        return \"in1_features={}, in2_features={}, out_features={}, bias={}\".format(\n            self.in1_features, self.in2_features, self.out_features, self.bias is not None\n        )\n\n\n@torch.no_grad()\ndef mask_arc(lengths: torch.Tensor, mask_diag: bool = True) -> Optional[torch.Tensor]:\n    b, n = lengths.numel(), lengths.max()\n    if torch.all(lengths == n):\n        if not mask_diag:\n            return None\n        mask = torch.ones(b, n, n + 1)\n    else:\n        mask = torch.zeros(b, n, n + 1)\n        for i, length in enumerate(lengths):\n            mask[i, :length, :length + 1] = 1\n    if mask_diag:\n        mask.masked_fill_(torch.eye(n, dtype=torch.bool), 0)\n    return mask\n\n\nclass SyntaxParserNetwork(torch.nn.Module):\n    \"\"\"The model which defines heads in syntax tree and dependencies for text tokens.\n       Text token ids are fed into Transformer encoder, hidden states are passed into dense layers followed by\n       two biaffine layers (first for prediction of pairwise probabilities of a token to be the head for other token,\n       second - for prediction of syntax dependency of a token).\n    \"\"\"\n\n    def __init__(self, n_deps: int, pretrained_bert: str, encoder_layer_ids: List[int] = (-1,),\n                 bert_config_file: Optional[str] = None, attention_probs_keep_prob: Optional[float] = None,\n                 hidden_keep_prob: Optional[float] = None, state_size: int = 256, device: str = \"gpu\"):\n        super().__init__()\n\n        self.device = torch.device(\"cuda\" if torch.cuda.is_available() and device == \"gpu\" else \"cpu\")\n        self.n_deps = n_deps\n        self.encoder_layer_ids = encoder_layer_ids\n        self.state_size = state_size\n        if pretrained_bert:\n            logger.debug(f\"From pretrained {pretrained_bert}.\")\n            config = AutoConfig.from_pretrained(pretrained_bert, output_attentions=False, output_hidden_states=False)\n            self.encoder = AutoModel.from_pretrained(pretrained_bert, config=config)\n\n        elif bert_config_file and Path(bert_config_file).is_file():\n            bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file)))\n            if attention_probs_keep_prob is not None:\n                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob\n            if hidden_keep_prob is not None:\n                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob\n            self.encoder = AutoModel(config=bert_config)\n        else:\n            raise ConfigError(\"No pre-trained BERT model is given.\")\n\n        self.head_embs1 = torch.nn.Linear(config.hidden_size, state_size)\n        self.dep_embs1 = torch.nn.Linear(config.hidden_size, state_size)\n        self.head_embs2 = torch.nn.Linear(config.hidden_size, state_size)\n        self.dep_embs2 = torch.nn.Linear(config.hidden_size, state_size)\n        self.zero_emb1 = torch.nn.Parameter(torch.randn(state_size, ), requires_grad=True)\n        self.zero_emb2 = torch.nn.Parameter(torch.randn(state_size, ), requires_grad=True)\n        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)\n        self.biaf_head = Biaffine(state_size, state_size, 1)\n        self.biaf_dep = Biaffine(state_size, state_size, n_deps)\n\n    def forward(self, input_ids, attention_mask, subtoken_mask, y_heads=None, y_dep=None):\n        input_ids = torch.from_numpy(input_ids).to(self.device)\n        attention_mask = torch.from_numpy(attention_mask).to(self.device)\n        subtoken_mask = torch.from_numpy(subtoken_mask)\n\n        outputs = self.encoder(input_ids, attention_mask, output_hidden_states=True)\n        hidden_states = outputs.hidden_states\n        layer_output_list = []\n        for layer_id in self.encoder_layer_ids:\n            layer_id = layer_id + 1 if layer_id != -1 else layer_id\n            layer_output_list.append(hidden_states[layer_id])\n        layer_output = torch.stack(layer_output_list)\n        layer_output = torch.sum(layer_output, dim=0)\n\n        layer_output = token_from_subtoken(layer_output, subtoken_mask)\n        bs, seq_len, dim = layer_output.size()\n\n        layer_output = layer_output.float().to(self.device)\n        lengths = torch.sum(subtoken_mask, dim=-1)\n\n        head1 = self.head_embs1(layer_output)\n        dep1 = self.dep_embs1(layer_output)\n        dep1_zero = [self.zero_emb1 for _ in range(bs)]\n        dep1_zero = torch.stack(dep1_zero).unsqueeze(1).to(self.device)\n        dep1 = torch.cat([dep1_zero, dep1], dim=1)\n\n        head2 = self.head_embs2(layer_output)\n        dep2 = self.dep_embs2(layer_output)\n        dep2_zero = [self.zero_emb2 for _ in range(bs)]\n        dep2_zero = torch.stack(dep2_zero).unsqueeze(1).to(self.device)\n        dep2 = torch.cat([dep2_zero, dep2], dim=1)\n\n        head1 = self.dropout(head1)\n        dep1 = self.dropout(dep1)\n        head2 = self.dropout(head2)\n        dep2 = self.dropout(dep2)\n\n        logits_head_init = self.biaf_head(head1, dep1).squeeze_(3)\n        logits_deprel = self.biaf_dep(head2, dep2)\n        mask = mask_arc(lengths, mask_diag=False)\n        if mask is not None:\n            logits_head_init.masked_fill_(mask.logical_not().to(logits_head_init.device), -10.0)\n        logits_head = F.softmax(logits_head_init, dim=-1)\n\n        head_loss, dep_loss = None, None\n        if y_heads is not None:\n            y_heads = tuple(torch.LongTensor(yh).to(self.device) for yh in y_heads)\n            y_heads_pd = nn.utils.rnn.pad_sequence(y_heads, batch_first=True, padding_value=-1)\n\n            logits_head_flatten = logits_head.contiguous().view(-1, logits_head.size(-1))\n            y_heads_flatten = y_heads_pd.contiguous().view(-1)\n            head_loss = F.cross_entropy(logits_head_flatten, y_heads_flatten, ignore_index=-1, reduction=\"sum\")\n            head_loss.div_((y_heads_flatten != -1).sum())\n\n            y_dep = tuple(torch.LongTensor(ydp).to(self.device) for ydp in y_dep)\n            y_dep_pd = nn.utils.rnn.pad_sequence(y_dep, batch_first=True, padding_value=-1)\n            y_heads_new = y_heads_pd.masked_fill(y_heads_pd == -1, 0)\n            gather_index = y_heads_new.view(*y_heads_new.size(), 1, 1).expand(-1, -1, -1, logits_deprel.size(-1))\n\n            logits_deprel = torch.gather(logits_deprel, dim=2, index=gather_index)\n            logits_deprel_flatten = logits_deprel.contiguous().view(-1, logits_deprel.size(-1))\n            y_dep_flatten = y_dep_pd.contiguous().view(-1)\n            dep_loss = F.cross_entropy(logits_deprel_flatten, y_dep_flatten, ignore_index=-1, reduction=\"sum\")\n            dep_loss.div_((y_dep_flatten != -1).sum())\n        else:\n            logits_head = logits_head.detach().cpu().numpy()\n            head_ids = np.argmax(logits_head, axis=-1).tolist()\n\n            head_ids_new = torch.LongTensor(head_ids)\n            steps = torch.arange(head_ids_new.size(1))\n            logits_deprel = [logits_deprel[i, steps, heads] for i, heads in enumerate(head_ids_new)]\n            logits_deprel = torch.stack(logits_deprel, dim=0)\n            deprels = logits_deprel.argmax(dim=2).detach().cpu().numpy().tolist()\n\n            head_probas = [head_probas_list[:l, :l + 1] for l, head_probas_list in zip(lengths, logits_head)]\n            deprels = [deprel[:l] for l, deprel in zip(lengths, deprels)]\n\n        if y_heads is not None:\n            return head_loss + dep_loss\n        else:\n            return head_probas, deprels\n\n\n@register('torch_transformers_syntax_parser')\nclass TorchTransformersSyntaxParser(TorchModel):\n    \"\"\"Transformer-based model on PyTorch for syntax parsing. It predicts probabilities of heads and\n       dependency ids for text tokens. \n\n    Args:\n        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. \"bert-base-uncased\")\n        n_deps: number of syntax dependencies\n        encoder_layer_ids: list of indexes of encoder layers which will be used for further predicting of heads and\n            dependencies with biaffine layer\n        state_size: size of dense layers which follow after transformer encoder\n        attention_probs_keep_prob: keep_prob for Bert self-attention layers\n        hidden_keep_prob: keep_prob for Bert hidden layers\n        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name\n    \"\"\"\n\n    def __init__(self, pretrained_bert: str,\n                 n_deps: int,\n                 encoder_layer_ids: List[int] = (-1,),\n                 state_size: int = 256,\n                 attention_probs_keep_prob: Optional[float] = None,\n                 hidden_keep_prob: Optional[float] = None,\n                 bert_config_file: Optional[str] = None,\n                 **kwargs) -> None:\n\n        model = SyntaxParserNetwork(n_deps, pretrained_bert, encoder_layer_ids,\n                                    bert_config_file, attention_probs_keep_prob, hidden_keep_prob,\n                                    state_size)\n        super().__init__(model, **kwargs)\n\n    def train_on_batch(self, input_ids: Union[List[List[int]], np.ndarray],\n                       input_masks: Union[List[List[int]], np.ndarray],\n                       y_masks: Union[List[List[int]], np.ndarray],\n                       y_heads: List[List[int]], y_dep: List[List[int]]) -> Dict:\n        \"\"\"\n\n        Args:\n            input_ids: indices of the subwords\n            input_masks: mask that determines where to attend and where not to\n            y_masks: mask which determines the first subword units in the the word\n            y_heads: for each token - id fo token which is the head in syntax tree for the token\n            y_dep: syntax dependencies for each tokens\n        \"\"\"\n        self.optimizer.zero_grad()\n        loss = self.model(input_ids, input_masks, y_masks, y_heads, y_dep)\n        loss.backward()\n        # Clip the norm of the gradients to 1.0.\n        # This is to help prevent the \"exploding gradients\" problem.\n        if self.clip_norm:\n            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm)\n\n        self.optimizer.step()\n\n        return {'loss': loss.item()}\n\n    def __call__(self, input_ids: Union[List[List[int]], np.ndarray],\n                 input_masks: Union[List[List[int]], np.ndarray],\n                 y_masks: Union[List[List[int]], np.ndarray]) -> Tuple[List[List[List[float]]], List[List[int]]]:\n        \"\"\" Predicts probas of heads and dependency ids for tokens\n\n        Args:\n            input_ids: indices of the subwords\n            input_masks: mask that determines where to attend and where not to\n            y_masks: mask which determines the first subword units in the the word\n\n        Returns:\n            Probas of heads and dependency ids for each token (not subtoken)\n\n        \"\"\"\n        with torch.no_grad():\n            head_probas, dep_ids = self.model(input_ids, input_masks, y_masks)\n        return head_probas, dep_ids\n"
  },
  {
    "path": "deeppavlov/models/vectorizers/__init__.py",
    "content": "\n"
  },
  {
    "path": "deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections import Counter\nfrom logging import getLogger\nfrom typing import List, Any, Generator, Tuple, KeysView, ValuesView, Dict, Optional\n\nimport numpy as np\nimport scipy as sp\nfrom scipy import sparse\nfrom sklearn.utils import murmurhash3_32\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.core.models.estimator import Estimator\n\nlogger = getLogger(__name__)\n\nSparse = sp.sparse.csr_matrix\n\n\ndef hash_(token: str, hash_size: int) -> int:\n    \"\"\"Convert a token to a hash of given size.\n    Args:\n        token: a word\n        hash_size: hash size\n\n    Returns:\n        int, hashed token\n\n    \"\"\"\n    return murmurhash3_32(token, positive=True) % hash_size\n\n\n@register('hashing_tfidf_vectorizer')\nclass HashingTfIdfVectorizer(Estimator):\n    \"\"\"Create a tfidf matrix from collection of documents of size [n_documents X n_features(hash_size)].\n\n    Args:\n        tokenizer: a tokenizer class\n        hash_size: a hash size, power of two\n        doc_index: a dictionary of document ids and their titles\n        save_path: a path to **.npz** file where tfidf matrix is saved\n        load_path: a path to **.npz** file where tfidf matrix is loaded from\n\n    Attributes:\n        hash_size: a hash size\n        tokenizer: instance of a tokenizer class\n        term_freqs: a dictionary with tfidf terms and their frequences\n        doc_index: provided by a user ids or generated automatically ids\n        rows: tfidf matrix rows corresponding to terms\n        cols: tfidf matrix cols corresponding to docs\n        data: tfidf matrix data corresponding to tfidf values\n\n    \"\"\"\n\n    def __init__(self, tokenizer: Component, hash_size=2 ** 24, doc_index: Optional[dict] = None,\n                 save_path: Optional[str] = None, load_path: Optional[str] = None, **kwargs):\n\n        super().__init__(save_path=save_path, load_path=load_path, mode=kwargs.get('mode', 'infer'))\n\n        self.hash_size = hash_size\n        self.tokenizer = tokenizer\n        self.rows = []\n        self.cols = []\n        self.data = []\n\n        if kwargs.get('mode', 'infer') == 'infer':\n            self.tfidf_matrix, opts = self.load()\n            self.ngram_range = opts['ngram_range']\n            self.hash_size = opts['hash_size']\n            self.term_freqs = opts['term_freqs'].squeeze()\n            self.doc_index = opts['doc_index']\n            self.index2doc = self.get_index2doc()\n        else:\n            self.term_freqs = None\n            self.doc_index = doc_index or {}\n\n    def __call__(self, questions: List[str]) -> Sparse:\n        \"\"\"Transform input list of documents to tfidf vectors.\n\n        Args:\n            questions: a list of input strings\n\n        Returns:\n            transformed documents as a csr_matrix with shape [n_documents X :attr:`hash_size`]\n\n        \"\"\"\n\n        sp_tfidfs = []\n\n        for question in questions:\n            ngrams = list(self.tokenizer([question]))\n            hashes = [hash_(ngram, self.hash_size) for ngram in ngrams[0]]\n\n            hashes_unique, q_hashes = np.unique(hashes, return_counts=True)\n            tfs = np.log1p(q_hashes)\n\n            if len(q_hashes) == 0:\n                sp_tfidfs.append(Sparse((1, self.hash_size)))\n                continue\n\n            size = len(self.doc_index)\n            Ns = self.term_freqs[hashes_unique]\n            idfs = np.log((size - Ns + 0.5) / (Ns + 0.5))\n            idfs[idfs < 0] = 0\n\n            tfidf = np.multiply(tfs, idfs).astype(\"float32\")\n\n            indptr = np.array([0, len(hashes_unique)])\n            sp_tfidf = Sparse((tfidf, hashes_unique, indptr), shape=(1, self.hash_size)\n                              )\n            sp_tfidfs.append(sp_tfidf)\n\n        transformed = sp.sparse.vstack(sp_tfidfs)\n        return transformed\n\n    def get_index2doc(self) -> Dict[Any, int]:\n        \"\"\"Invert doc_index.\n\n        Returns:\n            inverted doc_index dict\n\n        \"\"\"\n        return dict(zip(self.doc_index.values(), self.doc_index.keys()))\n\n    def get_counts(self, docs: List[str], doc_ids: List[Any]) \\\n            -> Generator[Tuple[KeysView, ValuesView, List[int]], Any, None]:\n        \"\"\"Get term counts for a list of documents.\n\n        Args:\n            docs: a list of input documents\n            doc_ids: a list of document ids corresponding to input documents\n\n        Yields:\n            a tuple of term hashes, count values and column ids\n\n        Returns:\n            None\n\n        \"\"\"\n        logger.debug(\"Tokenizing batch...\")\n        batch_ngrams = list(self.tokenizer(docs))\n        logger.debug(\"Counting hash...\")\n        doc_id = iter(doc_ids)\n        for ngrams in batch_ngrams:\n            counts = Counter([hash_(gram, self.hash_size) for gram in ngrams])\n            hashes = counts.keys()\n            values = counts.values()\n            _id = self.doc_index[next(doc_id)]\n            if values:\n                col_id = [_id] * len(values)\n            else:\n                col_id = []\n            yield hashes, values, col_id\n\n    def get_count_matrix(self, row: List[int], col: List[int], data: List[int], size: int) \\\n            -> Sparse:\n        \"\"\"Get count matrix.\n\n        Args:\n            row: tfidf matrix rows corresponding to terms\n            col:  tfidf matrix cols corresponding to docs\n            data: tfidf matrix data corresponding to tfidf values\n            size: :attr:`doc_index` size\n\n        Returns:\n            a count csr_matrix\n\n        \"\"\"\n        count_matrix = Sparse((data, (row, col)), shape=(self.hash_size, size))\n        count_matrix.sum_duplicates()\n        return count_matrix\n\n    @staticmethod\n    def get_tfidf_matrix(count_matrix: Sparse) -> Tuple[Sparse, np.array]:\n        \"\"\"Convert a count matrix into a tfidf matrix.\n\n        Args:\n            count_matrix: a count matrix\n\n        Returns:\n            a tuple of tfidf matrix and term frequences\n\n        \"\"\"\n\n        binary = (count_matrix > 0).astype(int)\n        term_freqs = np.array(binary.sum(1)).squeeze()\n        idfs = np.log((count_matrix.shape[1] - term_freqs + 0.5) / (term_freqs + 0.5))\n        idfs[idfs < 0] = 0\n        idfs = sp.sparse.diags(idfs, 0)\n        tfs = count_matrix.log1p()\n        tfidfs = idfs.dot(tfs)\n        return tfidfs, term_freqs\n\n    def save(self) -> None:\n        \"\"\"Save tfidf matrix into **.npz** format.\n\n        Returns:\n            None\n\n        \"\"\"\n        logger.info(\"Saving tfidf matrix to {}\".format(self.save_path))\n        count_matrix = self.get_count_matrix(self.rows, self.cols, self.data,\n                                             size=len(self.doc_index))\n        tfidf_matrix, term_freqs = self.get_tfidf_matrix(count_matrix)\n        self.term_freqs = term_freqs\n\n        opts = {'hash_size': self.hash_size,\n                'ngram_range': self.tokenizer.ngram_range,\n                'doc_index': self.doc_index,\n                'term_freqs': self.term_freqs}\n\n        data = {\n            'data': tfidf_matrix.data,\n            'indices': tfidf_matrix.indices,\n            'indptr': tfidf_matrix.indptr,\n            'shape': tfidf_matrix.shape,\n            'opts': opts\n        }\n        np.savez(self.save_path, **data)\n\n        # release memory\n        self.reset()\n\n    def reset(self) -> None:\n        \"\"\"Clear :attr:`rows`, :attr:`cols` and :attr:`data`\n\n        Returns:\n            None\n\n        \"\"\"\n        self.rows.clear()\n        self.cols.clear()\n        self.data.clear()\n\n    def load(self) -> Tuple[Sparse, Dict]:\n        \"\"\"Load a tfidf matrix as csr_matrix.\n\n        Returns:\n            a tuple of tfidf matrix and csr data.\n\n        Raises:\n            FileNotFoundError if :attr:`load_path` doesn't exist.\n\n        Todo:\n            * implement loading from URL\n\n        \"\"\"\n        if not self.load_path.exists():\n            raise FileNotFoundError(\"HashingTfIdfVectorizer path doesn't exist!\")\n\n        logger.debug(\"Loading tfidf matrix from {}\".format(self.load_path))\n        loader = np.load(self.load_path, allow_pickle=True)\n        matrix = Sparse((loader['data'], loader['indices'],\n                         loader['indptr']), shape=loader['shape'])\n        return matrix, loader['opts'].item(0)\n\n    def partial_fit(self, docs: List[str], doc_ids: List[Any], doc_nums: List[int]) -> None:\n        \"\"\"Partially fit on one batch.\n\n        Args:\n            docs: a list of input documents\n            doc_ids: a list of document ids corresponding to input documents\n            doc_nums: a list of document integer ids as they appear in a database\n\n        Returns:\n            None\n\n        \"\"\"\n        for doc_id, i in zip(doc_ids, doc_nums):\n            self.doc_index[doc_id] = i\n\n        for batch_rows, batch_data, batch_cols in self.get_counts(docs, doc_ids):\n            self.rows.extend(batch_rows)\n            self.cols.extend(batch_cols)\n            self.data.extend(batch_data)\n\n    def fit(self, docs: List[str], doc_ids: List[Any], doc_nums: List[int]) -> None:\n        \"\"\"Fit the vectorizer.\n\n        Args:\n            docs: a list of input documents\n            doc_ids: a list of document ids corresponding to input documents\n            doc_nums: a list of document integer ids as they appear in a database\n\n        Returns:\n            None\n\n        \"\"\"\n        self.doc_index = {}\n        self.rows = []\n        self.cols = []\n        self.data = []\n        return self.partial_fit(docs, doc_ids, doc_nums)\n"
  },
  {
    "path": "deeppavlov/paramsearch.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nimport sys\nfrom copy import deepcopy\nfrom itertools import product\nfrom logging import getLogger\nfrom pathlib import Path\n\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\n\nfrom deeppavlov.core.commands.train import train_evaluate_model_from_config, get_iterator_from_config, \\\n    read_data_by_config\nfrom deeppavlov.core.commands.utils import parse_config\nfrom deeppavlov.core.common.cross_validation import calc_cv_score\nfrom deeppavlov.core.common.file import save_json, find_config, read_json\nfrom deeppavlov.core.common.params_search import ParamsSearch\n\np = (Path(__file__) / \"..\" / \"..\").resolve()\nsys.path.append(str(p))\n\nlog = getLogger(__name__)\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\"config_path\", help=\"path to a pipeline json config\", type=str)\nparser.add_argument(\"--folds\", help=\"number of folds\", type=str, default=None)\nparser.add_argument(\"--search_type\", help=\"search type: grid or random search\", type=str, default='grid')\n\n\ndef get_best_params(combinations, scores, param_names, target_metric):\n    max_id = np.argmax(scores)\n    best_params = dict(zip(param_names, combinations[max_id]))\n    best_params[target_metric] = scores[max_id]\n\n    return best_params\n\n\ndef main():\n    params_helper = ParamsSearch()\n\n    args = parser.parse_args()\n    is_loo = False\n    n_folds = None\n    if args.folds == 'loo':\n        is_loo = True\n    elif args.folds is None:\n        n_folds = None\n    elif args.folds.isdigit():\n        n_folds = int(args.folds)\n    else:\n        raise NotImplementedError('Not implemented this type of CV')\n\n    # read config\n    pipeline_config_path = find_config(args.config_path)\n    config_init = read_json(pipeline_config_path)\n    config = parse_config(config_init)\n    data = read_data_by_config(config)\n    target_metric = parse_config(config_init)['train']['metrics'][0]\n    if isinstance(target_metric, dict):\n        target_metric = target_metric['name']\n\n    # get all params for search\n    param_paths = list(params_helper.find_model_path(config, 'search_choice'))\n    param_values = []\n    param_names = []\n    for path in param_paths:\n        value = params_helper.get_value_from_config(config, path)\n        param_name = path[-1]\n        param_value_search = value['search_choice']\n        param_names.append(param_name)\n        param_values.append(param_value_search)\n\n    # find optimal params\n    if args.search_type == 'grid':\n        # generate params combnations for grid search\n        combinations = list(product(*param_values))\n\n        # calculate cv scores\n        scores = []\n        for comb in combinations:\n            config = deepcopy(config_init)\n            for param_path, param_value in zip(param_paths, comb):\n                params_helper.insert_value_or_dict_into_config(config, param_path, param_value)\n            config = parse_config(config)\n\n            if (n_folds is not None) | is_loo:\n                # CV for model evaluation\n                score_dict = calc_cv_score(config, data=data, n_folds=n_folds, is_loo=is_loo)\n                score = score_dict[next(iter(score_dict))]\n            else:\n                # train/valid for model evaluation\n                data_to_evaluate = data.copy()\n                if len(data_to_evaluate['valid']) == 0:\n                    data_to_evaluate['train'], data_to_evaluate['valid'] = train_test_split(data_to_evaluate['train'],\n                                                                                            test_size=0.2)\n                iterator = get_iterator_from_config(config, data_to_evaluate)\n                score = train_evaluate_model_from_config(config, iterator=iterator)['valid'][target_metric]\n\n            scores.append(score)\n\n        # get model with best score\n        best_params_dict = get_best_params(combinations, scores, param_names, target_metric)\n        log.info('Best model params: {}'.format(best_params_dict))\n    else:\n        raise NotImplementedError('Not implemented this type of search')\n\n    # save config\n    best_config = config_init\n    for i, param_name in enumerate(best_params_dict.keys()):\n        if param_name != target_metric:\n            params_helper.insert_value_or_dict_into_config(best_config, param_paths[i], best_params_dict[param_name])\n\n    best_model_filename = pipeline_config_path.with_suffix('.cvbest.json')\n    save_json(best_config, best_model_filename)\n    log.info('Best model saved in json-file: {}'.format(best_model_filename))\n\n\n# try to run:\n# --config_path path_to_config.json --folds 2\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "deeppavlov/requirements/datasets.txt",
    "content": "datasets>=1.16.0,<2.5.0;python_version<=\"3.10\"\ndatasets==2.2.*;python_version==\"3.11.*\"\n"
  },
  {
    "path": "deeppavlov/requirements/dependency_decoding.txt",
    "content": "ufal.chu-liu-edmonds\n"
  },
  {
    "path": "deeppavlov/requirements/en_core_web_sm.txt",
    "content": "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl\nspacy\n"
  },
  {
    "path": "deeppavlov/requirements/faiss.txt",
    "content": "faiss-cpu==1.7.2;python_version<=\"3.10\"\nfaiss-cpu==1.7.4;python_version==\"3.11.*\"\n"
  },
  {
    "path": "deeppavlov/requirements/fasttext.txt",
    "content": "fasttext==0.9.*\n"
  },
  {
    "path": "deeppavlov/requirements/hdt.txt",
    "content": "hdt==2.3\n"
  },
  {
    "path": "deeppavlov/requirements/kenlm.txt",
    "content": "pypi-kenlm==0.1.20220713;python_version<=\"3.10\"\nkenlm==0.2.*;python_version==\"3.11.*\"\n"
  },
  {
    "path": "deeppavlov/requirements/lxml.txt",
    "content": "lxml==4.9.*\n"
  },
  {
    "path": "deeppavlov/requirements/opt_einsum.txt",
    "content": "opt-einsum==3.3.*\n"
  },
  {
    "path": "deeppavlov/requirements/protobuf.txt",
    "content": "protobuf<=3.20\n"
  },
  {
    "path": "deeppavlov/requirements/pytorch.txt",
    "content": "torch>=1.6.0,<1.14.0\n"
  },
  {
    "path": "deeppavlov/requirements/rapidfuzz.txt",
    "content": "rapidfuzz==2.1.*\n"
  },
  {
    "path": "deeppavlov/requirements/razdel.txt",
    "content": "razdel==0.5.0\n"
  },
  {
    "path": "deeppavlov/requirements/ru_core_news_sm.txt",
    "content": "https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.5.0/ru_core_news_sm-3.5.0-py3-none-any.whl\nspacy\n"
  },
  {
    "path": "deeppavlov/requirements/sacremoses.txt",
    "content": "sacremoses==0.0.53\n"
  },
  {
    "path": "deeppavlov/requirements/sentencepiece.txt",
    "content": "sentencepiece==0.2.0\n"
  },
  {
    "path": "deeppavlov/requirements/slovnet.txt",
    "content": "slovnet==0.5.*\nnavec\n"
  },
  {
    "path": "deeppavlov/requirements/sortedcontainers.txt",
    "content": "sortedcontainers==2.4.*\n"
  },
  {
    "path": "deeppavlov/requirements/torchcrf.txt",
    "content": "pytorch-crf==0.7.*\n"
  },
  {
    "path": "deeppavlov/requirements/transformers.txt",
    "content": "transformers>=4.13.0,<4.25.0;python_version<\"3.8\"\ntransformers==4.30.0;python_version>=\"3.8\"\n"
  },
  {
    "path": "deeppavlov/requirements/udapi.txt",
    "content": "udapi==0.3.*\n"
  },
  {
    "path": "deeppavlov/requirements/whapi.txt",
    "content": "bs4\nwhapi==0.6.*\n"
  },
  {
    "path": "deeppavlov/settings.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\n\nfrom deeppavlov.core.common.paths import get_settings_path, populate_settings_dir\n\nparser = argparse.ArgumentParser()\n\nparser.add_argument(\"-d\", \"--default\", action=\"store_true\", help=\"return to defaults\")\n\n\ndef main():\n    \"\"\"DeepPavlov console configuration utility.\"\"\"\n    args = parser.parse_args()\n    path = get_settings_path()\n\n    if args.default:\n        if populate_settings_dir(force=True):\n            print(f'Populated {path} with default settings files')\n        else:\n            print(f'{path} is already a default settings directory')\n    else:\n        print(f'Current DeepPavlov settings path: {path}')\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "deeppavlov/utils/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/utils/benchmarks/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/utils/benchmarks/benchmarks.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nfrom collections import defaultdict\nfrom logging import getLogger\n\nimport numpy as np\nfrom tqdm import tqdm\n\nfrom deeppavlov import build_model\nfrom deeppavlov.core.commands.train import read_data_by_config, get_iterator_from_config\nfrom deeppavlov.core.commands.utils import parse_config, expand_path\nfrom deeppavlov.core.common.file import save_jsonl\n\nlog = getLogger(__name__)\n\nparser = argparse.ArgumentParser()\n\nparser.add_argument('config_path', help='path to a pipeline json config', type=str)\nparser.add_argument('benchmark_name', help='benchmark name to be submitted',\n                    choices=['glue', 'superglue', 'russian_superglue'])\nparser.add_argument('-o', '--output-file', default=None, help='path to save output', type=str)\nparser.add_argument('-d', '--download', action='store_true', help='download model components')\n\nGLUE_TASKS = {\n    'cola': 'CoLA',\n    'mnli-m': 'MNLI-m',\n    'mnli-mm': 'MNLI-mm',\n    'mrpc': 'MRPC',\n    'qnli': 'QNLI',\n    'qqp': 'QQP',\n    'rte': 'RTE',\n    'sst2': 'SST-2',\n    'stsb': 'STS-B',\n    'wnli': 'WNLI'\n}\n\nSUPER_GLUE_TASKS = {\n    'copa': 'COPA',\n    'multirc': 'MultiRC',\n    'boolq': 'BoolQ',\n    'record': 'ReCoRD',\n    'wic': 'WiC'\n}\n\nRSG_TASKS = {\n    'lidirus': 'LiDiRus',\n    'rcb': 'RCB',\n    'parus': 'PARus',\n    'muserc': 'MuSeRC',\n    'terra': 'TERRa',\n    'russe': 'RUSSE',\n    'rwsd': 'RWSD',\n    'danetqa': 'DaNetQA',\n    'rucos': 'RuCoS'\n}\n\n\ndef split_config(config_path, download):\n    \"\"\"Gets model, data iterator and a task name from the configuration file.\n    \n    Args:\n        config_path: Path to the model configuration file.\n        download: If True, the model will be downloaded from the DeepPavlov server.\n    \"\"\"\n\n    config = parse_config(config_path)\n    data = read_data_by_config(config)\n    iterator = get_iterator_from_config(config, data)\n    task_name = config['dataset_reader']['name']\n    if task_name == 'mnli':\n        task_name = 'mnli-m' if config['dataset_reader']['valid'] == 'validation_matched' else 'mnli-mm'\n    data_gen = iterator.gen_batches(1, data_type='test', shuffle=False)\n    model = build_model(config, download=download)\n    return model, data_gen, task_name\n\n\ndef get_predictions(model, data_gen, replace_word=None, round_res=False):\n    \"\"\"Gets model predictions and replaces model output with replace_word.\n    \n    Args:\n        model: The model itself.\n        data_gen: Iterator with data to be submitted.\n        replace_word: Model outputs to be replaced with 1, other outputs are replaced with 0.\n            If None, model outputs are not replaced.\n        round_res: If True, model outputs are rounded (used in stsb).\n    \"\"\"\n\n    submission = {'index': [], 'prediction': []}\n    for idx, (x, _) in enumerate(tqdm(data_gen)):\n        prediction = model.compute(x)[0]\n        if replace_word:\n            prediction = 1 if prediction == replace_word else 0\n        if round_res:\n            prediction = round(prediction, 3)\n        submission['index'].append(idx)\n        submission['prediction'].append(prediction)\n    return submission\n\n\ndef submit_glue(config_path, output_path, download):\n    \"\"\"Creates submission file for the GLUE tasks.\n    Args:\n        config_path: Path to the model configuration file.\n        output_path: Path to output file. If None, file name is selected according corresponding task name.\n        download: If True, the model will be downloaded from the DeepPavlov server.\n    \"\"\"\n\n    model, data_gen, task_name = split_config(config_path, download)\n\n    if task_name == 'cola':\n        submission = get_predictions(model, data_gen, 'acceptable')\n\n    elif task_name.startswith('mnli'):\n        submission = get_predictions(model, data_gen)\n\n    elif task_name == 'mrpc':\n        submission = get_predictions(model, data_gen, 'equivalent')\n\n    elif task_name == 'sst2':\n        submission = get_predictions(model, data_gen, 'positive')\n\n    elif task_name == 'stsb':\n        submission = get_predictions(model, data_gen, None, True)\n\n    elif task_name == 'wnli':\n        submission = get_predictions(model, data_gen, 'entailment')\n\n    elif task_name in GLUE_TASKS:\n        submission = get_predictions(model, data_gen)\n    else:\n        raise ValueError(f'Unexpected GLUE task name: {task_name}')\n\n    save_path = output_path or f'{GLUE_TASKS[task_name]}.tsv'\n    save_path = expand_path(save_path)\n    save_path.parent.mkdir(parents=True, exist_ok=True)\n    save_array = np.vstack(([list(submission.keys())], np.array(list(submission.values())).transpose()))\n    np.savetxt(save_path, save_array, delimiter='\\t', fmt='%s')\n    log.info(f'Prediction saved to {save_path}')\n\n\ndef commonsense_reasoning_prediction(model, data_gen):\n    \"\"\"Common part for ReCoRD and RuCoS tasks that gets their predictions in needed format.\n    \n    Args:\n        model: The model itself.\n        data_gen: Iterator with data to be submitted.\n    \"\"\"\n\n    submission = []\n    output = defaultdict(\n        lambda: {\n            'predicted': [],\n            'probability': []\n        }\n    )\n\n    for x, _ in tqdm(data_gen):\n        indices, _, _, entities, _ = x[0]\n        prediction = model.compute(x)[:, 1]\n        output[indices]['predicted'].append(entities)\n        output[indices]['probability'].append(prediction)\n\n    for key, value in output.items():\n        answer_index = np.argmax(value['probability'])\n        answer = value['predicted'][answer_index]\n        submission.append({'idx': int(key.split('-')[1]), 'label': answer})\n    return submission\n\n\ndef multi_sentence_comprehension_prediction(model, data_gen):\n    \"\"\"Common part for MultiRC and MuSeRC tasks that gets their predictions in needed format.\n    \n    Args:\n        model: The model itself.\n        data_gen: Iterator with data to be submitted.\n    \"\"\"\n\n    output = {}\n\n    for x, _ in tqdm(data_gen):\n        contexts, answers, indices = x[0]\n\n        prediction = model([contexts], [answers], indices)\n\n        paragraph_idx = indices['paragraph']\n        question_idx = indices['question']\n        answer_idx = indices['answer']\n\n        label = int(prediction[0] == 'True')\n        if paragraph_idx not in output:\n            output[paragraph_idx] = {\n                'idx': paragraph_idx,\n                'passage': {\n                    'questions': [\n                        {\n                            'idx': question_idx,\n                            'answers': [{'idx': answer_idx, 'label': label}]\n                        }\n                    ]\n                }\n            }\n\n        questions = output[paragraph_idx]['passage']['questions']\n        question_indices = set(el['idx'] for el in questions)\n        if question_idx not in question_indices:\n            output[paragraph_idx]['passage']['questions'].append({\n                'idx': question_idx,\n                'answers': [{'idx': answer_idx, 'label': label}]\n            })\n        else:\n            for question in questions:\n                if question['idx'] == question_idx:\n                    question['answers'].append({'idx': answer_idx, 'label': label})\n\n    submission = list(output.values())\n    return submission\n\n\ndef submit_superglue(config_path, output_path, download):\n    \"\"\"Creates submission file for the SuperGLUE tasks.\n\n    Args:\n        config_path: Path to the model configuration file.\n        output_path: Path to output file. If None, file name is selected according corresponding task name.\n        download: If True, the model will be downloaded from the DeepPavlov server.\n    \"\"\"\n\n    model, data_gen, task_name = split_config(config_path, download)\n    submission = []\n\n    if task_name == 'record':\n        submission = commonsense_reasoning_prediction(model, data_gen)\n\n    elif task_name == 'copa':\n        for idx, (x, _) in enumerate(tqdm(data_gen)):\n            prediction = model.compute(x)[0]\n            label = int(prediction == 'choice2')\n            submission.append({'idx': idx, 'label': label})\n\n    elif task_name == 'multirc':\n        submission = multi_sentence_comprehension_prediction(model, data_gen)\n\n    elif task_name in SUPER_GLUE_TASKS:\n        for idx, (x, _) in enumerate(tqdm(data_gen)):\n            prediction = model.compute(x)\n\n            while isinstance(prediction, list):\n                prediction = prediction[0]\n\n            submission.append({'idx': idx, 'label': prediction})\n    else:\n        raise ValueError(f'Unexpected SuperGLUE task name: {task_name}')\n\n    save_path = output_path if output_path is not None else f'{SUPER_GLUE_TASKS[task_name]}.jsonl'\n    save_path = expand_path(save_path)\n    save_path.parent.mkdir(parents=True, exist_ok=True)\n    save_jsonl(submission, save_path)\n    log.info(f'Prediction saved to {save_path}')\n\n\ndef submit_rsg(config_path, output_path, download):\n    \"\"\"Creates submission file for the Russian SuperGLUE tasks.\n\n    Args:\n        config_path: Path to the model configuration file.\n        output_path: Path to output file. If None, file name is selected according corresponding task name.\n        download: If True, the model will be downloaded from the DeepPavlov server.\n    \"\"\"\n\n    model, data_gen, task_name = split_config(config_path, download)\n    submission = []\n\n    if task_name == 'rucos':\n        submission = commonsense_reasoning_prediction(model, data_gen)\n\n    elif task_name == 'parus':\n        for idx, (x, _) in enumerate(tqdm(data_gen)):\n            prediction = model.compute(x)[0]\n            label = int(prediction == 'choice2')\n            submission.append({'idx': idx, 'label': label})\n\n    elif task_name == 'muserc':\n        submission = multi_sentence_comprehension_prediction(model, data_gen)\n\n    elif task_name in RSG_TASKS:\n        for idx, (x, _) in enumerate(tqdm(data_gen)):\n            prediction = model.compute(x)\n\n            while isinstance(prediction, list):\n                prediction = prediction[0]\n\n            submission.append({'idx': idx, 'label': prediction})\n    else:\n        raise ValueError(f'Unexpected Russian SuperGLUE task name: {task_name}')\n\n    save_path = output_path if output_path is not None else f'{RSG_TASKS[task_name]}.jsonl'\n    save_path = expand_path(save_path)\n    save_path.parent.mkdir(parents=True, exist_ok=True)\n    save_jsonl(submission, save_path)\n    log.info(f'Prediction saved to {save_path}')\n\n\ndef main():\n    args = parser.parse_args()\n    if args.benchmark_name == 'glue':\n        submit_glue(args.config_path, args.output_file, args.download)\n    elif args.benchmark_name == 'superglue':\n        submit_superglue(args.config_path, args.output_file, args.download)\n    elif args.benchmark_name == 'russian_superglue':\n        submit_rsg(args.config_path, args.output_file, args.download)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "deeppavlov/utils/connector/__init__.py",
    "content": "from .dialog_logger import DialogLogger\n"
  },
  {
    "path": "deeppavlov/utils/connector/dialog_logger.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nfrom datetime import datetime\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import Any, Optional, Hashable\n\nfrom deeppavlov.core.common.file import read_json\nfrom deeppavlov.core.common.paths import get_settings_path\nfrom deeppavlov.core.data.utils import jsonify_data\n\nLOGGER_CONFIG_FILENAME = 'dialog_logger_config.json'\nLOG_TIMESTAMP_FORMAT = '%Y-%m-%d_%H-%M-%S_%f'\n\nlog = getLogger(__name__)\n\n\nclass DialogLogger:\n    \"\"\"DeepPavlov dialog logging facility.\n\n    DialogLogger is an entity which provides tools for dialogs logging.\n\n    Args:\n        enabled: DialogLogger on/off flag.\n        logger_name: Dialog logger name that is used for organising log files.\n\n    Attributes:\n        logger_name: Dialog logger name which is used for organising log files.\n        log_max_size: Maximum size of log file, kb.\n        self.log_file: Current log file object.\n    \"\"\"\n    def __init__(self, enabled: bool = False, logger_name: Optional[str] = None) -> None:\n        self.config: dict = read_json(get_settings_path() / LOGGER_CONFIG_FILENAME)\n        self.enabled: bool = enabled or self.config['enabled']\n\n        if self.enabled:\n            self.logger_name: str = logger_name or self.config['logger_name']\n            self.log_max_size: int = self.config['logfile_max_size_kb']\n            self.log_file = self._get_log_file()\n            self.log_file.writelines('\"Dialog logger initiated\"\\n')\n\n    @staticmethod\n    def _get_timestamp_utc_str() -> str:\n        \"\"\"Returns str converted current UTC timestamp.\n\n        Returns:\n            utc_timestamp_str: str converted current UTC timestamp.\n        \"\"\"\n        utc_timestamp_str = datetime.strftime(datetime.utcnow(), LOG_TIMESTAMP_FORMAT)\n        return utc_timestamp_str\n\n    def _get_log_file(self):\n        \"\"\"Returns opened file object for writing dialog logs.\n\n        Returns:\n            log_file: opened Python file object.\n        \"\"\"\n        log_dir: Path = Path(self.config['log_path']).expanduser().resolve() / self.logger_name\n        log_dir.mkdir(parents=True, exist_ok=True)\n        log_file_path = Path(log_dir, f'{self._get_timestamp_utc_str()}_{self.logger_name}.log')\n        log_file = open(log_file_path, 'a', buffering=1, encoding='utf8')\n        return log_file\n\n    def _log(self, utterance: Any, direction: str, dialog_id: Optional[Hashable]=None):\n        \"\"\"Logs single dialog utterance to current dialog log file.\n\n        Args:\n            utterance: Dialog utterance.\n            direction: 'in' or 'out' utterance direction.\n            dialog_id: Dialog ID.\n        \"\"\"\n        if isinstance(utterance, str):\n            pass\n        elif isinstance(utterance, (list, dict)):\n            utterance = jsonify_data(utterance)\n        else:\n            utterance = str(utterance)\n\n        dialog_id = str(dialog_id) if not isinstance(dialog_id, str) else dialog_id\n\n        if self.log_file.tell() >= self.log_max_size * 1024:\n            self.log_file.close()\n            self.log_file = self._get_log_file()\n        else:\n            try:\n                log_msg = {}\n                log_msg['timestamp'] = self._get_timestamp_utc_str()\n                log_msg['dialog_id'] = dialog_id\n                log_msg['direction'] = direction\n                log_msg['message'] = utterance\n                log_str = json.dumps(log_msg, ensure_ascii=self.config['ensure_ascii'])\n                self.log_file.write(f'{log_str}\\n')\n            except IOError:\n                log.error('Failed to write dialog log.')\n\n    def log_in(self, utterance: Any, dialog_id: Optional[Hashable] = None) -> None:\n        \"\"\"Wraps _log method for all input utterances.\n        Args:\n            utterance: Dialog utterance.\n            dialog_id: Dialog ID.\n        \"\"\"\n        if self.enabled:\n            self._log(utterance, 'in', dialog_id)\n\n    def log_out(self, utterance: Any, dialog_id: Optional[Hashable] = None) -> None:\n        \"\"\"Wraps _log method for all output utterances.\n        Args:\n            utterance: Dialog utterance.\n            dialog_id: Dialog ID.\n        \"\"\"\n        if self.enabled:\n            self._log(utterance, 'out', dialog_id)\n"
  },
  {
    "path": "deeppavlov/utils/pip_wrapper/__init__.py",
    "content": "from .pip_wrapper import *\n"
  },
  {
    "path": "deeppavlov/utils/pip_wrapper/pip_wrapper.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport re\nimport subprocess\nimport sys\nfrom logging import getLogger\nfrom pathlib import Path\n\nfrom deeppavlov.core.commands.utils import expand_path, parse_config\nfrom deeppavlov.core.data.utils import get_all_elems_from_json\n\nlog = getLogger(__name__)\n\n_tf_re = re.compile(r'\\s*tensorflow\\s*([<=>;]|$)')\n\n\ndef install(*packages):\n    if any(_tf_re.match(package) for package in packages) \\\n            and b'tensorflow-gpu' in subprocess.check_output([sys.executable, '-m', 'pip', 'freeze'],\n                                                             env=os.environ.copy()):\n        log.warning('found tensorflow-gpu installed, so upgrading it instead of tensorflow')\n        packages = [_tf_re.sub(r'tensorflow-gpu\\1', package) for package in packages]\n    result = subprocess.check_call([sys.executable, '-m', 'pip', 'install',\n                                    *[re.sub(r'\\s', '', package) for package in packages]],\n                                   env=os.environ.copy())\n    return result\n\n\ndef get_config_requirements(config: [str, Path, dict]):\n    config = parse_config(config)\n\n    requirements = set()\n    for req in config.get('metadata', {}).get('requirements', []):\n        requirements.add(req)\n\n    config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')]\n    requirements |= {req for config in config_references for req in get_config_requirements(config)}\n\n    return requirements\n\n\ndef install_from_config(config: [str, Path, dict]):\n    requirements_files = get_config_requirements(config)\n\n    if not requirements_files:\n        log.warning('No requirements found in config')\n        return\n\n    requirements = []\n    for rf in requirements_files:\n        with expand_path(rf).open(encoding='utf8') as f:\n            for line in f:\n                line = re.sub(r'\\s', '', line.strip())\n                if line and not line.startswith('#') and line not in requirements:\n                    requirements.append(line)\n\n    for r in requirements:\n        install(r)\n"
  },
  {
    "path": "deeppavlov/utils/server/__init__.py",
    "content": "from .server import get_server_params, get_ssl_params, redirect_root_to_docs, start_model_server\n"
  },
  {
    "path": "deeppavlov/utils/server/metrics.py",
    "content": "# Copyright 2020 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport time\nfrom typing import Tuple\n\nfrom prometheus_client import CONTENT_TYPE_LATEST, REGISTRY, generate_latest\nfrom prometheus_client import Counter, Gauge, Histogram\nfrom starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint\nfrom starlette.requests import Request\nfrom starlette.responses import Response\nfrom starlette.types import ASGIApp\n\nREQUESTS_COUNT = Counter('http_requests_count', 'Number of processed requests', ['endpoint', 'status_code'])\nREQUESTS_LATENCY = Histogram('http_requests_latency_seconds', 'Request latency histogram', ['endpoint'])\nREQUESTS_IN_PROGRESS = Gauge('http_requests_in_progress', 'Number of requests currently being processed', ['endpoint'])\n\n\ndef metrics(request: Request) -> Response:\n    return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST)\n\n\nclass PrometheusMiddleware(BaseHTTPMiddleware):\n    def __init__(self, app: ASGIApp, ignore_paths: Tuple = ()) -> None:\n        super().__init__(app)\n        self.ignore_paths = ignore_paths\n\n    async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:\n        endpoint = request.url.path\n\n        if endpoint in self.ignore_paths:\n            return await call_next(request)\n\n        REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).inc()\n\n        start_time = time.perf_counter()\n        status_code = 500\n\n        try:\n            response = await call_next(request)\n            status_code = response.status_code\n        finally:\n            if status_code == 200:\n                duration = time.perf_counter() - start_time\n                REQUESTS_LATENCY.labels(endpoint=endpoint).observe(duration)\n            REQUESTS_COUNT.labels(endpoint=endpoint, status_code=status_code).inc()\n            REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).dec()\n\n        return response\n"
  },
  {
    "path": "deeppavlov/utils/server/server.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport asyncio\nimport os\nfrom collections import namedtuple\nfrom logging import getLogger\nfrom pathlib import Path\nfrom ssl import PROTOCOL_TLSv1_2\nfrom typing import Dict, List, Optional, Union\n\nimport uvicorn\nfrom fastapi import Body, FastAPI, HTTPException\nfrom fastapi.utils import generate_operation_id_for_path\nfrom pydantic import BaseConfig, BaseModel\nfrom pydantic.fields import Field, ModelField\nfrom pydantic.main import ModelMetaclass\nfrom starlette.middleware.cors import CORSMiddleware\nfrom starlette.responses import RedirectResponse\n\nfrom deeppavlov.core.commands.infer import build_model\nfrom deeppavlov.core.commands.utils import parse_config\nfrom deeppavlov.core.common.chainer import Chainer\nfrom deeppavlov.core.common.file import read_json\nfrom deeppavlov.core.common.log import log_config\nfrom deeppavlov.core.common.paths import get_settings_path\nfrom deeppavlov.core.data.utils import check_nested_dict_keys, jsonify_data\nfrom deeppavlov.utils.connector import DialogLogger\nfrom deeppavlov.utils.server.metrics import metrics, PrometheusMiddleware\n\nSERVER_CONFIG_PATH = get_settings_path() / 'server_config.json'\nSSLConfig = namedtuple('SSLConfig', ['version', 'keyfile', 'certfile'])\n\nlog = getLogger(__name__)\ndialog_logger = DialogLogger(logger_name='rest_api')\n\nCOMPATIBILITY_MODE = os.getenv('COMPATIBILITY_MODE', False)\n\nif COMPATIBILITY_MODE is not False:\n    log.warning('DeepPavlov riseapi mode will use the old model response data format used up and including 1.0.0rc1.\\n'\n                'COMPATIBILITY_MODE will be removed in the DeepPavlov 1.2.0.\\n'\n                'Please, update your client code according to the new format.')\n\napp = FastAPI()\n\napp.add_middleware(\n    PrometheusMiddleware,\n    ignore_paths=('/', '/metrics', '/api', '/probe', '/docs', '/openapi.json')\n)\n\napp.add_middleware(\n    CORSMiddleware,\n    allow_origins=['*'],\n    allow_credentials=True,\n    allow_methods=['*'],\n    allow_headers=['*']\n)\n\napp.add_route(\"/metrics\", metrics)\n\n\ndef get_server_params(model_config: Union[str, Path]) -> Dict:\n    server_config = read_json(SERVER_CONFIG_PATH)\n    model_config = parse_config(model_config)\n\n    server_params = server_config['common_defaults']\n\n    if check_nested_dict_keys(model_config, ['metadata', 'server_utils']):\n        model_tag = model_config['metadata']['server_utils']\n        if check_nested_dict_keys(server_config, ['model_defaults', model_tag]):\n            model_defaults = server_config['model_defaults'][model_tag]\n            for param_name in model_defaults.keys():\n                if model_defaults[param_name]:\n                    server_params[param_name] = model_defaults[param_name]\n\n    server_params['model_endpoint'] = server_params.get('model_endpoint', '/model')\n\n    arg_names = server_params['model_args_names'] or model_config['chainer']['in']\n    if isinstance(arg_names, str):\n        arg_names = [arg_names]\n    server_params['model_args_names'] = arg_names\n\n    return server_params\n\n\ndef get_ssl_params(server_params: dict,\n                   https: Optional[bool],\n                   ssl_key: Optional[str],\n                   ssl_cert: Optional[str]) -> SSLConfig:\n    https = https or server_params['https']\n    if https:\n        ssh_key_path = Path(ssl_key or server_params['https_key_path']).resolve()\n        if not ssh_key_path.is_file():\n            e = FileNotFoundError('Ssh key file not found: please provide correct path in --key param or '\n                                  'https_key_path param in server configuration file')\n            log.error(e)\n            raise e\n\n        ssh_cert_path = Path(ssl_cert or server_params['https_cert_path']).resolve()\n        if not ssh_cert_path.is_file():\n            e = FileNotFoundError('Ssh certificate file not found: please provide correct path in --cert param or '\n                                  'https_cert_path param in server configuration file')\n            log.error(e)\n            raise e\n\n        ssl_config = SSLConfig(version=PROTOCOL_TLSv1_2, keyfile=str(ssh_key_path), certfile=str(ssh_cert_path))\n    else:\n        ssl_config = SSLConfig(None, None, None)\n\n    return ssl_config\n\n\ndef redirect_root_to_docs(fast_app: FastAPI, func_name: str, endpoint: str, method: str) -> None:\n    \"\"\"Adds api route to server that redirects user from root to docs with opened `endpoint` description.\"\"\"\n\n    @fast_app.get('/', include_in_schema=False)\n    async def redirect_to_docs() -> RedirectResponse:\n        operation_id = generate_operation_id_for_path(name=func_name, path=endpoint, method=method)\n        response = RedirectResponse(url=f'/docs#/default/{operation_id}')\n        return response\n\n\ndef interact(model: Chainer, payload: Dict[str, Optional[List]]) -> List:\n    model_args = payload.values()\n    dialog_logger.log_in(payload)\n    error_msg = None\n    lengths = {len(model_arg) for model_arg in model_args if model_arg is not None}\n\n    if not lengths:\n        error_msg = 'got empty request'\n    elif 0 in lengths:\n        error_msg = 'got empty array as model argument'\n    elif len(lengths) > 1:\n        error_msg = 'got several different batch sizes'\n\n    if error_msg is not None:\n        log.error(error_msg)\n        raise HTTPException(status_code=400, detail=error_msg)\n\n    batch_size = next(iter(lengths))\n    model_args = [arg or [None] * batch_size for arg in model_args]\n\n    prediction = model(*model_args)\n\n    # TODO: remove in 1.2.0\n    if COMPATIBILITY_MODE is not False:\n        if len(model.out_params) == 1:\n            prediction = [prediction]\n        prediction = list(zip(*prediction))\n\n    result = jsonify_data(prediction)\n    dialog_logger.log_out(result)\n    return result\n\n\ndef test_interact(model: Chainer, payload: Dict[str, Optional[List]]) -> List[str]:\n    model_args = [arg or [\"Test string.\"] for arg in payload.values()]\n    try:\n        _ = model(*model_args)\n        return [\"Test passed\"]\n    except Exception as e:\n        raise HTTPException(status_code=400, detail=repr(e))\n\n\ndef start_model_server(model_config: Path,\n                       https: Optional[bool] = None,\n                       ssl_key: Optional[str] = None,\n                       ssl_cert: Optional[str] = None,\n                       port: Optional[int] = None) -> None:\n\n    server_params = get_server_params(model_config)\n\n    host = server_params['host']\n    port = port or server_params['port']\n    model_endpoint = server_params['model_endpoint']\n    model_args_names = server_params['model_args_names']\n\n    ssl_config = get_ssl_params(server_params, https, ssl_key=ssl_key, ssl_cert=ssl_cert)\n\n    model = build_model(model_config)\n\n    def batch_decorator(cls: ModelMetaclass) -> ModelMetaclass:\n        cls.__annotations__ = {arg_name: list for arg_name in model_args_names}\n        cls.__fields__ = {arg_name: ModelField(name=arg_name, type_=list, class_validators=None,\n                                               model_config=BaseConfig, required=False, field_info=Field(None))\n                          for arg_name in model_args_names}\n        return cls\n\n    @batch_decorator\n    class Batch(BaseModel):\n        pass\n\n    redirect_root_to_docs(app, 'answer', model_endpoint, 'post')\n\n    model_endpoint_post_example = {arg_name: ['string'] for arg_name in model_args_names}\n\n    @app.post(model_endpoint, summary='A model endpoint')\n    async def answer(item: Batch = Body(..., example=model_endpoint_post_example)) -> List:\n        loop = asyncio.get_event_loop()\n        return await loop.run_in_executor(None, interact, model, item.dict())\n\n    @app.post('/probe', include_in_schema=False)\n    async def probe(item: Batch) -> List[str]:\n        loop = asyncio.get_event_loop()\n        return await loop.run_in_executor(None, test_interact, model, item.dict())\n\n    @app.get('/api', summary='Model argument names')\n    async def api() -> Dict[str, List[str]]:\n        if COMPATIBILITY_MODE is not False:\n            return model_args_names\n        return {\n            'in': model.in_x,\n            'out': model.out_params\n        }\n\n    uvicorn.run(app, host=host, port=port, log_config=log_config, ssl_version=ssl_config.version,\n                ssl_keyfile=ssl_config.keyfile, ssl_certfile=ssl_config.certfile, timeout_keep_alive=20)\n"
  },
  {
    "path": "deeppavlov/utils/settings/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/utils/settings/dialog_logger_config.json",
    "content": "{\n  \"enabled\": false,\n  \"logger_name\": \"default\",\n  \"log_path\": \"~/.deeppavlov/dialog_logs\",\n  \"logfile_max_size_kb\": 10240,\n  \"ensure_ascii\": false\n}"
  },
  {
    "path": "deeppavlov/utils/settings/log_config.json",
    "content": "{\n  \"version\": 1,\n  \"disable_existing_loggers\": false,\n  \"loggers\": {\n    \"deeppavlov\": {\n      \"level\": \"INFO\",\n      \"handlers\": [\n        \"stderr\"\n      ],\n      \"propagate\": true\n    },\n    \"uvicorn.access\": {\n      \"level\": \"INFO\",\n      \"handlers\": [\n        \"uvicorn_handler\"\n      ],\n      \"propagate\": true\n    },\n    \"uvicorn.error\": {\n      \"level\": \"INFO\",\n      \"handlers\": [\n        \"uvicorn_handler\"\n      ],\n      \"propagate\": true\n    },\n    \"train_report\": {\n      \"level\": \"INFO\",\n      \"handlers\": [\n        \"train_handler\"\n      ],\n      \"propagate\": true\n    },\n    \"filelock\": {\n      \"level\": \"WARNING\",\n      \"handlers\": [\n        \"stdout\"\n      ],\n      \"propagate\": true\n    }\n  },\n  \"formatters\": {\n    \"default\": {\n      \"format\": \"%(asctime)s.%(msecs)d %(levelname)s in '%(name)s'['%(module)s'] at line %(lineno)d: %(message)s\",\n      \"datefmt\": \"%Y-%m-%d %H:%M:%S\"\n    },\n    \"uvicorn_fmt\": {\n      \"format\": \"%(asctime)s %(message)s\",\n      \"datefmt\": \"%Y-%m-%d %H:%M:%S\"\n    },\n    \"message\": {\n      \"format\": \"%(message)s\"\n    }\n  },\n  \"handlers\": {\n    \"file\": {\n      \"class\": \"logging.FileHandler\",\n      \"level\": \"DEBUG\",\n      \"formatter\": \"default\",\n      \"filename\": \"~/.deeppavlov/log.log\"\n    },\n    \"stdout\": {\n      \"class\": \"logging.StreamHandler\",\n      \"level\": \"DEBUG\",\n      \"formatter\": \"default\",\n      \"stream\": \"ext://sys.stdout\"\n    },\n    \"stderr\": {\n      \"class\": \"logging.StreamHandler\",\n      \"level\": \"DEBUG\",\n      \"formatter\": \"default\",\n      \"stream\": \"ext://sys.stderr\"\n    },\n    \"uvicorn_handler\": {\n      \"class\": \"logging.StreamHandler\",\n      \"level\": \"INFO\",\n      \"formatter\": \"uvicorn_fmt\",\n      \"stream\": \"ext://sys.stdout\",\n      \"filters\": [\"probeFilter\"]\n    },\n    \"train_handler\": {\n      \"class\": \"logging.StreamHandler\",\n      \"level\": \"INFO\",\n      \"formatter\": \"message\",\n      \"stream\": \"ext://sys.stdout\"\n    }\n  },\n  \"filters\": {\n    \"probeFilter\": {\n      \"()\": \"deeppavlov.core.common.log.ProbeFilter\"\n    }\n  }\n}\n"
  },
  {
    "path": "deeppavlov/utils/settings/server_config.json",
    "content": "{\n  \"common_defaults\": {\n    \"host\": \"0.0.0.0\",\n    \"port\": 5000,\n    \"model_args_names\": [],\n    \"https\": false,\n    \"https_cert_path\": \"\",\n    \"https_key_path\": \"\",\n    \"socket_type\": \"TCP\",\n    \"unix_socket_file\": \"/tmp/deeppavlov_socket.s\",\n    \"socket_launch_message\": \"launching socket server at\"\n  }\n}\n"
  },
  {
    "path": "deeppavlov/utils/socket/__init__.py",
    "content": "from .socket import encode, start_socket_server\n"
  },
  {
    "path": "deeppavlov/utils/socket/socket.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport asyncio\nimport json\nfrom logging import getLogger\nfrom pathlib import Path\nfrom struct import pack, unpack\nfrom typing import Any, List, Optional, Tuple, Union\n\nfrom deeppavlov.core.commands.infer import build_model\nfrom deeppavlov.core.common.chainer import Chainer\nfrom deeppavlov.core.data.utils import jsonify_data\nfrom deeppavlov.utils.connector import DialogLogger\nfrom deeppavlov.utils.server import get_server_params\n\nHEADER_FORMAT = '<I'\n\nlog = getLogger(__name__)\ndialog_logger = DialogLogger(logger_name='socket_api')\n\n\ndef encode(data: Any) -> bytes:\n    \"\"\"Сonverts data to the socket server input formatted bytes array.\n\n    Serializes ``data`` to the JSON formatted bytes array and adds 4 bytes to the beginning of the array - packed\n    to bytes length of the JSON formatted bytes array. Header format is \"<I\"\n    (see https://docs.python.org/3/library/struct.html#struct-format-strings)\n\n    Args:\n        data: Object to pact to the bytes array.\n\n    Raises:\n        TypeError: If data is not JSON-serializable object.\n\n    Examples:\n        >>> from deeppavlov.utils.socket import encode\n        >>> encode({'a':1})\n        b'\\x08\\x00\\x00\\x00{\"a\": 1}\n        >>> encode([42])\n        b'\\x04\\x00\\x00\\x00[42]'\n\n    \"\"\"\n    json_data = jsonify_data(data)\n    bytes_data = json.dumps(json_data).encode()\n    response = pack(HEADER_FORMAT, len(bytes_data)) + bytes_data\n    return response\n\n\nclass SocketServer:\n    \"\"\"Creates socket server that sends the received data to the DeepPavlov model and returns model response.\n\n    The server receives bytes array consists of the `header` and the `body`. The `header` is the first 4 bytes\n    of the array - `body` length in bytes represented by a packed unsigned int (byte order is little-endian).\n    `body` is dictionary serialized to JSON formatted bytes array that server sends to the model. The dictionary\n    keys should match model arguments names, the values should be lists or tuples of inferenced values.\n\n    Socket server request creation example:\n        >>> from deeppavlov.utils.socket import encode\n        >>> request = encode({\"context\":[\"Elon Musk launched his cherry Tesla roadster to the Mars orbit\"]})\n        >>> request\n        b'I\\x00\\x00\\x00{\"x\": [\"Elon Musk launched his cherry Tesla roadster to the Mars orbit\"]}'\n\n    Socket server response, like the request, consists of the header and the body. Response body is dictionary\n    {'status': status, 'payload': payload} serialized to a JSON formatted byte array, where:\n        status (str): 'OK' if the model successfully processed the data, else - error message.\n        payload: (Optional[List[Tuple]]): The model result if no error has occurred, otherwise None.\n\n    \"\"\"\n    _launch_msg: str\n    _loop: asyncio.AbstractEventLoop\n    _model: Chainer\n    _model_args_names: List\n\n    def __init__(self,\n                 model_config: Path,\n                 socket_type: str,\n                 port: Optional[int] = None,\n                 socket_file: Optional[Union[str, Path]] = None) -> None:\n        \"\"\"Initializes socket server.\n\n        Args:\n            model_config: Path to the config file.\n            socket_type: Socket family. \"TCP\" for the AF_INET socket server, \"UNIX\" for UNIX Domain Socket server.\n            port: Port number for the AF_INET address family. If parameter is not defined, the port number from the\n                utils/settings/server_config.json is used.\n            socket_file: Path to the file to which UNIX Domain Socket server connects. If parameter is not defined,\n                the path from the utils/settings/server_config.json is used.\n\n        Raises:\n            ValueError: If ``socket_type`` parameter is neither \"TCP\" nor \"UNIX\".\n\n        \"\"\"\n        server_params = get_server_params(model_config)\n        socket_type = socket_type or server_params['socket_type']\n        self._loop = asyncio.get_event_loop()\n\n        if socket_type == 'TCP':\n            host = server_params['host']\n            port = port or server_params['port']\n            self._launch_msg = f'{server_params[\"socket_launch_message\"]} http://{host}:{port}'\n            self._loop.create_task(asyncio.start_server(self._handle_client, host, port))\n        elif socket_type == 'UNIX':\n            socket_file = socket_file or server_params['unix_socket_file']\n            socket_path = Path(socket_file).resolve()\n            if socket_path.exists():\n                socket_path.unlink()\n            self._launch_msg = f'{server_params[\"socket_launch_message\"]} {socket_file}'\n            self._loop.create_task(asyncio.start_unix_server(self._handle_client, socket_file))\n        else:\n            raise ValueError(f'socket type \"{socket_type}\" is not supported')\n\n        self._model = build_model(model_config)\n        self._model_args_names = server_params['model_args_names']\n\n    def start(self) -> None:\n        \"\"\"Launches socket server\"\"\"\n        log.info(self._launch_msg)\n        try:\n            self._loop.run_forever()\n        except KeyboardInterrupt:\n            pass\n        except Exception as e:\n            log.error(f'got exception {e} while running server')\n        finally:\n            self._loop.close()\n\n    async def _handle_client(self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:\n        \"\"\"Handles connection from a client.\n\n        Validates requests, sends request body to DeepPavlov model, sends responses to client.\n\n        \"\"\"\n        addr = writer.get_extra_info('peername')\n        log.info(f'handling connection from {addr}')\n        while True:\n            header = await reader.read(4)\n            if not header:\n                log.info(f'closing connection from {addr}')\n                writer.close()\n                break\n            elif len(header) != 4:\n                error_msg = f'header \"{header}\" length less than 4 bytes'\n                log.error(error_msg)\n                response = self._response(error_msg)\n            else:\n                data_len = unpack(HEADER_FORMAT, header)[0]\n                request_body = await reader.read(data_len)\n                try:\n                    data = json.loads(request_body)\n                    response = await self._interact(data)\n                except ValueError:\n                    error_msg = f'request \"{request_body}\" type is not json'\n                    log.error(error_msg)\n                    response = self._response(error_msg)\n            writer.write(response)\n            await writer.drain()\n\n    async def _interact(self, data: dict) -> bytes:\n        dialog_logger.log_in(data)\n        model_args = []\n        for param_name in self._model_args_names:\n            param_value = data.get(param_name)\n            if param_value is None or (isinstance(param_value, list) and len(param_value) > 0):\n                model_args.append(param_value)\n            else:\n                error_msg = f\"nonempty array expected but got '{param_name}'={repr(param_value)}\"\n                log.error(error_msg)\n                return self._response(error_msg)\n        lengths = {len(i) for i in model_args if i is not None}\n\n        if not lengths:\n            error_msg = 'got empty request'\n            log.error(error_msg)\n            return self._response(error_msg)\n        elif len(lengths) > 1:\n            error_msg = f'got several different batch sizes: {lengths}'\n            log.error(error_msg)\n            return self._response(error_msg)\n\n        batch_size = list(lengths)[0]\n        model_args = [arg or [None] * batch_size for arg in model_args]\n\n        # in case when some parameters were not described in model_args\n        model_args += [[None] * batch_size for _ in range(len(self._model.in_x) - len(model_args))]\n\n        prediction = await self._loop.run_in_executor(None, self._model, *model_args)\n        if len(self._model.out_params) == 1:\n            prediction = [prediction]\n        prediction = list(zip(*prediction))\n        dialog_logger.log_out(prediction)\n        return self._response(payload=prediction)\n\n    @staticmethod\n    def _response(status: str = 'OK', payload: Optional[List[Tuple]] = None) -> bytes:\n        \"\"\"Puts arguments into dict and serialize it to JSON formatted byte array with header.\n\n        Args:\n            status: Response status. 'OK' if no error has occurred, otherwise error message.\n            payload: DeepPavlov model result if no error has occurred, otherwise None.\n\n        Returns:\n            dict({'status': status, 'payload': payload}) serialized to a JSON formatted byte array starting with the\n                4-byte header - the length of serialized dict in bytes.\n\n        \"\"\"\n        return encode({'status': status, 'payload': payload})\n\n\ndef start_socket_server(model_config: Path, socket_type: str, port: Optional[int],\n                        socket_file: Optional[Union[str, Path]]) -> None:\n    server = SocketServer(model_config, socket_type, port, socket_file)\n    server.start()\n"
  },
  {
    "path": "deeppavlov/vocabs/__init__.py",
    "content": ""
  },
  {
    "path": "deeppavlov/vocabs/typos.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport shutil\nfrom collections import defaultdict\nfrom logging import getLogger\nfrom pathlib import Path\n\nimport requests\nfrom lxml import html\n\nfrom deeppavlov.core.commands.utils import expand_path\nfrom deeppavlov.core.common.file import load_pickle, save_pickle\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.data.utils import is_done, mark_done\n\nlog = getLogger(__name__)\n\n\n@register('static_dictionary')\nclass StaticDictionary:\n    \"\"\"Trie vocabulary used in spelling correction algorithms\n\n    Args:\n        data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as\n            relative to pipeline's data directory\n        dictionary_name: logical name of the dictionary\n        raw_dictionary_path: path to the source file with the list of words\n\n    Attributes:\n        dict_name: logical name of the dictionary\n        alphabet: set of all the characters used in this dictionary\n        words_set: set of all the words\n        words_trie: trie structure of all the words\n    \"\"\"\n\n    def __init__(self, data_dir: [Path, str] = '', *args, dictionary_name: str = 'dictionary', **kwargs):\n        data_dir = expand_path(data_dir) / dictionary_name\n\n        alphabet_path = data_dir / 'alphabet.pkl'\n        words_path = data_dir / 'words.pkl'\n        words_trie_path = data_dir / 'words_trie.pkl'\n\n        if not is_done(data_dir):\n            log.debug('Trying to build a dictionary in {}'.format(data_dir))\n            if data_dir.is_dir():\n                shutil.rmtree(str(data_dir))\n            data_dir.mkdir(parents=True)\n\n            words = self._get_source(data_dir, *args, **kwargs)\n            words = {self._normalize(word) for word in words}\n\n            alphabet = {c for w in words for c in w}\n            alphabet.remove('⟬')\n            alphabet.remove('⟭')\n\n            save_pickle(alphabet, alphabet_path)\n            save_pickle(words, words_path)\n\n            words_trie = defaultdict(set)\n            for word in words:\n                for i in range(len(word)):\n                    words_trie[word[:i]].add(word[:i + 1])\n                words_trie[word] = set()\n            words_trie = {k: sorted(v) for k, v in words_trie.items()}\n\n            save_pickle(words_trie, words_trie_path)\n\n            mark_done(data_dir)\n            log.debug('built')\n        else:\n            log.debug('Loading a dictionary from {}'.format(data_dir))\n\n        self.alphabet = load_pickle(alphabet_path)\n        self.words_set = load_pickle(words_path)\n        self.words_trie = load_pickle(words_trie_path)\n\n    @staticmethod\n    def _get_source(data_dir, raw_dictionary_path, *args, **kwargs):\n        raw_path = expand_path(raw_dictionary_path)\n        with raw_path.open(newline='', encoding='utf8') as f:\n            data = [line.strip().split('\\t')[0] for line in f]\n        return data\n\n    @staticmethod\n    def _normalize(word):\n        return '⟬{}⟭'.format(word.strip().lower().replace('ё', 'е'))\n\n\n@register('russian_words_vocab')\nclass RussianWordsVocab(StaticDictionary):\n    \"\"\"Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data from https://github.com/danakt/russian-words/\n\n    Args:\n        data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as\n            relative to pipeline's data directory\n\n    Attributes:\n        dict_name: logical name of the dictionary\n        alphabet: set of all the characters used in this dictionary\n        words_set: set of all the words\n        words_trie: trie structure of all the words\n    \"\"\"\n\n    def __init__(self, data_dir: [Path, str] = '', *args, **kwargs):\n        kwargs['dictionary_name'] = 'russian_words_vocab'\n        super().__init__(data_dir, *args, **kwargs)\n\n    @staticmethod\n    def _get_source(*args, **kwargs):\n        log.debug('Downloading russian vocab from https://github.com/danakt/russian-words/')\n        url = 'https://github.com/danakt/russian-words/raw/master/russian.txt'\n        page = requests.get(url)\n        return [word.strip() for word in page.content.decode('cp1251').strip().split('\\n')]\n\n\n@register('wikitionary_100K_vocab')\nclass Wiki100KDictionary(StaticDictionary):\n    \"\"\"Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data\n    from `Wikitionary <https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#Project_Gutenberg>`__\n\n    Args:\n        data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as\n            relative to pipeline's data directory\n\n    Attributes:\n        dict_name: logical name of the dictionary\n        alphabet: set of all the characters used in this dictionary\n        words_set: set of all the words\n        words_trie: trie structure of all the words\n    \"\"\"\n\n    def __init__(self, data_dir: [Path, str] = '', *args, **kwargs):\n        kwargs['dictionary_name'] = 'wikipedia_100K_vocab'\n        super().__init__(data_dir, *args, **kwargs)\n\n    @staticmethod\n    def _get_source(*args, **kwargs):\n        words = []\n        log.debug('Downloading english vocab from Wiktionary')\n        for i in range(1, 100000, 10000):\n            k = 10000 + i - 1\n            url = 'https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/PG/2005/08/{}-{}'.format(i, k)\n            page = requests.get(url)\n            tree = html.fromstring(page.content)\n            words += tree.xpath('//div[@class=\"mw-parser-output\"]/p/a/text()')\n        return words\n"
  },
  {
    "path": "deeppavlov/vocabs/wiki_sqlite.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom logging import getLogger\nfrom typing import List, Any, Optional, Union\n\nfrom deeppavlov.core.common.registry import register\nfrom deeppavlov.core.models.component import Component\nfrom deeppavlov.dataset_iterators.sqlite_iterator import SQLiteDataIterator\n\nlogger = getLogger(__name__)\n\n\n@register('wiki_sqlite_vocab')\nclass WikiSQLiteVocab(SQLiteDataIterator, Component):\n    \"\"\"Get content from SQLite database by document ids.\n\n    Args:\n        load_path: a path to local DB file\n        join_docs: whether to join extracted docs with ' ' or not\n        shuffle: whether to shuffle data or not\n\n    Attributes:\n        join_docs: whether to join extracted docs with ' ' or not\n\n    \"\"\"\n\n    def __init__(self, load_path: str, join_docs: bool = True, shuffle: bool = False, **kwargs) -> None:\n        SQLiteDataIterator.__init__(self, load_path=load_path, shuffle=shuffle)\n        self.join_docs = join_docs\n\n    def __call__(self, doc_ids: Optional[List[List[Any]]] = None, *args, **kwargs) -> List[Union[str, List[str]]]:\n        \"\"\"Get the contents of files, stacked by space or as they are.\n\n        Args:\n            doc_ids: a batch of lists of ids to get contents for\n\n        Returns:\n            a list of contents / list of lists of contents\n        \"\"\"\n        all_contents = []\n        if not doc_ids:\n            logger.warning('No doc_ids are provided in WikiSqliteVocab, return all docs')\n            doc_ids = [self.get_doc_ids()]\n\n        for ids in doc_ids:\n            contents = [self.get_doc_content(doc_id) for doc_id in ids]\n            if self.join_docs:\n                contents = ' '.join(contents)\n            all_contents.append(contents)\n\n        return all_contents\n"
  },
  {
    "path": "docs/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS    = -WT\nSPHINXBUILD   = sphinx-build\nSPHINXPROJ    = DeepPavlov\nSOURCEDIR     = .\nBUILDDIR      = _build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)"
  },
  {
    "path": "docs/_static/deeppavlov.css",
    "content": ".wy-side-nav-search {\n    background-color: #0176bd;\n}\n\n.wy-nav-content {\n    max-width: 1000px;\n}\n\n.wy-side-nav-search>div.version {\n    color: #ffffff;\n}"
  },
  {
    "path": "docs/_static/my_blocks.css",
    "content": "button.copybtn svg {\r\n    width: 1.3em;\r\n    height: 1.3em;\r\n    padding: 0.1em;\r\n}\r\n\r\nbutton.copybtn {\r\n    top: 0.2em;\r\n    width: 1.4em;\r\n    height: 1.4em;   \r\n}\r\n\r\n.rst-content .linenodiv pre, .rst-content div[class^=highlight] pre, .rst-content pre.literal-block {\r\n    font-size: 13px;\r\n    line-height: 1.4;\r\n}\r\n"
  },
  {
    "path": "docs/_templates/footer.html",
    "content": "{#{% extends '!footer.html' %}#}\n\n<footer>\n  <!-- Yandex.Metrika counter -->\n  <script type=\"text/javascript\" >\n     (function(m,e,t,r,i,k,a){m[i]=m[i]||function(){(m[i].a=m[i].a||[]).push(arguments)};\n     m[i].l=1*new Date();k=e.createElement(t),a=e.getElementsByTagName(t)[0],k.async=1,k.src=r,a.parentNode.insertBefore(k,a)})\n     (window, document, \"script\", \"https://mc.yandex.ru/metrika/tag.js\", \"ym\");\n     ym(72484825, \"init\", {\n          clickmap:true,\n          trackLinks:true,\n          accurateTrackBounce:true,\n          webvisor:true\n     });\n  </script>\n  <noscript><div><img src=\"https://mc.yandex.ru/watch/72484825\" style=\"position:absolute; left:-9999px;\" alt=\"\" /></div></noscript>\n  <!-- /Yandex.Metrika counter -->\n  {% if (theme_prev_next_buttons_location == 'bottom' or theme_prev_next_buttons_location == 'both') and (next or prev) %}\n    <div class=\"rst-footer-buttons\" role=\"navigation\" aria-label=\"footer navigation\">\n      {% if next %}\n        <a href=\"{{ next.link|e }}\" class=\"btn btn-neutral float-right\" title=\"{{ next.title|striptags|e }}\" accesskey=\"n\" rel=\"next\">{{ _('Next') }} <span class=\"fa fa-arrow-circle-right\"></span></a>\n      {% endif %}\n      {% if prev %}\n        <a href=\"{{ prev.link|e }}\" class=\"btn btn-neutral float-left\" title=\"{{ prev.title|striptags|e }}\" accesskey=\"p\" rel=\"prev\"><span class=\"fa fa-arrow-circle-left\"></span> {{ _('Previous') }}</a>\n      {% endif %}\n    </div>\n  {% endif %}\n\n  <hr/>\n\n  <div role=\"contentinfo\">\n    {%- block extrafooter %}\n        <p>Problem? <a href=\"https://forum.deeppavlov.ai\">Ask a Question</a> or <a href=\"https://demo.deeppavlov.ai/\">try our Demo</a></p>\n        <p>\n            <a href=\"https://medium.com/deeppavlov\"><img style=\"width: 30px; height: 30px;\" src=\"{{ pathto('_static/social/Medium_Monogram.svg', 1) }}\" alt=\"medium\"></a>\n            <a href=\"https://twitter.com/deeppavlov\"><img style=\"width: 30px; height: 30px;\" src=\"{{ pathto('_static/social/Twitter_Social_Icon_Circle_Color.svg', 1) }}\" alt=\"twitter\"></a>\n            <a href=\"https://www.youtube.com/channel/UCJ-6K2HGA0hpQytlSM7FBVQ\"><img style=\"width: 30px; height: 30px;\" src=\"{{ pathto('_static/social/youtube_social_circle_red.png', 1) }}\" alt=\"youtube\"></a>\n            <a href=\"https://t.me/deeppavlov\"><img style=\"width: 30px; height: 30px;\" src=\"{{ pathto('_static/social/telegram.png', 1) }}\" alt=\"medium\"></a>\n        </p>\n    {% endblock %}\n    <p>\n    {%- if show_copyright %}\n      {%- if hasdoc('copyright') %}\n        {% set path = pathto('copyright') %}\n        {% set copyright = copyright|e %}\n        &copy; <a href=\"{{ path }}\">{% trans %}Copyright{% endtrans %}</a> {{ copyright }}\n      {%- else %}\n        {% set copyright = copyright|e %}\n        &copy; {% trans %}Copyright{% endtrans %} {{ copyright }}\n      {%- endif %}\n    {%- endif %}\n\n    {%- if build_id and build_url %}\n      <span class=\"build\">\n        {# Translators: Build is a noun, not a verb #}\n        {% trans %}Build{% endtrans %}\n        <a href=\"{{ build_url }}\">{{ build_id }}</a>.\n      </span>\n    {%- elif commit %}\n      <span class=\"commit\">\n        {% trans %}Revision{% endtrans %} <code>{{ commit }}</code>.\n      </span>\n    {%- elif last_updated %}\n      <span class=\"lastupdated\">\n        {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %}\n      </span>\n    {%- endif %}\n\n    </p>\n  </div>\n\n  {%- if show_sphinx %}\n    {% set sphinx_web = '<a href=\"http://sphinx-doc.org/\">Sphinx</a>' %}\n    {% set readthedocs_web = '<a href=\"https://readthedocs.org\">Read the Docs</a>'  %}\n      {% trans sphinx_web=sphinx_web, readthedocs_web=readthedocs_web %}Built with {{ sphinx_web }} using a{% endtrans %} <a href=\"https://github.com/rtfd/sphinx_rtd_theme\">{% trans %}theme{% endtrans %}</a> {% trans %}provided by {{ readthedocs_web }}{% endtrans %}.\n  {%- endif %}\n\n</footer>\n"
  },
  {
    "path": "docs/apiref/core/commands.rst",
    "content": "deeppavlov.core.commands\n========================\nBasic training and inference functions.\n\n.. automodule:: deeppavlov.core.commands.infer\n   :members:\n\n.. automodule:: deeppavlov.core.commands.train\n   :members:\n"
  },
  {
    "path": "docs/apiref/core/common.rst",
    "content": "deeppavlov.core.common\n======================\nRegistration and classes initialization functionality, class method decorators.\n\n.. autoclass:: deeppavlov.core.common.chainer.Chainer\n   :members:\n\n   .. automethod:: __call__\n\n.. autoclass:: deeppavlov.core.common.base.Element\n\n    .. automethod:: __init__\n\n.. autoclass:: deeppavlov.core.common.base.Model\n\n    .. automethod:: __init__\n\n.. automodule:: deeppavlov.core.common.metrics_registry\n   :members:\n\n.. automodule:: deeppavlov.core.common.params\n   :members:\n\n.. automodule:: deeppavlov.core.common.registry\n   :members:\n"
  },
  {
    "path": "docs/apiref/core/data.rst",
    "content": "deeppavlov.core.data\n====================\nDatasetReader, Vocab, DataLearningIterator and DataFittingIterator classes.\n\n.. autoclass:: deeppavlov.core.data.dataset_reader.DatasetReader\n\n.. autoclass:: deeppavlov.core.data.data_fitting_iterator.DataFittingIterator\n\n.. autoclass:: deeppavlov.core.data.data_learning_iterator.DataLearningIterator\n\n.. autoclass:: deeppavlov.core.data.simple_vocab.SimpleVocabulary\n"
  },
  {
    "path": "docs/apiref/core/models.rst",
    "content": "deeppavlov.core.models\n======================\nAbstract model classes and interfaces.\n\n.. autoclass:: deeppavlov.core.models.component.Component\n\n.. autoclass:: deeppavlov.core.models.serializable.Serializable\n\n.. autoclass:: deeppavlov.core.models.estimator.Estimator\n\n.. autoclass:: deeppavlov.core.models.nn_model.NNModel\n\n.. autoclass:: deeppavlov.core.models.torch_model.TorchModel\n"
  },
  {
    "path": "docs/apiref/core/trainers.rst",
    "content": "deeppavlov.core.trainers\n========================\nTrainer classes.\n\n.. autoclass:: deeppavlov.core.trainers.FitTrainer\n   :members:\n\n.. autoclass:: deeppavlov.core.trainers.NNTrainer\n   :members:\n   :inherited-members:\n"
  },
  {
    "path": "docs/apiref/core.rst",
    "content": "core\n====\nDeepPavlov Core\n\n.. automodule:: deeppavlov.core\n   :members:\n\n.. toctree::\n   :glob:\n   :caption: Core\n\n   core/*\n"
  },
  {
    "path": "docs/apiref/dataset_iterators.rst",
    "content": "dataset_iterators\n=================\nConcrete DatasetIterator classes.\n\n.. autoclass:: deeppavlov.dataset_iterators.basic_classification_iterator.BasicClassificationDatasetIterator\n    :members:\n\n.. autoclass:: deeppavlov.dataset_iterators.siamese_iterator.SiameseIterator\n\n.. autoclass:: deeppavlov.dataset_iterators.sqlite_iterator.SQLiteDataIterator\n\n.. autoclass:: deeppavlov.dataset_iterators.squad_iterator.SquadIterator\n\n.. automodule:: deeppavlov.dataset_iterators.typos_iterator\n    :members:\n\n.. automodule:: deeppavlov.dataset_iterators.multitask_iterator\n    :members:\n"
  },
  {
    "path": "docs/apiref/dataset_readers.rst",
    "content": "dataset_readers\n===============\nConcrete DatasetReader classes.\n\n.. autoclass:: deeppavlov.dataset_readers.basic_classification_reader.BasicClassificationDatasetReader\n   :members:\n\n.. autoclass:: deeppavlov.dataset_readers.conll2003_reader.Conll2003DatasetReader\n\n.. autoclass:: deeppavlov.dataset_readers.faq_reader.FaqDatasetReader\n   :members:\n\n.. autoclass:: deeppavlov.dataset_readers.line_reader.LineReader\n   :members:\n\n.. autoclass:: deeppavlov.dataset_readers.paraphraser_reader.ParaphraserReader\n\n.. autoclass:: deeppavlov.dataset_readers.squad_dataset_reader.SquadDatasetReader\n   :members:\n\n.. automodule:: deeppavlov.dataset_readers.typos_reader\n   :members:\n\n.. automodule:: deeppavlov.dataset_readers.ubuntu_v2_reader\n   :members:\n\n.. automodule:: deeppavlov.dataset_readers.multitask_reader\n   :members:\n"
  },
  {
    "path": "docs/apiref/metrics.rst",
    "content": "metrics\n=======\nDifferent Metric functions.\n\n.. automodule:: deeppavlov.metrics\n   :members:\n\n.. autofunction:: deeppavlov.metrics.accuracy.sets_accuracy\n\n.. autofunction:: deeppavlov.metrics.fmeasure.round_f1\n\n.. autofunction:: deeppavlov.metrics.fmeasure.round_f1_macro\n\n.. autofunction:: deeppavlov.metrics.fmeasure.round_f1_weighted\n\n.. autofunction:: deeppavlov.metrics.fmeasure.ner_f1\n\n.. autofunction:: deeppavlov.metrics.fmeasure.ner_token_f1\n\n.. autofunction:: deeppavlov.metrics.log_loss.sk_log_loss\n\n.. autofunction:: deeppavlov.metrics.roc_auc_score.roc_auc_score\n"
  },
  {
    "path": "docs/apiref/models/api_requester.rst",
    "content": "deeppavlov.models.api_requester\n===============================\n\n.. automodule:: deeppavlov.models.api_requester\n    :members:\n\n.. autoclass:: deeppavlov.models.api_requester.api_requester.ApiRequester\n\n    .. automethod:: __call__\n    .. automethod:: get_async_response\n\n\n.. autoclass:: deeppavlov.models.api_requester.api_router.ApiRouter\n\n    .. automethod:: __call__\n"
  },
  {
    "path": "docs/apiref/models/classifiers.rst",
    "content": "deeppavlov.models.classifiers\n=============================\n\n.. automodule:: deeppavlov.models.classifiers\n   :members:\n\n.. autoclass:: deeppavlov.models.classifiers.torch_classification_model.TorchTextClassificationModel\n    :members:\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.classifiers.cos_sim_classifier.CosineSimilarityClassifier\n    :members:\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.classifiers.proba2labels.Proba2Labels\n    :members:\n\n    .. automethod:: __call__\n"
  },
  {
    "path": "docs/apiref/models/doc_retrieval.rst",
    "content": "deeppavlov.models.doc_retrieval\n===============================\n\nDocument retrieval classes.\n\n.. automodule:: deeppavlov.models.doc_retrieval\n\n.. autoclass:: deeppavlov.models.doc_retrieval.tfidf_ranker.TfidfRanker\n    :members:\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.doc_retrieval.logit_ranker.LogitRanker\n    :members:\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.doc_retrieval.pop_ranker.PopRanker\n    :members:\n\n    .. automethod:: __call__"
  },
  {
    "path": "docs/apiref/models/embedders.rst",
    "content": "deeppavlov.models.embedders\n============================\n\n.. autoclass:: deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder\n\n   .. automethod:: __call__\n   .. automethod:: __iter__\n\n.. autoclass:: deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder\n\n   .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder\n\n   .. automethod:: __call__\n"
  },
  {
    "path": "docs/apiref/models/entity_extraction.rst",
    "content": "deeppavlov.models.entity_extraction\n===================================\n\n.. autoclass:: deeppavlov.models.entity_extraction.ner_chunker.NerChunker\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.entity_extraction.entity_linking.EntityLinker\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.EntityDetectionParser\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.QuestionSignChecker\n"
  },
  {
    "path": "docs/apiref/models/kbqa.rst",
    "content": "deeppavlov.models.kbqa\n======================\n\n.. automodule:: deeppavlov.models.kbqa\n\n.. autoclass:: deeppavlov.models.kbqa.type_define.AnswerTypesExtractor\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.kbqa.query_generator.QueryGenerator\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.kbqa.query_generator_base.QueryGeneratorBase\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.kbqa.rel_ranking_infer.RelRankerInfer\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.kbqa.template_matcher.TemplateMatcher\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.kbqa.ru_adj_to_noun.RuAdjToNoun\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.kbqa.tree_to_sparql.TreeToSparql\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.kbqa.wiki_parser.WikiParser\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n"
  },
  {
    "path": "docs/apiref/models/preprocessors.rst",
    "content": "deeppavlov.models.preprocessors\n===============================\n\n.. autoclass:: deeppavlov.models.preprocessors.dirty_comments_preprocessor.DirtyCommentsPreprocessor\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.preprocessors.mask.Mask\n\n.. autoclass:: deeppavlov.models.preprocessors.one_hotter.OneHotter\n\n.. autoclass:: deeppavlov.models.preprocessors.sanitizer.Sanitizer\n\n.. autofunction:: deeppavlov.models.preprocessors.str_lower.str_lower\n\n.. autoclass:: deeppavlov.models.preprocessors.str_token_reverser.StrTokenReverser\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.preprocessors.str_utf8_encoder.StrUTF8Encoder\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.DocumentChunker\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.StringMultiplier\n\n    .. automethod:: __call__\n"
  },
  {
    "path": "docs/apiref/models/relation_extraction.rst",
    "content": "deeppavlov.models.relation_extraction\n=====================================\n\n.. autoclass:: deeppavlov.models.relation_extraction.relation_extraction_bert.REBertModel\n\n    .. automethod:: __init__\n    .. automethod:: __call__\n    .. automethod:: train_on_batch\n"
  },
  {
    "path": "docs/apiref/models/sklearn.rst",
    "content": "deeppavlov.models.sklearn\n=============================\n\n.. automodule:: deeppavlov.models.sklearn\n   :members:\n\n.. autoclass:: deeppavlov.models.sklearn.sklearn_component.SklearnComponent\n\n    .. automethod:: __call__\n    .. automethod:: fit\n    .. automethod:: init_from_scratch\n    .. automethod:: load\n    .. automethod:: save\n    .. automethod:: compose_input_data\n    .. automethod:: get_class_attributes\n    .. automethod:: get_function_params\n"
  },
  {
    "path": "docs/apiref/models/spelling_correction.rst",
    "content": "deeppavlov.models.spelling_correction\n=====================================\n\n.. autoclass:: deeppavlov.models.spelling_correction.brillmoore.ErrorModel\n\n    .. automethod:: __call__\n    .. automethod:: fit\n    .. automethod:: save\n    .. automethod:: load\n\n.. autoclass:: deeppavlov.models.spelling_correction.levenshtein.LevenshteinSearcherComponent\n\n    .. automethod:: __call__\n\n\n.. autoclass:: deeppavlov.models.spelling_correction.electors.top1_elector.TopOneElector\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.spelling_correction.electors.kenlm_elector.KenlmElector\n\n    .. automethod:: __call__\n"
  },
  {
    "path": "docs/apiref/models/tokenizers.rst",
    "content": "deeppavlov.models.tokenizers\n============================\n\n.. autoclass:: deeppavlov.models.tokenizers.nltk_moses_tokenizer.NLTKMosesTokenizer\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.tokenizers.nltk_tokenizer.NLTKTokenizer\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.tokenizers.split_tokenizer.SplitTokenizer\n\n.. autoclass:: deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer\n\n    .. automethod:: __call__"
  },
  {
    "path": "docs/apiref/models/torch_bert.rst",
    "content": "deeppavlov.models.torch_bert\n============================\n\n.. automodule:: deeppavlov.models.torch_bert\n   :members:\n\n.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersPreprocessor\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersNerPreprocessor\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchBertRankerPreprocessor\n\n    .. automethod:: __call__\n\n.. autoclass:: deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel\n\n    .. automethod:: __call__\n    .. automethod:: train_on_batch\n\n.. autoclass:: deeppavlov.models.torch_bert.torch_transformers_sequence_tagger.TorchTransformersSequenceTagger\n\n    .. automethod:: __call__\n    .. automethod:: train_on_batch\n\n.. autoclass:: deeppavlov.models.torch_bert.torch_transformers_squad.TorchTransformersSquad\n\n    .. automethod:: __call__\n    .. automethod:: train_on_batch\n\n.. autoclass:: deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel\n\n    .. automethod:: __call__\n    .. automethod:: train_on_batch\n"
  },
  {
    "path": "docs/apiref/models/vectorizers.rst",
    "content": "deeppavlov.models.vectorizers\n=============================\n\n\n.. autoclass:: deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer\n    :members:\n\n    .. automethod:: __call__\n"
  },
  {
    "path": "docs/apiref/models.rst",
    "content": "models\n======\nConcrete Model classes.\n\n.. automodule:: deeppavlov.models\n   :members:\n\n.. toctree::\n   :glob:\n   :caption: Models\n\n   models/*"
  },
  {
    "path": "docs/apiref/vocabs.rst",
    "content": "vocabs\n======\nConcrete Vocab classes.\n\n.. automodule:: deeppavlov.vocabs\n   :members:\n\n.. autoclass:: deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab\n   :members:\n\n   .. automethod:: __call__\n\n.. automodule:: deeppavlov.vocabs.typos\n   :members:\n"
  },
  {
    "path": "docs/conf.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Configuration file for the Sphinx documentation builder.\n#\n# This file does only contain a selection of the most common options. For a\n# full list see the documentation:\n# http://www.sphinx-doc.org/en/master/config\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n\nimport sphinx_rtd_theme\n\nimport deeppavlov\n\n# -- Project information -----------------------------------------------------\n\nproject = 'DeepPavlov'\ncopyright = '2018, ' + deeppavlov.__author__\nauthor = deeppavlov.__author__\n\n# The short X.Y version\nversion = deeppavlov.__version__\n# The full version, including alpha/beta/rc tags\nrelease = version\n\n\n# -- General configuration ---------------------------------------------------\n\n# If your documentation needs a minimal Sphinx version, state it here.\n#\n# needs_sphinx = '1.0'\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    'sphinx.ext.autodoc',\n    'sphinx.ext.doctest',\n    'sphinx.ext.intersphinx',\n    'sphinx.ext.todo',\n    'sphinx.ext.coverage',\n    'sphinx.ext.napoleon',\n    'sphinx.ext.viewcode',\n    'sphinx.ext.mathjax',\n    'sphinx.ext.extlinks',\n    'nbsphinx',\n    'IPython.sphinxext.ipython_console_highlighting',\n    'sphinx_copybutton'\n]\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = ['_templates']\n\n# The suffix(es) of source filenames.\n# You can specify multiple suffix as a list of string:\n#\n# source_suffix = ['.rst', '.md']\nsource_suffix = '.rst'\n\n# The master toctree document.\nmaster_doc = 'index'\n\n# The language for content autogenerated by Sphinx. Refer to documentation\n# for a list of supported languages.\n#\n# This is also used if you do content translation via gettext catalogs.\n# Usually you set \"language\" from the command line for these cases.\nlanguage = 'en'\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path .\nexclude_patterns = ['_build', \n                    'Thumbs.db', \n                    '.DS_Store', \n                    '**.ipynb_checkpoints'\n]\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = 'sphinx'\n\n\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\nhtml_theme = 'sphinx_rtd_theme'\nhtml_theme_path = [sphinx_rtd_theme.get_html_theme_path()]\n\n# Theme options are theme-specific and customize the look and feel of a theme\n# further.  For a list of options available for each theme, see the\n# documentation.\n#\nhtml_theme_options = {\n    'collapse_navigation': False,\n    'display_version': True,\n    'logo_only': True,\n}\n\nhtml_logo = '_static/deeppavlov.png'\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\nhtml_static_path = ['_static']\nhtml_css_files = ['my_blocks.css', 'deeppavlov.css']\n\n# Custom sidebar templates, must be a dictionary that maps document names\n# to template names.\n#\n# The default sidebars (for documents that don't match any pattern) are\n# defined by theme itself.  Builtin themes are using these templates by\n# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',\n# 'searchbox.html']``.\n#\n# html_sidebars = {}\n\nnbsphinx_prolog = \"\"\"\n.. raw:: html\n\n    <style>\n    .nbinput .prompt,\n    .nboutput .prompt {\n        display: none;\n    }\n    </style>\n\"\"\"\nnbsphinx_execute = 'never'\n\n\n# -- Options for HTMLHelp output ---------------------------------------------\n\n# Output file base name for HTML help builder.\n\nhtmlhelp_basename = f'{project}-Docs'\n\n\n# -- Options for LaTeX output ------------------------------------------------\n\nlatex_engine = 'xelatex'\n\nlatex_elements = {\n\n    # The paper size ('letterpaper' or 'a4paper').\n    #\n    # 'papersize': 'letterpaper',\n\n    # The font size ('10pt', '11pt' or '12pt').\n    #\n    # 'pointsize': '10pt',\n\n    # Additional stuff for the LaTeX preamble.\n    #\n    # 'preamble': '',\n\n    # Latex figure (float) alignment\n    #\n    # 'figure_align': 'htbp',\n\n    'extraclassoptions': 'openany,oneside',\n\n    'fncychap': r'\\usepackage[Sonny]{fncychap}'\n\n}\n\n# Grouping the document tree into LaTeX files. List of tuples\n# (source start file, target name, title,\n#  author, documentclass [howto, manual, or own class]).\nlatex_documents = [\n    (master_doc, f'{project}.tex', f'{project} Documentation',\n     author, 'manual'),\n]\n\n\n# -- Options for manual page output ------------------------------------------\n\n# One entry per manual page. List of tuples\n# (source start file, name, description, authors, manual section).\nman_pages = [\n    (master_doc, project.lower(), f'{project} Documentation',\n     [author], 1)\n]\n\n\n# -- Options for Texinfo output ----------------------------------------------\n\n# Grouping the document tree into Texinfo files. List of tuples\n# (source start file, target name, title, author,\n#  dir menu entry, description, category)\ntexinfo_documents = [\n    (master_doc, project, f'{project} Documentation',\n     author, project, deeppavlov.__description__,\n     str(deeppavlov.__keywords__)),\n]\n\n\n# -- Extension configuration -------------------------------------------------\n\nautodoc_mock_imports = ['bs4', 'faiss', 'fasttext', 'hdt', 'kenlm', 'lxml', 'navec', 'nltk', 'opt_einsum', 'rapidfuzz',\n                        'razdel', 'sacremoses', 'slovnet', 'sortedcontainers', 'spacy', 'torch', 'torchcrf',\n                        'transformers', 'udapi', 'ufal', 'whapi']\n\nextlinks = {\n    'config': (f'https://github.com/deeppavlov/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None),\n    'dp_file': (f'https://github.com/deeppavlov/DeepPavlov/blob/{release}/%s', None)\n}\n\n# -- Options for intersphinx extension ---------------------------------------\n\n# Configuration for intersphinx\nintersphinx_mapping = {\n    'python': ('https://docs.python.org/3.6', None),\n    'scipy': ('https://docs.scipy.org/doc/scipy/reference', None)\n}\n\n# -- Options for todo extension ----------------------------------------------\n\n# If true, `todo` and `todoList` produce output, else they produce nothing.\ntodo_include_todos = False\n"
  },
  {
    "path": "docs/devguides/contribution_guide.rst",
    "content": "\nContribution Guide\n=====================\n\nWe are happy that you share your research with us and want to improve our code!\n\nPlease follow the steps below to contribute to our project.\n\nIf you have any questions or suggestions about the contributing process,\nplease share them with us on the `forum <https://forum.deeppavlov.ai>`_.\nPlease note that we do not answer general questions in the github issues interface.\n\nIf you are a regular contributor in the DeepPavlov open source project,\nyou can receive an invitation to one of our events or an opportunity to become a part of our team.\n\nHow to contribute:\n\n#. Don't start the coding first.\n   You should do a quick search over `existing issues <https://github.com/deeppavlov/DeepPavlov/issues?q=is%3Aissue>`_\n   for the project to see if your suggestion was already discussed or even resolved.\n   If nothing relevant was found, please create a new one and state what exactly you would like\n   to implement or fix.\n   You may proceed with coding once someone on our team accepts your offer.\n\n#. `Fork <https://guides.github.com/activities/forking/>`_ the\n   `DeepPavlov repository <https://github.com/deeppavlov/DeepPavlov>`_\n\n#. Checkout the ``dev`` branch from\n   `the upstream <https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/configuring-a-remote-for-a-fork>`_\n   as a base for your code:\n\n    .. code:: bash\n\n        git clone https://github.com/<OWNER>/<REPOSITORY>.git\n        cd <REPOSITORY>\n        git remote add upstream https://github.com/deeppavlov/DeepPavlov.git\n        git fetch upstream\n        git checkout -b dev --track upstream/dev\n\n   afterwards to sync the ``dev`` branch with external updates you can run:\n\n    .. code:: bash\n\n        git checkout dev\n        git fetch upstream\n        git pull\n\n#. **Create a new branch and switch** to it. Give it a meaningful name:\n\n    .. code:: bash\n\n        git checkout -b what_my_code_does_branch\n\n#. **Install DeepPavlov** in editable mode:\n\n   .. code:: bash\n\n       pip install -e .\n\n   or\n\n   .. code:: bash\n\n       pip install -e .[docs,tests]\n\n   In editable mode changes of the files in the repository directory will automatically reflect in your\n   python environment. The last command with ``[docs,tests]`` will install additional requirements to build\n   documentation and run tests.\n\n#. **Write readable code** and keep it\n   `PEP8 <https://www.python.org/dev/peps/pep-0008/>`_-ed, **add docstrings**\n   and keep them consistent with the\n   `Google Style <http://google.github.io/styleguide/pyguide.html#381-docstrings>`_.\n   Pay attention that we support typing annotations in every function\n   declaration.\n\n   Accompany your code with **clear comments** to let other people understand the\n   flow of your mind.\n\n   If you create new models, refer to the :doc:`Register your model\n   </devguides/registry>` section to add it to the DeepPavlov registry of\n   models.\n\n#. We ask you to **add some tests**. This will help us maintain the\n   framework, and this will help users to understand the feature you introduce.\n   Examples of implemented tests are available in `tests/\n   <https://github.com/deeppavlov/DeepPavlov/tree/dev/tests>`_\n   directory.\n\n#. Please, **update the documentation**, if you committed significant changes\n   to our code. Make sure that documentation could be built after your changes\n   and check how it looks using:\n\n   .. code:: bash\n\n       cd docs\n       make html\n\n   The built documentation will be added to ``docs/_build`` directory. Open it with your browser.\n\n#. **Commit your changes and push** your feature branch to your GitHub fork:\n\n    .. code:: bash\n\n        git add my_files\n        git commit -m \"fix: resolve issue #271\"\n        git push origin what_my_code_does_branch\n\n    Follow the `semantic commit notation <https://seesparkbox.com/foundry/semantic_commit_messages>`_\n    for the name of the commit.\n\n#. Create a new `pull request <https://github.com/deeppavlov/DeepPavlov/pulls>`_\n   to get your feature branch merged into dev for others to use.\n   Don't forget to `reference <https://help.github.com/en/github/writing-on-github/autolinked-references-and-urls>`_\n   the GitHub issue associated with your task in the description.\n\n#. **Relax and wait** : )\n\nSome time after that your commit will be assigned to somebody from our team\nto check your code. \nAfter a code review and a successful completion of all tests, your pull request will be approved and\npushed into the framework.\n\nIf you still have any questions, either on the contribution process or about\nthe framework itself, please share them with us on our `forum <https://forum.deeppavlov.ai>`_.\nJoin our official `Telegram channel <https://t.me/deeppavlov>`_ to get notified about our updates & news.\n"
  },
  {
    "path": "docs/devguides/registry.rst",
    "content": "Register your model\n===================\n\nIn order to extend the library, you need to register your classes and functions; it is done in two steps.\n\n1. Decorate your :class:`~deeppavlov.core.models.component.Component`\n   (or :class:`~deeppavlov.core.data.dataset_reader.DatasetReader`,\n   or :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`,\n   or :class:`~deeppavlov.core.data.data_fitting_iterator.DataFittingIterator`)\n   using :func:`~deeppavlov.core.common.registry.register` and/or metrics function\n   using :func:`~deeppavlov.core.common.metrics_registry.register_metric`.\n\n2. Rebuild the registry running from DeepPavlov root directory:\n\n::\n\n    python -m utils.prepare.registry\n\nThis script imports all the modules in deeppavlov package, builds the registry from them and writes it to a file.\n\n\nHowever, it is possible to use some classes and functions inside configuration files without registering them explicitly.\nThere are two options available here:\n\n- instead of ``{\"class_name\": \"registered_component_name\"}`` in config file use key-value pair similar to\n  ``{\"class_name\": \"my_package.my_module:MyClass\"}``\n\n- if your classes/functions are properly decorated but not included in the registry, use ``\"metadata\"`` section of\n  your config file specifying imports as ``\"metadata\": {\"imports\": [\"my_local_package.my_module\", \"global_package.module\"]}``;\n  then the second step described above will be unnecessary (local packages are imported from the current working\n  directory).\n"
  },
  {
    "path": "docs/features/hypersearch.rst",
    "content": "Hyperparameters optimization\n============================\n\nYou can search for best hyperparameters of your model in DeepPavlov by means of cross-validation.\n\nCross-validation\n~~~~~~~~~~~~~~~~\n\nYou can run cross-validation in DeepPavlov to select best parameters of your model.\nFor this purpose you have to run special command 'paramserach'. for example:\n\n.. code:: bash\n\n    python -m deeppavlov.paramsearch path_to_json_config.json --folds 5\n\n\nParameters\n----------\n\nCross validation command have several parameters:\n\n-  ``config_path``:\n    Specify config path, where you model is located.\n-  ``--folds``:\n    This parameter shows how many folds you need in cross validation.\n    Do you want to use leave one out cross validation instead of folds?\n    Just specify this: ``--folds loo``.\n    If you want not to cross-validate just omit this parameter.\n-  ``--search_type``:\n    This parameter is optional - default value is \"grid\" (grid search).\n\n\n.. note::\n\n    Folds will be created automatically from union of train and validation datasets.\n\n\nSpecial parameters in config\n----------------------------\nConfig file of model should be consist of parameters ranges for search.\nFor example, you try to optimize regularization coefficient in model,\nso you should add additional parameter in config with suffix '_range'.\nLet's see example for logistic regression model:\n\n.. code:: python\n\n      {\n        \"class_name\": \"faq_logreg_model\",\n        \"in\": \"q_vect\",\n        \"fit_on\": [\"q_vect\", \"y\"],\n        \"c\": {\"search_choice\": [1, 10, 100, 1000]},\n        \"out\": [\"answer\", \"score\"]\n      }\n\nIn this example parameter \"c\" described as search_choice, values for grid search:\n\n.. code:: python\n\n    {\"search_choice\": [value_0, ..., value_n]}\n\n\nResults\n-------\nAs a result you'll have new json config with best model parameters.\nIt'll be stored in the same directory as config file and will have suffix '_cvbest.json'.\nAlso you'll see final log messages about best model:\n\n.. code:: bash\n\n    INFO in '__main__'['paramsearch'] at line 169: Best model params: {'C': 10000, 'penalty': 'l1', 'accuracy': 0.81466}\n    INFO in '__main__'['paramsearch'] at line 184: Best model saved in json-file: path_to_model_config_cvbest.json\n"
  },
  {
    "path": "docs/features/models/KBQA.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Knowledge Base Question Answering (KBQA)\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/KBQA.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1. [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"    \\n\",\n    \"    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"    4.3. [Using entity linking and Wiki parser as standalone services for KBQA](#4.3-Using-entity-linking-and-Wiki-parser-as-standalone-tools-for-KBQA)\\n\",\n    \"     \\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"    \\n\",\n    \"    5.1. [Description of config parameters](#5.1-Description-of-config-parameters)\\n\",\n    \"    \\n\",\n    \"    5.2. [Train KBQA components](#5.2-Train-KBQA-components)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"The knowledge base:\\n\",\n    \"\\n\",\n    \"* is a comprehensive repository of information about given domain or a number of domains;\\n\",\n    \"\\n\",\n    \"* reflects the ways we model knowledge about given subject or subjects, in terms of concepts, entities, properties, and relationships;\\n\",\n    \"\\n\",\n    \"* enables us to use this structured knowledge where appropriate, e.g. answering factoid questions.\\n\",\n    \"\\n\",\n    \"Currently, we support Wikidata as a Knowledge Base (Knowledge Graph). In the future, we will expand support for custom knowledge bases.\\n\",\n    \"\\n\",\n    \"The question answerer:\\n\",\n    \"\\n\",\n    \"* validates questions against the preconfigured list of question templates, disambiguates entities using entity linking and answers questions asked in natural language;\\n\",\n    \"\\n\",\n    \"* can be used with Wikidata (English, Russian) and (in the future versions) with custom knowledge graphs.\\n\",\n    \"\\n\",\n    \"Here are some of the most popular types of questions supported by the model:\\n\",\n    \"\\n\",\n    \"* **Complex questions with numerical values:** “What position did Angela Merkel hold on November 10, 1994?”\\n\",\n    \"* **Complex question where the answer is a number or a date:** “When did Jean-Paul Sartre move to Le Havre?”\\n\",\n    \"* **Questions with counting of answer entities:** “How many sponsors are for Juventus F.C.?”\\n\",\n    \"* **Questions with ordering of answer entities by ascending or descending of some parameter:** “Which country has highest individual tax rate?”\\n\",\n    \"* **Simple questions:** “What is crew member Yuri Gagarin’s Vostok?”\\n\",\n    \"\\n\",\n    \"The following models are used to find the answer (the links are for the English language model):\\n\",\n    \"\\n\",\n    \"* [BERT model](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/classifiers/query_pr.json) for prediction of query template type. Model performs classification of questions into 8 classes correponding to 8 query template types;\\n\",\n    \"* [BERT entity detection model](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/entity_extraction/entity_detection_en.json) for extraction of entity substrings from the questions;\\n\",\n    \"* Substring extracted by the entity detection model is used for [entity linking](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/entity_extraction/entity_linking_en.json). Entity linking performs matching the substring with one of the Wikidata entities. Matching is based on the Levenshtein distance between the substring and an entity title. The result of the matching procedure is a set of candidate entities. There is also the search for the entity among this set with one of the top-k relations predicted by classification model;\\n\",\n    \"* [BERT model](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/ranking/rel_ranking_bert_en.json) for ranking candidate relation paths;\\n\",\n    \"* Query generator model is used to fill query template with candidate entities and relations to find valid combinations of entities and relations for query template. Query generation model uses Wikidata HDT file.\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation](https://deeppavlov-test.readthedocs.io/en/latest/notebooks/Get%20Started%20with%20DeepPavlov.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! pip install --q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then make sure that all the required packages for the model are installed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov install kbqa_cq_en\\n\",\n    \"! python -m deeppavlov install kbqa_cq_ru\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`kbqa_cq_en` and `kbqa_cq_rus` here are the names of the model's *config_files*. [What is a Config File?](https://docs.deeppavlov.ai/en/master/intro/configuration.html) \\n\",\n    \"\\n\",\n    \"Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\\n\",\n    \"The full list of KBQA models with their config names can be found in the [table](#3.-Models-list).\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents a list of all of the KBQA-models available in DeepPavlov Library.\\n\",\n    \"\\n\",\n    \"| Config name  | Database | Language | RAM | GPU |\\n\",\n    \"| :--- | --- | --- | --- | --- |\\n\",\n    \"| [kbqa_cq_en](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/kbqa/kbqa_cq_en.json)    | Wikidata | En | 3.1 Gb | 3.4 Gb |\\n\",\n    \"| [kbqa_cq_ru](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/kbqa/kbqa_cq_en.json)    | Wikidata | Ru | 4.3 Gb | 8.0 Gb |\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import configs, build_model\\n\",\n    \"\\n\",\n    \"kbqa = build_model('kbqa_cq_en', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Input**: List[sentences]\\n\",\n    \"\\n\",\n    \"**Output**: List[answers]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[['Robert Zemeckis'],\\n\",\n       \" [['Q187364']],\\n\",\n       \" [['SELECT ?answer WHERE { wd:Q134773 wdt:P57 ?answer. }']]]\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"kbqa(['Who directed Forrest Gump?'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[['United States senator'],\\n\",\n       \" [['Q4416090']],\\n\",\n       \" [['SELECT ?answer WHERE { wd:Q11613 p:P39 ?ent . ?ent ps:P39 ?answer . ?ent ?p ?x filter(contains(?x, n)). }']]]\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"kbqa(['What position was held by Harry S. Truman on 1/3/1935?'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[['FC Barcelona B, Argentina national under-20 football team'],\\n\",\n       \" [['Q10467', 'Q1187790']],\\n\",\n       \" [['SELECT ?answer WHERE { wd:Q615 p:P54 ?ent . ?ent ps:P54 ?answer . ?ent ?p ?x filter(contains(?x, n)). }']]]\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"kbqa(['What teams did Lionel Messi play for in 2004?'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"KBQA model for complex question answering in Russian can be used from Python using the following code:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import configs, build_model\\n\",\n    \"\\n\",\n    \"kbqa = build_model('kbqa_cq_ru', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[['26 мая 1799, 06 июня 1799'],\\n\",\n       \" [['+1799-05-26^^T', '+1799-06-06^^T']],\\n\",\n       \" [['SELECT ?answer WHERE { wd:Q7200 wdt:P569 ?answer. }']]]\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"kbqa(['Когда родился Пушкин?'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact kbqa_сq_en [-d]\\n\",\n    \"! python -m deeppavlov interact kbqa_cq_ru [-d]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`-d` is an optional download key (alternative to `download=True` in Python code). It is used to download the pre-trained model along with embeddings and all other files needed to run the model.\\n\",\n    \"\\n\",\n    \"Or make predictions for samples from *stdin*.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov predict kbqa_сq_en -f <file-name>\\n\",\n    \"! python -m deeppavlov predict kbqa_cq_ru -f <file-name>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.3 Using entity linking and Wiki parser as standalone tools for KBQA\\n\",\n    \"\\n\",\n    \"Default configuration for KBQA was designed to use all of the supporting models together as a part of the KBQA pipeline. However, there might be a case when you want to work with some of these models in addition to KBQA.\\n\",\n    \"\\n\",\n    \"For example, you might want to use entity linking model as an annotator in your [multiskill AI Assistant](https://github.com/deeppavlov/dream). Or, you might want to use Wiki Parser component to directly run SPARQL queries against your copy of Wikidata. To support these usages, you can also deploy supporting models as standalone components.\\n\",\n    \"\\n\",\n    \"Don’t forget to replace the `url` parameter values in the examples below with correct URLs.\\n\",\n    \"\\n\",\n    \"Config [entity_linking_en](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/entity_extraction/entity_linking_en.json) can be used with the following commands:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov install entity_linking_en -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov riseapi entity_linking_en [-d] [-p <port>]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import requests\\n\",\n    \"\\n\",\n    \"payload = {\\\"entity_substr\\\": [[\\\"Forrest Gump\\\"]], \\\"tags\\\": [[\\\"PERSON\\\"]], \\\"probas\\\": [[0.9]],\\n\",\n    \"           \\\"sentences\\\": [[\\\"Who directed Forrest Gump?\\\"]]}\\n\",\n    \"response = requests.post(entity_linking_url, json=payload).json()\\n\",\n    \"print(response)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Config [wiki_parser](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/kbqa/wiki_parser.json) can be used with the following command:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov riseapi wiki_parser [-d] [-p <port>]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Arguments of the annotator are `parser_info` (what we want to extract from Wikidata) and `query`.\\n\",\n    \"\\n\",\n    \"**Examples of queries:**\\n\",\n    \"\\n\",\n    \"To extract triplets for entities, the `query` argument should be the list of entities ids. `parser_info` should be the list of “find_triplets” strings.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"requests.post(wiki_parser_url, json = {\\\"parser_info\\\": [\\\"find_triplets\\\"], \\\"query\\\": [\\\"Q159\\\"]}).json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To extract all relations of the entities, the `query` argument should be the list of entities ids, and `parser_info` should be the list of “find_rels” strings.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"requests.post(wiki_parser_url, json = {\\\"parser_info\\\": [\\\"find_rels\\\"], \\\"query\\\": [\\\"Q159\\\"]}).json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To find labels for entities ids, the `query` argument should be the list of entities ids, and `parser_info` should be the list of “find_label” strings.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"requests.post(wiki_parser_url, json = {\\\"parser_info\\\": [\\\"find_label\\\"], \\\"query\\\": [[\\\"Q159\\\", \\\"\\\"]]}).json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this example, the second element of the list (an empty string) can be replaced with a sentence.\\n\",\n    \"\\n\",\n    \"To execute SPARQL queries, the `query` argument should be the list of tuples with the info about SPARQL queries, and `parser_info` should be the list of “query_execute” strings.\\n\",\n    \"\\n\",\n    \"Let us consider an example of the question “What is the deepest lake in Russia?” with the corresponding SPARQL query `SELECT ?ent WHERE { ?ent wdt:P31 wd:T1 . ?ent wdt:R1 ?obj . ?ent wdt:R2 wd:E1 } ORDER BY ASC(?obj) LIMIT 5`\\n\",\n    \"\\n\",\n    \"Arguments:\\n\",\n    \"\\n\",\n    \"* *what_return*: ```[“?obj”]```,\\n\",\n    \"* *query_seq*: ```[[“?ent”, “P17”, “Q159”], [“?ent”, “P31”, “Q23397”], [“?ent”, “P4511”, “?obj”]]```,\\n\",\n    \"* *filter_info*: ```[]```,\\n\",\n    \"* *order_info*: ```order_info(variable=’?obj’, sorting_order=’asc’)```.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"requests.post(\\\"wiki_parser_url\\\", json = {\\\"parser_info\\\": [\\\"query_execute\\\"], \\\"query\\\": [[[\\\"?obj\\\"], [[\\\"Q159\\\", \\\"P36\\\", \\\"?obj\\\"]], [], [], True]]}).json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To use entity linking model in KBQA, you should add following API Requester component to the `pipe` in the *config_file*:\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"```\\n\",\n    \"{\\n\",\n    \"    \\\"class_name\\\": \\\"api_requester\\\",\\n\",\n    \"    \\\"id\\\": \\\"linker_entities\\\",\\n\",\n    \"    \\\"url\\\": \\\"entity_linking_url\\\",\\n\",\n    \"    \\\"out\\\": [\\\"entity_substr\\\", \\\"entity_ids\\\", \\\"entity_conf\\\", \\\"entity_pages\\\", \\\"entity_labels\\\"],\\n\",\n    \"    \\\"param_names\\\": [\\\"entity_substr\\\", \\\"tags\\\", \\\"probas\\\", \\\"sentences\\\"]\\n\",\n    \" }\\n\",\n    \" ```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To use Wiki parser service in KBQA, you should add following API Requester component to the `pipe` in the *config_file*:\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"```\\n\",\n    \"{\\n\",\n    \"    \\\"class_name\\\": \\\"api_requester\\\",\\n\",\n    \"    \\\"id\\\": \\\"wiki_p\\\",\\n\",\n    \"    \\\"url\\\": \\\"wiki_parser_url\\\",\\n\",\n    \"    \\\"out\\\": [\\\"wiki_parser_output\\\"],\\n\",\n    \"    \\\"param_names\\\": [\\\"parser_info\\\", \\\"query\\\"]\\n\",\n    \" }\\n\",\n    \" ```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Customize the model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 5.1 Description of config parameters\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Parameters of ``entity_linker`` component:\\n\",\n    \"\\n\",\n    \"- ``num_entities_to_return: int`` - the number of entity IDs, returned for each entity mention in text;\\n\",\n    \"- ``lemmatize: bool`` - whether to lemmatize entity mentions before searching candidate entity IDs in the inverted index;\\n\",\n    \"- ``use_decriptions: bool`` - whether to perform ranking of candidate entities by similarity of their descriptions to the context;\\n\",\n    \"- ``use_connections: bool`` - whether to use connections between candidate entities for different mentions for ranking;\\n\",\n    \"- ``use_tags: bool`` - whether to search only those entity IDs in the inverted index, which have the same tag as the entity mention;\\n\",\n    \"- ``prefixes: Dict[str, Any]`` - prefixes in the knowledge base for entities and relations;\\n\",\n    \"- ``alias_coef: float`` - the coefficient which is multiplied by the substring matching score of the entity if the entity mention in the text matches with the entity title.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Parameters of ``rel_ranking_infer`` component:\\n\",\n    \"\\n\",\n    \"- ``return_elements: List[str]`` - what elements should be returned by the component in the output tuple (answers are returned by default, optional elements are `\\\"confidences\\\"`, `\\\"answer_ids\\\"`, `\\\"entities_and_rels\\\"` (entities and relations from SPARQL queries), `\\\"queries\\\"` (SPARQL queries), `\\\"triplets\\\"` (triplets from SPARQL queries));\\n\",\n    \"- ``batch_size: int`` - candidate relations list will be split into N batches of the size `batch_size` for further ranking;\\n\",\n    \"- ``softmax: bool`` - whether to apply softmax function to the confidences list of candidate relations for a question;\\n\",\n    \"- ``use_api_requester: bool`` - true if wiki_parser [is called through api_requester](#4.3-Using-entity-linking-and-Wiki-parser-as-standalone-tools-for-KBQA);\\n\",\n    \"- ``rank: bool`` - whether to perform ranking of candidate relation paths;\\n\",\n    \"- ``nll_rel_ranking: bool`` - in DeepPavlov we have two types of relation ranking models: 1) the model which takes a question and a relation and is trained to classify question-relation by two classes (relevant / irrelevant relation) 2) the model which takes a question and a list of relations (one relevant relation and others - irrelevant) and is trained to define the relevant relation in the list with NLL loss; the output format in two cases is different;\\n\",\n    \"- ``nll_path_ranking: bool`` - the same case as `nll_rel_ranking` for ranking of relation paths;\\n\",\n    \"- ``top_possible_answers: int`` - SPARQL query execution can result in several valid answers, so `top_possible_answers` is the number of these answers which we leave in the output;\\n\",\n    \"- ``top_n: int`` - number of candidate SPARQL queries (and corresponding answers) in the output for a question;\\n\",\n    \"- ``pos_class_num: int`` - if we use the model which classifies question-relation into two classes (relevant / irrelevant), we should set the number of positive class (0 or 1);\\n\",\n    \"- ``rel_thres: float`` - we leave only relations with the confidence upper threshold;\\n\",\n    \"- ``type_rels: List[str]`` - relations which connect entity and its type in the knowledge graph.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Parameters of ``query_generator`` component:\\n\",\n    \"\\n\",\n    \"- ``entities_to_leave: int`` - how many entity IDs to use to make a a combination of entities and relations for filling in the slots of the SPARQL query template;\\n\",\n    \"- ``rels_to_leave: int`` - how many relations to use to make a a combination of entities and relations for filling in the slots of the SPARQL query template;\\n\",\n    \"- ``max_comb_num: int`` - maximal number of combinations of entities and relations for filling in the slots of SPARQL query template;\\n\",\n    \"- ``map_query_str_to_kb: List[Tuple[str, str]]`` - a list of elements like [\\\"wd:\\\", \\\"http://we/\\\"], where the first element is a prefix of an entity (\\\"wd:\\\") or relation in the SPARQL query template, the second - the corresponding prefix in the knowledge base (\\\"http://we/\\\");\\n\",\n    \"- ``kb_prefixes: Dict[str, str]`` - a dictionary {\\\"entity\\\": \\\"wd:E\\\", \\\"rel\\\": \\\"wdt:R\\\", ...} - prefixes of entities, relations and types in the knowledge base;\\n\",\n    \"- ``gold_query_info: Dict[str, str]`` - names of unknown variables in SPARQL queries in the dataset (LC-QuAD2.0 or RuBQ2.0);\\n\",\n    \"- ``syntax_structure_known: bool`` - whether the syntax structure of the question is known (is True in kbqa_cq_ru.json, because this config performs syntax parsing with slovnet_syntax_parser).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 5.2 Train KBQA components\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Train Query Prediction Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The dataset for training query prediction model consists of three *.csv* files: *train.csv*, *valid.csv* and *test.csv*. Each line in this file contains question and corresponding query template type, for example:\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"```\\n\",\n    \"\\\"What is the longest river in the UK?\\\", 6\\n\",\n    \"```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Train Entity Detection Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The dataset is a pickle file. The dataset must be split into three parts: train, test, and validation. Each part is a list of tuples of question tokens and tags for each token. An example of training sample:\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"```\\n\",\n    \"(['What', 'is', 'the', 'complete', 'list', 'of', 'records', 'released', 'by', 'Jerry', 'Lee', 'Lewis', '?'],\\n\",\n    \" ['O', 'O', 'O', 'O', 'B-T', 'I-T', 'I-T', 'O', 'O', 'B-E', 'I-E', 'I-E', 'O'])\\n\",\n    \"```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`B-T` corresponds to tokens of entity types substrings beginning, `I-T` - to tokens of inner part of entity types substrings, `B-E` and `I-E` - for entities, `O` - for other tokens.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Train Path Ranking Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The dataset (in pickle format) is a dict of three keys: \\\"train\\\", \\\"valid\\\" and \\\"test\\\". The value by each key is the list of samples, an example of a sample:\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"```\\n\",\n    \"(['What is the Main St. Exile label, which Nik Powell co-founded?', ['record label', 'founded by']], '1')\\n\",\n    \"```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The sample contains the question, relations in the question and label (1 - if the relations correspond to the question, 0 - otherwise).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Adding Templates For New SPARQL Queries\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Templates can be added to sparql_queries.json file, which is a dictionary, where keys are template types and values are templates with additional information. An example of a template:\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"```\\n\",\n    \"{\\n\",\n    \"    \\\"query_template\\\": \\\"SELECT ?obj WHERE { wd:E1 p:R1 ?s . ?s ps:R1 ?obj . ?s ?p ?x filter(contains(?x, N)) }\\\",\\n\",\n    \"    \\\"rank_rels\\\": [\\\"wiki\\\", \\\"do_not_rank\\\", \\\"do_not_rank\\\"],\\n\",\n    \"    \\\"rel_types\\\": [\\\"no_type\\\", \\\"statement\\\", \\\"qualifier\\\"],\\n\",\n    \"    \\\"query_sequence\\\": [1, 2, 3],\\n\",\n    \"    \\\"return_if_found\\\": true,\\n\",\n    \"    \\\"template_num\\\": \\\"0\\\",\\n\",\n    \"    \\\"alternative_templates\\\": []\\n\",\n    \" }\\n\",\n    \"```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"* `query_template` is the template of the SPARQL query;\\n\",\n    \"* `rank_rels` is a list which defines whether to rank relations, in this example **p:R1** relations we extract from Wikidata for **wd:E1** entities and rank with RelRanker, **ps:R1** and **?p** relations we do not extract or rank;\\n\",\n    \"* `rel_types` - direct, statement or qualifier relations;\\n\",\n    \"* `query_sequence` - the sequence in which the triplets will be extracted from the Wikidata hdt file;\\n\",\n    \"* `return_if_found` - the parameter which iterates over all possible combinations of entities, relations and types, if true - return the first valid combination found, if false - consider all combinations;\\n\",\n    \"* `template_num` - the type of a template;\\n\",\n    \"* `alternative_templates` - type numbers of alternative templates to use if the answer was not found using the current template.\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 1\n}\n"
  },
  {
    "path": "docs/features/models/NER.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Named Entity Recognition (NER)\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/NER.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1. [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"    \\n\",\n    \"    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"    \\n\",\n    \"5. [Evaluate](#5.-Evaluate)\\n\",\n    \"    \\n\",\n    \"    5.1. [Evaluate from Python](#5.1-Evaluate-from-Python)\\n\",\n    \"    \\n\",\n    \"    5.2. [Evaluate from CLI](#5.2-Evaluate-from-CLI)\\n\",\n    \"\\n\",\n    \"6. [Customize the model](#6.-Customize-the-model)\\n\",\n    \"    \\n\",\n    \"    6.1. [Train your model from Python](#6.1-Train-your-model-from-Python)\\n\",\n    \"    \\n\",\n    \"    6.2. [Train your model from CLI](#6.2-Train-your-model-from-CLI)\\n\",\n    \"\\n\",\n    \"7. [NER-tags list](#7.-NER-tags-list)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"**Named Entity Recognition (NER)** is a task of assigning a tag (from a predefined set of tags) to each token in a given sequence. In other words, NER-task consists of identifying named entities in the text and classifying them into types (e.g. person name, organization, location etc). \\n\",\n    \"\\n\",\n    \"**BIO encoding schema** is usually used in NER task. It uses 3 tags: B for the beginning of the entity, I for the inside of the entity, and O for non-entity tokens. The second part of the tag stands for the entity type.\\n\",\n    \"\\n\",\n    \"Here is an example of a tagged sequence:\\n\",\n    \"\\n\",\n    \"| Elon | Musk | founded | Tesla| in | 2003 | . |\\n\",\n    \"| --- | --- | --- | --- | --- | --- | --- |\\n\",\n    \"| B-PER | I-PER | O | B-ORG | O | B-DATE | O |\\n\",\n    \"\\n\",\n    \"Here we can see three extracted named entities: *Elon Musk* (which is a person's name), *Tesla* (which is a name of an organization) and *2003* (which is a date). To see more examples try out our [Demo](https://demo.deeppavlov.ai/#/en/ner).\\n\",\n    \"\\n\",\n    \"The list of possible types of NER entities may vary depending on your dataset domain. The list of tags used in DeepPavlov's models can be found in the [table](#7.-NER-tags-list).\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then make sure that all the required packages for the model are installed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install ner_ontonotes_bert\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`ner_ontonotes_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \\n\",\n    \"\\n\",\n    \"Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\\n\",\n    \"The full list of NER models with their config names can be found in the [table](#3.-Models-list).\\n\",\n    \"\\n\",\n    \"There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents a list of all of the NER-models available in the DeepPavlov Library.\\n\",\n    \"\\n\",\n    \"| Config name | Dataset | Language | Model Size | F1 score (ner_f1) | F1 score (ner_f1_token) |\\n\",\n    \"| :--- | --- | --- | --- | --- | ---: |\\n\",\n    \"| ner_case_agnostic_mdistilbert| [CoNLL-2003](https://paperswithcode.com/dataset/conll-2003)   | En | 1.6 GB | 89.9 | 91.6 |\\n\",\n    \"| ner_conll2003_bert | [CoNLL-2003](https://paperswithcode.com/dataset/conll-2003) | En | 1.3 GB | **91.9** | **93.4** |\\n\",\n    \"| ner_ontonotes_bert | [OntoNotes](https://paperswithcode.com/dataset/ontonotes-5-0) | En | 1.3 GB | 89.2 | 92.7 |\\n\",\n    \"| ner_collection3_bert | [Collection3](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 2.1 GB | **98.5** | **98.9** |\\n\",\n    \"| ner_rus_bert | [Collection3](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 2.1 GB | 97.6 | 98.5 |\\n\",\n    \"| ner_rus_convers_distilrubert_2L | [Collection-rus](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 1.3 GB | 92.9 | 96.6 |\\n\",\n    \"| ner_rus_convers_distilrubert_6L | [Collection-rus](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 1.6 GB | 96.7 | 98.5 |\\n\",\n    \"| ner_rus_bert_probas | [Wiki-NER-rus](https://aclanthology.org/I17-1042/) | Ru | 2.1 GB | 72.6 | 79.5 |\\n\",\n    \"| ner_ontonotes_bert_mult | [OntoNotes](https://paperswithcode.com/dataset/ontonotes-5-0) | Multi | 2.1 GB | 88.9 | 92.0 |\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"ner_model = build_model('ner_ontonotes_bert', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The `download` argument defines whether it is necessary to download the files defined in the `download` section of the config: usually it provides the links to the train and test data, to the pretrained models, or to the embeddings.\\n\",\n    \"\\n\",\n    \"Setting the `install` argument to `True` is equivalent to executing the command line `install` command. If set to `True`, it will first install all the required packages.\\n\",\n    \"\\n\",\n    \"**Input**: List[sentences]\\n\",\n    \"\\n\",\n    \"**Output**: List[tokenized sentences, corresponding NER-tags]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[[['Bob', 'Ross', 'lived', 'in', 'Florida'],\\n\",\n       \"  ['Elon', 'Musk', 'founded', 'Tesla']],\\n\",\n       \" [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE'],\\n\",\n       \"  ['B-PERSON', 'I-PERSON', 'O', 'B-ORG']]]\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"ner_model(['Bob Ross lived in Florida', 'Elon Musk founded Tesla'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Сommand Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact ner_ontonotes_bert -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\\n\",\n    \"\\n\",\n    \"Or make predictions for samples from *stdin*.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov predict ner_ontonotes_bert -f <file-name>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Evaluate\\n\",\n    \"\\n\",\n    \"There are two metrics that are used to evaluate a NER model in DeepPavlov:\\n\",\n    \"\\n\",\n    \"`ner_f1` is measured on the entity-level (actual text spans should match exactly)\\n\",\n    \"\\n\",\n    \"`ner_token_f1` is measured on a token level (correct tokens from not fully extracted entities will still be counted as TPs (true positives))\\n\",\n    \"\\n\",\n    \"## 5.1 Evaluate from Python\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import evaluate_model\\n\",\n    \"\\n\",\n    \"model = evaluate_model('ner_ontonotes_bert', download=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 5.2 Evaluate from CLI\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov evaluate ner_ontonotes_bert\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 6. Customize the model\\n\",\n    \"\\n\",\n    \"## 6.1 Train your model from Python\\n\",\n    \"\\n\",\n    \"### Provide your data path\\n\",\n    \"\\n\",\n    \"To train the model on your data, you need to change the path to the training data in the *config_file*.\\n\",\n    \" \\n\",\n    \"Parse the *config_file* and change the path to your data from Python.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"~/.deeppavlov/downloads/ontonotes/\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from deeppavlov import train_model\\n\",\n    \"from deeppavlov.core.commands.utils import parse_config\\n\",\n    \"\\n\",\n    \"model_config = parse_config('ner_ontonotes_bert')\\n\",\n    \"\\n\",\n    \"# dataset that the model was trained on\\n\",\n    \"print(model_config['dataset_reader']['data_path'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Provide a *data_path* to your own dataset. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# download and unzip a new example dataset\\n\",\n    \"!wget http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz\\n\",\n    \"!tar -xzvf \\\"conll2003_v2.tar.gz\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# provide a path to the train file\\n\",\n    \"model_config['dataset_reader']['data_path'] = 'contents/train.txt'\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"\\n\",\n    \"### Train dataset format\\n\",\n    \"\\n\",\n    \"To train the model, you need to have a txt-file with a dataset in the following format:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"EU B-ORG\\n\",\n    \"rejects O\\n\",\n    \"the O\\n\",\n    \"call O\\n\",\n    \"of O\\n\",\n    \"Germany B-LOC\\n\",\n    \"to O\\n\",\n    \"boycott O\\n\",\n    \"lamb O\\n\",\n    \"from O\\n\",\n    \"Great B-LOC\\n\",\n    \"Britain I-LOC\\n\",\n    \". O\\n\",\n    \"\\n\",\n    \"China B-LOC\\n\",\n    \"says O\\n\",\n    \"time O\\n\",\n    \"right O\\n\",\n    \"for O\\n\",\n    \"Taiwan B-LOC\\n\",\n    \"talks O\\n\",\n    \". O\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"The source text is **tokenized** and **tagged**. For each token, there is a tag with **BIO** markup. Tags are separated from tokens with **whitespaces**. Sentences are separated with **empty lines**.\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"### Train the model using new config\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ner_model = train_model(model_config)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Use your model for prediction.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[[['Bob', 'Ross', 'lived', 'in', 'Florida'],\\n\",\n       \"  ['Elon', 'Musk', 'founded', 'Tesla']],\\n\",\n       \" [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE'],\\n\",\n       \"  ['B-PERSON', 'I-PERSON', 'O', 'B-ORG']]]\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"ner_model(['Bob Ross lived in Florida', 'Elon Musk founded Tesla'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.2 Train your model from CLI\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov train ner_ontonotes_bert\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 7. NER-tags list\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The table presents a list of all of the NER entity tags used in DeepPavlov's NER-models.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"|              |                                                        |\\n\",\n    \"| ------------ | ------------------------------------------------------ |\\n\",\n    \"| **PERSON**       | People including fictional                             |\\n\",\n    \"| **NORP**         | Nationalities or religious or political groups         |\\n\",\n    \"| **FACILITY**     | Buildings, airports, highways, bridges, etc.           |\\n\",\n    \"| **ORGANIZATION** | Companies, agencies, institutions, etc.                |\\n\",\n    \"| **GPE**          | Countries, cities, states                              |\\n\",\n    \"| **LOCATION**     | Non-GPE locations, mountain ranges, bodies of water    |\\n\",\n    \"| **PRODUCT**      | Vehicles, weapons, foods, etc. (Not services)          |\\n\",\n    \"| **EVENT**        | Named hurricanes, battles, wars, sports events, etc.   |\\n\",\n    \"| **WORK OF ART**  | Titles of books, songs, etc.                           |\\n\",\n    \"| **LAW**          | Named documents made into laws                         |\\n\",\n    \"| **LANGUAGE**     | Any named language                                     |\\n\",\n    \"| **DATE**         | Absolute or relative dates or periods                  |\\n\",\n    \"| **TIME**         | Times smaller than a day                               |\\n\",\n    \"| **PERCENT**      | Percentage (including “%”)                             |\\n\",\n    \"| **MONEY**        | Monetary values, including unit                        |\\n\",\n    \"| **QUANTITY**     | Measurements such as weight or distance                |\\n\",\n    \"| **ORDINAL**      | “first”, “second”, etc.                                |\\n\",\n    \"| **CARDINAL**     | Numerals that do not fall under another type           |\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/ODQA.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Open Domain Question Answering (ODQA)\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/ODQA.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1 [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"\\n\",\n    \"    4.2 [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"\\n\",\n    \"    5.1 [Description of config parameters](#5.1-Description-of-config-parameters)\\n\",\n    \"    \\n\",\n    \"    5.2 [Building the index and training the reader model](#5.2-Building-the-index-and-training-the-reader-model)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"**Open Domain Question Answering (ODQA)** is a task to find an exact answer\\n\",\n    \"to any question in **Wikipedia** articles. Thus, given only a question, the system outputs\\n\",\n    \"the best answer it can find.\\n\",\n    \"The default ODQA implementation takes a batch of queries as input and returns the best answer.\\n\",\n    \"\\n\",\n    \"English ODQA version consists of the following components:\\n\",\n    \"\\n\",\n    \"- TF-IDF ranker, which defines top-N most relevant paragraphs in TF-IDF index;\\n\",\n    \"- Binary Passage Retrieval (BPR) ranker, which defines top-K most relevant in binary index;\\n\",\n    \"- a database of paragraphs (by default, from Wikipedia) which finds N + K most relevant paragraph text by IDs, defined by TF-IDF and BPR ranker;\\n\",\n    \"- Reading Comprehension component, which finds answers in paragraphs and defines answer confidences.\\n\",\n    \"\\n\",\n    \"Russian ODQA version performs retrieval only with TF-IDF index.\\n\",\n    \"\\n\",\n    \"Binary Passage Retrieval is resource-efficient the method of building a dense passage index. The dual encoder (with BERT or other Tranformer as backbone) is trained on question answering dataset (Natural Questions in our case) to maximize dot product of question and passage with answer embeddings and minimize otherwise. The question or passage embeddings are obtained the following way: vector of BERT CLS-token is fed into a dense layer followed by a hash function which turns dense vector into binary one.\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The example below is given for basic ODQA config [en_odqa_infer_wiki](https://github.com/deeppavlov/DeepPavlov/blob/1.1.1/deeppavlov/configs/odqa/en_odqa_infer_wiki.json).\\n\",\n    \"Check what [other ODQA configs](#3.-Models-list) are available and simply replace `en_odqa_infer_wiki`\\n\",\n    \"with the config name of your preference. [What is a Config File?](https://docs.deeppavlov.ai/en/master/intro/configuration.html)\\n\",\n    \"\\n\",\n    \"Before using the model make sure that all required packages are installed running the command:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install en_odqa_infer_wiki\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents a list of all of the ODQA models available in the DeepPavlov Library.\\n\",\n    \"\\n\",\n    \"| Config | Description |\\n\",\n    \"| :--- | :--- |\\n\",\n    \"| odqa/en_odqa_infer_wiki.json | Basic config for **English** language. Consists of of Binary Passage Retrieval, TF-IDF retrieval and reader. |\\n\",\n    \"| odqa/en_odqa_pop_infer_wiki.json | Extended config for **English** language. Consists of of Binary Passage Retrieval, TF-IDF retrieval, popularity ranker and reader. |\\n\",\n    \"| odqa/ru_odqa_infer_wiki.json | Basic config for **Russian** language. Consists of TF-IDF ranker and reader. |\\n\",\n    \"\\n\",\n    \"The table presents the scores on Natural Questions and SberQuAD dataset and memory consumption.\\n\",\n    \"\\n\",\n    \"| Config | Number of<br>paragraphs | Dataset | F1 | EM | RAM | GPU | Time for <br> 1 query |\\n\",\n    \"| :--- | :---: | :--- | :---: | :---: | :---: | :---: | :---: |\\n\",\n    \"| odqa/en_odqa_infer_wiki.json | 200 | Natural Questions | 45.2 | 37.0 | 10.4 | 2.4 | 4.9 s |\\n\",\n    \"| odqa/ru_odqa_infer_wiki.json | 100 | SberQuAD | 59.2 | 49.0 | 13.1 | 5.3 | 2.0 s |\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"### English\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"odqa_en = build_model('en_odqa_infer_wiki', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Input**: List[questions]\\n\",\n    \"\\n\",\n    \"**Output**: Tuple[List[answers], List[answer scores], List[answer places in paragraph]]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"[['Luke Skywalker'], [4.196979999542236]]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"odqa_en([\\\"What is the name of Darth Vader's son?\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Russian\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"odqa_ru = build_model('ru_odqa_infer_wiki', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"[['на востоке и юге Австралии'], [0.9999760985374451]]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"odqa_ru([\\\"Где живут кенгуру?\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Сommand Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact en_odqa_infer_wiki -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\\n\",\n    \"\\n\",\n    \"# 5. Customize the model\\n\",\n    \"\\n\",\n    \"## 5.1 Description of config parameters\\n\",\n    \"\\n\",\n    \"Parameters of ``bpr`` component:\\n\",\n    \"    \\n\",\n    \"- ``load_path`` - path with checkpoint of query encoder and bpr index;\\n\",\n    \"- ``query_encoder_file`` - filename of query encoder (Transformer-based model which takes a question as input and obtains its binary embedding);\\n\",\n    \"- ``bpr_index`` - filename with BPR index (matrix of paragraph binary vectors);\\n\",\n    \"- ``pretrained_model`` - Transformer model, used in query encoder;\\n\",\n    \"- ``max_query_length`` - maximal length (in sub-tokens) of the input to the query encoder;\\n\",\n    \"- ``top_n`` - how many paragraph IDs to return per a question.\\n\",\n    \"\\n\",\n    \"Parameters of ``tfidf_ranker`` component:\\n\",\n    \"\\n\",\n    \"- ``top_n`` - how many paragraph IDs to return per a question.\\n\",\n    \"\\n\",\n    \"Parameters of ``logit_ranker`` component:\\n\",\n    \"\\n\",\n    \"- ``batch_size`` - the paragraphs from the database (some of which contain the answer to the question, others - do not contain) will be split into batches with the size ``batch_size`` for extraction of candidate answer in each paragraph;\\n\",\n    \"- ``squad_model`` - the model which finds spans of an answer in a paragraph;\\n\",\n    \"- ``sort_noans`` - whether to put paragraphs with no answer in the end of paragraph list, sorted by confidences;\\n\",\n    \"- ``top_n`` - the number of possible answers for a question;\\n\",\n    \"- ``return_answer_sentence`` - whether to return the sentence from the paragraph with the answer.\\n\",\n    \"\\n\",\n    \"## 5.2 Building the index and training the reader model\\n\",\n    \"\\n\",\n    \"There are two customizable components in ODQA configs:\\n\",\n    \"\\n\",\n    \"- TF-IDF ranker;\\n\",\n    \"- Reading comprehension model.\\n\",\n    \"\\n\",\n    \"If you would like to build the TF-IDF index for your own text database, read [here](https://docs.deeppavlov.ai/en/master/features/models/tfidf_ranking.html#ranker-training). \\n\",\n    \"\\n\",\n    \"In addition, to train the Reader on your data, read [here](https://docs.deeppavlov.ai/en/master/features/models/SQuAD.html#4.1-Train-your-model-from-Python).\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/SQuAD.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Context Question Answering\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/SQuAD.ipynb)\\n\",\n    \"\\n\",\n    \"[![Medium](https://img.shields.io/badge/Medium-12100E?style=for-the-badge&logo=medium&logoColor=white)](https://medium.com/deeppavlov/developing-qa-systems-for-any-language-with-deeppavlov-a9033d5231a8)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1. [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"    \\n\",\n    \"    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"     \\n\",\n    \"5. [Train the model on your data](#5.-Train-the-model-on-your-data)\\n\",\n    \"    \\n\",\n    \"    5.1. [from Python](#5.1-Train-your-model-from-Python)\\n\",\n    \"    \\n\",\n    \"    5.2. [from CLI](#5.2-Train-your-model-from-CLI)\\n\",\n    \"    \\n\",\n    \"6. [Evaluate](#6.-Evaluate)\\n\",\n    \"    \\n\",\n    \"    6.1. [from Python](#6.1-Evaluate-from-Python)\\n\",\n    \"    \\n\",\n    \"    6.2. [from CLI](#6.2-Evaluate-from-CLI)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"Context Question Answering is a task of finding a fragment with an answer to a question in a given segment of context.\\n\",\n    \"\\n\",\n    \"**Context**:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"In meteorology, precipitation is any product of the condensation \\n\",\n    \"of atmospheric water vapor that falls under gravity. The main forms \\n\",\n    \"of precipitation include drizzle, rain, sleet, snow, graupel and hail… \\n\",\n    \"Precipitation forms as smaller droplets coalesce via collision with \\n\",\n    \"other rain drops or ice crystals within a cloud. Short, intense periods \\n\",\n    \"of rain in scattered locations are called “showers”.\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"**Question**:\\n\",\n    \"```\\n\",\n    \"Where do water droplets collide with ice crystals to form precipitation?\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"**Answer**: \\n\",\n    \"```\\n\",\n    \"within a cloud\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"Datasets that follow this task format:\\n\",\n    \"\\n\",\n    \"- [Stanford Question Answering Dataset (SQuAD) (EN)](https://rajpurkar.github.io/SQuAD-explorer/)\\n\",\n    \"\\n\",\n    \"- [SberQuAD (RU)](https://paperswithcode.com/dataset/sberquad)\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then make sure that all the required packages for the model are installed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install squad_bert\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`squad_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \\n\",\n    \"\\n\",\n    \"Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\\n\",\n    \"The full list of the models with their config names can be found in the [table](#3.-Models-list).\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents a list of all of the Context Question Answering models available in DeepPavlov Library.\\n\",\n    \"\\n\",\n    \"| Config name  | Dataset | Language | Model Size | F1 score | EM  |\\n\",\n    \"| :--- | --- | --- | --- | --- | ---: |\\n\",\n    \"| squad_bert | SQuAD v1.1 | En | 1.3 GB | 88.86 | 81.49 |\\n\",\n    \"| qa_squad2_bert | SQuAD v2.0 | En | 1.3 GB | 83.56 | 75.54 |\\n\",\n    \"| qa_multisberquad_bert | MultiSQuAD | Multi | 2 GB | 80.76 | 63.81 |\\n\",\n    \"| squad_ru_bert | SberQuAD | Ru | 2.0 GB | 84.71 | 66.21 |\\n\",\n    \"| squad_ru_convers_distilrubert_2L | SberQuAD | Ru | 1.2 GB | 65.20 | 44.52 |\\n\",\n    \"| squad_ru_convers_distilrubert_6L | SberQuAD | Ru | 1.6 GB | 80.57 | 61.54 |\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"model = build_model('squad_bert', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Input**: List[context], List[question]\\n\",\n    \"\\n\",\n    \"**Output**: List[answer, start_character, logit]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[['a library for NLP and dialog systems'], [14], [200928.390625]]\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"model(['DeepPavlov is a library for NLP and dialog systems.'], ['What is DeepPavlov?'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Command Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov interact squad_bert -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\\n\",\n    \"\\n\",\n    \"Or make predictions for samples from *stdin*.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov predict squad_bert -f <file-name>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Train the model on your data\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"## 5.1 Train your model from Python\\n\",\n    \"\\n\",\n    \"### Provide your data path\\n\",\n    \"\\n\",\n    \"To train the model on your data, you need to change the path to the training data in the *config_file*.\\n\",\n    \"\\n\",\n    \"Parse the *config_file* and change the path to your data from Python.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"~/.deeppavlov/downloads/squad/\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from deeppavlov import train_model\\n\",\n    \"from deeppavlov.core.commands.utils import parse_config\\n\",\n    \"\\n\",\n    \"model_config = parse_config('squad_bert')\\n\",\n    \"\\n\",\n    \"#  dataset that the model was trained on\\n\",\n    \"print(model_config['dataset_reader']['data_path'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Provide a *data_path* to your own dataset. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# download and unzip a new example dataset\\n\",\n    \"!wget http://files.deeppavlov.ai/datasets/squad-v1.1.tar.gz\\n\",\n    \"!tar -xzvf \\\"squad-v1.1.tar.gz\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Note that if you want to provide your own dataset, it should have the same format as the SQuAD dataset downloaded in this cell.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# provide a path to the train file\\n\",\n    \"model_config['dataset_reader']['data_path'] = '/contents/train-v1.1.json'\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### SQuAD dataset info\\n\",\n    \"\\n\",\n    \"There are *two* versions of the SQuAD dataset available for training at the moment: \\n\",\n    \"\\n\",\n    \"- [SQuAD 1.1](https://arxiv.org/abs/1606.05250) contains 107,785 question-answer pairs on 536 articles. Dataset size: `33.52 MiB`.\\n\",\n    \"\\n\",\n    \"- [SQuAD 2.0](https://arxiv.org/abs/1806.03822) combines all of the questions from SQuAD 1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers. Dataset size: `44.34 MiB`.\\n\",\n    \"\\n\",\n    \"### Train the model using new config\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"model = train_model(model_config)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Use your model for prediction.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[['a library for NLP and dialog systems'], [14], [200928.390625]]\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"model(['DeepPavlov is a library for NLP and dialog systems.'], ['What is DeepPavlov?'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 5.2 Train your model from CLI\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov train squad_bert\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 6. Evaluate\\n\",\n    \"\\n\",\n    \"## 6.1 Evaluate from Python\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import evaluate_model\\n\",\n    \"\\n\",\n    \"model = evaluate_model('squad_bert', download=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.2 Evaluate from CLI\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov evaluate squad_bert -d\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/bert.rst",
    "content": "BERT in DeepPavlov\n==================\nBERT (Bidirectional Encoder Representations from Transformers) is a Transformer pre-trained on masked language model\nand next sentence prediction tasks. This approach showed state-of-the-art results on a wide range of NLP tasks in\nEnglish.\n\n| BERT paper: https://arxiv.org/abs/1810.04805\n| Google Research BERT repository: https://github.com/google-research/bert\n\nThere are several pre-trained BERT models released by Google Research, more details about these pre-trained models could be found here: https://github.com/google-research/bert#pre-trained-models\n\n-  BERT-base, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip>`__,\n   `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip>`__\n-  BERT-base, English, uncased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip>`__,\n   `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip>`__\n-  BERT-large, English, cased, 24-layer, 1024-hidden, 16-heads, 340M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip>`__\n-  BERT-base, multilingual, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip>`__,\n   `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12.zip>`__, `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz>`__\n-  BERT-base, Chinese, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip>`__,\n   `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/chinese_L-12_H-768_A-12.zip>`__, `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/chinese_L-12_H-768_A-12_pt.tar.gz>`__\n\nWe have trained BERT-base model for other languages and domains:\n\n-  RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v2.tar.gz>`__,\n   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__\n-  SlavicBERT, Slavic (bg, cs, pl, ru), cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_v1.tar.gz>`__,\n   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__\n-  Conversational BERT, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_v1.tar.gz>`__,\n   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__\n-  Conversational RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12.tar.gz>`__,\n   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__\n-  Conversational DistilRuBERT, Russian, cased, 6-layer, 768-hidden, 12-heads, 135.4M parameters: `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/distil_ru_conversational_cased_L-6_H-768_A-12_pt.tar.gz>`__\n-  Conversational DistilRuBERT-tiny, Russian, cased, 2-layer, 768-hidden, 12-heads, 107M parameters: `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/distil_ru_conversational_cased_L-2_H-768_A-12_pt.tar.gz>`__\n-  Sentence Multilingual BERT, 101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12.tar.gz>`__,\n   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__\n-  Sentence RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12.tar.gz>`__,\n   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__\n\nThe ``deeppavlov_pytorch`` models are designed to be run with the `HuggingFace's Transformers <https://huggingface.co/transformers/>`__ library.\n\nRuBERT was trained on the Russian part of Wikipedia and news data. We used this training data to build vocabulary of Russian subtokens and took\nmultilingual version of BERT-base as initialization for RuBERT [1]_.\n\nSlavicBERT was trained on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian.\nSubtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT.\nThe model is described in our ACL paper [2]_.\n\nConversational BERT was trained on the English part of Twitter, Reddit, DailyDialogues [4]_, OpenSubtitles [5]_, Debates [6]_, Blogs [7]_, Facebook News Comments.\nWe used this training data to build the vocabulary of English subtokens and took\nEnglish cased version of BERT-base as initialization for English Conversational BERT.\n\nConversational RuBERT was trained on OpenSubtitles [5]_, Dirty, Pikabu, and Social Media segment of Taiga corpus [8]_.\nWe assembled new vocabulary for Conversational RuBERT model on this data and initialized model with RuBERT.\n\nConversational DistilRuBERT (6 transformer layers) and DistilRuBERT-tiny (2 transformer layers) were trained on the same data as Conversational RuBERT and highly inspired by DistilBERT [3]_. Namely, Distil* models (students) used pretrained Conversational RuBERT as teacher and linear combination of the following losses:\n\n1. Masked language modeling loss (between student output logits for tokens and its true labels)\n2. Kullback-Leibler divergence (between student and teacher output logits)\n3. Cosine embedding loss (between averaged hidden states of the teacher and hidden states of the student)\n4. Mean squared error loss (between averaged attention maps of the teacher and attention maps of the student)\n\nSentence Multilingual BERT is a representation-based sentence encoder for 101 languages of Multilingual BERT.\nIt is initialized with Multilingual BERT and then fine-tuned on english MultiNLI [9]_ and on dev set of multilingual XNLI [10]_.\nSentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_.\n\nSentence RuBERT is a representation-based sentence encoder for Russian.\nIt is initialized with RuBERT and fine-tuned on SNLI [11]_ google-translated to russian and on russian part of XNLI dev set [10]_.\nSentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_.\n\nHere, in DeepPavlov, we made it easy to use pre-trained BERT for downstream tasks like classification, tagging, question answering and\nranking. We also provide pre-trained models and examples on how to use BERT with DeepPavlov.\n\nBERT as Embedder\n----------------\n\n:class:`~deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder` allows for using BERT\nmodel outputs as token, subtoken and sentence level embeddings.\n\nAdditionaly the embeddings can be easily used in DeepPavlov. To get text level, token level and subtoken level representations,\nyou can use or modify a :config:`BERT embedder configuration <embedder/bert_embedder.json>`:\n\n.. code:: python\n    \n    from deeppavlov.core.common.file import read_json\n    from deeppavlov import build_model, configs\n    \n    bert_config = read_json(configs.embedder.bert_embedder)\n    bert_config['metadata']['variables']['BERT_PATH'] = 'path/to/bert/directory'\n\n    m = build_model(bert_config)\n\n    texts = ['Hi, i want my embedding.', 'And mine too, please!']\n    tokens, token_embs, subtokens, subtoken_embs, sent_max_embs, sent_mean_embs, bert_pooler_outputs = m(texts)\n\n\nBERT for Classification\n-----------------------\n\n:class:`~deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel`\nprovides solution for classification problem using pre-trained BERT on PyTorch.\nOne can use several pre-trained English, multi-lingual and Russian BERT models that are\nlisted above. :class:`~deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel`\nalso supports any Transformer-based model of `Transformers <https://github.com/huggingface/transformers>`.\n\nTwo main components of BERT classifier pipeline in DeepPavlov are\n:class:`~deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersPreprocessor` and\n:class:`~deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel`.\nNon-processed texts should be given to ``torch_transformers_preprocessor`` for tokenization on subtokens,\nencoding subtokens with their indices and creating tokens and segment masks.\n\n``torch_transformers_classifier`` has a dense layer of number of classes size upon pooled outputs of Transformer encoder,\nit is followed by ``softmax`` activation (``sigmoid`` if ``multilabel`` parameter is set to ``true`` in config).\n\n\nBERT for Named Entity Recognition (Sequence Tagging)\n----------------------------------------------------\n\nPre-trained BERT model can be used for sequence tagging. Examples of BERT application to sequence tagging\ncan be found :doc:`here </features/models/NER>`. The module used for tagging\nis :class:`~deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger`.\nThe tags are obtained by applying a dense layer to the representation of\nthe first subtoken of each word. There is also an optional CRF layer on the top.\nYou can choose among different Transformers architectures by modifying the TRANSFORMER variable in the corresponding configuration files.\nThe possible choices are DistilBert, Albert, Camembert, XLMRoberta, Bart, Roberta, Bert, XLNet, Flaubert, XLM.\n\n..\n    TODO: fix Zero-Shot NER reference\n\nMultilingual BERT model allows to perform zero-shot transfer across languages. To use our 19 tags NER for over a\nhundred languages see ner_multi_bert.\n\n\nBERT for Context Question Answering (SQuAD)\n-------------------------------------------\nContext Question Answering on `SQuAD <https://rajpurkar.github.io/SQuAD-explorer/>`__ dataset is a task\nof looking for an answer on a question in a given context. This task could be formalized as predicting answer start\nand end position in a given context. :class:`~deeppavlov.models.torch_bert.torch_transformers_squad:TorchTransformersSquad` on PyTorch uses two linear\ntransformations to predict probability that current subtoken is start/end position of an answer. For details check\n:doc:`Context Question Answering documentation page </features/models/SQuAD>`.\n\nUsing custom BERT in DeepPavlov\n-------------------------------\n\nThe previous sections describe the BERT based models implemented in DeepPavlov.\nTo change the BERT model used for initialization in any downstream task mentioned above the following parameters of\nthe :doc:`config </intro/configuration>` file must be changed to match new BERT path:\n\n* download URL in the ``metadata.download.url`` part of the config\n* ``bert_config_file``, ``pretrained_bert`` in the BERT based Component. In case of PyTorch BERT, ``pretrained_bert`` can be assigned to\n    string name of any Transformer-based model (e.g. ``\"bert-base-uncased\"``, ``\"distilbert-base-uncased\"``) and then ``bert_config_file`` is set to ``None``.\n* ``vocab_file`` in the ``torch_transformers_preprocessor``. ``vocab_file`` can be assigned to\n    string name of used pre-trained BERT (e.g. ``\"bert-base-uncased\"``).\n\n.. [1] Kuratov, Y., Arkhipov, M. (2019). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint arXiv:1905.07213.\n.. [2] Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. (2019). `Tuning Multilingual Transformers for Language-Specific Named Entity Recognition <https://www.aclweb.org/anthology/W19-3712/>`__ . ACL anthology W19-3712.\n.. [3] Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108.\n.. [4] Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017.\n.. [5] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016)\n.. [6] Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016.\n.. [7] J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs.\n.. [8] Shavrina T., Shapovalova O. (2017) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING: «TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017.\n.. [9] Williams A., Nangia N. & Bowman S. (2017) A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference. arXiv preprint arXiv:1704.05426\n.. [10] Williams A., Bowman S. (2018) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint arXiv:1809.05053\n.. [11] S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. (2015) A large annotated corpus for learning natural language inference. arXiv preprint arXiv:1508.05326\n.. [12] N. Reimers, I. Gurevych (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:1908.10084\n"
  },
  {
    "path": "docs/features/models/classification.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Classification\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/classification.ipynb)\\n\",\n    \"\\n\",\n    \"[![Medium](https://img.shields.io/badge/Medium-12100E?style=for-the-badge&logo=medium&logoColor=white)](https://medium.com/deeppavlov/text-classification-using-deeppavlov-library-with-pytorch-and-transformers-f14db5528821)\\n\",\n    \"\\n\",\n    \"# Table of contents\\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1. [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"\\n\",\n    \"    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Evaluation](#5.-Evaluation)\\n\",\n    \"\\n\",\n    \"    5.1. [from Python](#5.1-Evaluate-from-Python)\\n\",\n    \"\\n\",\n    \"    5.2. [from CLI](#5.2-Evaluate-from-CLI)\\n\",\n    \"\\n\",\n    \"6. [Train the model on your data](#6.-Train-the-model-on-your-data)\\n\",\n    \"\\n\",\n    \"    6.1. [from Python](#6.1-Train-your-model-from-Python)\\n\",\n    \"\\n\",\n    \"    6.2. [from CLI](#6.2-Train-your-model-from-CLI)\\n\",\n    \"\\n\",\n    \"7. [Simple few-shot classifiers](#7.-Simple-few-shot-classifiers)\\n\",\n    \"\\n\",\n    \"    7.1. [Few-shot setting](#7.1-Few-shot-setting)\\n\",\n    \"\\n\",\n    \"    7.2. [Multiple languages support](#7.2-Multiple-languages-support)\\n\",\n    \"\\n\",\n    \"    7.3. [Dataset and Scores](#7.3-Dataset-and-Scores)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"This section describes a family of BERT-based models that solve a variety of different classification tasks.\\n\",\n    \"\\n\",\n    \"**Insults detection** is a binary classification task of identying wether a given sequence is an insult of another participant of communication.\\n\",\n    \"\\n\",\n    \"**Sentiment analysis** is a task of classifying the polarity of the the given sequence. The number of classes may vary depending on the data: positive/negative binary classification, multiclass classification with a neutral class added or with a number of different emotions.\\n\",\n    \"\\n\",\n    \"The models trained for the **paraphrase detection** task identify whether two sentences expressed with different words convey the same meaning.\\n\",\n    \"\\n\",\n    \"**Topic classification** refers to the task of classifying an utterance by the topic which belongs to the conversational domain.\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then make sure that all the required packages for the model are installed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install insults_kaggle_bert\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`insults_kaggle_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\\n\",\n    \"\\n\",\n    \"Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\\n\",\n    \"The full list of classification models with their config names can be found in the [table](#3.-Models-list).\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents a list of all of the classification models available in DeepPavlov Library.\\n\",\n    \"\\n\",\n    \"| Config name  | Language | Task | Dataset | Model Size | Metric | Score |\\n\",\n    \"| :--- | --- | --- | --- | --- | --- | ---: |\\n\",\n    \"| insults_kaggle_bert | En | Insults | [Insults](https://www.kaggle.com/c/detecting-insults-in-social-commentary) | 1.1 GB | ROC-AUC | 0.8770 |\\n\",\n    \"| paraphraser_rubert | Ru | Paraphrase | [Paraphrase Corpus](http://paraphraser.ru/download/) | 2.0 GB | F1 | 0.8738 |\\n\",\n    \"| paraphraser_convers_distilrubert_2L | Ru | Paraphrase | [Paraphrase Corpus](http://paraphraser.ru/download/) | 1.2 GB | F1 | 0.7396 |\\n\",\n    \"| paraphraser_convers_distilrubert_6L | Ru | Paraphrase | [Paraphrase Corpus](http://paraphraser.ru/download/) | 1.6 GB | F1 | 0.8354 |\\n\",\n    \"| sentiment_sst_conv_bert | En | Sentiment | [SST](https://paperswithcode.com/dataset/sst) | 1.1 GB | Accuracy | 0.6626 |\\n\",\n    \"| sentiment_twitter | Ru | Sentiment | [Twitter Mokoron](https://github.com/mokoron/sentirueval) | 6.2 GB | F1-macro | 0.9961 |\\n\",\n    \"| rusentiment_bert | Ru | Sentiment | [RuSentiment](https://text-machine.cs.uml.edu/projects/rusentiment/) | 1.3 GB | F1-weighted | 0.7005 |\\n\",\n    \"| rusentiment_convers_bert | Ru | Sentiment | [RuSentiment](https://text-machine.cs.uml.edu/projects/rusentiment/) | 1.5 GB | F1-weighted | 0.7724  |\\n\",\n    \"| topics_distilbert_base_uncased | En | Topics | [DeepPavlov Topics](https://deeppavlov.ai/datasets/topics) | 6.2 GB | F1-macro | 0.9961 |\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"model = build_model('insults_kaggle_bert', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Input format**: List[sentences]\\n\",\n    \"\\n\",\n    \"**Output format**: List[labels]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"['Insult', 'Not Insult']\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"model(['You are kind of stupid', 'You are a wonderful person!'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Command Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python deeppavlov interact insults_kaggle_bert -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\\n\",\n    \"\\n\",\n    \"Or make predictions for samples from *stdin*.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python deeppavlov predict insults_kaggle_bert -f <file-name>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Evaluation\\n\",\n    \"\\n\",\n    \"## 5.1 Evaluate from Python\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import evaluate_model\\n\",\n    \"\\n\",\n    \"model = evaluate_model('insults_kaggle_bert', download=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 5.2 Evaluate from CLI\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov evaluate insults_kaggle_bert -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 6. Train the model on your data\\n\",\n    \"\\n\",\n    \"## 6.1 Train your model from Python\\n\",\n    \"\\n\",\n    \"### Provide your data path\\n\",\n    \"\\n\",\n    \"To train the model on your data, you need to change the path to the training data in the *config_file*.\\n\",\n    \"\\n\",\n    \"Parse the *config_file* and change the path to your data from Python.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"~/.deeppavlov/downloads/insults_data\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from deeppavlov import train_model\\n\",\n    \"from deeppavlov.core.commands.utils import parse_config\\n\",\n    \"\\n\",\n    \"model_config = parse_config('insults_kaggle_bert')\\n\",\n    \"\\n\",\n    \"# dataset that the model was trained on\\n\",\n    \"print(model_config['dataset_reader']['data_path'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Provide a *data_path* to your own dataset. You can also change any of the hyperparameters of the model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# download and unzip a new example dataset\\n\",\n    \"!wget http://files.deeppavlov.ai/datasets/insults_data.tar.gz\\n\",\n    \"!tar -xzvf \\\"insults_data.tar.gz\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# provide a path to the directory with your train, valid and test files\\n\",\n    \"model_config['dataset_reader']['data_path'] = \\\"./contents/\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"\\n\",\n    \"### Train dataset format\\n\",\n    \"\\n\",\n    \"### Train the model using new config\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"model = train_model(model_config)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Use your model for prediction.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"['Insult', 'Not Insult']\"\n      ]\n     },\n     \"execution_count\": null,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"model(['You are kind of stupid', 'You are a wonderful person!'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.2 Train your model from CLI\\n\",\n    \"\\n\",\n    \"To train the model on your data, create a copy of a config file and change the *data_path* variable in it. After that, train the model using your new *config_file*. You can also change any of the hyperparameters of the model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov train model_config.json\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 7. Simple few-shot classifiers\\n\",\n    \"\\n\",\n    \"Additionally, in the [faq](https://github.com/deeppavlov/DeepPavlov/tree/master/deeppavlov/configs/faq) section you can find a config for a fast and simple pre-BERT model, which consists of a fasttext vectorizer and a simple logistic regression classifier.\\n\",\n    \"\\n\",\n    \"## 7.1 Few-shot setting\\n\",\n    \"\\n\",\n    \"In the current setting the config can be used for few-shot classification - a task, in which only a few training examples are available for each class (usually from 5 to 10). Note that the config takes the full version of the dataset as the input and samples N examples for each class of the train data in the iterator.\\n\",\n    \"\\n\",\n    \"The sampling is done within the `basic_classification_iterator` component of the pipeline and the `shot` parameter defines the number of examples to be sampled. By default the `shot` parameter is set to `None` (no sampling applied).\\n\",\n    \"\\n\",\n    \"## 7.2 Multiple languages support\\n\",\n    \"\\n\",\n    \"By default `fasttext_logreg` supports classification in English, but can be modified for classification in Russian.\\n\",\n    \"\\n\",\n    \"In order to change `fasttext_logreg` language to Russian, change `LANGUAGE` variable in the `metadata.variables` section from `en` to `ru` and change the Spacy model by changing `SPACY_MODEL` variable from `en_core_web_sm` to `ru_core_news_sm`.\\n\",\n    \"\\n\",\n    \"You can do that by directly editing the config file through an editor or change it through Python (example below). N.B. `read_json` and `find_config` combination is intentionally used instead of `parse_config` to read config in the example, because `parse_config` will replace all `LANGUAGE` and `SPACY_MODEL` usages in the config with the default values from `metadata.variables`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"from deeppavlov.core.common.file import read_json, find_config\\n\",\n    \"\\n\",\n    \"model_config = read_json(find_config('fasttext_logreg'))\\n\",\n    \"model_config['metadata']['variables']['LANGUAGE'] = 'ru'\\n\",\n    \"model_config['metadata']['variables']['SPACY_MODEL'] = 'ru_core_news_sm'\\n\",\n    \"model = build_model(model_config, install=True, download=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 7.3 Dataset and Scores\\n\",\n    \"\\n\",\n    \"To demonstrate the performance of the model in two languages, we use the English and Russian subsets of [the MASSIVE dataset](https://github.com/alexa/massive).\\n\",\n    \"\\n\",\n    \"MASSIVE is a parallel dataset of utterrances in 52 languages with annotations for the Natural Language Understanding tasks of intent prediction and slot annotation. We only employ the intent classification data. You can see the results of the given configs in 5-shot classification setting in the table below.\\n\",\n    \"\\n\",\n    \"| Config name | Language | Train accuracy | Validation accuracy | Test accuracy |\\n\",\n    \"| :--- | --- | --- | --- | ---: |\\n\",\n    \"| fasttext_logreg | en | 0.9632 | 0.5239 | 0.5155 |\\n\",\n    \"| fasttext_logreg | ru | 0.9231 | 0.4565 | 0.4304 |\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/entity_extraction.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Entity Extraction\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/entity_extraction.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1 [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"    \\n\",\n    \"    4.2 [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"    \\n\",\n    \"    5.1 [Description of config parameters](#5.1-Description-of-config-parameters)\\n\",\n    \"    \\n\",\n    \"    5.2 [Training entity detection model](#5.2-Training-entity-detection-model)\\n\",\n    \"    \\n\",\n    \"    5.3 [Using custom knowledge base](#5.3-Using-custom-knowledge-base)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"**Entity Detection** is the task of identifying entity mentions in text with corresponding entity types. Entity Detection models in DeepPavlov split the input text into fragments of the lengths less than 512 tokens and find entities with BERT-based models.\\n\",\n    \"\\n\",\n    \"**Entity Linking** is the task of finding knowledge base entity ids for entity mentions in text. Entity Linking in DeepPavlov supports Wikidata and Wikipedia. Entity Linking component performs the following steps:\\n\",\n    \"\\n\",\n    \"* extraction of candidate entities from SQLite database;\\n\",\n    \"* candidate entities sorting by entity tags (if entity tags are provided);\\n\",\n    \"* ranking of candidate entities by connections in Wikidata knowledge graph of candidate entities for different mentions;\\n\",\n    \"* candidate entities ranking by context and descriptions using Transformer model [bert-small](https://huggingface.co/prajjwal1/bert-small) in English config and [distilrubert-tiny](https://huggingface.co/DeepPavlov/distilrubert-tiny-cased-conversational-v1).\\n\",\n    \"\\n\",\n    \"**Entity Extraction** configs perform subsequent Entity Detection and Entity Linking of extracted entity mentions.\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then make sure that all the required packages for the model are installed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install entity_extraction_en\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`entity_extraction_en` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\\n\",\n    \"\\n\",\n    \"There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\\n\",\n    \"The full list of models for entity detection, linking and extraction with their config names can be found in the [table](#3.-Models-list).\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents a list of all of the models for entity detection, linking and extraction available in the DeepPavlov Library.\\n\",\n    \"\\n\",\n    \"| Config name | Language | RAM | GPU |\\n\",\n    \"| :--- | --- | --- | --- |\\n\",\n    \"| entity_detection_en | En | 2.5 Gb | 3.7 Gb |\\n\",\n    \"| entity_detection_ru | Ru | 2.5 Gb | 5.3 Gb |\\n\",\n    \"| entity_linking_en | En | 2.4 Gb | 1.2 Gb |\\n\",\n    \"| entity_linking_ru | Ru | 2.2 Gb | 1.1 Gb |\\n\",\n    \"| entity_extraction_en | En | 2.5 Gb | 3.7 Gb |\\n\",\n    \"| entity_extraction_ru | Ru | 2.5 Gb | 5.3 Gb |\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict.\\n\",\n    \"\\n\",\n    \"### Entity Detection\\n\",\n    \"\\n\",\n    \"**For English:**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import warnings\\n\",\n    \"warnings.filterwarnings('ignore')\\n\",\n    \"\\n\",\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"ed_en = build_model('entity_detection_en', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**The output elements:**\\n\",\n    \"\\n\",\n    \"* entity substrings\\n\",\n    \"* entity offsets (indices of start and end symbols of entities in text)\\n\",\n    \"* entity positions (indices of entity tokens in text)\\n\",\n    \"* entity tags\\n\",\n    \"* sentences offsets\\n\",\n    \"* list of sentences in text\\n\",\n    \"* confidences of detected entities\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ed_en(['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**For Russian:**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ed_ru = build_model('entity_detection_ru', download=True, install=True)\\n\",\n    \"ed_ru(['Москва — столица России, центр Центрального федерального округа и центр Московской области.'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Entity Linking\\n\",\n    \"\\n\",\n    \"**For English:**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"el_en = build_model('entity_linking_en', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**The input elements:**\\n\",\n    \"\\n\",\n    \"* entity substrings\\n\",\n    \"* entity tags (optional argument)\\n\",\n    \"* confidences of entity substrings (optional argument)\\n\",\n    \"* sentences (context) of the entities (optional argument)\\n\",\n    \"* entity offsets (optional argument)\\n\",\n    \"* sentences offsets (optional argument)\\n\",\n    \"\\n\",\n    \"**The output elements:**\\n\",\n    \"\\n\",\n    \"* entity ids\\n\",\n    \"* entity confidences (for each entity - the list with three confidences: substring matching confidence, popularity ranking confidence and context ranking confidence)\\n\",\n    \"* entity pages in Wikipedia\\n\",\n    \"* entity labels in Wikidata\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"el_en([['forrest gump', 'robert zemeckis', 'eric roth']],\\n\",\n    \"      [['WORK_OF_ART', 'PERSON', 'PERSON']],\\n\",\n    \"      [[1.0, 1.0, 1.0]],\\n\",\n    \"      [['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.']],\\n\",\n    \"      [[(0, 12), (48, 63), (79, 88)]],\\n\",\n    \"      [[(0, 89)]])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**For Russian:**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"el_ru = build_model('entity_linking_ru', download=True, install=True)\\n\",\n    \"\\n\",\n    \"el_ru([['москва', 'россии', 'центрального федерального округа', 'московской области']],\\n\",\n    \"      [['CITY', 'COUNTRY', 'LOC', 'LOC']],\\n\",\n    \"      [[1.0, 1.0, 1.0, 1.0]],\\n\",\n    \"      [['Москва — столица России, центр Центрального федерального округа и центр Московской области.']],\\n\",\n    \"      [[(0, 6), (17, 23), (31, 63), (72, 90)]],\\n\",\n    \"      [[(0, 91)]])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Entity Extraction\\n\",\n    \"\\n\",\n    \"**For English:**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ex_en = build_model('entity_extraction_en', download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**The output elements:**\\n\",\n    \"\\n\",\n    \"* entity substrings\\n\",\n    \"* entity tags\\n\",\n    \"* entity offsets\\n\",\n    \"* entity ids in the knowledge base\\n\",\n    \"* entity linking confidences\\n\",\n    \"* entity pages\\n\",\n    \"* entity labels\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ex_en(['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**For Russian:**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ex_ru = build_model('entity_extraction_ru', download=True, install=True)\\n\",\n    \"\\n\",\n    \"ex_ru(['Москва — столица России, центр Центрального федерального округа и центр Московской области.'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Сommand Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact entity_extraction_en -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Customize the model\\n\",\n    \"\\n\",\n    \"## 5.1 Description of config parameters\\n\",\n    \"\\n\",\n    \"Parameters of ``ner_chunker`` component:\\n\",\n    \"\\n\",\n    \"- ``batch_size: int`` - each text from the input text batch is split into chunks with the length lower than the threshold (because Transformer-based models for entity detection work with limited lengths of the input sequences), than all chunks are concatenated into one list and the list is split into batches of the size ``batch_size``;\\n\",\n    \"- ``max_seq_len: int`` - maximum length of chunk (in wordpiece tokens);\\n\",\n    \"- ``vocab_file: str`` - vocab file of Transformer tokenizer, which is used to tokenize the text for further splitting into chunks.\\n\",\n    \"\\n\",\n    \"Parameters of ``entity_detection_parser`` component:\\n\",\n    \"    \\n\",\n    \"- ``thres_proba: float`` - the NER models return tag confidences for each token; if the probability of \\\"O\\\" tag (which is used for tokens not related to entities) for the token is lower than the ``thres_proba``, the tag with the maximum probability from entity tags list is chosen;\\n\",\n    \"- ``o_tag: str`` - tag for non-entity tokens (by default is \\\"O\\\" tag);\\n\",\n    \"- ``tags_file: str`` - the filename with the list of tags used in the NER model.\\n\",\n    \"\\n\",\n    \"Parameters of ``ner_chunk_model`` component:\\n\",\n    \"\\n\",\n    \"- ``ner: deeppavlov.core.common.chainer:Chainer`` - the config for entity recognition, which defines entity tags (or \\\"O\\\" tag) and tag probabilities for each token in the input text;\\n\",\n    \"- ``ner_parser: deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser`` - the component which processes the tags and tag probabilities returned by the entity recognition model and defines entity substrings;\\n\",\n    \"- ``ner2: deeppavlov.core.common.chainer:Chainer`` - (optional) an additional entity recognition config, which can improve the quality of entity recognition in the case of joint usage with ``ner`` config;\\n\",\n    \"- ``ner_parser2: deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser`` - (optional) an additional config for processing entity recognition output.\\n\",\n    \"\\n\",\n    \"Parameters of ``entity_linker`` component:\\n\",\n    \"\\n\",\n    \"- ``load_path: str`` - the path to the folder with the inverted index;\\n\",\n    \"- ``entity_ranker`` - the component for ranking of candidate entities by descriptions;\\n\",\n    \"- ``entities_database_filename: str`` - file with the inverted index (the mapping between entity titles and entity IDs);\\n\",\n    \"- ``words_dict_filename: str`` - file with mapping of entity titles to the tags of entity detection model;\\n\",\n    \"- ``ngrams_matrix_filename: str`` - matrix of char ngrams of words from entity titles from the knowledge base;\\n\",\n    \"- ``num_entities_for_bert_ranking: int`` - number of candidate entities which are re-ranked by context and description using Transformer-based model;\\n\",\n    \"- ``num_entities_for_conn_ranking: int`` - number of candidate entities which are re-ranked by connections in the knowledge graph between entities for different mentions in the text;\\n\",\n    \"- ``num_entities_to_return: int`` - the number of entity IDs, returned for each entity mention in text; \\n\",\n    \"- ``max_paragraph_len: int`` - maximum length of context used for ranking of entities by description;\\n\",\n    \"- ``lang: str`` - language of the entity linking model (Russian or English);\\n\",\n    \"- ``use_descriptions: bool`` - whether to perform ranking of candidate entities by similarity of their descriptions to the context;\\n\",\n    \"- ``alias_coef: float`` - the coefficient which is multiplied by the substring matching score of the entity if the entity mention in the text matches with the entity title;\\n\",\n    \"- ``use_tags: bool`` - whether to search only those entity IDs in the inverted index, which have the same tag as the entity mention;\\n\",\n    \"- ``lemmatize: bool`` - whether to lemmatize entity mentions before searching candidate entity IDs in the inverted index;\\n\",\n    \"- ``full_paragraph: bool`` - whether to use full context for ranking of entities by descriptions or cut the paragraph to one sentence with entity mention;\\n\",\n    \"- ``use_connections: bool`` - whether to use connections between candidate entities for different mentions for ranking;\\n\",\n    \"- ``kb_filename: str`` - file with the knowledge base in .hdt format;\\n\",\n    \"- ``prefixes: Dict[str, Any]`` - prefixes in the knowledge base for entities and relations.\\n\",\n    \"\\n\",\n    \"## 5.2 Training entity detection model\\n\",\n    \"\\n\",\n    \"The configs `entity_detection_en` and `entity extraction_en` use `ner_ontonotes_bert` model for detection of entity mentions, the configs `entity_detection_ru` and `entity extraction_ru` use `ner_rus_bert_probas` model. [How to train a NER model](http://docs.deeppavlov.ai/en/master/features/models/NER.html#6.-Customize-the-model).\\n\",\n    \"\\n\",\n    \"## 5.3 Using custom knowledge base\\n\",\n    \"\\n\",\n    \"The database filename is defined with the **entities_database_filename** in entity linking configs. The file is in SQLite format with FTS5 extensions for full-text search of entities by entity mention. The database file should contain the **inverted_index** table with the following columns:\\n\",\n    \"\\n\",\n    \"* ``title`` - entity title (name or alias) in the knowledge base;\\n\",\n    \"* ``entity_id`` - entity ID in the knowledge base;\\n\",\n    \"* ``num_rels`` - number of relations of the entity with other entities in the knowledge graph;\\n\",\n    \"* ``ent_tag`` - entity tag of the entity detection model (for example, CITY, PERSON, WORK_OF_ART, etc.);\\n\",\n    \"* ``page`` - page title of the entity (for Wikidata entities - the Wikipedia page);\\n\",\n    \"* ``label`` - entity label in the knowledge base;\\n\",\n    \"* ``descr`` - entity description in the knowledge base.\\n\",\n    \"\\n\",\n    \"Tags of entities in the knowledge base should correspond with the tags of the custom NER model or default `ner_ontonotes_bert` or `ner_rus_bert_probas` models. The list of `ner_ontonotes_bert` tags is listed in tags.dict file in ~/.deeppavlov/models/ner_ontonotes_bert_torch_crf directory, the list of `ner_rus_bert_probas tags` - in tags.dict file in ~/.deeppavlov/models/wiki_ner_rus_bert directory.\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/few_shot_classification.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Few-shot Text Classification\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1 [Dataset format](#4.1-Dataset-format)\\n\",\n    \"\\n\",\n    \"    4.2. [Predict using Python](#4.2-Predict-using-Python)\\n\",\n    \"    \\n\",\n    \"    4.3. [Predict using CLI](#4.3-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"__Text classification__ is a task of identifying one of the pre-defined label given an utterance, where label is one of N classes or \\\"OOS\\\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then make sure that all the required packages are installed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install few_shot_roberta\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`few_shot_roberta` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \\n\",\n    \"\\n\",\n    \"Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\\n\",\n    \"Some of few-shot classification models with their config names can be found in the [table](#3.-Models-list).\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"At the moment, only `few_shot_roberta` config support out-of-scope detection.\\n\",\n    \"\\n\",\n    \"| Config name  | Dataset | Shot | Model Size | In-domain accuracy | Out-of-scope recall | Out-of-scope precision |\\n\",\n    \"| :--- | --- | --- | --- | --- |  --- | ---: |\\n\",\n    \"| few_shot_roberta| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 1.4 GB | 84.1±1.9 | 93.2±0.8 | 97.8±0.3 |\\n\",\n    \"| few_shot_roberta| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 1.4 GB | 59.4±1.4 | 87.9±1.2 | 40.3±0.7 |\\n\",\n    \"| few_shot_roberta| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent)  | 5 | 1.4 GB | 51.4±2.1 | 93.7±0.7 | 82.7±1.4 |\\n\",\n    \"| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 37 KB |24.8±2.2 | 98.2±0.4 | 74.8±0.6 |\\n\",\n    \"| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 37 KB | 13.4±0.5 | 98.6±0.2 | 20.5±0.1 |\\n\",\n    \"| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent)  | 5 | 37 KB |10.7±0.8 | 99.0±0.3 | 36.4±0.2 |\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"With zero threshold we can get a classification accuracy without OOS detection:\\n\",\n    \"\\n\",\n    \"| Config name  | Dataset | Shot | Model Size | Accuracy |\\n\",\n    \"| :--- | --- | --- | --- | ---: |\\n\",\n    \"| few_shot_roberta| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 1.4 GB | 89.6 |\\n\",\n    \"| few_shot_roberta| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 1.4 GB | 79.6 |\\n\",\n    \"| few_shot_roberta| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent)  | 5 | 1.4 GB | 55.1 |\\n\",\n    \"| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 37 KB | 86.3 |\\n\",\n    \"| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 37 KB | 73.6\\n\",\n    \"| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent)  | 5 | 37 KB | 51.6 |\\n\",\n    \"\\n\",\n    \"\\\\* \\\\- config file was modified to predict OOS examples\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"Base model `few_shot_roberta` was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed.\\n\",\n    \"\\n\",\n    \"## 4.1 Dataset format\\n\",\n    \"\\n\",\n    \"DNNC model compares input text to every example in dataset to determine, which class the input example belongs to. The dataset based on which classification is performed has the following format:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"[\\n\",\n    \"    [\\\"text_1\\\",  \\\"label_1\\\"],\\n\",\n    \"    [\\\"text_2\\\",  \\\"label_2\\\"],\\n\",\n    \"             ...\\n\",\n    \"    [\\\"text_n\\\",  \\\"label_n\\\"]\\n\",\n    \"]\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"## 4.2 Predict using Python\\n\",\n    \"\\n\",\n    \"After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"model = build_model(\\\"few_shot_roberta\\\", download=True)\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"If you set `download` flag to `True`, then existing model weights will be overwritten.\\n\",\n    \"\\n\",\n    \"Setting the `install` argument to `True` is equivalent to executing the command line `install` command. If set to `True`, it will first install all the required packages.\\n\",\n    \"\\n\",\n    \"**Input**: List[texts, dataset]\\n\",\n    \"\\n\",\n    \"**Output**: List[labels]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"['translate', 'exchange_rate', 'car_rental']\"\n      ]\n     },\n     \"execution_count\": 2,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"texts = [\\n\",\n    \"    \\\"what expression would i use to say i love you if i were an italian\\\",\\n\",\n    \"    \\\"what's the currency conversion between krones and yen\\\",\\n\",\n    \"    \\\"i'd like to reserve a high-end car\\\"\\n\",\n    \"]\\n\",\n    \"\\n\",\n    \"dataset = [\\n\",\n    \"    [\\\"please help me book a rental car for nashville\\\",                       \\\"car_rental\\\"],\\n\",\n    \"    [\\\"how can i rent a car in boston\\\",                                       \\\"car_rental\\\"],\\n\",\n    \"    [\\\"help me get a rental car for march 2 to 6th\\\",                          \\\"car_rental\\\"],\\n\",\n    \"    \\n\",\n    \"    [\\\"how many pesos can i get for one dollar\\\",                              \\\"exchange_rate\\\"],\\n\",\n    \"    [\\\"tell me the exchange rate between rubles and dollars\\\",                 \\\"exchange_rate\\\"],\\n\",\n    \"    [\\\"what is the exchange rate in pesos for 100 dollars\\\",                   \\\"exchange_rate\\\"],\\n\",\n    \"    \\n\",\n    \"    [\\\"can you tell me how to say 'i do not speak much spanish', in spanish\\\", \\\"translate\\\"],\\n\",\n    \"    [\\\"please tell me how to ask for a taxi in french\\\",                       \\\"translate\\\"],\\n\",\n    \"    [\\\"how would i say thank you if i were russian\\\",                          \\\"translate\\\"]\\n\",\n    \"]\\n\",\n    \"\\n\",\n    \"model(texts, dataset)\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.3 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Сommand Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov interact few_shot_roberta -d\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with all other files needed to run the model.\\n\",\n    \"\\n\",\n    \"Or make predictions for samples from *stdin*.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov predict few_shot_roberta -f <file-name>\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Customize the model\\n\",\n    \"\\n\",\n    \"Out-of-scope (OOS) examples are determined via confidence with *confidence_threshold* parameter. For each input text, if the confidence of the model is lower than the *confidence_threshold*, then the input example is considered out-of-scop. The higher the threshold, the more often the model predicts \\\"oos\\\" class. By default it is set to 0, but you can change it to your preferences in configuration file.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"0.0\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"from deeppavlov.core.commands.utils import parse_config\\n\",\n    \"\\n\",\n    \"model_config = parse_config('few_shot_roberta')\\n\",\n    \"model_config['chainer']['pipe'][-1]['confidence_threshold'] = 0.1\\n\",\n    \"model = build_model(model_config)\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "docs/features/models/morpho_tagger.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Morphotagger\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/morpho_tagger.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1. [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"\\n\",\n    \"    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"Morphological tagging is definition morphological tags, such as case, number, gender, aspect etc. for text tokens.\\n\",\n    \"\\n\",\n    \"An example:\\n\",\n    \"```\\n\",\n    \"Я шёл домой по незнакомой улице.\\n\",\n    \"```\\n\",\n    \"```\\n\",\n    \"1\\tЯ\\tя\\tPRON\\t_\\tCase=Nom|Number=Sing|Person=1\\t_\\t_\\t_\\t_\\n\",\n    \"2\\tшёл\\tидти\\tVERB\\t_\\tAspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\\t_\\t_\\t_\\t_\\n\",\n    \"3\\tдомой\\tдомой\\tADV\\t_\\tDegree=Pos\\t_\\t_\\t_\\t_\\n\",\n    \"4\\tпо\\tпо\\tADP\\t_\\t_\\t_\\t_\\t_\\t_\\n\",\n    \"5\\tнезнакомой\\tнезнакомый\\tADJ\\t_\\tCase=Dat|Degree=Pos|Gender=Fem|Number=Sing\\t_\\t_\\t_\\t_\\n\",\n    \"6\\tулице\\tулица\\tNOUN\\t_\\tAnimacy=Inan|Case=Dat|Gender=Fem|Number=Sing\\t_\\t_\\t_\\t_\\n\",\n    \"7\\t.\\t.\\tPUNCT\\t_\\t_\\t_\\t_\\t_\\t_\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"The model is based on [BERT for token classification](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForTokenClassification).\\n\",\n    \"The model is trained on [Universal Dependencies corpora](https://universaldependencies.org/) (version 2.3).\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Before using the model make sure that all required packages are installed running the command:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install morpho_ru_syntagrus_bert\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents comparison of ``morpho_ru_syntagrus_bert`` config with other models on UD2.3 dataset.\\n\",\n    \"\\n\",\n    \"| Model | Accuracy |\\n\",\n    \"| :--- | :---: |\\n\",\n    \"| UDPipe | 93.5 |\\n\",\n    \"| morpho_ru_syntagrus_bert | 97.6 |\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"model = build_model(\\\"morpho_ru_syntagrus_bert\\\", download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"1\\tЯ\\tя\\tPRON\\t_\\tCase=Nom|Number=Sing|Person=1\\t_\\t_\\t_\\t_\\n\",\n      \"2\\tшёл\\tшёл\\tVERB\\t_\\tAspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\\t_\\t_\\t_\\t_\\n\",\n      \"3\\tдомой\\tдомой\\tADV\\t_\\tDegree=Pos\\t_\\t_\\t_\\t_\\n\",\n      \"4\\tпо\\tпо\\tADP\\t_\\t_\\t_\\t_\\t_\\t_\\n\",\n      \"5\\tнезнакомой\\tнезнакомый\\tADJ\\t_\\tCase=Dat|Degree=Pos|Gender=Fem|Number=Sing\\t_\\t_\\t_\\t_\\n\",\n      \"6\\tулице\\tулица\\tNOUN\\t_\\tAnimacy=Inan|Case=Dat|Gender=Fem|Number=Sing\\t_\\t_\\t_\\t_\\n\",\n      \"7\\t.\\t.\\tPUNCT\\t_\\t_\\t_\\t_\\t_\\t_\\n\",\n      \"\\n\",\n      \"1\\tДевушка\\tдевушка\\tNOUN\\t_\\tAnimacy=Anim|Case=Nom|Gender=Fem|Number=Sing\\t_\\t_\\t_\\t_\\n\",\n      \"2\\tпела\\tпеть\\tVERB\\t_\\tAspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\\t_\\t_\\t_\\t_\\n\",\n      \"3\\tв\\tв\\tADP\\t_\\t_\\t_\\t_\\t_\\t_\\n\",\n      \"4\\tцерковном\\tцерковном\\tADJ\\t_\\tCase=Loc|Degree=Pos|Gender=Masc|Number=Sing\\t_\\t_\\t_\\t_\\n\",\n      \"5\\tхоре\\tхор\\tNOUN\\t_\\tAnimacy=Inan|Case=Loc|Gender=Masc|Number=Sing\\t_\\t_\\t_\\t_\\n\",\n      \"6\\tо\\tо\\tADP\\t_\\t_\\t_\\t_\\t_\\t_\\n\",\n      \"7\\tвсех\\tвесь\\tDET\\t_\\tCase=Loc|Number=Plur\\t_\\t_\\t_\\t_\\n\",\n      \"8\\tуставших\\tустать\\tVERB\\t_\\tAspect=Perf|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act\\t_\\t_\\t_\\t_\\n\",\n      \"9\\tв\\tв\\tADP\\t_\\t_\\t_\\t_\\t_\\t_\\n\",\n      \"10\\tчужом\\tчужом\\tADJ\\t_\\tCase=Loc|Degree=Pos|Gender=Masc|Number=Sing\\t_\\t_\\t_\\t_\\n\",\n      \"11\\tкраю\\tкрай\\tNOUN\\t_\\tAnimacy=Inan|Case=Loc|Gender=Masc|Number=Sing\\t_\\t_\\t_\\t_\\n\",\n      \"12\\t.\\t.\\tPUNCT\\t_\\t_\\t_\\t_\\t_\\t_\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"sentences = [\\\"Я шёл домой по незнакомой улице.\\\", \\\"Девушка пела в церковном хоре о всех уставших в чужом краю.\\\"]\\n\",\n    \"for parse in model(sentences):\\n\",\n    \"    print(parse)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Сommand Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact morpho_ru_syntagrus_bert -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\\n\",\n    \"\\n\",\n    \"# 5. Customize the model\\n\",\n    \"\\n\",\n    \"To train **morphotagger** on your own data, you should prepare a dataset in **CoNLL-U format**. The description of **CoNLL-U format** can be found [here](https://universaldependencies.org/format.html#conll-u-format).\\n\",\n    \"\\n\",\n    \"Then you should place files for training, validation and testing into the ``\\\"data_path\\\"`` directory of ``morphotagger_dataset_reader``, change file names in ``morphotagger_dataset_reader`` to your filenames and launch the training:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import train_model\\n\",\n    \"\\n\",\n    \"train_model(\\\"<your_morphotagging_config_name>\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"or **using CLI**:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov train <your_morphotagging_config_name>\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/multitask_bert.rst",
    "content": "Multi-task BERT in DeepPavlov\n=============================\n\nMulti-task BERT in DeepPavlov is an implementation of BERT training algorithm published in the paper\n`Knowledge Transfer Between Tasks and Languages in the Multi-task\nEncoder-agnostic Transformer-based Models <https://www.dialog-21.ru/media/5902/karpovdpluskonovalovv002.pdf>`_.\n\nThe idea is to share BERT body between several tasks. This is necessary if a model pipe has several\ncomponents using BERT and the amount of GPU memory is limited. Each task has its own 'head' part attached to the\noutput of the BERT encoder. If multi-task BERT has :math:`T` heads, one training iteration consists of\n\n- composing :math:`T` lists of examples, one for each task,\n\n- :math:`T` gradient steps, one gradient step for each task.\n\nBy default, on every training steps lists of examples for all but one tasks are empty, as if in the original MT-DNN repository. \n\nWhen one of BERT heads is being trained, other heads' parameters do not change. On each training step both BERT head\nand body parameters are modified.\n\nCurrently multitask bert heads support classification, regression, NER and multiple choice tasks. \n\nAt this page, multi-task BERT usage is explained on a toy configuration file of a model that is trained for the\nsingle-sentence classification, sentence pair classification, regression, multiple choice and NER.\nThe config for this model is :config:`multitask_example <configs/multitask/multitask_example.json>`.\n\nOther examples of using multitask models can be found in :config:`mt_glue <configs/multitask/mt_glue.json>`.\n\nTrain config\n------------\n\nWhen using ``multitask_transformer`` component, you can use the same inference file as the train file.\n\nData reading and iteration is performed by :class:`~deeppavlov.dataset_readers.multitask_reader.MultiTaskReader`\nand :class:`~deeppavlov.dataset_iterators.multitask_iterator.MultiTaskIterator`. These classes are composed\nof task readers and iterators and generate batches that contain data from heterogeneous datasets. Example below\ndemonstrates the usage of multitask dataset reader:\n\n.. code:: json\n\n  \"dataset_reader\": {\n    \"class_name\": \"multitask_reader\",\n    \"task_defaults\": {\n      \"class_name\": \"huggingface_dataset_reader\",\n      \"path\": \"glue\",\n      \"train\": \"train\",\n      \"valid\": \"validation\",\n      \"test\": \"test\"\n    },\n    \"tasks\": {\n      \"cola\": {\"name\": \"cola\"},\n      \"copa\": {\n        \"path\": \"super_glue\",\n        \"name\": \"copa\"\n      },\n      \"conll\": {\n        \"class_name\": \"conll2003_reader\",\n        \"use_task_defaults\": false,\n        \"data_path\": \"{DOWNLOADS_PATH}/conll2003/\",\n        \"dataset_name\": \"conll2003\",\n        \"provide_pos\": false\n      }\n    }\n  }\n\nNested dataset readers are listed in the ``tasks`` section. By default, default nested readers parameters are taken from\n``task_defaults`` section. Values from the ``tasks`` could complement parameters, like ``name`` parameter in the\n``dataset_reader.tasks.cola``, and could overwrite default parameter values, like ``path`` parameter from\n``dataset_reader.tasks.copa``. In the ``dataset_reader.tasks.conll`` ``use_task_defaults`` is ``False``. This is special\nparameter, that forces ``multitask_reader`` to ignore ``task_defaults`` while creating nested reader, which means that\ndataset reader for ``conll`` task will use only parameters from ``dataset_reader.tasks.conll``.\n\nThe same principle with default values applies to ``multitask_iterator``.\n\nBatches generated by ``multitask_iterator`` are tuples of two elements: inputs of the model and labels. \nBoth inputsand labels are lists of tuples. The inputs have following format:\n``[(first_task_inputs[0], second_task_inputs[0],...), (first_task_inputs[1], second_task_inputs[1], ...), ...]``\nwhere ``first_task_inputs``, ``second_task_inputs``, and so on are x values of batches from task dataset iterators.\nThe labels in the second element have the similar format.\n\nIf task datasets have different sizes, then for smaller datasets the lists are padded with ``None`` values. For example,\nif the first task dataset inputs are ``[0, 1, 2, 3, 4, 5, 6]``, the second task dataset inputs are ``[7, 8, 9]``,\nand the batch size is ``2``, then multi-task input mini-batches will be ``[(0, 7), (1, 8)]``, ``[(2, 9), (3, None)]``,\n``[(4, None), (5, None)]``, ``[(6, None)]``.\n\nIn this tutorial, there are 5 datasets. Considering the batch structure, ``chainer`` inputs in\n:config:`multitask_example <configs/multitask/multitask_example.json>` are:\n\n.. code:: json\n\n  \"in\": [\"x_cola\", \"x_rte\", \"x_stsb\", \"x_copa\", \"x_conll\"],\n  \"in_y\": [\"y_cola\", \"y_rte\", \"y_stsb\", \"y_copa\", \"y_conll\"]\n\nSometimes a task dataset iterator returns inputs or labels consisting of more than one element. For example, in the\nmodel input element could consist of two strings. If there is a necessity to split such a variable, ``InputSplitter``\ncomponent can be used. Data preparation in the multitask setting can be similar to the preparation in singletask setting\nexcept for the names of the variables.\n\nFor streamlining the code, however, ``input_splitter`` and ``tokenizer`` can be unified into the\n``multitask_pipeline_preprocessor``. This preprocessor gets as a parameter ``preprocessor`` the one preprocessor class\nname for all tasks, or gets the preprocessor name list as a parameter ``preprocessors``. After splitting input by\n``possible_keys_to_extract``, every preprocessor (being initialized by the input beforehand) processes the input.\nNote, that if ``strict`` parameter(default:False) is set to True, we always try to split data. Here is the definition of\n``multitask_pipeline_preprocessor`` from the :config:`multitask_example <configs/multitask/multitask_example.json>`:\n\n.. code:: json\n\n  \"class_name\": \"multitask_pipeline_preprocessor\",\n  \"possible_keys_to_extract\": [0, 1],\n  \"preprocessors\": [\n    \"TorchTransformersPreprocessor\",\n    \"TorchTransformersPreprocessor\",\n    \"TorchTransformersPreprocessor\",\n    \"TorchTransformersMultiplechoicePreprocessor\",\n    \"TorchTransformersNerPreprocessor\"\n  ],\n  \"do_lower_case\": true,\n  \"n_task\": 5,\n  \"vocab_file\": \"{BACKBONE}\",\n  \"max_seq_length\": 200,\n  \"max_subword_length\": 15,\n  \"token_masking_prob\": 0.0,\n  \"return_features\": true,\n  \"in\": [\"x_cola\", \"x_rte\", \"x_stsb\", \"x_copa\", \"x_conll\"],\n  \"out\": [\n    \"bert_features_cola\",\n    \"bert_features_rte\",\n    \"bert_features_stsb\",\n    \"bert_features_copa\",\n    \"bert_features_conll\"\n  ]\n\nThe ``multitask_transformer`` component has common and task-specific parameters. Shared parameters are provided inside\nthe tasks parameter. The tasks is a dictionary that keys are task names and values are task-specific parameters (type,\noptions). Common parameters, are backbone_model(same parameter as in the tokenizer) and all parameters from torch_bert.\n**The order of tasks MATTERS.**\n\nHere is the definition of ``multitask_transformer`` from the :config:`multitask_example <configs/multitask/multitask_example.json>`:\n\n.. code:: json\n\n  \"id\": \"multitask_transformer\",\n  \"class_name\": \"multitask_transformer\",\n  \"optimizer_parameters\": {\"lr\": 2e-5},\n  \"gradient_accumulation_steps\": \"{GRADIENT_ACC_STEPS}\",\n  \"learning_rate_drop_patience\": 2,\n  \"learning_rate_drop_div\": 2.0,\n  \"return_probas\": true,\n  \"backbone_model\": \"{BACKBONE}\",\n  \"save_path\": \"{MODEL_PATH}\",\n  \"load_path\": \"{MODEL_PATH}\",\n  \"tasks\": {\n    \"cola\": {\n      \"type\": \"classification\",\n      \"options\": 2\n    },\n    \"rte\": {\n      \"type\": \"classification\",\n      \"options\": 2\n    },\n    \"stsb\": {\n      \"type\": \"regression\",\n      \"options\": 1\n    },\n    \"copa\": {\n      \"type\": \"multiple_choice\",\n      \"options\": 2\n    },\n    \"conll\": {\n      \"type\": \"sequence_labeling\",\n      \"options\": \"#vocab_conll.len\"\n    }\n  },\n  \"in\": [\n    \"bert_features_cola\",\n    \"bert_features_rte\",\n    \"bert_features_stsb\",\n    \"bert_features_copa\",\n    \"bert_features_conll\"\n  ],\n  \"in_y\": [\"y_cola\", \"y_rte\", \"y_stsb\", \"y_copa\", \"y_ids_conll\"],\n  \"out\": [\n    \"y_cola_pred_probas\",\n    \"y_rte_pred_probas\",\n    \"y_stsb_pred\",\n    \"y_copa_pred_probas\",\n    \"y_conll_pred_ids\"\n  ]\n         \nNote that ``proba2labels`` can now take several arguments.\n\n.. code:: json\n\n  {\n    \"in\":[\"y_cola_pred_probas\", \"y_rte_pred_probas\", \"y_copa_pred_probas\"],\n    \"out\":[\"y_cola_pred_ids\", \"y_rte_pred_ids\", \"y_copa_pred_ids\"],\n    \"class_name\":\"proba2labels\",\n    \"max_proba\":true\n  }\n\nYou may need to create your own metric for early stopping. In this example, the target metric is an average of AUC ROC\nfor insults and sentiment tasks and F1 for NER task:\n\n.. code:: python\n\n    from deeppavlov.metrics.roc_auc_score import roc_auc_score\n\n    def roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_probas2, ner_true3, ner_pred3):\n        roc_auc1 = roc_auc_score(true_onehot1, pred_probas1)\n        roc_auc2 = roc_auc_score(true_onehot2, pred_probas2)\n        ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100\n        return (roc_auc1 + roc_auc2 + ner_f1_3) / 3\n\nIt he code above will be saved at ``custom_metric.py``, metric could be used in the config as\n``custom_metric:roc_auc__roc_auc__ner_f1`` (``module.submodules:function_name`` reference format).\n\nYou can make an inference-only config. In this config, there is no need in dataset reader and dataset iterator.\nA ``train`` field and components preparing ``in_y`` are removed. In ``multitask_transformer`` component configuration\nall training parameters (learning rate, optimizer, etc.) are omitted.\n\nHere are the results of ``deeppavlov/configs/multitask/mt_glue.json`` compared to the analogous single-task configs,\naccording to the test server.\n\n+-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+\n| Task              | Score       | CoLA           | SST-2    | MRPC          | STS-B                 | QQP           | MNLI(m/mm) | QNLI     | RTE      | AX             |\n+-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+\n| Metric            | from server | Matthew's Corr | Accuracy | F1 / Accuracy | Pearson/Spearman Corr | F1 / Accuracy | Accuracy   | Accuracy | Accuracy | Matthew's Corr |\n+===================+=============+================+==========+===============+=======================+===============+============+==========+==========+================+\n| Multitask config  | 77.8        | 43.6           | 93.2     | 88.6/84.2     | 84.3/84.0             | 70.1/87.9     | 83.0/82.6  | 90.6     | 75.4     | 35.4           |\n+-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+\n| Singletask config | 77.6        | 53.6           | 92.7     | 87.7/83.6     | 84.4/83.1             | 70.5/88.9     | 84.4/83.2  | 90.3     | 63.4     | 36.3           |\n+-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+\n"
  },
  {
    "path": "docs/features/models/neural_ranking.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Neural Ranking\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/neural_ranking.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1. [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"    \\n\",\n    \"    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"This model solves the tasks of ranking and paraphrase identification based on semantic similarity which is trained with siamese neural networks. The trained network can retrieve the response closest semantically to a given context from some database or answer whether two sentences are paraphrases or not. It is possible to build automatic semantic FAQ systems with such neural architectures.\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then make sure that all the required packages for the model are installed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install ranking_ubuntu_v2_torch_bert_uncased\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`ranking_ubuntu_v2_torch_bert_uncased` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\\n\",\n    \"\\n\",\n    \"There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\\n\",\n    \"The full list of models for neural ranking with their config names can be found in the [table](#3.-Models-list).\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"| Config | Language | Dataset | Transformer model |\\n\",\n    \"| :--- | :---: | :--- | :--- |\\n\",\n    \"| ranking/ranking_ubuntu_v2_torch_bert_uncased.json | En | [Ubuntu v2](https://github.com/rkadlec/ubuntu-ranking-dataset-creator) | bert-base-uncased |\\n\",\n    \"| classifiers/paraphraser_rubert.json | Ru | [paraphraser.ru](https://paraphraser.ru) | DeepPavlov/rubert-base-cased |\\n\",\n    \"| classifiers/paraphraser_convers_distilrubert_2L.json | Ru | [paraphraser.ru](https://paraphraser.ru) | DeepPavlov/distilrubert-tiny-cased-conversational |\\n\",\n    \"| classifiers/paraphraser_convers_distilrubert_6L.json | Ru | [paraphraser.ru](https://paraphraser.ru) | DeepPavlov/distilrubert-base-cased-conversational |\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"### English\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import configs, build_model\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"ranking = build_model(\\\"ranking_ubuntu_v2_torch_bert_uncased\\\", download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ranking([[\\\"Forrest Gump is a 1994 American epic comedy-drama film directed by Robert Zemeckis.\\\",\\n\",\n    \"          \\\"Robert Zemeckis directed Forrest Gump.\\\",\\n\",\n    \"          \\\"Robert Lee Zemeckis was born on May 14, 1952, in Chicago.\\\"]])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Input:** List[List[sentence1, sentence2, ...]], where the sentences from the second to the last will be ranked by similarity with the first sentence.\\n\",\n    \"\\n\",\n    \"**Output:** List[List[scores]] - similarity scores to the first sentence of the sentences from the second to the last.\\n\",\n    \"\\n\",\n    \"### Russian\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import configs, build_model\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"ranking = build_model(\\\"paraphraser_rubert\\\", download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ranking([\\\"Форрест Гамп - комедийная драма, девятый полнометражный фильм режиссёра Роберта Земекиса.\\\"],\\n\",\n    \"        [\\\"Роберт Земекис был режиссером фильма «Форрест Гамп».\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Input:** Tuple[List[sentences1], List[sentence2]], where each element of the list of sentences1 will be compared with the corresponding element of the sentence2 list.\\n\",\n    \"\\n\",\n    \"**Output:** List[labels] - each label is 1 or 0, 1 - if the sentence from the first list is a paraphrase to the corresponding sentence from the second list, 0 - otherwise.\\n\",\n    \"\\n\",\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"### English\\n\",\n    \"\\n\",\n    \"It is not intended to use the class ``deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel`` in the interact mode, so it is better to launch the config ranking/ranking_ubuntu_v2_torch_bert_uncased.json [using Python](#4.1-Predict-using-Python).\\n\",\n    \"\\n\",\n    \"### Russian\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Сommand Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact paraphraser_rubert -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Customize the model\\n\",\n    \"\\n\",\n    \"## English\\n\",\n    \"\\n\",\n    \"To train the ranking model on your own data, you should make a dataset in the following format:\\n\",\n    \"\\n\",\n    \"- the dataset should have **train.csv**, **valid.csv** and **test.csv** files.\\n\",\n    \"\\n\",\n    \"- **train.csv** file should contain the following columns: Context, Utterance, Label. Context and utterance are two texts and label (0 or 1) shows the relevance of the utterance to the context.\\n\",\n    \"\\n\",\n    \"- **valid.csv** and **test.csv** files should contain the following columns: Context, Ground Truth Utterance, Distractor_0, Distractor_1, ..., Distractor_N. Distractor utterances are negative samples (utterances, irrelevant to the context).\\n\",\n    \"\\n\",\n    \"Then you should put train.csv, valid.csv and test.csv files into the directory ``\\\"data_path\\\"`` in the dataset reader from the config and launch training of the model:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"python -m deeppavlov train ranking_ubuntu_v2_torch_bert_uncased\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Russian\\n\",\n    \"\\n\",\n    \"To train the ranking model on your own data, you should make a dataset with two files: **paraphrases.xml** (for training) and **paraphrases_gold.xml** (for testing).\\n\",\n    \"\\n\",\n    \"The xml files should have the following format:\\n\",\n    \"\\n\",\n    \"    <?xml version='1.0' encoding='UTF8'?>\\n\",\n    \"    <data>\\n\",\n    \"      <head>\\n\",\n    \"        <title>Russian Paraphrase Corpus</title>\\n\",\n    \"        <description>This file contains a collection of sentence pairs with crowdsourced annotation. Paraphrase classes: -1: non-paraphrases, 0: loose paraphrases, 1: strict paraphrases.</description>\\n\",\n    \"        <reference>http://paraphraser.ru</reference>\\n\",\n    \"        <version>1.0 beta</version>\\n\",\n    \"        <date>2015-11-28</date>\\n\",\n    \"      </head>\\n\",\n    \"      <corpus>\\n\",\n    \"        <paraphrase>\\n\",\n    \"          <value name=\\\"id\\\">1</value>\\n\",\n    \"          <value name=\\\"id_1\\\">201</value>\\n\",\n    \"          <value name=\\\"id_2\\\">8159</value>\\n\",\n    \"          <value name=\\\"text_1\\\">text 1</value>\\n\",\n    \"          <value name=\\\"text_2\\\">text 2</value>\\n\",\n    \"          <value name=\\\"jaccard\\\">0.65</value>\\n\",\n    \"          <value name=\\\"class\\\">0</value>\\n\",\n    \"        </paraphrase>\\n\",\n    \"        <paraphrase>\\n\",\n    \"          ...\\n\",\n    \"        </paraphrase>\\n\",\n    \"      </corpus>\\n\",\n    \"    </data>\\n\",\n    \"\\n\",\n    \"Place **paraphrases.xml** and **paraphrases_gold.xml** files into the directory ``\\\"data_path\\\"`` in the dataset reader from the config and launch training of the model:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"python -m deeppavlov train paraphraser_rubert\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/popularity_ranking.rst",
    "content": "=================\nPopularity Ranker\n=================\n\nPopularity Ranker re-ranks results obtained via :doc:`TF-IDF Ranker <tfidf_ranking>` using information about\nthe number of article views. The number of Wikipedia articles views is an open piece of information which can be\nobtained via `Wikimedia REST API <https://wikimedia.org/api/rest_v1/>`_.\nWe assigned a mean number of views for the period since 2017/11/05 to 2018/11/05 to each article in our\nEnglish Wikipedia database `enwiki20180211 <http://files.deeppavlov.ai/datasets/wikipedia/enwiki.tar.gz>`_.\n\nThe inner algorithm of Popularity Ranker is a Logistic Regression classifier based on 3 features:\n\n- tfidf score of the article\n- popularity of the article\n- multiplication of two above features\n\nThe classifier is trained on `SQuAD-v1.1`_ train set.\n\nQuick Start\n===========\n\nBefore using the model make sure that all required packages are installed running the command:\n\n.. code:: bash\n\n    python -m deeppavlov install en_ranker_pop_wiki\n\nBuilding the model\n\n.. code:: python\n\n    from deeppavlov import build_model\n\n    ranker = build_model('en_ranker_pop_wiki', download=True)\n\nInference\n\n.. code:: python\n\n    result = ranker(['Who is Ivan Pavlov?'])\n    print(result[:5])\n\nOutput\n\n::\n\n    >> ['Ivan Pavlov', 'Vladimir Bekhterev', 'Classical conditioning', 'Valentin Pavlov', 'Psychology']\n\nText for the output titles can be further extracted with :class:`~deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab` class.\n\n\nConfiguration\n=============\n\nDefault ranker config is\n:config:`doc_retrieval/en_ranker_pop_wiki.json <doc_retrieval/en_ranker_pop_wiki.json>`\n\nRunning the Ranker\n==================\n\n.. note::\n\n    About **17 GB of RAM** required.\n\nInteracting\n-----------\n\nWhen interacting, the ranker returns document titles of the relevant\ndocuments.\n\nRun the following to interact with the ranker:\n\n.. code:: bash\n\n    python -m deeppavlov interact en_ranker_pop_wiki -d\n\n\nAvailable Data and Pretrained Models\n====================================\n\nAvailable information about Wikipedia articles popularity is downloaded to ``~/.deeppavlov/downloads/odqa/popularities.json``\nand pre-trained logistic regression classifier is downloaded to ``~/.deeppavlov/models/odqa/logreg_3features.joblib`` by default.\n\n\nReferences\n==========\n\n.. target-notes::\n\n.. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250\n"
  },
  {
    "path": "docs/features/models/relation_extraction.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Relation Extraction\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/relation_extraction.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1 [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"    \\n\",\n    \"    4.2 [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"    \\n\",\n    \"    5.1 [Description of config parameters](#5.1-Description-of-config-parameters)\\n\",\n    \"    \\n\",\n    \"    5.2 [Train Relation Extraction on custom data](#5.2-Train-Relation-Extraction-on-custom-data)\\n\",\n    \"\\n\",\n    \"6. [Relations list](#6.-Relations-list)\\n\",\n    \"\\n\",\n    \"    6.1 [Relations used in English model](#6.1-Relations-used-in-English-model)\\n\",\n    \"    \\n\",\n    \"    6.2 [Relations used in Russian model](#6.2-Relations-used-in-Russian-model)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"Relation extraction is the task of detecting and classifying the relationship between two entities in text.\\n\",\n    \"DeepPavlov provides the document-level relation extraction meaning that the relation can be detected between the entities that are not in one sentence.\\n\",\n    \"\\n\",\n    \"**RE Model Architecture**\\n\",\n    \"\\n\",\n    \"We based our model on the [Adaptive Thresholding and Localized Context Pooling](https://arxiv.org/pdf/2010.11304.pdf) model and used NER entity tags as additional input. Two core ideas of this model are:\\n\",\n    \"\\n\",\n    \"- Adaptive Threshold\\n\",\n    \"\\n\",\n    \"The usual global threshold for converting the RE classifier output probability to relation label is replaced with a learnable one. A new threshold class that learns an entities-dependent threshold value is introduced and learnt as all other classes. During prediction the positive classes (= relations that are hold in the sample indeed) are claimed to be the classes with higher logins that the TH class, while all others are negative ones.\\n\",\n    \"\\n\",\n    \"- Localised Context Pooling\\n\",\n    \"\\n\",\n    \"The embedding of each entity pair is enhanced with an additional local context embedding related to both entities. Such representation, which is attended to the relevant context in the document, is useful to decide the relation for exactly this entity pair. For incorporating the context information the attention heads are directly used.\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Before using the model make sure that all required packages are installed running the command:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install re_docred\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents a list of all of the relation extraction models available in the DeepPavlov Library.\\n\",\n    \"\\n\",\n    \"| Config | Language | Dataset |\\n\",\n    \"| :--- | :---: | :--- |\\n\",\n    \"| relation_extraction/re_docred.json | En | [DocRED](https://www.aclweb.org/anthology/P19-1074/) |\\n\",\n    \"| relation_extraction/re_rured.json | Ru | [RuRED](http://www.dialog-21.ru/media/5093/gordeevdiplusetal-031.pdf) |\\n\",\n    \"\\n\",\n    \"## Some details on DocRED corpus English RE model was trained on\\n\",\n    \"\\n\",\n    \"The English RE model was trained on DocRED English corpus. It was constructed from Wikipedia and Wikidata and is now the largest human-annotated dataset for document-level RE from plain text.\\n\",\n    \"\\n\",\n    \"As the original DocRED test dataset containes only unlabeled data, while we want to have labeled one in order to perform evaluation, we decided to:\\n\",\n    \"1. merge train and dev data (= labeled data)\\n\",\n    \"2. split them into new train, dev and test dataset\\n\",\n    \"\\n\",\n    \"Currently, there are two types of possible splittings provided:\\n\",\n    \"\\n\",\n    \"- user can set the relative size of dev and test data (e.g. 1/7)\\n\",\n    \"- user can set the absolute size of dev and test data (e.g. 2000 samples)\\n\",\n    \"\\n\",\n    \"In our experiment, we set the absolute size of dev and test data == 150 initial documents. It resulted in approximately 3500 samples.\\n\",\n    \"\\n\",\n    \"We additionally generate negative samples if it was necessary to have the following proportions:\\n\",\n    \"- for train set: negative samples are twice as many as positive ones\\n\",\n    \"- for dev & test set: negative samples are the same amount as positive ones\\n\",\n    \"\\n\",\n    \"| Train | Dev | Test |\\n\",\n    \"| :---: | :---: | :---: |\\n\",\n    \"| 130650 | 3406 | 3545 |\\n\",\n    \"\\n\",\n    \"| Train Positive | Train Negative | Dev Positive   | Dev Negative   | Test Positive  | Test Negative  |\\n\",\n    \"| :---: | :---: | :---: | :---: | :---: | :---: |\\n\",\n    \"| 44823          | 89214          | 1239           | 1229           | 1043           | 1036           |\\n\",\n    \"\\n\",\n    \"## Some details on RuRED corpus Russian RE model was trained on\\n\",\n    \"\\n\",\n    \"In case of RuRED we used the train, dev and test sets from the original RuRED setting. We additionally generate negative samples if it was necessary to have the following proportions:\\n\",\n    \"\\n\",\n    \"- for train set: negative samples are twice as many as positive ones\\n\",\n    \"- for dev & test set: negative samples are the same amount as positive ones\\n\",\n    \"\\n\",\n    \"| Train         | Dev           | Test           |\\n\",\n    \"| :---: | :---: | :---: |\\n\",\n    \"| 12855         | 1076          |1072            |\\n\",\n    \"\\n\",\n    \"| Train Positive | Train Negative | Dev Positive | Dev Negative | Test Positive | Test Negative |\\n\",\n    \"| :---: | :---: | :---: | :---: | :---: | :---: |\\n\",\n    \"| 4285           | 8570           | 538          | 538          | 536           | 536           |\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"### English\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import configs, build_model\\n\",\n    \"\\n\",\n    \"re_model = build_model(configs.relation_extraction.re_docred, download=False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"[['P26'], ['spouse']]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"sentence_tokens = [[\\\"Barack\\\", \\\"Obama\\\", \\\"is\\\", \\\"married\\\", \\\"to\\\", \\\"Michelle\\\", \\\"Obama\\\", \\\",\\\", \\\"born\\\", \\\"Michelle\\\", \\\"Robinson\\\", \\\".\\\"]]\\n\",\n    \"entity_pos = [[[(0, 2)], [(5, 7), (9, 11)]]]\\n\",\n    \"entity_tags = [[\\\"PER\\\", \\\"PER\\\"]]\\n\",\n    \"pred = re_model(sentence_tokens, entity_pos, entity_tags)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Model Input**:\\n\",\n    \"\\n\",\n    \"- list of tokens of a text document\\n\",\n    \"- list of entities positions (i.e. all start and end positions of both entities' mentions)\\n\",\n    \"- list of NER tags of both entities.\\n\",\n    \"\\n\",\n    \"As NER tags, we adapted the used in the DocRED corpus, which are, in turn, inherited from [Tjong Kim Sang and De Meulder(2003)](https://aclanthology.org/W03-0419/)\\n\",\n    \"\\n\",\n    \"**The whole list of 6 English NER tags**\\n\",\n    \"\\n\",\n    \"| Tag | Description |\\n\",\n    \"| :--- | :--- |\\n\",\n    \"|PER | People, including fictional |\\n\",\n    \"|ORG    | Companies, universities, institutions, political or religious groups, etc.                     |\\n\",\n    \"|LOC    | Geographically defined locations, including mountains, waters, etc. <br> Politically defined locations, including countries, cities, states, streets, etc. <br> Facilities, including buildings, museums, stadiums, hospitals, factories, airports, etc.       |\\n\",\n    \"|TIME   | Absolute or relative dates or periods.                                                         |\\n\",\n    \"|NUM    | Percents, money, quantities                                                                    |\\n\",\n    \"|MISC   | Products, including vehicles, weapons, etc. <br> Events, including elections, battles, sporting MISC events, etc. Laws, cases, languages, etc.   |\\n\",\n    \"\\n\",\n    \"**Model Output**: one or several of the [97 relations](#6.1-Relations-used-in-English-model) found between the given entities; relation id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P26') and relation name ('spouse').\\n\",\n    \"\\n\",\n    \"### Russian\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import configs, build_model\\n\",\n    \"\\n\",\n    \"re_model = build_model(configs.relation_extraction.re_rured)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"[['P495'], ['страна происхождения']]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"sentence_tokens = [[\\\"Илон\\\", \\\"Маск\\\", \\\"живет\\\", \\\"в\\\", \\\"Сиэттле\\\", \\\".\\\"]]\\n\",\n    \"entity_pos = [[[(0, 2)], [(4, 5)]]]\\n\",\n    \"entity_tags = [[\\\"PERSON\\\", \\\"CITY\\\"]]\\n\",\n    \"pred = re_model(sentence_tokens, entity_pos, entity_tags)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**Model Input**:\\n\",\n    \"\\n\",\n    \"- list of tokens of a text document\\n\",\n    \"- list of entities positions (i.e. all start and end positions of both entities' mentions)\\n\",\n    \"- list of NER tags of both entities.\\n\",\n    \"\\n\",\n    \"**Model Output**: one or several of the [30 relations](#6.2-Relations-used-in-Russian-model) found between the given entities; a Russian relation name (e.g. \\\"участник\\\") or an English one, if Russian one is unavailable, and, if applicable, its id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P710').\\n\",\n    \"\\n\",\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact re_docred [-d]\\n\",\n    \"! python -m deeppavlov interact re_rured [-d]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`-d` is an optional download key (alternative to `download=True` in Python code). It is used to download the pre-trained model along with embeddings and all other files needed to run the model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Customize the model\\n\",\n    \"\\n\",\n    \"## 5.1 Description of config parameters\\n\",\n    \"\\n\",\n    \"Parameters of ``re_preprocessor`` component:\\n\",\n    \"\\n\",\n    \"- ``ner_tags: List[str]`` - ner tags of the entities, which are one-hot encoded and concatenated to entity embeddings in the output of the Transformer;\\n\",\n    \"- ``special_token: str`` - the token which is added before and after the entities (subject and object in the triplet) mentions;\\n\",\n    \"- ``default_tag: str`` - default ner tags, if no tags are provided;\\n\",\n    \"- ``do_lower_case: bool`` - set True if lowercasing is needed.\\n\",\n    \"\\n\",\n    \"Parameters of ``re_classifier`` component:\\n\",\n    \"\\n\",\n    \"- ``n_classes: int`` - number of relations which the model supports;\\n\",\n    \"- ``num_ner_tags: int`` - number of ner tags;\\n\",\n    \"- ``return_probas: bool`` - whether to return confidences of predicted relations.\\n\",\n    \"\\n\",\n    \"Parameters of ``re_postprocessor`` component:\\n\",\n    \"    \\n\",\n    \"- ``rel2id_path: str`` - the file with mapping of relation IDs in the knowledge base to relation number (for example, \\\"P19\\\": 24);\\n\",\n    \"- ``rel2label_path: str`` - the file with mapping of relation IDs to relation labels.\\n\",\n    \"\\n\",\n    \"## 5.2 Train Relation Extraction on custom data\\n\",\n    \"\\n\",\n    \"There are two kinds of dataset readers for relation extraction in DeepPavlov library:\\n\",\n    \"\\n\",\n    \"- ``docred_reader``, which takes into account partition of the text into sentences and several mentions in the text for one entity;\\n\",\n    \"- ``rured_reader``, a simplified dataset reader.\\n\",\n    \"\\n\",\n    \"### Train with ``docred_reader``\\n\",\n    \"\\n\",\n    \"You should prepare **train_annotated.json**, **dev.json**, **test.json** in the following format:\\n\",\n    \"\\n\",\n    \"    {\\n\",\n    \"      \\\"vertexSet\\\": [\\n\",\n    \"        [\\n\",\n    \"          {\\n\",\n    \"            \\\"name\\\": entity1_mention1,\\n\",\n    \"            \\\"pos\\\": [mention1 start token index, mention1 end token index],\\n\",\n    \"            \\\"sent_id\\\": ID of the sentence with the entity1 mention1,\\n\",\n    \"            \\\"type\\\": ner tag\\n\",\n    \"          },\\n\",\n    \"          {\\n\",\n    \"            \\\"name\\\": entity1_mention2,\\n\",\n    \"            ...\\n\",\n    \"          },\\n\",\n    \"          ...\\n\",\n    \"        ],\\n\",\n    \"        [ ... ]\\n\",\n    \"      ],\\n\",\n    \"      \\\"labels\\\": [\\n\",\n    \"        {\\n\",\n    \"          \\\"r\\\": relation ID,\\n\",\n    \"          \\\"h\\\": index of head entity of the triplet in the vertexSet list,\\n\",\n    \"          \\\"t\\\": index of tail entity of the triplet in the vertexSet list,\\n\",\n    \"          \\\"evidence\\\": [\\n\",\n    \"            indices of the sentences with the triplet\\n\",\n    \"          ]\\n\",\n    \"        },\\n\",\n    \"        ...\\n\",\n    \"      ],\\n\",\n    \"      \\\"title\\\": doc title,\\n\",\n    \"      \\\"sentences\\\": [\\n\",\n    \"        list of tokens of sentence 1,\\n\",\n    \"        list of tokens of sentence 2,\\n\",\n    \"        ...\\n\",\n    \"      ],\\n\",\n    \"      ...\\n\",\n    \"    }\\n\",\n    \"\\n\",\n    \"For example,\\n\",\n    \"\\n\",\n    \"    {\\n\",\n    \"      \\\"vertexSet\\\": [\\n\",\n    \"        [\\n\",\n    \"          {\\n\",\n    \"            \\\"name\\\": \\\"Elon Musk\\\",\\n\",\n    \"            \\\"pos\\\": [0, 2],\\n\",\n    \"            \\\"sent_id\\\": 0,\\n\",\n    \"            \\\"type\\\": \\\"PER\\\"\\n\",\n    \"          }\\n\",\n    \"        ],\\n\",\n    \"        [\\n\",\n    \"          {\\n\",\n    \"            \\\"name\\\": \\\"Seattle\\\",\\n\",\n    \"            \\\"pos\\\": [4, 5],\\n\",\n    \"            \\\"sent_id\\\": 0,\\n\",\n    \"            \\\"type\\\": \\\"CITY\\\"\\n\",\n    \"          }\\n\",\n    \"        ]\\n\",\n    \"      ],\\n\",\n    \"      \\\"labels\\\": [\\n\",\n    \"        {\\n\",\n    \"          \\\"r\\\": \\\"P551\\\",\\n\",\n    \"          \\\"h\\\": 0,\\n\",\n    \"          \\\"t\\\": 1,\\n\",\n    \"          \\\"evidence\\\": [0]\\n\",\n    \"        }\\n\",\n    \"      ],\\n\",\n    \"      \\\"title\\\": \\\"title1\\\",\\n\",\n    \"      \\\"sentences\\\": [\\n\",\n    \"        [\\\"Elon\\\", \\\"Musk\\\", \\\"lives\\\", \\\"in\\\", \\\"Seattle\\\", \\\".\\\"]\\n\",\n    \"      ]\\n\",\n    \"    }\\n\",\n    \"\\n\",\n    \"### Train with  ``rured_reader``\\n\",\n    \"\\n\",\n    \"You should prepare **train.json**, **dev.json**, **test.json** in the following format:\\n\",\n    \"\\n\",\n    \"    {\\n\",\n    \"        \\\"token\\\": list of text tokens,\\n\",\n    \"        \\\"relation\\\": relation ID,\\n\",\n    \"        \\\"subj_start\\\": index of the token of the subject start in the list,\\n\",\n    \"        \\\"subj_end\\\": index of the token of the subject end in the list,\\n\",\n    \"        \\\"obj_start\\\": index of the token of the object start in the list,\\n\",\n    \"        \\\"obj_end\\\": index of the token of the object end in the list,\\n\",\n    \"        \\\"subj_type\\\": ner tag of the subject entity,\\n\",\n    \"        \\\"obj_type\\\": ner tag of the object entity,\\n\",\n    \"    },\\n\",\n    \"\\n\",\n    \"for example:\\n\",\n    \"\\n\",\n    \"    {\\n\",\n    \"        \\\"token\\\": [\\\"Илон\\\", \\\"Маск\\\", \\\"живет\\\", \\\"в\\\", \\\"Сиэттле\\\", \\\".\\\"],\\n\",\n    \"        \\\"relation\\\": \\\"P551\\\",\\n\",\n    \"        \\\"subj_start\\\": 0,\\n\",\n    \"        \\\"subj_end\\\": 2,\\n\",\n    \"        \\\"obj_start\\\": 4,\\n\",\n    \"        \\\"obj_end\\\": 5,\\n\",\n    \"        \\\"subj_type\\\": \\\"PERSON\\\",\\n\",\n    \"        \\\"obj_type\\\": \\\"CITY\\\"\\n\",\n    \"    }\\n\",\n    \"\\n\",\n    \"#### Train the model using Python:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import train_model\\n\",\n    \"\\n\",\n    \"train_model(\\\"re_docred\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"**or using CLI:**\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov train re_docred\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 6. Relations list\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.1 Relations used in English model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"|Relation id     |  Relation                                           |\\n\",\n    \"| :--- | :--- |\\n\",\n    \"|P6              |  head of government                                 |\\n\",\n    \"|P17             |  country                                            |\\n\",\n    \"|P19             |  place of birth                                     |\\n\",\n    \"|P20             |  place of death                                     |\\n\",\n    \"|P22             |  father                                             |\\n\",\n    \"|P25             |  mother                                             |\\n\",\n    \"|P26             |  spouse                                             |\\n\",\n    \"|P27             |  country of citizenship                             |\\n\",\n    \"|P30             |  continent                                          |\\n\",\n    \"|P31             |  instance of                                        |\\n\",\n    \"|P35             |  head of state                                      |\\n\",\n    \"|P36             |  capital                                            |\\n\",\n    \"|P37             |  official language                                  |\\n\",\n    \"|P39             |  position held                                      |\\n\",\n    \"|P40             |  child                                              |\\n\",\n    \"|P50             |  author                                             |\\n\",\n    \"|P54             |  member of sports team                              |\\n\",\n    \"|P57             |  director                                           |\\n\",\n    \"|P58             |  screenwriter                                       |\\n\",\n    \"|P69             |  educated at                                        |\\n\",\n    \"|P86             |  composer                                           |\\n\",\n    \"|P102            |  member of political party                          |\\n\",\n    \"|P108            |  employer                                           |\\n\",\n    \"|P112            |  founded by                                         |\\n\",\n    \"|P118            |  league                                             |\\n\",\n    \"|P123            |  publisher                                          |\\n\",\n    \"|P127            |  owned by                                           |\\n\",\n    \"|P131            |  located in the administrative territorial entity   |\\n\",\n    \"|P136            |  genre                                              |\\n\",\n    \"|P137            |  operator                                           |\\n\",\n    \"|P140            |  religion                                           |\\n\",\n    \"|P150            |  contains administrative territorial entity         |\\n\",\n    \"|P155            |  follows                                            |\\n\",\n    \"|P156            |  followed by                                        |\\n\",\n    \"|P159            |  headquarters location                              |\\n\",\n    \"|P161            |  cast member                                        |\\n\",\n    \"|P162            |  producer                                           |\\n\",\n    \"|P166            |  award received                                     |\\n\",\n    \"|P170            |  creator                                            |\\n\",\n    \"|P171            |  parent taxon                                       |\\n\",\n    \"|P172            |  ethnic group                                       |\\n\",\n    \"|P175            |  performer                                          |\\n\",\n    \"|P176            |  manufacturer                                       |\\n\",\n    \"|P178            |  developer                                          |\\n\",\n    \"|P179            |  series                                             |\\n\",\n    \"|P190            |  sister city                                        |\\n\",\n    \"|P194            |  legislative body                                   |\\n\",\n    \"|P205            |  basin country                                      |\\n\",\n    \"|P206            |  located in or next to body of water                |\\n\",\n    \"|P241            |  military branch                                    |\\n\",\n    \"|P264            |  record label                                       |\\n\",\n    \"|P272            |  production company                                 |\\n\",\n    \"|P276            |  location                                           |\\n\",\n    \"|P279            |  subclass of                                        |\\n\",\n    \"|P355            |  subsidiary                                         |\\n\",\n    \"|P361            |  part of                                            |\\n\",\n    \"|P364            |  original language of work                          |\\n\",\n    \"|P400            |  platform                                           |\\n\",\n    \"|P403            |  mouth of the watercourse                           |\\n\",\n    \"|P449            |  original network                                   |\\n\",\n    \"|P463            |  member of                                          |\\n\",\n    \"|P488            |  chairperson                                        |\\n\",\n    \"|P495            |  country of origin                                  |\\n\",\n    \"|P527            |  has part                                           |\\n\",\n    \"|P551            |  residence                                          |\\n\",\n    \"|P569            |  date of birth                                      |\\n\",\n    \"|P570            |  date of death                                      |\\n\",\n    \"|P571            |  inception                                          |\\n\",\n    \"|P576            |  dissolved, abolished or demolished                 |\\n\",\n    \"|P577            |  publication date                                   |\\n\",\n    \"|P580            |  start time                                         |\\n\",\n    \"|P582            |  end time                                           |\\n\",\n    \"|P585            |  point in time                                      |\\n\",\n    \"|P607            |  conflict                                           |\\n\",\n    \"|P674            |  characters                                         |\\n\",\n    \"|P676            |  lyrics by                                          |\\n\",\n    \"|P706            |  located on terrain feature                         |\\n\",\n    \"|P710            |  participant                                        |\\n\",\n    \"|P737            |  influenced by                                      |\\n\",\n    \"|P740            |  location of formation                              |\\n\",\n    \"|P749            |  parent organization                                |\\n\",\n    \"|P800            |  notable work                                       |\\n\",\n    \"|P807            |  separated from                                     |\\n\",\n    \"|P840            |  narrative location                                 |\\n\",\n    \"|P937            |  work location                                      |\\n\",\n    \"|P1001           |  applies to jurisdiction                            |\\n\",\n    \"|P1056           |  product or material produced                       |\\n\",\n    \"|P1198           |  unemployment rate                                  |\\n\",\n    \"|P1336           |  territory claimed by                               |\\n\",\n    \"|P1344           |  participant of                                     |\\n\",\n    \"|P1365           |  replaces                                           |\\n\",\n    \"|P1366           |  replaced by                                        |\\n\",\n    \"|P1376           |  capital of                                         |\\n\",\n    \"|P1412           |  languages spoken, written or signed                |\\n\",\n    \"|P1441           |  present in work                                    |\\n\",\n    \"|P3373           |  sibling                                            |\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.2 Relations used in Russian model\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"| Relation                   | Relation id       | Russian relation                |\\n\",\n    \"| :--- | :--- | :--- |\\n\",\n    \"| MEMBER                     | P710              | участник                        |\\n\",\n    \"| WORKS_AS                   | P106              | род занятий                     |\\n\",\n    \"| WORKPLACE                  | --                | --                              |\\n\",\n    \"| OWNERSHIP                  | P1830             | владеет                         |\\n\",\n    \"| SUBORDINATE_OF             | --                | --                              |\\n\",\n    \"| TAKES_PLACE_IN             | P276              | местонахождение                 |\\n\",\n    \"| EVENT_TAKES_PART_IN        | P1344             | участвовал в                    |\\n\",\n    \"| SELLS_TO                   | --                | --                              |\\n\",\n    \"| ALTERNATIVE_NAME           | --                | --                              |\\n\",\n    \"| HEADQUARTERED_IN           | P159              | расположение штаб-квартиры      |\\n\",\n    \"| PRODUCES                   | P1056             | продукция                       |\\n\",\n    \"| ABBREVIATION               | --                | --                              |\\n\",\n    \"| DATE_DEFUNCT_IN            | P576              | дата прекращения существования  |\\n\",\n    \"| SUBEVENT_OF                | P361              | часть от                        |\\n\",\n    \"| DATE_FOUNDED_IN            | P571              | дата основания/создания/возн-я  |\\n\",\n    \"| DATE_TAKES_PLACE_ON        | P585              | момент времени                  |\\n\",\n    \"| NUMBER_OF_EMPLOYEES_FIRED  | --                | --                              |\\n\",\n    \"| ORIGINS_FROM               | P495              | страна происхождения            |\\n\",\n    \"| ACQUINTANCE_OF             | --                | --                              |\\n\",\n    \"| PARENT_OF                  | P40               | дети                            |\\n\",\n    \"| ORGANIZES                  | P664              | организатор                     |\\n\",\n    \"| FOUNDED_BY                 | P112              | основатель                      |\\n\",\n    \"| PLACE_RESIDES_IN           | P551              | место жительства                |\\n\",\n    \"| BORN_IN                    | P19               | место рождения                  |\\n\",\n    \"| AGE_IS                     | --                | --                              |\\n\",\n    \"| RELATIVE                   | --                | --                              |\\n\",\n    \"| NUMBER_OF_EMPLOYEES        | P1128             | число сотрудников               |\\n\",\n    \"| SIBLING                    | P3373             | брат/сестра                     |\\n\",\n    \"| DATE_OF_BIRTH              | P569              | дата рождения                   |\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/spelling_correction.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Spelling correction\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/spelling_correction.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1. [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"\\n\",\n    \"    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"\\n\",\n    \"    5.1. [Training configuration](#5.1-Training-configuration)\\n\",\n    \"\\n\",\n    \"    5.2. [Language model](#5.2-Language-model)\\n\",\n    \"\\n\",\n    \"6. [Comparison](#6.-Comparison)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"Spelling correction is detection of words in the text with spelling errors and replacement them with correct ones.\\n\",\n    \"\\n\",\n    \"For example, the sentence\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"The platypus lives in eastern Astralia, inkluding Tasmania.\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"with spelling mistakes ('Astralia', 'inkluding') will be corrected as\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"The platypus lives in eastern Australia, including Tasmania.\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then make sure that all the required packages for the model are installed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install brillmoore_wikitypos_en\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`brillmoore_wikitypos_en` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\\n\",\n    \"\\n\",\n    \"There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\\n\",\n    \"The full list of models for spelling correction with their config names can be found in the [table](#3.-Models-list).\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents a list of all of the models for entity detection, linking and extraction available in the DeepPavlov Library.\\n\",\n    \"\\n\",\n    \"| Config name | Language | RAM |\\n\",\n    \"| :--- | --- | --- |\\n\",\n    \"| brillmoore_wikitypos_en | En | 6.7 Gb |\\n\",\n    \"| levenshtein_corrector_ru | Ru | 8.7 Gb |\\n\",\n    \"\\n\",\n    \"We provide two types of pipelines for spelling correction:\\n\",\n    \"\\n\",\n    \"* [levenshtein_corrector](#4.1.1-Levenshtein-corrector) uses simple Damerau-Levenshtein distance to find correction candidates\\n\",\n    \"\\n\",\n    \"* [brillmoore](#4.1.2-Brillmoore) uses statistics based error model for it.\\n\",\n    \"\\n\",\n    \"In both cases correction candidates are chosen based on context with the help of a [kenlm language model](https://docs.deeppavlov.ai/en/master/features/models/spelling_correction.html#language-model).\\n\",\n    \"\\n\",\n    \"You can find [the comparison](#6.-Comparison) of these and other approaches near the end of this readme.\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"### 4.1.1 Levenshtein corrector\\n\",\n    \"\\n\",\n    \"[This component](https://docs.deeppavlov.ai/en/master/apiref/models/spelling_correction.html#deeppavlov.models.spelling_correction.levenshtein.LevenshteinSearcherComponent) finds all the candidates in a static dictionary on a set Damerau-Levenshtein distance. It can separate one token into two but it will not work the other way around.\\n\",\n    \"\\n\",\n    \"**Component config parameters**:\\n\",\n    \"\\n\",\n    \"-  ``in`` — list with one element: name of this component's input in\\n\",\n    \"   chainer's shared memory\\n\",\n    \"-  ``out`` — list with one element: name for this component's output in\\n\",\n    \"   chainer's shared memory\\n\",\n    \"-  ``class_name`` always equals to ``\\\"spelling_levenshtein\\\"`` or ``deeppavlov.models.spelling_correction.levenshtein.searcher_component:LevenshteinSearcherComponent``.\\n\",\n    \"-  ``words`` — list of all correct words (should be a reference)\\n\",\n    \"-  ``max_distance`` — maximum allowed Damerau-Levenshtein distance\\n\",\n    \"   between source words and candidates\\n\",\n    \"-  ``error_probability`` — assigned probability for every edit\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model, configs\\n\",\n    \"\\n\",\n    \"model = build_model('levenshtein_corrector_ru', download=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"['утконос живет в восточной австралии на обширном ареале от холодных плато тасмании и австралийских альп до дождевых лесов прибрежного квинсленда.']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"model(['Утканос живет в Васточной Австралии на обширном ареале от холодных плато Тасмании и Австралийских Альп до дождевых лесов прибрежного Квинсленда.'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 4.1.2 Brillmoore\\n\",\n    \"\\n\",\n    \"[This component](https://docs.deeppavlov.ai/en/master/apiref/models/spelling_correction.html#deeppavlov.models.spelling_correction.brillmoore.ErrorModel) is based on [An Improved Error Model for Noisy Channel Spelling Correction](http://www.aclweb.org/anthology/P00-1037) by Eric Brill and Robert C. Moore and uses statistics based error model to find best candidates in a static dictionary.\\n\",\n    \"\\n\",\n    \"**Component config parameters:**\\n\",\n    \"\\n\",\n    \"-  ``in`` — list with one element: name of this component's input in\\n\",\n    \"   chainer's shared memory\\n\",\n    \"-  ``out`` — list with one element: name for this component's output in\\n\",\n    \"   chainer's shared memory\\n\",\n    \"-  ``class_name`` always equals to ``\\\"spelling_error_model\\\"`` or ``deeppavlov.models.spelling_correction.brillmoore.error_model:ErrorModel``.\\n\",\n    \"-  ``save_path`` — path where the model will be saved at after a\\n\",\n    \"   training session\\n\",\n    \"-  ``load_path`` — path to the pretrained model\\n\",\n    \"-  ``window`` — window size for the error model from ``0`` to ``4``,\\n\",\n    \"   defaults to ``1``\\n\",\n    \"-  ``candidates_count`` — maximum allowed count of candidates for every\\n\",\n    \"   source token\\n\",\n    \"-  ``dictionary`` — description of a static dictionary model, instance\\n\",\n    \"   of (or inherited from)\\n\",\n    \"   ``deeppavlov.vocabs.static_dictionary.StaticDictionary``\\n\",\n    \"\\n\",\n    \"   -  ``class_name`` — ``\\\"static_dictionary\\\"`` for a custom dictionary or one\\n\",\n    \"      of two provided:\\n\",\n    \"\\n\",\n    \"      -  ``\\\"russian_words_vocab\\\"`` to automatically download and use a\\n\",\n    \"         list of russian words from\\n\",\n    \"         `https://github.com/danakt/russian-words/ <https://github.com/danakt/russian-words/>`__\\n\",\n    \"      -  ``\\\"wikitionary_100K_vocab\\\"`` to automatically download a list\\n\",\n    \"         of most common words from Project Gutenberg from\\n\",\n    \"         `Wiktionary <https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#Project_Gutenberg>`__\\n\",\n    \"\\n\",\n    \"   -  ``dictionary_name`` — name of a directory where a dictionary will\\n\",\n    \"      be built to and loaded from, defaults to ``\\\"dictionary\\\"`` for\\n\",\n    \"      static\\\\_dictionary\\n\",\n    \"   -  ``raw_dictionary_path`` — path to a file with a line-separated\\n\",\n    \"      list of dictionary words, required for static\\\\_dictionary\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model, configs\\n\",\n    \"\\n\",\n    \"model = build_model('brillmoore_wikitypos_en', download=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"['the platypus lives in australia.']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"model(['The platypus lives in Astralia.'])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Сommand Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact brillmoore_wikitypos_en -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Customize the model\\n\",\n    \"\\n\",\n    \"## 5.1 Training configuration\\n\",\n    \"\\n\",\n    \"For the training phase config file needs to also include these\\n\",\n    \"parameters:\\n\",\n    \"\\n\",\n    \"-  ``dataset_iterator`` — it should always be set like\\n\",\n    \"   ``\\\"dataset_iterator\\\": {\\\"class_name\\\": \\\"typos_iterator\\\"}``\\n\",\n    \"\\n\",\n    \"   -  ``class_name`` always equals to ``typos_iterator``\\n\",\n    \"   -  ``test_ratio`` — ratio of test data to train, from ``0.`` to\\n\",\n    \"      ``1.``, defaults to ``0.``\\n\",\n    \"\\n\",\n    \"-  ``dataset_reader``\\n\",\n    \"\\n\",\n    \"   -  ``class_name`` — ``typos_custom_reader`` for a custom dataset or one of\\n\",\n    \"      two provided:\\n\",\n    \"\\n\",\n    \"      -  ``typos_kartaslov_reader`` to automatically download and\\n\",\n    \"         process misspellings dataset for russian language from\\n\",\n    \"         https://github.com/dkulagin/kartaslov/tree/master/dataset/orfo_and_typos\\n\",\n    \"      -  ``typos_wikipedia_reader`` to automatically download and\\n\",\n    \"         process a list of common misspellings from english\\n\",\n    \"         Wikipedia - https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines\\n\",\n    \"\\n\",\n    \"   -  ``data_path`` — required for typos\\\\_custom\\\\_reader as a path to\\n\",\n    \"      a dataset file,\\n\",\n    \"      where each line contains a misspelling and a correct spelling\\n\",\n    \"      of a word separated by a tab symbol\\n\",\n    \"\\n\",\n    \"Component's configuration for ``spelling_error_model`` also has to\\n\",\n    \"have as ``fit_on`` parameter — list of two elements:\\n\",\n    \"names of component's input and true output in chainer's shared\\n\",\n    \"memory.\\n\",\n    \"\\n\",\n    \"## 5.2 Language model\\n\",\n    \"\\n\",\n    \"Provided pipelines use [KenLM](http://kheafield.com/code/kenlm/) to process language models, so if you want to build your own, we suggest you consult its website. We do also provide our own language models for\\n\",\n    \"[english](http://files.deeppavlov.ai/lang_models/en_wiki_no_punkt.arpa.binary.gz) (5.5GB) and\\n\",\n    \"[russian](http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz) (3.1GB) languages.\\n\",\n    \"\\n\",\n    \"# 6. Comparison\\n\",\n    \"\\n\",\n    \"We compared our pipelines with\\n\",\n    \"[Yandex.Speller](http://api.yandex.ru/speller/),\\n\",\n    \"[JamSpell](https://github.com/bakwc/JamSpell) and\\n\",\n    \"[PyHunSpell](https://github.com/blatinier/pyhunspell)\\n\",\n    \"on the [test set](http://www.dialog-21.ru/media/3838/test_sample_testset.txt) for the [SpellRuEval\\n\",\n    \"competition](http://www.dialog-21.ru/en/evaluation/2016/spelling_correction/)\\n\",\n    \"on Automatic Spelling Correction for Russian:\\n\",\n    \"\\n\",\n    \"| Correction method | Precision | Recall | F-measure | Speed (sentences/s) |\\n\",\n    \"| :---------------- | --------- | ------ | --------- | ------------------- |\\n\",\n    \"| Yandex.Speller | 83.09 | 59.86 | 69.59 | 5. |\\n\",\n    \"| DeepPavlov levenshtein_corrector_ru | 59.38 | 53.44 | 56.25 | 39.3 |\\n\",\n    \"| Hunspell + lm | 41.03 | 48.89 | 44.61 | 2.1 |\\n\",\n    \"| JamSpell | 44.57 | 35.69 | 39.64 | 136.2 |\\n\",\n    \"| Hunspell | 30.30 | 34.02 | 32.06 | 20.3 |\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/superglue.rst",
    "content": "Russian SuperGLUE Submission\n==========================================\nThe DeepPavlov library provides a way to train your Russian SuperGLUE models and submit the results to the leaderboard in a couple of easy steps.\n\nTask definition\n---------------\n`Russian SuperGLUE <https://russiansuperglue.com/>`__ is a benchmark that contains a set of tasks in Russian developed for evaluating general language understanding.\n\nThere are 9 tasks in the Russian SuperGLUE set:\n\n**DaNetQA (Yes/no Question Answering Dataset for Russian)** is a binary classification task of question answering, in which the model is asked to answer a yes/no question based on a given context fragment.\n\n**PARus (Choice of Plausible Alternatives for Russian language)** is a causal reasoning task. The model is asked to choose the most plausible alternative that has causal relation with the given premise.\n\n**RCB (Russian Commitment Bank)** is a classification task in which the model is asked to define the type of textual entailment (Entailment, Contradiction, Neutral) between two sentences.\n\nIn the **MuSeRC (Russian Multi-Sentence Reading Comprehension)** task the model needs to process information from multiple sentences at once and identify the correct answers for the\nquestion from the given list.\n\nIn the **RuCoS (Russian reading comprehension with Commonsense reasoning)** task the model has to choose the answer to each query from a list of text spans from a fragment.\n\n**RUSSE (Russian Word-in-Context)** is a reading comprehension task in which the model has to identify whether a given word is used in the same\nmeaning in two different sentences.\n\nIn **RWSD (The Russian Winograd Schema Challenge)** the data is a set of sentences that differ by one or two words\nin which syntactic ambiguity is resolved differently. The model is trained to predict whether it is resolved correctly.\n\n**LiDiRus** is a diagnostic task in which the model has to identify whether there is entailment between two sentences.\n\n**TERRa (Textual Entailment Recognition for Russian)** is a binary classification task of identifying whether there is entailment between two sentences.\n\n\nFor more detailed description of each task see `this <https://russiansuperglue.com/tasks/>`__.\n\nTrain your model\n----------------\nModify the configuration file you need and train your own model for the task (see :doc:`here </intro/quick_start>` \nfor more detailed instructions). The full list of models designed for each task can be found in the table below.\n\nCreate your submission files\n----------------------------\nTo do that, use the ``submit`` command with the name of the configuration file that defines the path to your model.\nNote that the name of the Russian SuperGLUE task should be defined in the ``[\"metadata\"][\"variables\"][\"TASK\"]`` variable in the config file.\n\n.. code:: bash\n\n    python -m deeppavlov.utils.benchmarks.superglue <config_name> [-d] [-o <output_file_name.jsonl>]\n\n* ``-d``: downloads model specific data before starting submission generation.\n* ``-o <output_file_name.jsonl>``: set output file name. By default for Russian SuperGLUE models output filenames are\n  comply with benchmark requirements.\n\nFor example, ``russian_superglue_danetqa_rubert`` solves **Yes/no Question Answering Dataset for the Russian** task.\nFollowing command will generate ``DaNetQA.jsonl`` ready for submission:\n\n.. code:: bash\n\n    python -m deeppavlov.utils.benchmarks.superglue russian_superglue_danetqa_rubert -d\n\nThe prediction results will be saved in the correct format and the file will be automatically named with the name required by the system and saved to the current directory. All you have to do next \nis to zip the files you want into one archive and `submit them to leaderboard <https://russiansuperglue.com/guide/>`__.\n\nScores\n------\nThe scores for DeepPavlov's pretrained models on the tasks are presented in the table.\n    \n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n| Model                                                                                                  |     Metric     |      Score      |\n+========================================================================================================+================+=================+\n|  :config:`russian_superglue_danetqa_rubert <russian_super_glue/russian_superglue_danetqa_rubert.json>` |    Accuracy    |      0.647      |\n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n|  :config:`russian_superglue_parus_rubert <russian_super_glue/russian_superglue_parus_rubert.json>`     |    Accuracy    |      0.588      |\n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n|  :config:`russian_superglue_russe_rubert <russian_super_glue/russian_superglue_russe_rubert.json>`     |    Accuracy    |      0.641      |\n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n|  :config:`russian_superglue_lidirus_rubert <russian_super_glue/russian_superglue_lidirus_rubert.json>` | Matthew's Corr |      0.251      |\n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n|  :config:`russian_superglue_rcb_rubert <russian_super_glue/russian_superglue_rcb_rubert.json>`         |     F1/Acc     |  0.336 / 0.486  |\n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n|  :config:`russian_superglue_rwsd_rubert <russian_super_glue/russian_superglue_rwsd_rubert.json>`       |    Accuracy    |      0.669      |\n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n|  :config:`russian_superglue_muserc_rubert <russian_super_glue/russian_superglue_muserc_rubert.json>`   |     F1a/Em     |  0.689 / 0.298  |\n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n|  :config:`russian_superglue_rucos_rubert <russian_super_glue/russian_superglue_rucos_rubert.json>`     |      F1/EM     |   0.77 / 0.768  |\n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n|  :config:`russian_superglue_terra_rubert <russian_super_glue/russian_superglue_terra_rubert.json>`     |    Accuracy    |      0.65       |\n+--------------------------------------------------------------------------------------------------------+----------------+-----------------+\n"
  },
  {
    "path": "docs/features/models/syntax_parser.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Syntax Parser\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/syntax_parser.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1. [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"\\n\",\n    \"    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"Syntactic parsing is the task of prediction of the syntactic tree given the tokenized (or raw) sentence.\\n\",\n    \"\\n\",\n    \"To define a tree, for each word one should know its syntactic head and the dependency label for the edge between them.\\n\",\n    \"For example, the tree above can be restored from the data\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"    1\\tJohn    2\\tnsubj\\t\\n\",\n    \"    2\\tbought  0\\troot\\t\\n\",\n    \"    3\\ta       6\\tdet\\t\\n\",\n    \"    4\\tvery    5\\tadvmod\\t\\n\",\n    \"    5\\ttasty   6\\tamod\\t\\n\",\n    \"    6\\tcake    2\\tobj\\n\",\n    \"    7\\t.       2\\tpunct\\n\",\n    \"```\\n\",\n    \"Here the third column contains the positions of syntactic heads and the last one -- the dependency labels.\\n\",\n    \"The words are enumerated from 1 since 0 is the index of the artificial root of the tree, whose only\\n\",\n    \"dependent is the actual syntactic head of the sentence (usually a verb).\\n\",\n    \"\\n\",\n    \"Syntactic trees can be used in many information extraction tasks. For example, to detect who is the winner\\n\",\n    \"and who is the loser in the sentence *Manchester defeated Liverpool* one relies on the word order. However,\\n\",\n    \"many languages, such as Russian, Spanish and German, have relatively free word order, which means we need\\n\",\n    \"other cues. Note also that syntactic relations (`nsubj`, `obj` and so one) have clear semantic counterparts,\\n\",\n    \"which makes syntactic parsing an appealing preprocessing step for the semantic-oriented tasks.\\n\",\n    \"\\n\",\n    \"We use BERT as the lowest layer of our model (the embedder). To extract syntactic information we apply\\n\",\n    \"the biaffine network of [Dozat, Manning, 2017](https://arxiv.org/pdf/1611.01734.pdf).\\n\",\n    \"For each sentence of length `K` this network produces two outputs: the first is an array of shape ``K*(K+1)``,\\n\",\n    \"where `i`-th row is the probability distribution of the head of `i`-th word over the sentence elements.\\n\",\n    \"The 0-th element of this distribution is the probability of the word to be a root of the sentence.\\n\",\n    \"The second output of the network is of shape `K*D`, where `D` is the number of possible dependency labels.\\n\",\n    \"\\n\",\n    \"The easiest way to obtain a tree is simply to return the head with the highest probability\\n\",\n    \"for each word in the sentence. However, the graph obtained in such a way may fail to be a valid tree:\\n\",\n    \"it may either contain a cycle or have multiple nodes with head at position 0.\\n\",\n    \"Therefore we apply the well-known Chu-Liu-Edmonds algorithm for minimal spanning tree\\n\",\n    \"to return the optimal tree, using the open-source modification from [dependency_decoding package](https://pypi.org/project/ufal.chu-liu-edmonds/).\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Before using the model make sure that all required packages are installed running the command:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install syntax_ru_syntagrus_bert\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"The table presents a list of all of the syntax parsing models available in the DeepPavlov Library.\\n\",\n    \"\\n\",\n    \"| Config | Description |\\n\",\n    \"| :--- | :--- |\\n\",\n    \"| morpho_syntax_parser/syntax_ru_syntagrus_bert.json | Config with the model which defines for each token in the sentence <br> its head and dependency type in the syntactic tree. |\\n\",\n    \"| morpho_syntax_parser/ru_syntagrus_joint_parsing | Config which unifies syntax parsing and morphological tagging. |\\n\",\n    \"\\n\",\n    \"The table presents comparison of syntax_ru_syntagrus_bert config with other models on UD2.3 dataset.\\n\",\n    \"\\n\",\n    \"| Model | UAS | LAS |\\n\",\n    \"| :--- | :---: | :---: |\\n\",\n    \"| [UD Pipe 2.3](http://ufal.mff.cuni.cz/udpipe) (Straka et al., 2017)  | 90.3 | 89.0 |\\n\",\n    \"| [UD Pipe Future](https://github.com/CoNLL-UD-2018/UDPipe-Future) (Straka, 2018) | 93.0 | 91.5 |\\n\",\n    \"| [UDify (multilingual BERT)](https://github.com/hyperparticle/udify) (Kondratyuk, 2018) | 94.8 | 93.1 |\\n\",\n    \"| Our BERT model (morpho_syntax_parser/syntax_ru_syntagrus_bert.json) | 94.9 | 93.4 |\\n\",\n    \"\\n\",\n    \"So our model is the state-of-the-art system for Russian syntactic parsing.\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"### Syntax Parser\\n\",\n    \"\\n\",\n    \"Our model produces the output in [CONLL-U format](http://universaldependencies.org/format.html)\\n\",\n    \"and is trained on Universal Dependency corpora, available on http://universaldependencies.org/format.html .\\n\",\n    \"The example usage for inference is\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"model = build_model(\\\"syntax_ru_syntagrus_bert\\\", download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"1\\tЯ\\t_\\t_\\t_\\t_\\t2\\tnsubj\\t_\\t_\\n\",\n      \"2\\tшёл\\t_\\t_\\t_\\t_\\t0\\troot\\t_\\t_\\n\",\n      \"3\\tдомой\\t_\\t_\\t_\\t_\\t2\\tadvmod\\t_\\t_\\n\",\n      \"4\\tпо\\t_\\t_\\t_\\t_\\t6\\tcase\\t_\\t_\\n\",\n      \"5\\tнезнакомой\\t_\\t_\\t_\\t_\\t6\\tamod\\t_\\t_\\n\",\n      \"6\\tулице\\t_\\t_\\t_\\t_\\t2\\tobl\\t_\\t_\\n\",\n      \"7\\t.\\t_\\t_\\t_\\t_\\t2\\tpunct\\t_\\t_\\n\",\n      \"\\n\",\n      \"1\\tДевушка\\t_\\t_\\t_\\t_\\t2\\tnsubj\\t_\\t_\\n\",\n      \"2\\tпела\\t_\\t_\\t_\\t_\\t0\\troot\\t_\\t_\\n\",\n      \"3\\tв\\t_\\t_\\t_\\t_\\t5\\tcase\\t_\\t_\\n\",\n      \"4\\tцерковном\\t_\\t_\\t_\\t_\\t5\\tamod\\t_\\t_\\n\",\n      \"5\\tхоре\\t_\\t_\\t_\\t_\\t2\\tobl\\t_\\t_\\n\",\n      \"6\\t.\\t_\\t_\\t_\\t_\\t2\\tpunct\\t_\\t_\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"sentences = [\\\"Я шёл домой по незнакомой улице.\\\", \\\"Девушка пела в церковном хоре.\\\"]\\n\",\n    \"for parse in model(sentences):\\n\",\n    \"    print(parse, end=\\\"\\\\n\\\\n\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"As prescribed by UD standards, our model writes the head information to the 7th column and the dependency\\n\",\n    \"information -- to the 8th. Our parser does not return morphological tags and even does not use them in\\n\",\n    \"training.\\n\",\n    \"\\n\",\n    \"### Joint Syntax Parser and Morphological tagger\\n\",\n    \"\\n\",\n    \"Our model in principle supports joint prediction of morphological tags and syntactic information, however, the quality of the joint model is slightly inferior to the separate ones. Therefore we release a special component that can combine the outputs of tagger and parser: `deeppavlov.models.syntax_parser.joint.JointTaggerParser`. Its sample output for the Russian language with default settings (see the configuration file `morpho_syntax_parser/ru_syntagrus_joint_parsing.json` for exact options) looks like\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"model = build_model(\\\"ru_syntagrus_joint_parsing\\\", download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"1\\tЯ\\tя\\tPRON\\t_\\tCase=Nom|Number=Sing|Person=1\\t2\\tnsubj\\t_\\t_\\n\",\n      \"2\\tшёл\\tшёл\\tVERB\\t_\\tAspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\\t0\\troot\\t_\\t_\\n\",\n      \"3\\tдомой\\tдомой\\tADV\\t_\\tDegree=Pos\\t2\\tadvmod\\t_\\t_\\n\",\n      \"4\\tпо\\tпо\\tADP\\t_\\t_\\t6\\tcase\\t_\\t_\\n\",\n      \"5\\tнезнакомой\\tнезнакомый\\tADJ\\t_\\tCase=Dat|Degree=Pos|Gender=Fem|Number=Sing\\t6\\tamod\\t_\\t_\\n\",\n      \"6\\tулице\\tулица\\tNOUN\\t_\\tAnimacy=Inan|Case=Dat|Gender=Fem|Number=Sing\\t2\\tobl\\t_\\t_\\n\",\n      \"7\\t.\\t.\\tPUNCT\\t_\\t_\\t2\\tpunct\\t_\\t_\\n\",\n      \"1\\tДевушка\\tдевушка\\tNOUN\\t_\\tAnimacy=Anim|Case=Nom|Gender=Fem|Number=Sing\\t2\\tnsubj\\t_\\t_\\n\",\n      \"2\\tпела\\tпеть\\tVERB\\t_\\tAspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\\t0\\troot\\t_\\t_\\n\",\n      \"3\\tв\\tв\\tADP\\t_\\t_\\t5\\tcase\\t_\\t_\\n\",\n      \"4\\tцерковном\\tцерковном\\tADJ\\t_\\tCase=Loc|Degree=Pos|Gender=Masc|Number=Sing\\t5\\tamod\\t_\\t_\\n\",\n      \"5\\tхоре\\tхор\\tNOUN\\t_\\tAnimacy=Inan|Case=Loc|Gender=Masc|Number=Sing\\t2\\tobl\\t_\\t_\\n\",\n      \"6\\t.\\t.\\tPUNCT\\t_\\t_\\t2\\tpunct\\t_\\t_\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"sentences = [\\\"Я шёл домой по незнакомой улице.\\\", \\\"Девушка пела в церковном хоре.\\\"]\\n\",\n    \"for parse in model(sentences):\\n\",\n    \"    print(parse, end=\\\"\\\\n\\\\n\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In the basic case the model outputs a human-readable string with parse data for each information. If you need\\n\",\n    \"to use the output in Python, consult the `deeppavlov.models.syntax_parser.joint.JointTaggerParser` and source code.\\n\",\n    \"\\n\",\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Сommand Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact syntax_ru_syntagrus_bert -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\\n\",\n    \"\\n\",\n    \"# 5. Customize the model\\n\",\n    \"\\n\",\n    \"To train **syntax parser** on your own data, you should prepare a dataset in **CoNLL-U format**. The description of **CoNLL-U format** can be found [here](https://universaldependencies.org/format.html#conll-u-format).\\n\",\n    \"\\n\",\n    \"Then you should place files for training, validation and testing into the ``\\\"data_path\\\"`` directory of ``morphotagger_dataset_reader``, change file names in ``morphotagger_dataset_reader`` to your filenames and launch the training:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import train_model\\n\",\n    \"\\n\",\n    \"train_model(\\\"<your_syntax_parsing_config_name>\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"or **using CLI**:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov train <your_syntax_parser_config_name>\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/models/tfidf_ranking.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Tfidf Ranking\\n\",\n    \"\\n\",\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/tfidf_ranking.ipynb)\\n\",\n    \"\\n\",\n    \"# Table of contents \\n\",\n    \"\\n\",\n    \"1. [Introduction to the task](#1.-Introduction-to-the-task)\\n\",\n    \"\\n\",\n    \"2. [Get started with the model](#2.-Get-started-with-the-model)\\n\",\n    \"\\n\",\n    \"3. [Models list](#3.-Models-list)\\n\",\n    \"\\n\",\n    \"4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\\n\",\n    \"\\n\",\n    \"    4.1. [Predict using Python](#4.1-Predict-using-Python)\\n\",\n    \"    \\n\",\n    \"    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\\n\",\n    \"\\n\",\n    \"5. [Customize the model](#5.-Customize-the-model)\\n\",\n    \"    \\n\",\n    \"    5.1. [Fit on Wikipedia](#5.1-Fit-on-Wikipedia)\\n\",\n    \"    \\n\",\n    \"    5.2. [Download, parse new Wikipedia dump, build database and index](#5.2-Download,-parse-new-Wikipedia-dump,-build-database-and-index)\\n\",\n    \"\\n\",\n    \"# 1. Introduction to the task\\n\",\n    \"\\n\",\n    \"This is an implementation of a passage ranker based on tf-idf vectorization.\\n\",\n    \"The ranker implementation is based on [DrQA](https://github.com/facebookresearch/DrQA/) project.\\n\",\n    \"The default ranker implementation takes a batch of queries as input and returns 100 passage titles sorted via relevance.\\n\",\n    \"\\n\",\n    \"# 2. Get started with the model\\n\",\n    \"\\n\",\n    \"First make sure you have the DeepPavlov Library installed.\\n\",\n    \"[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -q deeppavlov\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then make sure that all the required packages for the model are installed.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!python -m deeppavlov install en_ranker_tfidf_wiki\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"`en_ranker_tfidf_wiki` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\\n\",\n    \"\\n\",\n    \"There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\\n\",\n    \"The full list of models for tfidf ranking with their config names can be found in the [table](#3.-Models-list).\\n\",\n    \"\\n\",\n    \"# 3. Models list\\n\",\n    \"\\n\",\n    \"| Config | Language | Description | RAM |\\n\",\n    \"| :--- | :---: | :--- | :---: |\\n\",\n    \"| doc_retrieval/en_ranker_tfidf_wiki.json | En | Config for TF-IDF ranking over Wikipedia | 2.9 Gb |\\n\",\n    \"| doc_retrieval/en_ranker_pop_wiki.json | En | Config for TF-IDF ranking, followed by <br> popularity ranking, over Wikipedia | 8.1 Gb |\\n\",\n    \"| doc_retrieval/ru_ranker_tfidf_wiki.json | Ru | TF-IDF ranking config over Wikipedia | 8.4 Gb |\\n\",\n    \"\\n\",\n    \"# 4. Use the model for prediction\\n\",\n    \"\\n\",\n    \"## 4.1 Predict using Python\\n\",\n    \"\\n\",\n    \"### English\\n\",\n    \"\\n\",\n    \"Building (if you don't have your own data)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model, configs\\n\",\n    \"\\n\",\n    \"ranker = build_model(configs.doc_retrieval.en_ranker_tfidf_wiki, download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Inference\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"[18155097, 628663, 17123727, 628662, 19097375]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"result = ranker(['Who is Ivan Pavlov?'])\\n\",\n    \"print(result[0][:5])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Russian\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import build_model, configs\\n\",\n    \"\\n\",\n    \"ranker = build_model(configs.doc_retrieval.ru_ranker_tfidf_wiki, download=True, install=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"[4902620, 1900377, 11129584, 1720563, 1720658]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"result = ranker(['Когда произошла Куликовская битва?'])\\n\",\n    \"print(result[0][:5])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Text for the output titles can be further extracted with [deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab](https://docs.deeppavlov.ai/en/master/apiref/vocabs.html#deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab) class.\\n\",\n    \"\\n\",\n    \"## 4.2 Predict using CLI\\n\",\n    \"\\n\",\n    \"You can also get predictions in an interactive mode through CLI (Сommand Line Interface).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"! python -m deeppavlov interact en_ranker_tfidf_wiki -d\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5. Customize the model\\n\",\n    \"\\n\",\n    \"## 5.1 Fit on Wikipedia\\n\",\n    \"\\n\",\n    \"Run the following to fit the ranker on **English** Wikipedia:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"python -m deppavlov train en_ranker_tfidf_wiki\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Run the following to fit the ranker on **Russian** Wikipedia:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"python -m deeppavlov train ru_ranker_tfidf_wiki\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"As a result of ranker training, a SQLite database and tf-idf matrix are created.\\n\",\n    \"\\n\",\n    \"## 5.2 Download, parse new Wikipedia dump, build database and index\\n\",\n    \"\\n\",\n    \"**enwiki.db** SQLite database consists of ~21 M Wikipedia articles and is built by the following steps:\\n\",\n    \"\\n\",\n    \"- Download a Wikipedia dump file. We took the latest\\n\",\n    \"   [enwiki dump](https://dumps.wikimedia.org/enwiki/20230501/)\\n\",\n    \"\\n\",\n    \"- Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor)\\n\",\n    \"   (with ``--json``, ``--no-templates``, ``--filter_disambig_pages``\\n\",\n    \"   options)\\n\",\n    \"\\n\",\n    \"- [Build a database](#5.1-Fit-on-Wikipedia).\\n\",\n    \"\\n\",\n    \"**enwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size **hash_size x number of documents** which is\\n\",\n    \"$2^{24}$ x 21 M. This matrix is built with [deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer](https://docs.deeppavlov.ai/en/master/apiref/models/vectorizers.html#deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer) class.\\n\",\n    \"\\n\",\n    \"**ruwiki.db** SQLite database consists of ~12 M Wikipedia articles and is built by the following steps:\\n\",\n    \"\\n\",\n    \"- Download a Wikipedia dump file. We took the latest [ruwiki dump](https://dumps.wikimedia.org/ruwiki/20230501/)\\n\",\n    \"\\n\",\n    \"- Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor)\\n\",\n    \"   (with ``--json``, ``--no-templates``, ``--filter_disambig_pages``\\n\",\n    \"   options)\\n\",\n    \"\\n\",\n    \"- [Build a database](#5.1-Fit-on-Wikipedia).\\n\",\n    \"\\n\",\n    \"**ruwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size **hash_size x number of documents** which is\\n\",\n    \"$2^{24}$ x 12 M. This matrix is built with\\n\",\n    \"[deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer](https://docs.deeppavlov.ai/en/master/apiref/models/vectorizers.html#deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer) class.\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/features/overview.rst",
    "content": "Features\n========\n\n.. contents:: :local:\n\nModels\n------\n\nNER model :doc:`[docs] </features/models/NER>`\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nNamed Entity Recognition task in DeepPavlov is solved with BERT-based model.\nThe models predict tags (in BIO format) for tokens in input.\n\nBERT-based model is described in  `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\n<https://arxiv.org/abs/1810.04805>`__.\n\n+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+\n| Dataset                                                 | Lang  | Model                                                                                      |   Test F1   |\n+=========================================================+=======+============================================================================================+=============+\n| Persons-1000 dataset with additional LOC and ORG markup | Ru    | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                                        |    97.9     |\n+                                                         +       +--------------------------------------------------------------------------------------------+-------------+\n| (Collection 3)                                          |       | :config:`ner_rus_convers_distilrubert_2L.json  <ner/ner_rus_convers_distilrubert_2L.json>` |  88.4 ± 0.5 |\n+                                                         +       +--------------------------------------------------------------------------------------------+-------------+\n|                                                         |       | :config:`ner_rus_convers_distilrubert_6L.json  <ner/ner_rus_convers_distilrubert_6L.json>` |  93.3 ± 0.3 |\n+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+\n| Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>`                  |    88.9     |\n+                                                         +-------+--------------------------------------------------------------------------------------------+-------------+\n|                                                         | En    | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`                            |    89.2     |\n+---------------------------------------------------------+       +--------------------------------------------------------------------------------------------+-------------+\n| ConLL-2003                                              |       | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`                            |    91.7     |\n+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+\n\nClassification model :doc:`[docs] </features/models/classification>`\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nModel for classification tasks (intents, sentiment, etc) on word-level. Shallow-and-wide CNN, Deep CNN, BiLSTM,\nBiLSTM with self-attention and other models are presented. The model also allows multilabel classification of texts.\nSeveral pre-trained models are available and presented in Table below.\n\n\n+------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+\n| Task             | Dataset             | Lang | Model                                                                                              | Metric      | Valid            | Test            | Downloads |\n+==================+=====================+======+====================================================================================================+=============+==================+=================+===========+\n| Insult detection | `Insults`_          | En   | :config:`English BERT<classifiers/insults_kaggle_bert.json>`                                       | ROC-AUC     | 0.9327           | 0.8602          |  1.1 Gb   |\n+------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+\n| Sentiment        | `SST`_              |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`          | Accuracy    | 0.6293           | 0.6626          |  1.1 Gb   |\n+------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+\n| Sentiment        | `Twitter mokoron`_  | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`                  | Accuracy    | 0.9918           | 0.9923          |  5.8 Gb   |\n+                  +---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+\n|                  | `RuSentiment`_      |      | :config:`Multi-language BERT <classifiers/rusentiment_bert.json>`                                  | F1-weighted | 0.6787           | 0.7005          |  1.3 Gb   |\n+                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+\n|                  |                     |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                        |             | 0.739            | 0.7724          |  1.5 Gb   |\n+                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+\n|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L.json>`  |             |  0.703 ± 0.0031  | 0.7348 ± 0.0028 |  690 Mb   |\n+                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+\n|                  |                     |      | :config:`Conversational DistilRuBERT-base <classifiers/rusentiment_convers_distilrubert_6L.json>`  |             |  0.7376 ± 0.0045 | 0.7645 ± 0.035  |  1.0 Gb   |\n+------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+\n\n.. _`DSTC 2`: http://camdial.org/~mh521/dstc/\n.. _`SNIPS-2017`: https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines\n.. _`Insults`: https://www.kaggle.com/c/detecting-insults-in-social-commentary\n.. _`AG News`: https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html\n.. _`Twitter mokoron`: http://study.mokoron.com/\n.. _`RuSentiment`: http://text-machine.cs.uml.edu/projects/rusentiment/\n.. _`Yahoo-L31`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l\n.. _`Yahoo-L6`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l\n.. _`SST`: https://nlp.stanford.edu/sentiment/index.html\n\nAs no one had published intent recognition for DSTC-2 data, the\ncomparison of the presented model is given on **SNIPS** dataset. The\nevaluation of model scores was conducted in the same way as in [3]_ to\ncompare with the results from the report of the authors of the dataset.\nThe results were achieved with tuning of parameters and embeddings\ntrained on Reddit dataset.\n\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n| Model                  | AddToPlaylist   | BookRestaurant   | GetWheather   | PlayMusic    | RateBook     | SearchCreativeWork   | SearchScreeningEvent   |\n+========================+=================+==================+===============+==============+==============+======================+========================+\n| api.ai                 | 0.9931          | 0.9949           | 0.9935        | 0.9811       | 0.9992       | 0.9659               | 0.9801                 |\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n| ibm.watson             | 0.9931          | 0.9950           | 0.9950        | 0.9822       | 0.9996       | 0.9643               | 0.9750                 |\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n| microsoft.luis         | 0.9943          | 0.9935           | 0.9925        | 0.9815       | 0.9988       | 0.9620               | 0.9749                 |\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n| wit.ai                 | 0.9877          | 0.9913           | 0.9921        | 0.9766       | 0.9977       | 0.9458               | 0.9673                 |\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n| snips.ai               | 0.9873          | 0.9921           | 0.9939        | 0.9729       | 0.9985       | 0.9455               | 0.9613                 |\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n| recast.ai              | 0.9894          | 0.9943           | 0.9910        | 0.9660       | 0.9981       | 0.9424               | 0.9539                 |\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n| amazon.lex             | 0.9930          | 0.9862           | 0.9825        | 0.9709       | 0.9981       | 0.9427               | 0.9581                 |\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n| Shallow-and-wide CNN   | **0.9956**      | **0.9973**       | **0.9968**    | **0.9871**   | **0.9998**   | **0.9752**           | **0.9854**             |\n+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+\n\n.. [3] https://www.slideshare.net/KonstantinSavenkov/nlu-intent-detection-benchmark-by-intento-august-2017\n\n\nAutomatic spelling correction model :doc:`[docs] </features/models/spelling_correction>`\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nPipelines that use candidates search in a static dictionary and an ARPA language model to correct spelling errors.\n\n.. note::\n\n    About 4.4 GB on disc required for the Russian language model and about 7 GB for the English one.\n\nComparison on the `test set <http://www.dialog-21.ru/media/3838/test_sample_testset.txt>`__ for the `SpellRuEval\ncompetition <http://www.dialog-21.ru/en/evaluation/2016/spelling_correction/>`__\non Automatic Spelling Correction for Russian:\n\n+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+\n| Correction method                                                                       | Precision | Recall | F-measure | Speed (sentences/s) |\n+=========================================================================================+===========+========+===========+=====================+\n| Yandex.Speller                                                                          | 83.09     | 59.86  | 69.59     | 5.                  |\n+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+\n| :config:`Damerau Levenshtein 1 + lm<spelling_correction/levenshtein_corrector_ru.json>` | 53.26     | 53.74  | 53.50     | 29.3                |\n+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+\n| Hunspell + lm                                                                           | 41.03     | 48.89  | 44.61     | 2.1                 |\n+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+\n| JamSpell                                                                                | 44.57     | 35.69  | 39.64     | 136.2               |\n+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+\n| Hunspell                                                                                | 30.30     | 34.02  | 32.06     | 20.3                |\n+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+\n\n\n\nRanking model :doc:`[docs] </features/models/neural_ranking>`\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nAvailable pre-trained models for paraphrase identification:\n\n.. table::\n   :widths: auto\n\n   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+\n   |    Dataset             | Model config                                                                                         | Val (accuracy) | Test (accuracy) | Val (F1)   | Test (F1)  | Val (log_loss) | Test (log_loss) | Downloads |\n   +========================+======================================================================================================+================+=================+============+============+================+=================+===========+\n   | `paraphraser.ru`_      | :config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                                    |   89.8         |   84.2          |   92.2     |  87.4      |   --           |   --            | 1325M     |\n   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+\n   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L <classifiers/paraphraser_convers_distilrubert_2L.json>` |  76.1 ± 0.2    |  64.5 ± 0.5     | 81.8 ± 0.2 | 73.9 ± 0.8 |   --           |   --            | 618M      |\n   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+\n   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L <classifiers/paraphraser_convers_distilrubert_6L.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3 | 83.2 ± 0.5 |   --           |   --            | 930M      |\n   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+\n\n.. _`paraphraser.ru`: https://paraphraser.ru/\n\n\nReferences:\n\n* Yu Wu, Wei Wu, Ming Zhou, and Zhoujun Li. 2017. Sequential match network: A new architecture for multi-turn response selection in retrieval-based chatbots. In ACL, pages 372–381. https://www.aclweb.org/anthology/P17-1046\n\n* Xiangyang Zhou, Lu Li, Daxiang Dong, Yi Liu, Ying Chen, Wayne Xin Zhao, Dianhai Yu and Hua Wu. 2018. Multi-Turn Response Selection for Chatbots with Deep Attention Matching Network. Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1118-1127, ACL. http://aclweb.org/anthology/P18-1103\n\n* Chongyang Tao, Wei Wu, Can Xu, Wenpeng Hu, Dongyan Zhao, and Rui Yan. Multi-Representation Fusion Network for Multi-turn Response Selection in Retrieval-based Chatbots. In WSDM'19. https://dl.acm.org/citation.cfm?id=3290985\n\n* Gu, Jia-Chen & Ling, Zhen-Hua & Liu, Quan. (2019). Interactive Matching Network for Multi-Turn Response Selection in Retrieval-Based Chatbots. https://arxiv.org/abs/1901.01824\n\n\n\nTF-IDF Ranker model :doc:`[docs] </features/models/tfidf_ranking>`\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nBased on `Reading Wikipedia to Answer Open-Domain Questions <https://github.com/facebookresearch/DrQA/>`__. The model solves the task of document retrieval for a given query.\n\n+---------------+-------------------------------------------------------------------+----------------------+-----------------+-----------+\n| Dataset       | Model                                                             |  Wiki dump           |  Recall@5       | Downloads |\n+===============+========================================================+==========+======================+=================+===========+\n| `SQuAD-v1.1`_ | :config:`doc_retrieval <doc_retrieval/en_ranker_tfidf_wiki.json>` |  enwiki (2018-02-11) |   75.6          | 33 GB     |\n+---------------+-------------------------------------------------+-----------------+----------------------+-----------------+-----------+\n\n\nQuestion Answering model :doc:`[docs] </features/models/SQuAD>`\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nModels in this section solve the task of looking for an answer on a\nquestion in a given context (`SQuAD <https://rajpurkar.github.io/SQuAD-explorer/>`__ task format).\nThere are two models for this task in DeepPavlov: BERT-based and R-Net. Both models predict answer start and end\nposition in a given context.\n\nBERT-based model is described in  `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\n<https://arxiv.org/abs/1810.04805>`__.\n\nRuBERT-based model is described in  `Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language\n<https://arxiv.org/abs/1905.07213>`__.\n\n+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+\n|    Dataset     | Model config                                                                                                  | lang  |    EM (dev)    |    F-1 (dev)    |    Downloads    |\n+================+===============================================================================================================+=======+================+=================+=================+\n| `SQuAD-v1.1`_  | :config:`DeepPavlov BERT <squad/squad_bert.json>`                                                             |  en   |     81.49      |     88.86       |     1.2 Gb      |\n+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+\n| `SQuAD-v2.0`_  | :config:`DeepPavlov BERT <squad/qa_squad2_bert.json>`                                                         |  en   |     75.71      |     80.72       |     1.2 Gb      |\n+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+\n| `SDSJ Task B`_ | :config:`DeepPavlov RuBERT <squad/squad_ru_bert.json.json>`                                                   |  ru   |     66.21      |     84.71       |     1.7 Mb      |\n+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+\n| `SDSJ Task B`_ | :config:`DeepPavlov RuBERT, trained with tfidf-retrieved negative samples <squad/qa_sberquad2_bert.json>`     |  ru   |     66.24      |     84.71       |     1.6 Gb      |\n+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+\n| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-tiny <squad/squad_ru_convers_distilrubert_2L.json>`                          |  ru   |  44.2 ± 0.46   |  65.1 ± 0.36    |     867Mb       |\n+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+\n| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-base <squad/squad_ru_convers_distilrubert_6L.json>`                          |  ru   |  61.23 ± 0.42  |  80.36 ± 0.28   |     1.18Gb      |\n+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+\n\nIn the case when answer is not necessary present in given context we have :config:`qa_squad2_bert <squad/qa_squad2_bert.json>`\nmodel. This model outputs empty string in case if there is no answer in context.\n\n\n\nODQA :doc:`[docs] </features/models/ODQA>`\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nAn open domain question answering model. The model accepts free-form questions about the world and outputs an answer\nbased on its Wikipedia knowledge.\n\n\n+----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+\n| Dataset        | Model config                                                       |  Wiki dump            |   F1   | Downloads |\n+================+====================================================================+=======================+========+===========+\n| `SQuAD-v1.1`_  | :config:`ODQA <odqa/en_odqa_infer_wiki.json>`                      | enwiki (2018-02-11)   |  46.24 | 9.7Gb     |\n+----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+\n| `SDSJ Task B`_ | :config:`ODQA with RuBERT <odqa/ru_odqa_infer_wiki.json>`          | ruwiki (2018-04-01)   |  37.83 | 4.3Gb     |\n+----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+\n\n\nAutoML\n--------------------\n\nHyperparameters optimization :doc:`[docs] </features/hypersearch>`\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nHyperparameters optimization by cross-validation for DeepPavlov models\nthat requires only some small changes in a config file.\n\n\nEmbeddings\n----------\n\nPre-trained embeddings :doc:`[docs] </features/pretrained_vectors>`\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nWord vectors for the Russian language trained on joint `Russian Wikipedia <https://ru.wikipedia.org/>`__ and `Lenta.ru\n<https://lenta.ru/>`__ corpora.\n\n\nExamples of some models\n---------------------------\n\n-  Run insults detection model with console interface:\n\n   .. code-block:: bash\n\n      python -m deeppavlov interact insults_kaggle_bert -d\n\n-  Run insults detection model with REST API:\n\n   .. code-block:: bash\n\n      python -m deeppavlov riseapi insults_kaggle_bert -d\n\n-  Predict whether it is an insult on every line in a file:\n\n   .. code-block:: bash\n\n      python -m deeppavlov predict insults_kaggle_bert -d --batch-size 15 < /data/in.txt > /data/out.txt\n\n\n.. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250\n.. _`SQuAD-v2.0`: https://arxiv.org/abs/1806.03822\n.. _`SDSJ Task B`: https://arxiv.org/abs/1912.09723\n"
  },
  {
    "path": "docs/features/pretrained_vectors.rst",
    "content": "Pre-trained embeddings\n======================\n\nBERT\n----\n\nWe are publishing several pre-trained BERT models:\n\n* RuBERT for Russian language\n* Slavic BERT for Bulgarian, Czech, Polish, and Russian\n* Conversational BERT for informal English\n* Conversational BERT for informal Russian\n* Sentence Multilingual BERT for encoding sentences in 101 languages\n* Sentence RuBERT for encoding sentences in Russian\n\nDescription of these models is available in the :doc:`BERT section </features/models/bert>` of the docs.\n\nLicense\n~~~~~~~\n\nThe pre-trained models are distributed under the `License Apache\n2.0 <https://www.apache.org/licenses/LICENSE-2.0>`__.\n\nDownloads\n~~~~~~~~~\n\nThe ``TensorFlow`` models can be run with the original `BERT repo <https://github.com/google-research/bert>`_ code\nwhile the ``PyTorch`` models can be run with the `HuggingFace's Transformers <https://github.com/huggingface/transformers>`__ library.\nThe download links are:\n\n+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+\n| Description                | Model parameters                      | Download links                                                                                                       |\n+============================+=======================================+======================================================================================================================+\n| RuBERT                     | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,           |\n|                            | size = 632MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v2.tar.gz>`__            |\n+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+\n| Slavic BERT                | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,      |\n|                            | size = 632MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_v1.tar.gz>`__       |\n+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+\n| Conversational BERT        | vocab size = 30K, parameters = 110M,  | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,   |\n|                            | size = 385MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_v1.tar.gz>`__    |\n+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+\n| Conversational RuBERT      | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,|\n|                            | size = 630MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12.tar.gz>`__    |\n+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+\n| Sentence Multilingual BERT | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,   |\n|                            | size = 630MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12.tar.gz>`__       |\n+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+\n| Sentence RuBERT            | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,      |\n|                            | size = 630MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12.tar.gz>`__          |\n+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+\n\n\nELMo\n----\n\nThe ELMo can used via Python code as following:\n\n.. code:: python\n\n   import tensorflow as tf\n   import tensorflow_hub as hub\n   elmo = hub.Module(\"http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz\", trainable=True)\n   sess = tf.Session()\n   sess.run(tf.global_variables_initializer())\n   embeddings = elmo([\"это предложение\", \"word\"], signature=\"default\", as_dict=True)[\"elmo\"]\n   sess.run(embeddings)\n\n\nTensorFlow Hub module also supports tokenized sentences in the following format.\n\n.. code:: python\n\n   tokens_input = [[\"мама\", \"мыла\", \"раму\"], [\"рама\", \"\", \"\"]]\n   tokens_length = [3, 1]\n   embeddings = elmo(inputs={\"tokens\": tokens_input,\"sequence_len\": tokens_length},signature=\"tokens\",as_dict=True)[\"elmo\"]\n   sess.run(embeddings)\n\n\nDownloads\n~~~~~~~~~\n\nThe models can be downloaded and run by tensorflow hub module from:\n\n\n+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| Description                                                        | Dataset parameters                          | Perplexity       | Tensorflow hub module                                                                                                                                                                                                                 |\n+====================================================================+=============================================+==================+=======================================================================================================================================================================================================================================+\n| ELMo on  `Russian Wikipedia <https://ru.wikipedia.org/>`__         | lines = 1M, tokens = 386M, size = 5GB       | 43.692           | `module_spec <http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-wiki_600k_steps.tar.gz>`__                                                                                                                                           |\n+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ELMo on  `Russian WMT News <http://www.statmt.org/>`__             | lines = 63M, tokens = 946M, size = 12GB     | 49.876           | `module_spec <http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz>`__                                                                                                                                  |\n+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ELMo on  `Russian Twitter <https://twitter.com/>`__                | lines = 104M, tokens = 810M, size = 8.5GB   | 94.145           | `module_spec <http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz>`__                                                                                                                        |\n+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n\n\nfastText\n--------\n\nWe are publishing pre-trained word vectors for Russian language.\nSeveral models were trained on joint `Russian\nWikipedia <https://ru.wikipedia.org/>`__\nand `Lenta.ru <https://lenta.ru/>`__ corpora.\nWe also introduce one model for Russian conversational language that\nwas trained on `Russian Twitter <https://twitter.com/>`__ corpus.\n\nAll vectors are 300-dimensional. We used fastText skip-gram (see\n`Bojanowski et al. (2016) <https://arxiv.org/abs/1607.04606>`__) for\nvectors training as well as various preprocessing options (see below).\n\nYou can get vectors either in binary or in text (vec) formats for FastText.\n\nLicense\n~~~~~~~\n\nThe pre-trained word vectors are distributed under the `License Apache\n2.0 <https://www.apache.org/licenses/LICENSE-2.0>`__.\n\nDownloads\n~~~~~~~~~\n\nThe pre-trained **fastText skipgram** models can be downloaded from:\n\n+-----------------------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| Domain                | Preprocessing                                           | Vectors                                                                                                                                                                                                                                                                                                                            |\n+=======================+=========================================================+====================================================================================================================================================================================================================================================================================================================================+\n| Wiki+Lenta            | tokenize (nltk word\\_tokenize), lemmatize (pymorphy2)   | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lemmatize/ft_native_300_ru_wiki_lenta_lemmatize.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lemmatize/ft_native_300_ru_wiki_lenta_lemmatize.vec>`__                                                                   |\n+                       +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n|                       | tokenize (nltk word\\_tokenize), lowercasing             | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lower_case/ft_native_300_ru_wiki_lenta_lower_case.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lower_case/ft_native_300_ru_wiki_lenta_lower_case.vec>`__                                                               |\n+                       +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n|                       | tokenize (nltk wordpunсt\\_tokenize)                     | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec>`__           |\n+                       +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n|                       | tokenize (nltk word\\_tokenize)                          | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_word_tokenize/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_word_tokenize/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.vec>`__                               |\n+                       +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n|                       | tokenize (nltk word\\_tokenize), remove stopwords        | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_remstopwords/ft_native_300_ru_wiki_lenta_remstopwords.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_remstopwords/ft_native_300_ru_wiki_lenta_remstopwords.vec>`__                                                       |\n+-----------------------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| Twitter               | tokenize (nltk word\\_tokenize)                          | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.vec>`__                                                                                                                                   |\n+-----------------------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n\nWord vectors training parameters\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nThese word vectors were trained with following parameters ([...] is for\ndefault value):\n\nfastText (skipgram)\n                   \n\n-  lr [0.1]\n-  lrUpdateRate [100]\n-  dim 300\n-  ws [5]\n-  epoch [5]\n-  neg [5]\n-  loss [softmax]\n-  pretrainedVectors []\n-  saveOutput [0]\n\n"
  },
  {
    "path": "docs/index.rst",
    "content": "Welcome to DeepPavlov's documentation!\n======================================\n\n.. toctree::\n   :glob:\n   :maxdepth: 1\n\n   Installation <intro/installation>\n   QuickStart <intro/quick_start>\n   General concepts <intro/overview>\n   Configuration file <intro/configuration>\n   Python pipelines <intro/python.ipynb>\n   Models overview <features/overview>\n\n\n.. toctree::\n   :glob:\n   :maxdepth: 2\n   :caption: Features\n\n   Pre-trained embeddings <features/pretrained_vectors>\n   AutoML <features/hypersearch>\n\n\n.. toctree::\n   :glob:\n   :maxdepth: 1\n   :caption: Models\n\n   Multitask BERT <features/models/multitask_bert>\n   Context Question Answering <features/models/SQuAD.ipynb>\n   Classification <features/models/classification.ipynb>\n   Few-shot Classification <features/models/few_shot_classification>\n   Named Entity Recognition <features/models/NER.ipynb>\n   Entity Extraction <features/models/entity_extraction.ipynb>\n   BERT-based models <features/models/bert>\n   Morphological Tagging <features/models/morpho_tagger.ipynb>\n   Neural Ranking <features/models/neural_ranking.ipynb>\n   Spelling Correction <features/models/spelling_correction.ipynb>\n   Syntactic Parsing <features/models/syntax_parser.ipynb>\n   TF-IDF Ranking <features/models/tfidf_ranking.ipynb>\n   Popularity Ranking <features/models/popularity_ranking.ipynb>\n   Knowledge Base Question answering <features/models/KBQA.ipynb>\n   Relation Extraction <features/models/relation_extraction.ipynb>\n   SuperGLUE Submission <features/models/superglue>\n   Open-Domain Question Answering <features/models/ODQA.ipynb>\n\n\n.. toctree::\n   :glob:\n   :maxdepth: 3\n   :caption: Integrations\n\n   REST API <integrations/rest_api>\n   Socket API <integrations/socket_api>\n   Amazon AWS deployment <integrations/aws_ec2>\n   DeepPavlov settings <integrations/settings>\n\n\n.. toctree::\n   :glob:\n   :maxdepth: 3\n   :caption: Developer Guides\n\n   Contribution guide <devguides/contribution_guide>\n   Register your model <devguides/registry>\n\n\n.. toctree::\n   :glob:\n   :maxdepth: 3\n   :caption: Internships\n\n   Internships <internships/internships>\n\n\n.. toctree::\n   :glob:\n   :maxdepth: 3\n   :caption: Package Reference\n\n   apiref/*\n\n\nIndices and tables\n==================\n\n* :ref:`genindex`\n* :ref:`modindex`\n"
  },
  {
    "path": "docs/integrations/aws_ec2.rst",
    "content": "Amazon AWS deployment\n=====================\n\nHere is a manual for deployment DeepPavlov (with ODQA as example) in Amazon Web Services using EC2 virtual machine.\n\nDeployment process consists of two main stages:\n\n1. AWS EC2 machine launch\n2. DeepPavlov ODQA deployment\n\n1. AWS EC2 machine launch\n-------------------------\n\n1.  Login to your AWS console and proceed to the EC2 services dashboard.\n\n.. image:: ../_static/aws_ec2/01_login_to_aws.png\n   :width: 800\n\n2.  Choose Ubuntu Server 18.04 LTS 64-bit x86 machine.\n\n.. image:: ../_static/aws_ec2/02_choose_ubuntu.png\n   :width: 800\n\n3.  You should select appropriate instance type because of high memory consumption by ODQA.\n    32 GiB memory is a minimum. Then press *\"Next: ...\"*\n\n.. image:: ../_static/aws_ec2/03_select_instance_type.png\n   :width: 800\n\n4.  Proceed to Step 4. Your instance storage size should be no less than 50 GiB to\n    store ODQA models.\n\n.. image:: ../_static/aws_ec2/04_add_storage.png\n   :width: 800\n\n5.  Proceed to Step 7. Check your instance parameters and press *\"Launch\"* button.\n    You will be prompted to create and save security key pair for further access to your instance.\n\n.. image:: ../_static/aws_ec2/05_review_instance.png\n   :width: 800\n\n6.  Return to your EC2 services dashboard and navigate to your running instances list.\n\n.. image:: ../_static/aws_ec2/06_go_to_running_instances.png\n   :width: 800\n\n7.  Wait until instance initializing finishes (instance status become *\"running\"*).\n\n.. image:: ../_static/aws_ec2/07_wait_init.png\n   :width: 800\n\n8.  To make DeepPavlov ODQA model rest API accessible from Internet you should set\n    corresponding inbound security rules:\n\n    8.1 Navigate to your instance security group dashboard\n    (in this example security group has name *\"launch-wizard-2\"*).\n\n    .. image:: ../_static/aws_ec2/08_01_set_sec_group.png\n       :width: 800\n\n    8.2 Select *\"Inbound\"* rules tab, click *\"Edit\"*, then click *\"Add Rule\"*.\n    For your new rule select *\"Custom TCP Rule\"* type, *\"Anywhere\"* source and input\n    port for your ODQA API. Click *\"Save\"*.\n\n    .. image:: ../_static/aws_ec2/08_02_set_inbound.png\n       :width: 800\n\n9.  Connecting to your instance by SSH:\n\n    9.1 Navigate to your instance dashboard, right-click your instance, select *\"Connect\"*.\n\n    .. image:: ../_static/aws_ec2/09_01_select_connect.png\n       :width: 800\n\n    You will be redirected to connection instructions screen for your dashboard.\n    Follow instructions for standalone SSH client. SSH connection bash command example will\n    already contain valid user and host name. To connect to your Amazon instance just run\n    the example with valid path to your saved key pair (instead of *\"dp_key_pair.pem\"*\n    in this example).\n\n    .. image:: ../_static/aws_ec2/09_02_connection_info.png\n       :width: 800\n\n2. DeepPavlov ODQA deployment\n-----------------------------\n\n1.  Login to your AWS EC2 instance.\n\n2.  For now DeepPavlov requires Python 3.6 to run. Below are instructions for DeepPavlov ODQA\n    deployment under Ubuntu 18.04 (which has pre-installed Python 3.6) and virtualenv.\n\n3.  Install pip3:\n\n    ``sudo apt update``\n\n    ``sudo apt install python3-pip``\n\n4.  Install virtualenv:\n\n    ``sudo pip3 install virtualenv``\n\n5.  Create and activate Python 3.6 virtual enviroment:\n\n    ``virtualenv env -p python3.6``\n\n    ``source env/bin/activate``\n\n6.  Install DeepPavlov:\n\n    ``pip install deeppavlov``\n\n7.  Install ODQA dependencies:\n\n    ``python -m deeppavlov install en_odqa_infer_wiki``\n\n8.  Download ODQA models (it will take quite a time):\n\n    ``python -m deeppavlov download en_odqa_infer_wiki``\n\n9.  Run ODQA REST API service, where <port> is port you defined in TCP\n    inbound rules for your AWS instance:\n\n    ``python -m deeppavlov riseapi en_odqa_infer_wiki -p <port>``\n\n3. Accessing your ODQA API\n--------------------------\n\n1.  Get your AWS instance public DNS from the instance dashboard.\n\n2.  Get full info about your ODQA API from its Swagger by navigating to\n    following URL in your browser:\n\n    ``http://<your_aws_instance_public_dns>:<your_odqa_service_port>``\n"
  },
  {
    "path": "docs/integrations/rest_api.rst",
    "content": "REST API\n========\n\nEach DeepPavlov model can be easily made available for\ninference as a REST web service. The general method is:\n\n.. code:: bash\n\n    python -m deeppavlov riseapi <config_path> [-d] [-p <port>] [--https] [--key <SSL key file path>] \\\n    [--cert <SSL certificate file path>]\n\n\n* ``-d``: downloads model specific data before starting the service.\n* ``-p <port>``: sets the port to ``<port>``. Overrides default\n  value from ``deeppavlov/utils/settings/server_config.json``.\n* ``--https``: use https instead of http. Overrides default\n  value from ``deeppavlov/utils/settings/server_config.json``.\n* ``--key <SSL key file path>``: path to SSL key file. Overrides default\n  value from ``deeppavlov/utils/settings/server_config.json``.\n* ``--cert <SSL certificate file path>``: path to SSL certificate file. Overrides default\n  value from ``deeppavlov/utils/settings/server_config.json``.\n\nThe command will print the used host and port. Default web service properties\n(host, port, POST request arguments) can be modified via changing\n``deeppavlov/utils/settings/server_config.json`` file.\n\n.. warning::\n\n    Starting from the 1.0.0rc2 model response format in riseapi mode matches :class:`~deeppavlov.core.common.chainer.Chainer`\n    response format. To start model with the old format, give the ``COMPATIBILITY_MODE`` environment variable any\n    non-empty value (e.g. ``COMPATIBILITY_MODE=true python -m deeppavlov riseapi ...``).\n    ``COMPATIBILITY_MODE`` will be removed in DeepPavlov 1.2.0.\n\nAPI routes\n----------\n\n/model\n\"\"\"\"\"\"\nSend POST request to ``<host>:<port>/model`` to infer model. See details at\n:ref:`rest_api_docs`.\n\n/probe\n\"\"\"\"\"\"\nSend POST request to ``<host>:<port>/probe`` to check if API is working. The\nserver will send a response ``[\"Test passed\"]`` if it is working. Requests to\n``/probe`` are not logged.\n\n/api\n\"\"\"\"\nTo get model argument and response names send GET request to ``<host>:<port>/api``. Server\nwill return dict with model input and output names.\n\n.. _rest_api_docs:\n\n/docs\n\"\"\"\"\"\nTo interact with the REST API via graphical interface open\n``<host>:<port>/docs`` in a browser (Swagger UI).\n\n/metrics\n\"\"\"\"\"\"\"\"\nEndpoint to monitor a running service using Prometheus. Metrics:\n\n* ``http_requests_count``: Counter, tracks number of processed requests. Labels: ``endpoint``, ``status_code``.\n* ``http_requests_latency_seconds``: Histogram, tracks responses latency (only with 200 status code). Labels:\n  ``endpoint``.\n* ``http_requests_in_progress``: Gauge, tracks inprogress requests. Labels: ``endpoint``.\n\nAdvanced configuration\n----------------------\n\nBy modifying ``deeppavlov/utils/settings/server_config.json`` you can change\nhost, port, POST request arguments and other properties of the API service.\n\nProperties from ``common_defaults`` section are used by default unless\nthey are overridden by model-specific properties, provided in ``model_defaults``\nsection of the ``server_config.json``. Model-specific properties are bound\nto the model by ``server_utils`` label in ``metadata`` section of the model\nconfig. Value of ``server_utils`` label from model config should match with\nproperties key from ``model_defaults`` section of ``server_config.json``.\n\nFor example, adding ``metadata/server_utils`` key to ``kbqa/kbqa_cq.json``\nwith value *KBQA* will initiate the search of *KBQA* tag\nat ``model_defaults`` section of ``server_config.json``. Therefore, if this\nsection is present, all parameters with non empty (i.e. not ``\"\"``,\nnot ``[]`` etc.) values stored by this tag will overwrite the parameter values\nin ``common_defaults``.\n\nIf ``model_args_names`` parameter of ``server_config.json`` is empty string,\nthen model argument names are provided as list from ``chainer/in`` section of\nthe model config file, where arguments order corresponds to model API.\nWhen inferencing model via REST api, JSON payload keys should match\nmodel arguments names from ``chainer/in`` section.\nIf ``model_args_names`` parameter of ``server_config.json`` is list, its values\nare used as model argument names instead of the list from model config's\n``chainer/in`` section.\nHere are POST request payload examples for some of the library models:\n\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Model                                   | POST request JSON payload example                                                                                                                   |\n+=========================================+=====================================================================================================================================================+\n| **One argument models**                                                                                                                                                                       |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| NER model                               | {\"x\":[\"Elon Musk launched his cherry Tesla roadster to the Mars orbit\"]}                                                                            |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Intent classification model             | {\"x\":[\"I would like to go to a restaurant with Asian cuisine this evening\"]}                                                                        |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Automatic spelling correction model     | {\"x\":[\"errror\"]}                                                                                                                                    |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Ranking model                           | {\"x\":[\"What is the average cost of life insurance services?\"]}                                                                                      |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Goal-oriented bot                       | {\"x\":[\"Hello, can you help me to find and book a restaurant this evening?\"]}                                                                        |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| **Multiple arguments models**                                                                                                                                                                 |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Question Answering model                | | {\"context_raw\":[\"After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies.\"], |\n|                                         | |  \"question_raw\":[\"What strained the relationship between Great Britain and its colonies?\"]}                                                       |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n\nREST API Usage Example\n======================\n\nTo start server with ``squad_bert`` model run:\n\n.. code:: bash\n\n    python -m deeppavlov riseapi squad_bert -id\n\nTo get response from this model on another terminal run:\n\n.. code:: bash\n\n    curl -X POST http://0.0.0.0:5000/model -H 'Content-Type: application/json' -d '{\n        \"context_raw\": [\n            \"All work and no play makes Jack a dull boy.\",\n            \"I used to be an adventurer like you, then I took an arrow in the knee.\"\n        ],\n        \"question_raw\": [\n            \"What makes Jack a dull boy?\",\n            \"Who I used to be?\"\n        ]\n    }'\n"
  },
  {
    "path": "docs/integrations/settings.rst",
    "content": "DeepPavlov settings\n===================\n\nDeepPavlov provides some tools to facilitate its usage (e.g. dialog logging, settings management). This document is aimed to guide you through them.\n\n1. Settings files access and management\n---------------------------------------\n\nMost of DeepPavlov settings are located in settings files, which in turn are located in a settings folder. Default settings folder location is ``deeppavlov/utils/settings`` .\n\nYou can override a settings directory path by setting the ``DP_SETTINGS_PATH`` environment variable. Missing files will be added automatically when running any deeppavlov script.\n\nYou can get current full path to settings directory with ``python -m deeppavlov.settings``.\nTo reset settings in the current settings directory one can use ``python -m deeppavlov.settings -d``.\n\n2. Dialog logging\n-----------------\n\nDeepPavlov supports logging of infered utterances and DeepPavlov model responses. You can manage dialog logging by\nediting ``dialog_logger_config.json`` file in a settings directory.\n\nFollowing dialog logging settings are available:\n\n1. **enabled** (default: ``false``): turns on/off dialog logging for DeepPavlov instance;\n2. **log_path** (default: ``~/.deeppavlov/dialog_logs``): sets directory where dialog logs are stored;\n3. **logger_name** (default: ``default``): sets subdirectory name for storing dialog logs;\n4. **logfile_max_size_kb** (default: ``10240``): sets logfile maximum size in kilobytes. If exceeded, new log file is created;\n5. **ensure_ascii** (default: ``false``): If ``true``, converts all non-ASCII symbols in logged content to Unicode code points.\n\n3. Environment variables\n------------------------\n\n- **DP_SETTINGS_PATH** — custom path to a directory that contains settings files. It's automatically populated with missing files when running any deeppavlov scripts.\n- **DP_SKIP_NLTK_DOWNLOAD** set to ``TRUE`` to prevent automatic downloading of **nltk** packages (``punkt``, ``stopwords``, ``perluniprops``, ``nonbreaking_prefixes``)\n"
  },
  {
    "path": "docs/integrations/socket_api.rst",
    "content": "Socket API\n==========\n\nEach DeepPavlov model can be made available as a socket server. The general\nmethod is:\n\n.. code:: bash\n\n    python -m deeppavlov risesocket <config_path> [-d] [--socket-type <address_family>] [-p <port>] \\\n    [--socket-file <unix_socket_file>]\n\n\n* ``-d``: downloads model specific data before starting the service.\n* ``--socket-type <address_family>``: sets socket address family to ``AF_INET``\n  if ``<address_family>`` is ``TCP`` or to ``AF_UNIX`` if ``<address_family>``\n  is ``UNIX``. Overrides default value from\n  ``deeppavlov/utils/settings/server_config.json``.\n* ``-p <port>``: sets the port to ``<port>`` if socket address family is\n  ``AF_INET``. Overrides default value from\n  ``deeppavlov/utils/settings/server_config.json``.\n* ``--socket-file <unix_socket_file>``: sets the file for socket binding to\n  ``<unix_socket_file>`` if socket address family is ``AF_UNIX``. Overrides\n  default value from ``deeppavlov/utils/settings/server_config.json``.\n\nThe command will print the binding address: host and port for ``AF_INET``\nsocket family and path to the UNIX socket file for ``AF_UNIX`` socket family.\nDefault service properties (socket address family, host, port, path to the UNIX\nsocket file, socket buffer size, binding message) can be modified via changing\n``deeppavlov/utils/settings/server_config.json`` file.\n\nAdvanced configuration\n~~~~~~~~~~~~~~~~~~~~~~\n\nBy modifying ``deeppavlov/utils/settings/server_config.json`` you can change\nsocket address family, host, port, path to the UNIX socket file and other\nproperties of the API service.\n\nProperties from ``common_defaults`` section are used by default unless\nthey are overridden by model-specific properties, provided in ``model_defaults``\nsection of the ``server_config.json``. Model-specific properties are bound\nto the model by ``server_utils`` label in ``metadata`` section of the model\nconfig. Value of ``server_utils`` label from model config should match with\nproperties key from ``model_defaults`` section of ``server_config.json``.\n\nFor example, adding ``metadata/server_utils`` key to ``kbqa/kbqa_cq.json``\nwith value *KBQA* will initiate the search of *KBQA* tag\nat ``model_defaults`` section of ``server_config.json``. Therefore, if this\nsection is present, all parameters with non empty (i.e. not ``\"\"``,\nnot ``[]`` etc.) values stored by this tag will overwrite the parameter values\nin ``common_defaults``.\n\nIf ``model_args_names`` parameter of ``server_config.json`` is empty string,\nthen model argument names are provided as list from ``chainer/in`` section of\nthe model config file, where arguments order corresponds to model API.\nWhen inferencing model via socket API, serialized JSON payload keys should match\nmodel arguments names from ``chainer/in`` section.\nIf ``model_args_names`` parameter of ``server_config.json`` is list, its values\nare used as model argument names instead of the list from model config's\n``chainer/in`` section.\n\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Model                                   | POST request JSON payload example                                                                                                                   |\n+=========================================+=====================================================================================================================================================+\n| **One argument models**                                                                                                                                                                       |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| NER model                               | {\"x\":[\"Elon Musk launched his cherry Tesla roadster to the Mars orbit\"]}                                                                            |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Intent classification model             | {\"x\":[\"I would like to go to a restaurant with Asian cuisine this evening\"]}                                                                        |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Automatic spelling correction model     | {\"x\":[\"errror\"]}                                                                                                                                    |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Ranking model                           | {\"x\":[\"What is the average cost of life insurance services?\"]}                                                                                      |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Goal-oriented bot                       | {\"x\":[\"Hello, can you help me to find and book a restaurant this evening?\"]}                                                                        |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| **Multiple arguments models**                                                                                                                                                                 |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n| Question Answering model                | | {\"context_raw\":[\"After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies.\"], |\n|                                         | |  \"question_raw\":[\"What strained the relationship between Great Britain and its colonies?\"]}                                                       |\n+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+\n\nSocket client example (Python)\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nSocket client for :doc:`SQuAD </features/models/SQuAD>` model with a batch of\ntwo elements:\n\n.. code-block:: python\n\n    # squad-client.py\n\n    import json\n    import socket\n    from struct import unpack\n\n    from deeppavlov.utils.socket import encode\n\n    socket_payload = {\n        \"context_raw\": [\n            \"All work and no play makes Jack a dull boy\",\n            \"I used to be an adventurer like you, then I took an arrow in the knee\"\n        ],\n        \"question_raw\": [\n            \"What makes Jack a dull boy?\",\n            \"Who I used to be?\"\n        ]\n    }\n    serialized_socket_payload = encode(socket_payload)\n\n    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:\n        s.connect(('0.0.0.0', 5000))\n        s.sendall(serialized_socket_payload)\n        header = s.recv(4)\n        body_len = unpack('<I', header)[0]\n        serialized_response = s.recv(body_len)\n        json_payload = json.loads(serialized_response)\n\n    print(json_payload)\n\nTo start socket server with ``squad_bert`` model run:\n\n.. code:: bash\n\n    python -m deeppavlov risesocket -d squad_bert --socket-type TCP -p 5000\n\n\nTo start socket client on another terminal run:\n\n.. code:: bash\n\n    python squad-client.py\n"
  },
  {
    "path": "docs/internships/internships.rst",
    "content": "\nInternships\n===========\n\nDo you have ideas on how to improve dialog systems for everyone? Are you ready to make an impact across the world?\nGreat, then join us!\n\nLet’s shape the future of Conversational AI together. An internship is for aspiring graduate and undergraduate students\nwho are passionate about Conversational AI technology and offer diverse perspectives.\n\nAs an intern, you will work on some of the most ambitious technical problems, develop new ML solutions that will impact\nfuture DeepPavlov products and make the lives of DeepPavlov users easier.\n\nAll interns are paired with a mentor and will participate directly in DeepPavlov's groundbreaking work.\nThere are no restrictions on publications based on internships. International candidates are welcome to apply.\n\nEach of our research teams has specific test assignments for interested candidates, so please familiarize yourself\nwith our `projects <https://deeppavlov.ai/research>`_ that best match your skills and interests.\n\n`Apply now at our website <https://deeppavlov.ai/internships#application>`_.\n"
  },
  {
    "path": "docs/intro/configuration.rst",
    "content": "Configuration file\n==================\n\nAn NLP pipeline config is a JSON file that contains one required element ``chainer``:\n\n.. code:: python\n\n    {\n      \"chainer\": {\n        \"in\": [\"x\"],\n        \"in_y\": [\"y\"],\n        \"pipe\": [\n          ...\n        ],\n        \"out\": [\"y_predicted\"]\n      }\n    }\n\n:class:`~deeppavlov.core.common.chainer.Chainer` is a core concept of DeepPavlov library: chainer builds a pipeline from\nheterogeneous components (Rule-Based/ML/DL) and allows to train or infer from pipeline as a whole. Each component in the\npipeline specifies its inputs and outputs as arrays of names, for example: ``\"in\": [\"tokens\", \"features\"]`` and\n``\"out\": [\"token_embeddings\", \"features_embeddings\"]`` and you can chain outputs of one components with inputs of other\ncomponents:\n\n.. code:: python\n\n    {\n      \"class_name\": \"deeppavlov.models.preprocessors.str_lower:str_lower\",\n      \"in\": [\"x\"],\n      \"out\": [\"x_lower\"]\n    },\n    {\n      \"class_name\": \"nltk_tokenizer\",\n      \"in\": [\"x_lower\"],\n      \"out\": [\"x_tokens\"]\n    },\n\nPipeline elements could be child classes of :class:`~deeppavlov.core.models.component.Component` or functions.\n\nEach :class:`~deeppavlov.core.models.component.Component` in the pipeline must implement method :meth:`__call__` and has\n``class_name`` parameter, which is its registered codename, or full name of any python class in the form of\n``\"module_name:ClassName\"``. It can also have any other parameters which repeat its :meth:`__init__` method arguments.\nDefault values of :meth:`__init__` arguments will be overridden with the config values during the initialization of a\nclass instance.\n\nYou can reuse components in the pipeline to process different parts of data with the help of ``id`` and ``ref``\nparameters:\n\n.. code:: python\n\n    {\n      \"class_name\": \"nltk_tokenizer\",\n      \"id\": \"tokenizer\",\n      \"in\": [\"x_lower\"],\n      \"out\": [\"x_tokens\"]\n    },\n    {\n      \"ref\": \"tokenizer\",\n      \"in\": [\"y\"],\n      \"out\": [\"y_tokens\"]\n    },\n\n\nNested configuration files\n--------------------------\n\nAny configuration file could be used inside another configuration file as an element of the\n:class:`~deeppavlov.core.common.chainer.Chainer` or as a field of another component using ``config_path`` key.\nAny field of the nested configuration file could be overwritten using ``overwrite`` field:\n\n.. code::\n\n    \"chainer\": {\n      \"pipe\": {\n        ...\n        {\n          \"class_name\": \"ner_chunk_model\",\n          \"ner\": {\n            \"config_path\": \"{CONFIGS_PATH}/ner/ner_ontonotes_bert.json\",\n            \"overwrite\": {\n              \"chainer.out\": [\"x_tokens\", \"tokens_offsets\", \"y_pred\", \"probas\"]\n            }\n          },\n          ...\n        }\n      }\n    }\n\nIn this example ``ner_ontonotes_bert.json`` is used as ``ner`` argument value in ``ner_chunk_model`` component.\n``chainer.out`` value is overwritten with new list. Overwritten fields names are defined using dot notation. In this\nnotation numeric fields are treated as indexes of lists. For example, to change ``class_name`` value of the second\nelement of the pipe to ``ner_chunker`` (1 is the index of the second element), use\n``\"chainer.pipe.1.class_name\": \"ner_chunker\"`` key-value pair.\n\n\nVariables\n---------\n\nAs of *version 0.1.0* every string value in a configuration file is interpreted\nas a `format string <https://docs.python.org/3.6/library/string.html#formatstrings>`__ where fields are evaluated\nfrom ``metadata.variables`` element:\n\n.. code:: python\n\n    {\n      \"chainer\": {\n        \"in\": [\"x\"],\n        \"pipe\": [\n          {\n            \"class_name\": \"my_component\",\n            \"in\": [\"x\"],\n            \"out\": [\"x\"],\n            \"load_path\": \"{MY_PATH}/file.obj\"\n          },\n          {\n            \"in\": [\"x\"],\n            \"out\": [\"y_predicted\"],\n            \"config_path\": \"{CONFIGS_PATH}/classifiers/insults_kaggle_bert.json\"\n          }\n        ],\n        \"out\": [\"y_predicted\"]\n      },\n      \"metadata\": {\n        \"variables\": {\n          \"MY_PATH\": \"/some/path\",\n          \"CONFIGS_PATH\": \"{DEEPPAVLOV_PATH}/configs\"\n        }\n      }\n    }\n\nVariable ``DEEPPAVLOV_PATH`` is always preset to be a path to the ``deeppavlov`` python module.\n\nOne can override configuration variables using environment variables with prefix ``DP_``. So environment variable\n``DP_VARIABLE_NAME`` will override ``VARIABLE_NAME`` inside a configuration file.\n\nFor example, adding ``DP_ROOT_PATH=/my_path/to/large_hard_drive`` will make most configs use this path for downloading and reading  embeddings/models/datasets.\n\nTraining\n--------\n\nThere are two abstract classes for trainable components: :class:`~deeppavlov.core.models.estimator.Estimator`\nand :class:`~deeppavlov.core.models.nn_model.NNModel`.\n\n:class:`~deeppavlov.core.models.estimator.Estimator` are fit once on any data with no batching or early stopping,\nso it can be safely done at the time of pipeline initialization. :meth:`fit` method has to be implemented for each\n:class:`~deeppavlov.core.models.estimator.Estimator`. One example is :class:`~deeppavlov.core.data.vocab.Vocab`.\n\n:class:`~deeppavlov.core.models.nn_model.NNModel` requires more complex training. It can only be trained in a supervised\nmode (as opposed to :class:`~deeppavlov.core.models.estimator.Estimator` which can be trained in both supervised and\nunsupervised settings). This process takes multiple epochs with periodic validation and logging.\n:meth:`~deeppavlov.core.models.nn_model.NNModel.train_on_batch` method has to be implemented for each\n:class:`~deeppavlov.core.models.nn_model.NNModel`.\n\nTraining is triggered by :func:`~deeppavlov.train_model` function.\n\n\nTrain config\n~~~~~~~~~~~~\n\n:class:`~deeppavlov.core.models.estimator.Estimator` s that are trained should also have ``fit_on`` parameter which\ncontains a list of input parameter names. An :class:`~deeppavlov.core.models.nn_model.NNModel` should have the ``in_y``\nparameter which contains a list of ground truth answer names. For example:\n\n.. code:: python\n\n    [\n      {\n        \"id\": \"classes_vocab\",\n        \"class_name\": \"default_vocab\",\n        \"fit_on\": [\"y\"],\n        \"level\": \"token\",\n        \"save_path\": \"vocabs/classes.dict\",\n        \"load_path\": \"vocabs/classes.dict\"\n      },\n      {\n        \"in\": [\"x\"],\n        \"in_y\": [\"y\"],\n        \"out\": [\"y_predicted\"],\n        \"class_name\": \"intent_model\",\n        \"save_path\": \"classifiers/intent_cnn\",\n        \"load_path\": \"classifiers/intent_cnn\",\n        \"classes_vocab\": {\n          \"ref\": \"classes_vocab\"\n        }\n      }\n    ]\n\nThe config for training the pipeline should have three additional elements: ``dataset_reader``, ``dataset_iterator``\nand ``train``:\n\n.. code:: python\n\n    {\n      \"dataset_reader\": {\n        \"class_name\": ...,\n        ...\n      },\n      \"dataset_iterator\": {\n        \"class_name\": ...,\n        ...\n      },\n      \"chainer\": {\n        ...\n      },\n      \"train\": {\n        ...\n      }\n    }\n\n\nSimplified version of training pipeline contains two elements: ``dataset`` and ``train``. The ``dataset`` element\ncurrently can be used for train from classification data in ``csv`` and ``json`` formats.\n\n\nTrain Parameters\n~~~~~~~~~~~~~~~~\n\n``train`` element can contain a ``class_name`` parameter that references a trainer class (default value is\n:class:`torch_trainer <deeppavlov.core.trainers.torch_trainer.TorchTrainer>`).\nAll other parameters will be passed as keyword arguments to the trainer class's constructor.\n\n\nMetrics\n_______\n\n.. code:: python\n\n    \"train\": {\n      \"class_name\": \"torch_trainer\",\n      \"metrics\": [\n        \"f1\",\n        {\n          \"name\": \"accuracy\",\n          \"inputs\": [\"y\", \"y_labels\"]\n        },\n        {\n          \"name\": \"sklearn.metrics:accuracy_score\",\n          \"alias\": \"unnormalized_accuracy\",\n          \"inputs\": [\"y\", \"y_labels\"],\n          \"normalize\": false\n        }\n      ],\n      ...\n    }\n\nThe first metric in the list is used for early stopping.\n\nEach metric can be described as a JSON object with ``name``, ``alias`` and ``inputs`` properties, where:\n\n  - ``name`` is either a registered name of a metric function or ``module.submodules:function_name``.\n  - ``alias`` is a metric name. Default value is ``name`` value.\n  - ``inputs`` is a list of parameter names from chainer's inner memory that will be passed to the metric function.\n    Default value is a concatenation of chainer's ``in_y`` and ``out`` parameters.\n\nAll other arguments are interpreted as kwargs when the metric is called.\nIf a metric is given as a string, this string is interpreted as a metric name, i.e. ``\"f1\"`` in the example\nabove is equivalent to ``{\"name\": \"f1\"}``.\n\n\nDatasetReader\n~~~~~~~~~~~~~\n\n:class:`~deeppavlov.core.dara.dataset_reader.DatasetReader` class reads data and returns it in a specified format.\nA concrete :class:`DatasetReader` class should be inherited from this base class and registered with a codename:\n\n\n.. code:: python\n\n    from deeppavlov.core.common.registry import register\n    from deeppavlov.core.data.dataset_reader import DatasetReader\n\n    @register('conll2003_reader')\n    class Conll2003DatasetReader(DatasetReader):\n\n\nDataLearningIterator and DataFittingIterator\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n:class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` forms the sets of data ('train', 'valid',\n'test') needed for training/inference and divides them into batches. A concrete :class:`DataLearningIterator` class\nshould be registered and can be inherited from :class:`deeppavlov.data.data_learning_iterator.DataLearningIterator`\nclass. This is a base class and can be used as a :class:`DataLearningIterator` as well.\n\n:class:`~deeppavlov.core.data.data_fitting_iterator.DataFittingIterator` iterates over provided dataset without\ntrain/valid/test splitting and is useful for :class:`~deeppavlov.core.models.estimator.Estimator` s that do not require\ntraining.\n\n\nInference\n---------\n\nAll components inherited from :class:`~deeppavlov.core.models.component.Component` abstract class can be used for\ninference. The :meth:`__call__` method should return standard output of a component. For example, a `tokenizer`\nshould return `tokens`, a `NER recognizer` should return `recognized entities`, a `bot` should return an `utterance`.\nA particular format of returned data should be defined in :meth:`__call__`.\n\nInference is triggered by :func:`~deeppavlov.core.commands.infer.interact_model` function. There is no need in a\nseparate JSON for inference.\n\nModel Configuration\n-------------------\n\nEach DeepPavlov model is determined by its configuration file. You can use\nexisting config files or create yours. You can also choose a config file and \nmodify preprocessors/tokenizers/embedders/vectorizers there. The components\nbelow have the same interface and are responsible for the same functions,\ntherefore they can be used in the same parts of a config pipeline.\n\nHere is a list of useful\n:class:`~deeppavlov.core.models.component.Component`\\ s aimed to preprocess,\npostprocess and vectorize your data.\n\nPreprocessors\n~~~~~~~~~~~~~\n\nPreprocessor is a component that processes batch of samples.\n\n* Already implemented universal preprocessors of **tokenized texts** (each\n  sample is a list of tokens):\n\n    - :class:`~deeppavlov.models.preprocessors.mask.Mask` (registered as\n      ``mask``) returns binary mask of corresponding length (padding up to the\n      maximum length per batch.\n\n    - :class:`~deeppavlov.models.preprocessors.sanitizer.Sanitizer`\n      (registered as ``sanitizer``) removes all combining characters like\n      diacritical marks from tokens.\n\n* Already implemented universal preprocessors of **non-tokenized texts**\n  (each sample is a string):\n\n    - :class:`~deeppavlov.models.preprocessors.dirty_comments_preprocessor.DirtyCommentsPreprocessor`\n      (registered as ``dirty_comments_preprocessor``) preprocesses samples\n      converting samples to lowercase, paraphrasing English combinations with\n      apostrophe ``'``,  transforming more than three the same symbols to two\n      symbols.\n\n    - :meth:`~deeppavlov.models.preprocessors.str_lower.str_lower` converts samples to lowercase.\n\n* Already implemented universal preprocessors of another type of features:\n\n    - :class:`~deeppavlov.models.preprocessors.one_hotter.OneHotter`\n      (registered as ``one_hotter``) performs one-hotting operation for the\n      batch of samples where each sample is an integer label or a list of\n      integer labels (can be combined in one batch). If ``multi_label``\n      parameter is set to ``True``, returns one one-dimensional vector per\n      sample with several elements equal to ``1``.\n\n\nTokenizers\n~~~~~~~~~~\n\nTokenizer is a component that processes batch of samples (each sample is a text\nstring).\n\n    - :class:`~deeppavlov.models.tokenizers.nltk_tokenizer.NLTKTokenizer`\n      (registered as ``nltk_tokenizer``) tokenizes using tokenizers from\n      ``nltk.tokenize``, e.g. ``nltk.tokenize.wordpunct_tokenize``.\n\n    - :class:`~deeppavlov.models.tokenizers.nltk_moses_tokenizer.NLTKMosesTokenizer`\n      (registered as ``nltk_moses_tokenizer``) tokenizes and detokenizes using\n      ``nltk.tokenize.moses.MosesDetokenizer``,\n      ``nltk.tokenize.moses.MosesTokenizer``.\n\n    - :class:`~deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer`\n      (registered as ``stream_spacy_tokenizer``) tokenizes or lemmatizes texts\n      with spacy ``en_core_web_sm`` models by default.\n\n    - :class:`~deeppavlov.models.tokenizers.split_tokenizer.SplitTokenizer`\n      (registered as ``split_tokenizer``) tokenizes using string method\n      ``split``.\n\n\nEmbedders\n~~~~~~~~~\n\nEmbedder is a component that converts every token in a tokenized batch to a\nvector of a particular dimension (optionally, returns a single vector per\nsample).\n\n    - :class:`~deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder`\n      (registered as ``fasttext``) reads embedding file in fastText format.\n      If ``mean`` returns one vector per sample - mean of embedding vectors\n      of tokens.\n\n    - :class:`~deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder`\n      (registered as ``tfidf_weighted``) accepts embedder, tokenizer (for\n      detokenization, by default, detokenize with joining with space), TFIDF\n      vectorizer or counter vocabulary, optionally accepts tags vocabulary (to\n      assign additional multiplcative weights to particular tags). If ``mean``\n      returns one vector per sample - mean of embedding vectors of tokens.\n\nVectorizers\n~~~~~~~~~~~\n\nVectorizer is a component that converts batch of text samples to batch of\nvectors.\n\n    - :class:`~deeppavlov.models.sklearn.sklearn_component.SklearnComponent`\n      (registered as ``sklearn_component``) is a DeepPavlov wrapper for most\n      of sklearn estimators, vectorizers etc. For example, to get\n      TFIDF-vectorizer one should assign in config ``model_class`` to\n      ``sklearn.feature_extraction.text:TfidfVectorizer``, ``infer_method``\n      to ``transform``, pass ``load_path``, ``save_path`` and other sklearn\n      model parameters.\n\n    - :class:`~deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer`\n      (registered as ``hashing_tfidf_vectorizer``) implements hashing version\n      of usual TFIDF-vecotrizer. It creates a TFIDF matrix from collection of\n      documents of size ``[n_documents X n_features(hash_size)]``.\n\n"
  },
  {
    "path": "docs/intro/installation.rst",
    "content": "Installation\n============\n\nDeepPavlov supports **Linux**, **Windows 10+** (through WSL/WSL2), **MacOS** (Big Sur+) platforms, **Python 3.6-3.11**.\nDepending on the model used, you may need from 4 to 16 GB RAM.\n\nInstall with pip\n~~~~~~~~~~~~~~~~\n\nYou should install DeepPavlov in a `virtual environment <https://docs.python.org/3/library/venv.html>`_. If you’re\nunfamiliar with Python virtual environments, take a look at this\n`guide <https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/>`_. A virtual\nenvironment makes it easier to manage different projects, and avoid compatibility issues between dependencies.\n\n#. Create a virtual environment:\n\n    .. code:: bash\n\n        python -m venv env\n\n#. Activate the virtual environment on Linux (`source` could be replaced with `.`):\n\n    .. code:: bash\n\n        source env/bin/activate\n\n#. Install DeepPavlov inside this virtual environment:\n\n    .. code:: bash\n\n        pip install deeppavlov\n\nInstall from source\n~~~~~~~~~~~~~~~~~~~\n\nInstall DeepPavlov **dev** branch from source with the following command:\n\n    .. code:: bash\n\n        pip install git+http://github.com/deeppavlov/DeepPavlov@dev\n\nThis command installs the bleeding edge dev version rather than the latest release version. The dev version is useful\nfor staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last release but\na new release hasn’t been rolled out yet. However, this means the dev version may not always be stable.\n\nEditable install\n~~~~~~~~~~~~~~~~\n\nYou will need an editable install if you want to make changes in the DeepPavlov source code that immediately take place\nwithout requiring a new installation.\n\nClone the repository and install DeepPavlov with the following commands:\n\n    .. code:: bash\n\n        git clone http://github.com/deeppavlov/DeepPavlov.git\n        pip install -e DeepPavlov\n\nDocker Images\n~~~~~~~~~~~~~\n\nWe have built several DeepPavlov based Docker images, which include:\n\n    * DeepPavlov based Jupyter notebook Docker image;\n    * Docker images which serve some of our models and allow to access them\n      via REST API (:doc:`riseapi </integrations/rest_api>` mode).\n\nHere is our `DockerHub repository <https://hub.docker.com/u/deeppavlov/>`_ with\nimages and deployment instructions.\n"
  },
  {
    "path": "docs/intro/overview.rst",
    "content": "Conceptual overview\n===================\n\nOur goal is to enable AI-application developers and researchers with:\n\n-  A set of pre-trained NLP models, pre-defined dialog system components\n   (ML/DL/Rule-based), and pipeline templates;\n-  A framework for implementing and testing their own dialog models;\n-  Tools for application integration with adjacent infrastructure\n   (messengers, helpdesk software, etc.);\n-  Benchmarking environments for conversational models and uniform access\n   to relevant datasets.\n\n.. image:: ../_static/dp_agnt_diag.png\n\n\nKey Concepts\n------------\n\n-  A ``Model`` is any NLP model that doesn't necessarily communicates\n   with the user in natural language.\n-  A ``Component`` is a reusable functional part of a ``Model``.\n-  ``Rule-based Models`` cannot be trained.\n-  ``Machine Learning Models`` can be trained only stand alone.\n-  ``Deep Learning Models`` can be trained independently and in an\n   end-to-end mode being joined in a chain.\n-  A ``Chainer`` builds a model pipeline from heterogeneous\n   components (Rule-based/ML/DL). It allows one to train and infer models in\n   a pipeline as a whole.\n\nThe smallest building block of the library is a ``Component``.\nA ``Component`` stands for any kind of function in an NLP pipeline. It can\nbe implemented as a neural network, a non-neural ML model, or a\nrule-based system.\n\n``Component``\\ s can be joined into a ``Model``. A ``Model``\nsolves a larger NLP task than a ``Component``. However, in terms of\nimplementation, ``Model``\\ s are not different from ``Component``\\ s.\n\nMost of DeepPavlov models are built on top of `PyTorch <https://www.pytorch.org/>`__.\nOther external libraries can be used to build basic components.\n"
  },
  {
    "path": "docs/intro/python.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"6d5cd16b\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Python pipelines\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"da10fd80\",\n   \"metadata\": {},\n   \"source\": [\n    \"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/intro/python.ipynb)\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"d55ebe35\",\n   \"metadata\": {},\n   \"source\": [\n    \"Python models could be used without .json configuration files.\\n\",\n    \"\\n\",\n    \"The code below is an alternative to building [insults_kaggle_bert](https://github.com/deepmipt/DeepPavlov/blob/master/deeppavlov/configs/classifiers/insults_kaggle_bert.json) model and using it with\\n\",\n    \"\\n\",\n    \"```python\\n\",\n    \"from deeppavlov import build_model\\n\",\n    \"\\n\",\n    \"model = build_model('insults_kaggle_bert', download=True)\\n\",\n    \"```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"fa1db63b\",\n   \"metadata\": {},\n   \"source\": [\n    \"At first, define variables for model components and download model data.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"9d6671e2\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov.core.commands.utils import expand_path\\n\",\n    \"from deeppavlov.download import download_resource\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"classifiers_path = expand_path('~/.deeppavlov/models/classifiers')\\n\",\n    \"model_path = classifiers_path / 'insults_kaggle_torch_bert'\\n\",\n    \"transformer_name = 'bert-base-uncased'\\n\",\n    \"\\n\",\n    \"download_resource(\\n\",\n    \"    'http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_torch_bert_v5.tar.gz',\\n\",\n    \"    {classifiers_path}\\n\",\n    \")\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"332d644e\",\n   \"metadata\": {},\n   \"source\": [\n    \"Then, initialize model components.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"809c31ad\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov.core.data.simple_vocab import SimpleVocabulary\\n\",\n    \"from deeppavlov.models.classifiers.proba2labels import Proba2Labels\\n\",\n    \"from deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersPreprocessor\\n\",\n    \"from deeppavlov.models.torch_bert.torch_transformers_classifier import TorchTransformersClassifierModel\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"preprocessor = TorchTransformersPreprocessor(\\n\",\n    \"    vocab_file=transformer_name,\\n\",\n    \"    max_seq_length=64\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"classes_vocab = SimpleVocabulary(\\n\",\n    \"    load_path=model_path/'classes.dict',\\n\",\n    \"    save_path=model_path/'classes.dict'\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"classifier =  TorchTransformersClassifierModel(\\n\",\n    \"    n_classes=classes_vocab.len,\\n\",\n    \"    return_probas=True,\\n\",\n    \"    pretrained_bert=transformer_name,\\n\",\n    \"    save_path=model_path/'model',\\n\",\n    \"    optimizer_parameters={'lr': 1e-05}\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"proba2labels = Proba2Labels(max_proba=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"87e8ec20\",\n   \"metadata\": {},\n   \"source\": [\n    \"Finally, create model from components. ``Element`` is a wrapper for a component. ``Element`` receives the component and the names of the incoming and outgoing arguments. ``Model`` combines ``Element``s into pipeline.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"acfe29de\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from deeppavlov import Element, Model\\n\",\n    \"\\n\",\n    \"model = Model(\\n\",\n    \"    x=['x'],\\n\",\n    \"    out=['y_pred_labels'],\\n\",\n    \"    pipe=[\\n\",\n    \"        Element(component=preprocessor, x=['x'], out=['bert_features']),\\n\",\n    \"        Element(component=classifier, x=['bert_features'], out=['y_pred_probas']),\\n\",\n    \"        Element(component=proba2labels, x=['y_pred_probas'], out=['y_pred_ids']),\\n\",\n    \"        Element(component=classes_vocab, x=['y_pred_ids'], out=['y_pred_labels'])\\n\",\n    \"    ]\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"model(['you are stupid', 'you are smart'])\"\n   ]\n  }\n ],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "docs/intro/quick_start.rst",
    "content": "QuickStart\n------------\n\nFirst, follow instructions on :doc:`Installation page </intro/installation>`\nto install ``deeppavlov`` package for Python 3.6-3.11.\n\nDeepPavlov contains a bunch of great pre-trained NLP models. Each model is\ndetermined by its config file. List of models is available on\n:doc:`the doc page </features/overview>` or in\nthe ``deeppavlov.configs``:\n\n    .. code:: python\n        \n        from deeppavlov import configs\n\nWhen you've decided on the model (+ config file), there are two ways to train,\nevaluate and infer it:\n\n* via `Command line interface (CLI)`_ and\n* via `Python`_.\n\nBefore making choice of an interface, install model's package requirements\n(CLI):\n\n    .. code:: bash\n        \n        python -m deeppavlov install <config_path>\n\n    * where ``<config_path>`` is model name without ``.json`` extension (e.g. ``insults_kaggle_bert``) or path to the\n      chosen model's config file (e.g. ``deeppavlov/configs/classifiers/insults_kaggle_bert.json``)\n\n\nCommand line interface (CLI)\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nTo get predictions from a model interactively through CLI, run\n\n    .. code:: bash\n        \n        python -m deeppavlov interact <config_path> [-d] [-i]\n\n    * ``-d`` downloads required data -- pretrained model files and embeddings (optional).\n    * ``-i`` installs model requirements (optional).\n\nYou can train it in the same simple way:\n\n    .. code:: bash\n        \n        python -m deeppavlov train <config_path> [-d] [-i]\n\n    Dataset will be downloaded regardless of whether there was ``-d`` flag or not.\n\n    To train on your own data, you need to modify dataset reader path in the\n    `train section doc <configuration.html#Train-config>`__. The data format is\n    specified in the corresponding model doc page. \n\nThere are even more actions you can perform with configs:\n\n    .. code:: bash\n        \n        python -m deeppavlov <action> <config_path> [-d] [-i]\n\n    * ``<action>`` can be\n        * ``install`` to install model requirements (same as ``-i``),\n        * ``download`` to download model's data (same as ``-d``),\n        * ``train`` to train the model on the data specified in the config file,\n        * ``evaluate`` to calculate metrics on the same dataset,\n        * ``interact`` to interact via CLI,\n        * ``riseapi`` to run a REST API server (see :doc:`docs\n          </integrations/rest_api>`),\n        * ``risesocket`` to run a socket API server (see :doc:`docs\n          </integrations/socket_api>`),\n        * ``predict`` to get prediction for samples from ``stdin`` or from\n          ``<file_path>`` if ``-f <file_path>`` is specified.\n    * ``<config_path>`` specifies path (or name) of model's config file\n    * ``-d`` downloads required data\n    * ``-i`` installs model requirements\n\n\nPython\n~~~~~~\n\nTo get predictions from a model interactively through Python, run\n\n    .. code:: python\n        \n        from deeppavlov import build_model\n\n        model = build_model(<config_path>, install=True, download=True)\n\n        # get predictions for 'input_text1', 'input_text2'\n        model(['input_text1', 'input_text2'])\n\nwhere\n\n    * ``install=True`` installs model requirements (optional),\n    * ``download=True`` downloads required data from web -- pretrained model files and embeddings (optional),\n    * ``<config_path>`` is path to the chosen model's config file (e.g.\n      ``\"deeppavlov/configs/ner/ner_ontonotes_bert_mult.json\"``) or\n      ``deeppavlov.configs`` attribute (e.g.\n      ``deeppavlov.configs.ner.ner_ontonotes_bert_mult`` without quotation marks).\n\nYou can train it in the same simple way:\n\n    .. code:: python\n        \n        from deeppavlov import train_model \n\n        model = train_model(<config_path>, install=True, download=True)\n\n    * ``download=True`` downloads pretrained model, therefore the pretrained\n      model will be, first, loaded and then trained (optional).\n\n    Dataset will be downloaded regardless of whether there was ``-d`` flag or not.\n\n    To train on your own data, you need to modify dataset reader path in the\n    `train section doc <configuration.html#Train-config>`__. The data format is\n    specified in the corresponding model doc page. \n\nYou can also calculate metrics on the dataset specified in your config file:\n\n    .. code:: python\n        \n        from deeppavlov import evaluate_model \n\n        model = evaluate_model(<config_path>, install=True, download=True)\n\n\nUsing GPU\n~~~~~~~~~\n\nTo run or train **PyTorch**-based DeepPavlov models on GPU you should have `CUDA <https://developer.nvidia.com/cuda-toolkit>`__\ninstalled on your host machine, and install model's package requirements. CUDA version should be compatible with\nDeepPavlov :dp_file:`required PyTorch version <deeppavlov/requirements/pytorch.txt>`.\nGPU with Pascal or newer architecture and 4+ GB VRAM is recommended.\n\n.. warning::\n    If you use latest NVIDIA architecture, PyTorch installed from PyPI using DeepPavlov could not support your device\n    CUDA capability. You will receive incompatible device warning after model initialization. You can install compatible\n    package from `download.pytorch.org <https://download.pytorch.org/whl/torch_stable.html>`_. For example:\n\n    .. code:: bash\n\n        pip3 install torch==1.8.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html\n\nIf you want to run the code on GPU, just make the device visible for the script.\nIf you want to use a particular device, you may set it in command line:\n\n    .. code:: bash\n\n        export CUDA_VISIBLE_DEVICES=3; python -m deeppavlov train <config_path>\n\nor in Python script:\n\n    .. code:: python\n\n        import os\n\n        os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"3\"\n\nIn case you want to keep GPU visible but disable GPU acceleration for specific component, use ``device`` paramenter\n(available for :class:`~deeppavlov.core.models.torch_model.TorchModel` child classes): ``\"device\": \"cpu\"``.\n\n\nPretrained models\n~~~~~~~~~~~~~~~~~\n\nDeepPavlov provides a wide range of pretrained models.\nSee :doc:`features overview </features/overview>` for more info. Please\nnote that most of our models are trained on specific datasets for\nspecific tasks and may require further training on your data.\nYou can find a list of our out-of-the-box models `below <#out-of-the-box-pretrained-models>`_.\n\n\nDocker images\n~~~~~~~~~~~~~\n\nYou can run DeepPavlov models in :doc:`riseapi </integrations/rest_api>` mode or start Jupyter server\nvia Docker without installing DeepPavlov. Both your CPU and GPU (we support NVIDIA graphic\nprocessors) can be utilised, please refer our `Docker <https://hub.docker.com/r/deeppavlov/deeppavlov>`_\nimages run instructions.\n\n\nOut-of-the-box pretrained models\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nWhile the best way to solve most of the NLP tasks lies through collecting datasets\nand training models according to the domain and an actual task itself, DeepPavlov\noffers several pretrained models, which can be strong baselines for a wide range of tasks.\n\nYou can run these models `via Docker <#docker-images>`_ or in ``riseapi``/``risesocket`` mode to use in\nsolutions. See :doc:`riseapi </integrations/rest_api>` and :doc:`risesocket </integrations/socket_api>`\nmodes documentation for API details.\n\n\nText Question Answering\n=======================\n\nText Question Answering component answers a question based on a given context (e.g,\na paragraph of text), where the answer to the question is a segment of the context.\n\n.. code:: python\n\n    from deeppavlov import build_model\n\n    model = build_model('squad_bert', download=True, install=True)\n    contexts = ['DeepPavlov is a library for NLP and dialog systems.', 'All work and no play makes Jack a dull boy']\n    questions = ['What is DeepPavlov?', 'What makes Jack a dull boy?']\n    answer, answers_start_idx, score = model(contexts, questions)\n    print(answer)\n\n.. code:: bash\n\n    ['a library for NLP and dialog systems', 'All work and no play']\n\nTo get list of available models for Text Question Answering see :doc:`documentation </features/models/SQuAD>`.\n\nOpen-Domain Question Answering\n==============================\n\nOpen Domain Question Answering (ODQA) answers any question based on the document collection covering a wide range of\ntopics. The ODQA task combines two challenges of document retrieval (finding the relevant articles) with that of machine\ncomprehension of text (identifying the answer span from those articles). This component can be used to answer questions\nbased on the company knowledge base.\n\n.. code:: python\n\n    from deeppavlov import build_model\n\n    model = build_model('en_odqa_infer_wiki', download=True, install=True)\n    questions = [\"What is the name of Darth Vader's son?\", 'Who was the first president of France?']\n    answer, answer_score, answer_place = model(questions)\n    print(answer)\n\n.. code:: bash\n\n    ['Luke Skywalker', 'Louis-Napoleon Bonaparte']\n\nTo get list of available models for Open-Domain Question Answering see :doc:`documentation </features/models/ODQA>`.\n\nKnowledge Base Question Answering\n=================================\n\nKnowledge Base Question Answering (KBQA) answers any question based on Knowledge Base (Knowledge Graph) -\na comprehensive repository of information about a given domain or a number of domains that reflects the ways we model\nknowledge about a given subject or subjects, in terms of concepts, entities, properties, and relationships. KBQA models\nvalidate questions against a preconfigured list of question templates, disambiguate entities using Entity Linking,\nand answer questions asked in natural language.\n\n.. code:: python\n\n    from deeppavlov import build_model\n\n    model = build_model('kbqa_cq_en', download=True, install=True)\n    questions = ['What is the currency of Sweden?', 'When did the Korean War end?']\n    answers, answer_ids, query = model(questions)\n    print(answers)\n\n.. code:: bash\n\n    ['Swedish krona', '27 July 1953']\n\nTo get list of available models for Knowledge Base Question Answering see :doc:`documentation </features/models/KBQA>`.\n\nClassification (insult and paraphrase detection, sentiment analysis, topic classification)\n==========================================================================================\n\nInsult detection predicts whether a text (e.g, post or speech in some public discussion) is considered insulting to one\nof the persons it is related to.\n\nSentiment analysis is a task of classifying the polarity of the the given sequence.\n\nThe models trained for the paraphrase detection task identify whether two sentences expressed with different words\nconvey the same meaning.\n\nTopic classification refers to the task of classifying an utterance by the topic which belongs to the conversational\ndomain.\n\n.. code:: python\n\n    from deeppavlov import build_model\n\n    model = build_model('insults_kaggle_bert', download=True, install=True)\n    phrases = ['You are kind of stupid', 'You are a wonderful person!']\n    labels = model(phrases)\n    print(labels)\n\n.. code:: bash\n\n    ['Insult', 'Not Insult']\n\nTo get list of available models for Classification see :doc:`documentation </features/models/classification>`.\n\nName Entity Recognition\n=======================\n\nNamed Entity Recognition (NER) classifies tokens in text into predefined categories\n(tags), such as person names, quantity expressions, percentage expressions, names\nof locations, organizations, as well as expression of time, currency and others.\n\n.. code:: python\n\n    from deeppavlov import build_model\n\n    model = build_model('ner_ontonotes_bert', download=True, install=True)\n    phrases = ['Bob Ross lived in Florida', 'Elon Musk founded Tesla']\n    tokens, tags = model(phrases)\n    print(tokens, tags, sep='\\n')\n\n.. code:: bash\n\n    [['Bob', 'Ross', 'lived', 'in', 'Florida'], ['Elon', 'Musk', 'founded', 'Tesla']]\n    [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE'], ['B-PERSON', 'I-PERSON', 'O', 'B-ORG']]\n\nTo get list of available models for Name Entity Recognition see :doc:`documentation </features/models/NER>`.\n\nEntity Extraction\n=================\n\nEntity Detection is the task of identifying entity mentions in text with corresponding entity types.\nEntity Linking is the task of finding knowledge base entity ids for entity mentions in text.\nEntity Extraction configs perform subsequent Entity Detection and Entity Linking of extracted entity mentions.\n\n.. code:: python\n\n    from deeppavlov import build_model\n\n    model = build_model('entity_extraction_en', download=True, install=True)\n    phrases = ['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.']\n    entity_substr, tags, entity_offsets, entity_ids, entity_conf, entity_pages, entity_labels = model(phrases)\n    print(entity_substr, tags, entity_ids, entity_labels, sep='\\n')\n\n.. code:: bash\n\n    [['forrest gump', 'robert zemeckis', 'eric roth']]\n    [['WORK_OF_ART', 'PERSON', 'PERSON']]\n    [[['Q134773', 'Q552213', 'Q12016774'], ['Q187364', 'Q36951156'], ['Q942932', 'Q89320386', 'Q89909683']]]\n    [[['Forrest Gump', 'Forrest Gump', 'Forrest Gump'], ['Robert Zemeckis', 'Welcome to Marwen'], ['Eric Roth', 'Eric Roth', 'Eric W Roth']]]\n\nTo get list of available models for Entity Extraction see :doc:`documentation </features/models/entity_extraction>`.\n\nSpelling Correction\n===================\n\nSpelling Correction models detect and correct spelling errors in texts.\n\n.. code:: python\n\n    from deeppavlov import build_model\n\n    model = build_model('brillmoore_wikitypos_en', download=True, install=True)\n    phrases_w_typos = ['I think this is the begining of a beautifull frendship.', \"I'll be bak\"]\n    correct_phrases = model(phrases_w_typos)\n    print(correct_phrases)\n\n.. code:: bash\n\n    ['i think this is the beginning of a beautiful friendship.', \"i'll be back\"]\n\nTo get list of available models for Spelling Correction see :doc:`documentation </features/models/spelling_correction>`.\n"
  },
  {
    "path": "requirements.txt",
    "content": "fastapi>=0.47.0,<=0.89.1\nfilelock>=3.0.0,<3.10.0\nnltk>=3.2.4,<3.10.0\nnumpy<1.24\npandas>=1.0.0,<1.6.0\nprometheus-client>=0.13.0,<=1.16.0\npydantic<2\npybind11==2.10.3\nrequests>=2.19.0,<3.0.0\nscikit-learn>=0.24,<1.1.0;python_version<=\"3.10\"\nscikit-learn==1.4.0;python_version==\"3.11.*\"\ntqdm>=4.42.0,<4.65.0\nuvicorn>=0.13.0,<0.19.0\nwheel\nscipy<1.10.0;python_version<\"3.8\"\nscipy==1.10.0;python_version>=\"3.8\"\n"
  },
  {
    "path": "setup.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#     http://www.apache.org/licenses/LICENSE-2.0\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport re\n\nfrom setuptools import setup, find_packages\n\n__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))\n\nmeta_path = os.path.join(__location__, 'deeppavlov', '_meta.py')\nwith open(meta_path) as meta:\n    exec(meta.read())\n\n\ndef read_requirements():\n    \"\"\"parses requirements from requirements.txt\"\"\"\n    reqs_path = os.path.join(__location__, 'requirements.txt')\n    with open(reqs_path, encoding='utf8') as f:\n        reqs = [line.strip() for line in f if not line.strip().startswith('#')]\n\n    names = []\n    links = []\n    for req in reqs:\n        if '://' in req:\n            links.append(req)\n        else:\n            names.append(req)\n    return {'install_requires': names, 'dependency_links': links}\n\n\ndef readme():\n    with open(os.path.join(__location__, 'README.md'), encoding='utf8') as f:\n        text = f.read()\n    text = re.sub(r']\\((?!https?://)', r'](https://github.com/deeppavlov/DeepPavlov/blob/master/', text)\n    text = re.sub(r'\\ssrc=\"(?!https?://)', r' src=\"https://raw.githubusercontent.com/deeppavlov/DeepPavlov/master/', text)\n    return text\n\n\nif __name__ == '__main__':\n    setup(\n        name='deeppavlov',\n        packages=find_packages(exclude=('tests', 'docs', 'utils')),\n        version=__version__,\n        description=__description__,\n        long_description=readme(),\n        long_description_content_type='text/markdown',\n        author=__author__,\n        author_email=__email__,\n        license=__license__,\n        url='https://github.com/deeppavlov/DeepPavlov',\n        download_url=f'https://github.com/deeppavlov/DeepPavlov/archive/{__version__}.tar.gz',\n        keywords=__keywords__,\n        include_package_data=True,\n        extras_require={\n            'tests': [\n                'flake8',\n                'pytest',\n                'pytest-instafail',\n                'pexpect'\n            ],\n            'docs': [\n                'sphinx==3.5.4;python_version<=\"3.7\"',\n                'sphinx==5.0.0;python_version==\"3.8\"',\n                'sphinx==5.0.0;python_version==\"3.9\"',\n                'sphinx==5.0.0;python_version==\"3.10\"',\n                'sphinx==7.2.*;python_version==\"3.11.*\"',\n                'sphinx_rtd_theme==0.5.2;python_version<=\"3.10\"',\n                'sphinx_rtd_theme==2.0.0;python_version==\"3.11.*\"',\n                'docutils<0.17,>=0.12;python_version<=\"3.10\"',\n                'docutils==0.20.1;python_version==\"3.11.*\"',\n                'nbsphinx==0.8.4;python_version<=\"3.10\"',\n                'nbsphinx==0.9.3;python_version==\"3.11.*\"',\n                'ipykernel==5.5.4',\n                'jinja2<=3.0.3',\n                'sphinx-copybutton==0.5.0',\n                'pandoc==2.3',\n                'ipython_genutils==0.2.0'\n            ],\n            's3': [\n                'boto3'\n            ]\n        },\n        **read_requirements()\n    )\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/test_configs/doc_retrieval/en_ranker_pop_wiki_test.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"odqa_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test\",\n    \"save_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test.db\",\n    \"dataset_format\": \"txt\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"sqlite_iterator\",\n    \"shuffle\": false,\n    \"load_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test.db\"\n  },\n  \"chainer\": {\n    \"in\": [\n      \"docs\"\n    ],\n    \"in_y\": [\n      \"doc_ids\",\n      \"doc_nums\"\n    ],\n    \"out\": [\n      \"pop_doc_ids\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"hashing_tfidf_vectorizer\",\n        \"id\": \"vectorizer\",\n        \"fit_on\": [\n          \"docs\",\n          \"doc_ids\",\n          \"doc_nums\"\n        ],\n        \"save_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz\",\n        \"load_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz\",\n        \"tokenizer\": {\n          \"class_name\": \"stream_spacy_tokenizer\",\n          \"lemmas\": true,\n          \"ngram_range\": [\n            1,\n            2\n          ]\n        }\n      },\n      {\n        \"class_name\": \"tfidf_ranker\",\n        \"top_n\": 20,\n        \"in\": [\n          \"docs\"\n        ],\n        \"out\": [\n          \"tfidf_doc_ids\",\n          \"tfidf_doc_scores\"\n        ],\n        \"vectorizer\": \"#vectorizer\"\n      },\n      {\n        \"class_name\": \"pop_ranker\",\n        \"pop_dict_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_popularities.json\",\n        \"load_path\": \"{MODELS_PATH}/odqa/logreg_3features_v2.joblib\",\n        \"top_n\": 10,\n        \"in\": [\n          \"tfidf_doc_ids\",\n          \"tfidf_doc_scores\"\n        ],\n        \"out\": [\n          \"pop_doc_ids\",\n          \"pop_doc_scores\"\n        ]\n      }\n    ]\n  },\n  \"train\": {\n    \"batch_size\": 10000,\n    \"evaluation_targets\": [],\n    \"class_name\": \"fit_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/wikipedia/enwiki_test.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/odqa\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_popularities.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/odqa\"\n      },\n      {\n        \"url\": \"http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib\",\n        \"subdir\": \"{MODELS_PATH}/odqa\"\n      }\n    ]\n  }\n}\n"
  },
  {
    "path": "tests/test_configs/doc_retrieval/en_ranker_tfidf_wiki_test.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"odqa_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test\",\n    \"save_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test.db\",\n    \"dataset_format\": \"txt\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"sqlite_iterator\",\n    \"shuffle\": false,\n    \"load_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test.db\"\n  },\n  \"chainer\": {\n    \"in\": [\n      \"docs\"\n    ],\n    \"in_y\": [\n      \"doc_ids\",\n      \"doc_nums\"\n    ],\n    \"out\": [\n      \"tfidf_doc_ids\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"hashing_tfidf_vectorizer\",\n        \"id\": \"vectorizer\",\n        \"fit_on\": [\n          \"docs\",\n          \"doc_ids\",\n          \"doc_nums\"\n        ],\n        \"save_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz\",\n        \"load_path\": \"{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz\",\n        \"tokenizer\": {\n          \"class_name\": \"stream_spacy_tokenizer\",\n          \"lemmas\": true,\n          \"ngram_range\": [\n            1,\n            2\n          ]\n        }\n      },\n      {\n        \"class_name\": \"tfidf_ranker\",\n        \"top_n\": 20,\n        \"in\": [\n          \"docs\"\n        ],\n        \"out\": [\n          \"tfidf_doc_ids\",\n          \"tfidf_doc_scores\"\n        ],\n        \"vectorizer\": \"#vectorizer\"\n      }\n    ]\n  },\n  \"train\": {\n    \"batch_size\": 2,\n    \"evaluation_targets\": [],\n    \"class_name\": \"fit_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/wikipedia/enwiki_test.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/odqa\"\n      }\n    ]\n  }\n}"
  },
  {
    "path": "tests/test_configs/doc_retrieval/ru_ranker_tfidf_wiki_test.json",
    "content": "{\n  \"dataset_reader\": {\n    \"class_name\": \"odqa_reader\",\n    \"data_path\": \"{DOWNLOADS_PATH}/odqa/ruwiki_test\",\n    \"save_path\": \"{DOWNLOADS_PATH}/odqa/ruwiki_test.db\",\n    \"dataset_format\": \"txt\"\n  },\n  \"dataset_iterator\": {\n    \"class_name\": \"sqlite_iterator\",\n    \"shuffle\": false,\n    \"load_path\": \"{DOWNLOADS_PATH}/odqa/ruwiki_test.db\"\n  },\n  \"chainer\": {\n    \"in\": [\n      \"docs\"\n    ],\n    \"in_y\": [\n      \"doc_ids\",\n      \"doc_nums\"\n    ],\n    \"out\": [\n      \"tfidf_doc_ids\"\n    ],\n    \"pipe\": [\n      {\n        \"class_name\": \"hashing_tfidf_vectorizer\",\n        \"id\": \"vectorizer\",\n        \"fit_on\": [\n          \"docs\",\n          \"doc_ids\",\n          \"doc_nums\"\n        ],\n        \"save_path\": \"{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz\",\n        \"load_path\": \"{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz\",\n        \"tokenizer\": {\n          \"class_name\": \"stream_spacy_tokenizer\",\n          \"spacy_model\": \"ru_core_news_sm\",\n          \"lemmas\": true,\n          \"lowercase\": true,\n          \"filter_stopwords\": true,\n          \"ngram_range\": [\n            1,\n            2\n          ]\n        }\n      },\n      {\n        \"class_name\": \"tfidf_ranker\",\n        \"top_n\": 20,\n        \"in\": [\n          \"docs\"\n        ],\n        \"out\": [\n          \"tfidf_doc_ids\",\n          \"tfidf_doc_scores\"\n        ],\n        \"vectorizer\": \"#vectorizer\"\n      }\n    ]\n  },\n  \"train\": {\n    \"batch_size\": 2,\n    \"evaluation_targets\": [],\n    \"class_name\": \"fit_trainer\"\n  },\n  \"metadata\": {\n    \"variables\": {\n      \"ROOT_PATH\": \"~/.deeppavlov\",\n      \"DOWNLOADS_PATH\": \"{ROOT_PATH}/downloads\",\n      \"MODELS_PATH\": \"{ROOT_PATH}/models\"\n    },\n    \"download\": [\n      {\n        \"url\": \"http://files.deeppavlov.ai/datasets/wikipedia/ruwiki_test.tar.gz\",\n        \"subdir\": \"{DOWNLOADS_PATH}/odqa\"\n      }\n    ]\n  }\n}"
  },
  {
    "path": "tests/test_quick_start.py",
    "content": "import io\nimport json\nimport logging\nimport os\nimport shutil\nimport signal\nimport socket\nimport sys\nfrom concurrent.futures import ProcessPoolExecutor\nfrom pathlib import Path\nfrom struct import unpack\nfrom time import sleep\nfrom typing import Optional, Union\nfrom urllib.parse import urljoin\n\nimport pexpect\nimport pexpect.popen_spawn\nimport pytest\nimport requests\n\nimport deeppavlov\nfrom deeppavlov import build_model\nfrom deeppavlov.core.commands.utils import parse_config, parse_value_with_config\nfrom deeppavlov.core.common.aliases import ALIASES\nfrom deeppavlov.core.data.utils import get_all_elems_from_json\nfrom deeppavlov.download import deep_download\nfrom deeppavlov.utils.server import get_server_params\nfrom deeppavlov.utils.socket import encode\n\ntests_dir = Path(__file__).parent\ntest_configs_path = tests_dir / \"deeppavlov\" / \"configs\"\nsrc_dir = Path(deeppavlov.__path__[0]) / \"configs\"\ntest_src_dir = tests_dir / \"test_configs\"\ndownload_path = tests_dir / \"download\"\n\ncache_dir: Optional[Path] = None\nif not os.getenv('DP_PYTEST_NO_CACHE'):\n    cache_dir = tests_dir / 'download_cache'\n\nSKIP_TF = os.getenv('SKIP_TF', False)\n\napi_port = os.getenv('DP_PYTEST_API_PORT')\nif api_port is not None:\n    api_port = int(api_port)\n\nTEST_MODES = ['IP',  # test_inferring_pretrained_model\n              'TI',  # test_consecutive_training_and_inferring\n              ]\n\nALL_MODES = ('IP', 'TI')\n\nONE_ARGUMENT_INFER_CHECK = ('Dummy text', None)\nTWO_ARGUMENTS_INFER_CHECK = ('Dummy text', 'Dummy text', None)\nFOUR_ARGUMENTS_INFER_CHECK = ('Dummy text', 'Dummy text', 'Dummy text', 'Dummy_text', None)\n\nLIST_ARGUMENTS_INFER_CHECK = (['Dummy text', 'Dummy text'], ['Dummy text', 'Dummy text'], None)\n\nRECORD_ARGUMENTS_INFER_CHECK = (\"Index\", \"Dummy query text\", \"Dummy passage text\", \"Dummy entity\", 1, None)\n\n# Mapping from model name to config-model_dir-ispretrained and corresponding queries-response list.\nPARAMS = {\n    \"relation_extraction\": {\n        (\"relation_extraction/re_docred.json\", \"relation_extraction\", ('IP',)):\n            [\n                (\n                    [[\"Barack\", \"Obama\", \"is\", \"married\", \"to\", \"Michelle\", \"Obama\", \",\", \"born\", \"Michelle\",\n                      \"Robinson\", \".\"]],\n                    [[[(0, 2)], [(5, 7), (9, 11)]]],\n                    [[\"PER\", \"PER\"]],\n                    (\n                        'P26',\n                        'spouse'\n                    )\n                )\n            ],\n        (\"relation_extraction/re_rured.json\", \"relation_extraction\", ('IP',)):\n            [\n                (\n                    [[\"Илон\", \"Маск\", \"живет\", \"в\", \"Сиэттле\", \".\"]],\n                    [[[(0, 2)], [(4, 6)]]],\n                    [[\"PERSON\", \"CITY\"]],\n                    (\n                        'P495',\n                        'страна происхождения'\n                    )\n                ),\n            ]\n    },\n    \"faq\": {\n        (\"faq/fasttext_logreg.json\", \"fasttext_logreg\", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK],  # TODO: add ru test\n    },\n    \"spelling_correction\": {\n        (\"spelling_correction/brillmoore_wikitypos_en.json\", \"error_model\", ALL_MODES):\n            [\n                (\"helllo\", (\"hello\",)),\n                (\"datha\", (\"data\",))\n            ],\n        (\"spelling_correction/levenshtein_corrector_ru.json\", \"error_model\", ('IP',)):\n            [\n                (\"преветствую\", (\"приветствую\",)),\n                (\"Я джва года хочу такую игру\", (\"я два года хочу такую игру\",))\n            ]\n    },\n    \"classifiers\": {\n        (\"classifiers/paraphraser_rubert.json\", \"classifiers\", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/insults_kaggle_bert.json\", \"classifiers\", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"classifiers/rusentiment_bert.json\", \"classifiers\", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"classifiers/sentiment_twitter.json\", \"classifiers\", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK],\n        (\"classifiers/sentiment_sst_conv_bert.json\", \"classifiers\", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"classifiers/glue/glue_mrpc_roberta.json\", \"classifiers\", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/glue/glue_stsb_roberta.json\", \"classifiers\", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/glue/glue_mnli_roberta.json\", \"classifiers\", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/glue/glue_rte_roberta_mnli.json\", \"classifiers\", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/glue/glue_cola_roberta.json\", \"classifiers\", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"classifiers/glue/glue_qnli_roberta.json\", \"classifiers\", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/glue/glue_qqp_roberta.json\", \"classifiers\", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/glue/glue_sst2_roberta.json\", \"classifiers\", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"classifiers/glue/glue_wnli_roberta.json\", \"classifiers\", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/superglue/superglue_copa_roberta.json\", \"classifiers\", ('TI',)): [LIST_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/superglue/superglue_boolq_roberta_mnli.json\", \"classifiers\", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/superglue/superglue_record_roberta.json\", \"classifiers\", ('TI',)): [RECORD_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/superglue/superglue_wic_bert.json\", \"classifiers\", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/topics_distilbert_base_uncased.json\", \"classifiers\", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"classifiers/few_shot_roberta.json\", \"classifiers\", ('IP',)): [\n            ('Dummy text', ['Dummy text Dummy text', 'Dummy class'], ('Dummy class',))\n        ]\n    },\n    \"distil\": {\n        (\"classifiers/paraphraser_convers_distilrubert_2L.json\", \"distil\", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/paraphraser_convers_distilrubert_6L.json\", \"distil\", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"classifiers/rusentiment_convers_distilrubert_2L.json\", \"distil\", ('IP')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"classifiers/rusentiment_convers_distilrubert_6L.json\", \"distil\", ('IP')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_rus_convers_distilrubert_2L.json\", \"distil\", ('IP')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_rus_convers_distilrubert_6L.json\", \"distil\", ('IP')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_case_agnostic_mdistilbert.json\", \"distil\", ('IP')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"squad/squad_ru_convers_distilrubert_2L.json\", \"distil\", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"squad/squad_ru_convers_distilrubert_6L.json\", \"distil\", ('IP')): [TWO_ARGUMENTS_INFER_CHECK]\n    },\n    \"russian_super_glue\": {\n        (\"russian_super_glue/russian_superglue_lidirus_rubert.json\", \"russian_super_glue\", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"russian_super_glue/russian_superglue_danetqa_rubert.json\", \"russian_super_glue\", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"russian_super_glue/russian_superglue_terra_rubert.json\", \"russian_super_glue\", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"russian_super_glue/russian_superglue_rcb_rubert.json\", \"russian_super_glue\", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"russian_super_glue/russian_superglue_russe_rubert.json\", \"russian_super_glue\", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"russian_super_glue/russian_superglue_rwsd_rubert.json\", \"russian_super_glue\", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"russian_super_glue/russian_superglue_muserc_rubert.json\", \"russian_super_glue\", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"russian_super_glue/russian_superglue_parus_rubert.json\", \"russian_super_glue\", ('IP',)): [LIST_ARGUMENTS_INFER_CHECK],\n        (\"russian_super_glue/russian_superglue_rucos_rubert.json\", \"russian_super_glue\", ('IP',)): [RECORD_ARGUMENTS_INFER_CHECK]\n    },\n    \"multitask\":{\n        (\"multitask/multitask_example.json\", \"multitask\", ALL_MODES): [\n            ('Dummy text',) + (('Dummy text', 'Dummy text'),) * 3 + ('Dummy text',) + (None,)],\n        (\"multitask/mt_glue.json\", \"multitask\", ALL_MODES): [\n            ('Dummy text',) * 2 + (('Dummy text', 'Dummy text'),) * 6 + (None,)]\n    },\n    \"entity_extraction\": {\n        (\"entity_extraction/entity_detection_en.json\", \"entity_extraction\", ('IP',)):\n            [\n                (\"Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.\",\n                 (['forrest gump', 'robert zemeckis', 'eric roth'],\n                  [(0, 12), (48, 63), (79, 88)],\n                  [[0, 1], [10, 11], [15, 16]],\n                  ['WORK_OF_ART', 'PERSON', 'PERSON'],\n                  [(0, 89)],\n                  ['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'],\n                  [0.8798, 0.9986, 0.9985]))\n            ],\n        (\"entity_extraction/entity_detection_ru.json\", \"entity_extraction\", ('IP',)):\n            [\n                (\"Москва — столица России, центр Центрального федерального округа и центр Московской области.\",\n                 (['москва', 'россии', 'центрального федерального округа', 'московской области'],\n                  [(0, 6), (17, 23), (31, 63), (72, 90)],\n                  [[0], [3], [6, 7, 8], [11, 12]],\n                  ['CITY', 'COUNTRY', 'LOC', 'LOC'],\n                  [(0, 91)],\n                  ['Москва — столица России, центр Центрального федерального округа и центр Московской области.'],\n                  [0.8359, 0.938, 0.9917, 0.9803]))\n            ],\n        (\"entity_extraction/entity_extraction_en.json\", \"entity_extraction\", ('IP',)):\n            [\n                (\"Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.\",\n                 (['forrest gump', 'robert zemeckis', 'eric roth'],\n                  ['WORK_OF_ART', 'PERSON', 'PERSON'],\n                  [(0, 12), (48, 63), (79, 88)],\n                  [['Q134773', 'Q552213', 'Q12016774'], ['Q187364', 'Q36951156'],\n                   ['Q942932', 'Q89320386', 'Q89909683']],\n                  [[[1.1, 110, 1.0], [1.1, 13, 0.73], [1.1, 8, 0.04]], [[1.1, 73, 1.0], [0.5, 52, 0.29]],\n                   [[1.1, 37, 0.95], [1.1, 2, 0.35], [0.67, 2, 0.35]]],\n                  [['Forrest Gump', 'Forrest Gump (novel)', ''], ['Robert Zemeckis', 'Welcome to Marwen'],\n                   ['Eric Roth', '', '']],\n                  [['Forrest Gump', 'Forrest Gump', 'Forrest Gump'], ['Robert Zemeckis', 'Welcome to Marwen'],\n                   ['Eric Roth', 'Eric Roth', 'Eric W Roth']]))\n            ],\n        (\"entity_extraction/entity_extraction_ru.json\", \"entity_extraction\", ('IP',)):\n            [\n                (\"Москва — столица России, центр Центрального федерального округа и центр Московской области.\",\n                 (['москва', 'россии', 'центрального федерального округа', 'московской области'],\n                  ['CITY', 'COUNTRY', 'LOC', 'LOC'],\n                  [(0, 6), (17, 23), (31, 63), (72, 90)],\n                  [['Q649', 'Q1023006', 'Q2380475'], ['Q159', 'Q2184', 'Q139319'], ['Q190778', 'Q4504288', 'Q27557290'],\n                   ['Q1697', 'Q4303932', 'Q24565285']],\n                  [[[1.1, 200, 1.0], [1.0, 20, 0.0], [1.0, 18, 0.0]],\n                   [[1.1, 200, 1.0], [1.0, 58, 1.0], [1.0, 29, 0.85]],\n                   [[1.1, 200, 1.0], [0.67, 3, 0.92], [0.67, 3, 0.89]],\n                   [[0.9, 200, 1.0], [0.9, 6, 0.83], [0.61, 8, 0.03]]],\n                  [['Москва', 'Москоу (Канзас)', 'Москоу (Теннесси)'],\n                   ['Россия', 'Российская Советская Федеративная Социалистическая Республика',\n                    'Российская республика'],\n                   ['Центральный федеральный округ', 'Центральный округ (Краснодар)', ''],\n                   ['Московская область', 'Московская область (1917—1918)',\n                    'Мостовский (Волгоградская область)']],\n                  [['Москва', 'Москоу', 'Москоу'],\n                   ['Россия', 'Российская Советская Федеративная Социалистическая Республика',\n                    'Российская республика'],\n                   ['Центральный федеральный округ', 'Центральный округ (Краснодар)', 'Центральный округ (Братск)'],\n                   ['Московская область', 'Московская область', 'Мостовский']]))\n            ]\n    },\n    \"ner\": {\n        (\"ner/ner_bert_base.json\", \"ner_bert_base\", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_conll2003_bert.json\", \"ner_conll2003_bert\", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_ontonotes_bert.json\", \"ner_ontonotes_bert\", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_ontonotes_bert_mult.json\", \"ner_ontonotes_bert_mult\", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_rus_bert.json\", \"ner_rus_bert\", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_collection3_bert.json\", \"ner_collection3_bert\", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_conll2003_deberta_crf.json\", \"ner_conll2003_deberta_crf\", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"ner/ner_ontonotes_deberta_crf.json\", \"ner_ontonotes_deberta_crf\", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],\n    },\n    \"sentence_segmentation\": {\n        (\"sentence_segmentation/sentseg_dailydialog_bert.json\", \"sentseg_dailydialog_bert\", ('IP', 'TI')): [\n            ([\"hey\", \"alexa\", \"how\", \"are\", \"you\"], None)]\n    },\n    \"kbqa\": {\n        (\"kbqa/kbqa_cq_en.json\", \"kbqa\", ('IP',)):\n            [\n                (\"What is the currency of Sweden?\",\n                 (\"Swedish krona\", [\"Q122922\"], [\"SELECT ?answer WHERE { wd:Q34 wdt:P38 ?answer. }\"])),\n                (\"Where was Napoleon Bonaparte born?\",\n                 (\"Ajaccio\", [\"Q40104\"], [\"SELECT ?answer WHERE { wd:Q517 wdt:P19 ?answer. }\"])),\n                (\"When did the Korean War end?\",\n                 (\"27 July 1953\", [\"+1953-07-27^^T\"], [\"SELECT ?answer WHERE { wd:Q8663 wdt:P582 ?answer. }\"])),\n                (\"   \", (\"Not Found\", [], []))\n            ],            \n        (\"kbqa/kbqa_cq_ru.json\", \"kbqa\", ('IP',)):\n            [\n                (\"Кто такой Оксимирон?\",\n                 (\"российский рэп-исполнитель\", ['российский рэп-исполнитель\"@ru'],\n                  [\"SELECT ?answer WHERE { wd:Q4046107 wdt:P0 ?answer. }\"])),\n                (\"Кто написал «Евгений Онегин»?\",\n                 (\"Александр Сергеевич Пушкин\", [\"Q7200\"], [\"SELECT ?answer WHERE { wd:Q50948 wdt:P50 ?answer. }\"])),\n                (\"абв\", (\"Not Found\", [], []))\n            ]\n    },\n    \"ranking\": {\n        (\"ranking/ranking_ubuntu_v2_torch_bert_uncased.json\", \"ranking\", ('TI',)): [ONE_ARGUMENT_INFER_CHECK]\n    },\n    \"doc_retrieval\": {\n        (\"doc_retrieval/en_ranker_tfidf_wiki_test.json\", \"doc_retrieval\", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"doc_retrieval/ru_ranker_tfidf_wiki_test.json\", \"doc_retrieval\", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"doc_retrieval/en_ranker_pop_wiki_test.json\", \"doc_retrieval\", ('TI',)): [ONE_ARGUMENT_INFER_CHECK]\n    },\n    \"squad\": {\n        (\"squad/squad_ru_bert.json\", \"squad_ru_bert\", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK],\n        (\"squad/squad_bert.json\", \"squad_bert\", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK]\n    },\n    \"odqa\": {\n        (\"odqa/en_odqa_infer_wiki.json\", \"odqa\", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"odqa/ru_odqa_infer_wiki.json\", \"odqa\", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],\n        (\"odqa/en_odqa_pop_infer_wiki.json\", \"odqa\", ('IP',)): [ONE_ARGUMENT_INFER_CHECK]\n    },\n    \"morpho_tagger\": {\n        (\"morpho_syntax_parser/morpho_ru_syntagrus_bert.json\", \"morpho_tagger_bert\", ('IP', 'TI')):\n            [ONE_ARGUMENT_INFER_CHECK]\n    },\n    \"syntax_tagger\": {\n        (\"morpho_syntax_parser/syntax_ru_syntagrus_bert.json\", \"syntax_ru_bert\", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],\n        (\"morpho_syntax_parser/ru_syntagrus_joint_parsing.json\", \"syntax_ru_bert\", ('IP',)): [ONE_ARGUMENT_INFER_CHECK]\n    },\n}\n\nMARKS = {\"gpu_only\": [\"squad\"], \"slow\": [\"error_model\", \"squad\"]}  # marks defined in pytest.ini\n\nTEST_GRID = []\nfor model in PARAMS.keys():\n    for conf_file, model_dir, mode in PARAMS[model].keys():\n        marks = []\n        for mark in MARKS.keys():\n            if model in MARKS[mark]:\n                marks.append(eval(\"pytest.mark.\" + mark))\n        grid_unit = pytest.param(model, conf_file, model_dir, mode, marks=marks)\n        TEST_GRID.append(grid_unit)\n\n\ndef _override_with_test_values(item: Union[dict, list]) -> None:\n    if isinstance(item, dict):\n        keys = [k for k in item.keys() if k.startswith('pytest_')]\n        for k in keys:\n            item[k[len('pytest_'):]] = item.pop(k)\n        item = item.values()\n\n    for child in item:\n        if isinstance(child, (dict, list)):\n            _override_with_test_values(child)\n\n\ndef download_config(config_path):\n    src_file = src_dir / config_path\n    if not src_file.is_file():\n        src_file = test_src_dir / config_path\n\n    if not src_file.is_file():\n        raise RuntimeError('No config file {}'.format(config_path))\n\n    with src_file.open(encoding='utf8') as fin:\n        config: dict = json.load(fin)\n\n    # Download referenced config files\n    config_references = get_all_elems_from_json(parse_config(config), 'config_path')\n    for config_ref in config_references:\n        splitted = config_ref.split(\"/\")\n        first_subdir_index = splitted.index(\"configs\") + 1\n        m_name = config_ref.split('/')[first_subdir_index]\n        config_ref = '/'.join(config_ref.split('/')[first_subdir_index:])\n\n        test_configs_path.joinpath(m_name).mkdir(exist_ok=True)\n        if not test_configs_path.joinpath(config_ref).exists():\n            download_config(config_ref)\n\n    # Update config for testing\n    config.setdefault('train', {}).setdefault('pytest_epochs', 1)\n    config['train'].setdefault('pytest_max_batches', 2)\n    config['train'].setdefault('pytest_max_test_batches', 2)\n    _override_with_test_values(config)\n\n    config_path = test_configs_path / config_path\n    config_path.parent.mkdir(exist_ok=True, parents=True)\n    with config_path.open(\"w\", encoding='utf8') as fout:\n        json.dump(config, fout)\n\n\ndef install_config(config_path):\n    logfile = io.BytesIO(b'')\n    p = pexpect.popen_spawn.PopenSpawn(sys.executable + \" -m deeppavlov install \" + str(config_path), timeout=None,\n                                       logfile=logfile)\n    p.readlines()\n    if p.wait() != 0:\n        raise RuntimeError('Installing process of {} returned non-zero exit code: \\n{}'\n                           .format(config_path, logfile.getvalue().decode()))\n\n\ndef setup_module():\n    shutil.rmtree(str(test_configs_path), ignore_errors=True)\n    shutil.rmtree(str(download_path), ignore_errors=True)\n    test_configs_path.mkdir(parents=True)\n\n    for m_name, conf_dict in PARAMS.items():\n        test_configs_path.joinpath(m_name).mkdir(exist_ok=True, parents=True)\n        for (config_path, _, _), _ in conf_dict.items():\n            download_config(config_path)\n\n    os.environ['DP_ROOT_PATH'] = str(download_path)\n    os.environ['DP_CONFIGS_PATH'] = str(test_configs_path)\n\n    if cache_dir:\n        cache_dir.mkdir(parents=True, exist_ok=True)\n        os.environ['DP_CACHE_DIR'] = str(cache_dir.resolve())\n\n\ndef teardown_module():\n    shutil.rmtree(str(test_configs_path.parent), ignore_errors=True)\n    shutil.rmtree(str(download_path), ignore_errors=True)\n\n    if cache_dir:\n        shutil.rmtree(str(cache_dir), ignore_errors=True)\n\n\ndef _infer(config, inputs, download=False):\n    chainer = build_model(config, download=download)\n    if inputs:\n        prediction = chainer(*inputs)\n        if len(chainer.out_params) == 1:\n            prediction = [prediction]\n    else:\n        prediction = []\n    return prediction\n\n\n@pytest.mark.parametrize(\"model,conf_file,model_dir,mode\", TEST_GRID, scope='class')\nclass TestQuickStart(object):\n    @staticmethod\n    def infer(config_path, qr_list=None, check_outputs=True):\n\n        *inputs, expected_outputs = zip(*qr_list) if qr_list else ([],)\n        with ProcessPoolExecutor(max_workers=1) as executor:\n            f = executor.submit(_infer, config_path, inputs)\n        outputs = list(zip(*f.result()))\n\n        if check_outputs:\n            errors = ';'.join([f'expected `{expected}` got `{output}`'\n                               for output, expected in zip(outputs, expected_outputs)\n                               if expected is not None and expected != output])\n            if errors:\n                raise RuntimeError(f'Unexpected results for {config_path}: {errors}')\n\n    @staticmethod\n    def infer_api(config_path, qr_list):\n        *inputs, expected_outputs = zip(*qr_list)\n        server_params = get_server_params(config_path)\n\n        url_base = 'http://{}:{}'.format(server_params['host'], api_port or server_params['port'])\n        url = urljoin(url_base.replace('http://0.0.0.0:', 'http://127.0.0.1:'), server_params['model_endpoint'])\n\n        post_headers = {'Accept': 'application/json'}\n\n        logfile = io.BytesIO(b'')\n        args = [sys.executable, \"-m\", \"deeppavlov\", \"riseapi\", str(config_path)]\n        if api_port:\n            args += ['-p', str(api_port)]\n        p = pexpect.popen_spawn.PopenSpawn(' '.join(args),\n                                           timeout=None, logfile=logfile)\n        try:\n            p.expect(url_base)\n\n            get_url = urljoin(url_base.replace('http://0.0.0.0:', 'http://127.0.0.1:'), '/api')\n            get_response = requests.get(get_url)\n            response_code = get_response.status_code\n            assert response_code == 200, f\"GET /api request returned error code {response_code} with {config_path}\"\n\n            model_args_names = get_response.json()['in']\n            post_payload = dict(zip(model_args_names, inputs))\n            # TODO: remove this if from here and socket\n            if 'docred' in str(config_path) or 'rured' in str(config_path):\n                post_payload = {k: v[0] for k, v in post_payload.items()}\n            post_response = requests.post(url, json=post_payload, headers=post_headers)\n            response_code = post_response.status_code\n            assert response_code == 200, f\"POST request returned error code {response_code} with {config_path}\"\n\n        except pexpect.exceptions.EOF:\n            raise RuntimeError('Got unexpected EOF: \\n{}'.format(logfile.getvalue().decode()))\n\n        finally:\n            p.kill(signal.SIGTERM)\n            p.wait()\n            # if p.wait() != 0:\n            #     raise RuntimeError('Error in shutting down API server: \\n{}'.format(logfile.getvalue().decode()))\n\n    @staticmethod\n    def infer_socket(config_path, socket_type):\n        socket_params = get_server_params(config_path)\n        model_args_names = socket_params['model_args_names']\n\n        host = socket_params['host']\n        host = host.replace('0.0.0.0', '127.0.0.1')\n        port = api_port or socket_params['port']\n\n        socket_payload = {}\n        for arg_name in model_args_names:\n            arg_value = ' '.join(['qwerty'] * 10)\n            socket_payload[arg_name] = [arg_value]\n\n        if 'parus' in str(config_path):\n            socket_payload = {k: [v] for k, v in socket_payload.items()}\n\n        logfile = io.BytesIO(b'')\n        args = [sys.executable, \"-m\", \"deeppavlov\", \"risesocket\", str(config_path), '--socket-type', socket_type]\n        if socket_type == 'TCP':\n            args += ['-p', str(port)]\n            address_family = socket.AF_INET\n            connect_arg = (host, port)\n        else:\n            address_family = socket.AF_UNIX\n            connect_arg = socket_params['unix_socket_file']\n        p = pexpect.popen_spawn.PopenSpawn(' '.join(args),\n                                           timeout=None, logfile=logfile)\n        try:\n            p.expect(socket_params['socket_launch_message'])\n            with socket.socket(address_family, socket.SOCK_STREAM) as s:\n                try:\n                    s.connect(connect_arg)\n                except ConnectionRefusedError:\n                    sleep(1)\n                    s.connect(connect_arg)\n                s.sendall(encode(socket_payload))\n                s.settimeout(120)\n                header = s.recv(4)\n                body_len = unpack('<I', header)[0]\n                data = bytearray()\n                while len(data) < body_len:\n                    chunk = s.recv(body_len - len(data))\n                    if not chunk:\n                        raise ValueError(f'header does not match body\\nheader: {body_len}\\nbody length: {len(data)}'\n                                         f'data: {data}')\n                    data.extend(chunk)\n            try:\n                resp = json.loads(data)\n            except json.decoder.JSONDecodeError:\n                raise ValueError(f\"Can't decode model response {data}\")\n            assert resp['status'] == 'OK', f\"{socket_type} socket request returned status: {resp['status']}\" \\\n                                           f\" with {config_path}\\n{logfile.getvalue().decode()}\"\n\n        except pexpect.exceptions.EOF:\n            raise RuntimeError(f'Got unexpected EOF: \\n{logfile.getvalue().decode()}')\n\n        except json.JSONDecodeError:\n            raise ValueError(f'Got JSON not serializable response from model: \"{data}\"\\n{logfile.getvalue().decode()}')\n\n        finally:\n            p.kill(signal.SIGTERM)\n            p.wait()\n\n    def test_inferring_pretrained_model(self, model, conf_file, model_dir, mode):\n        if 'IP' in mode:\n            config_file_path = str(test_configs_path.joinpath(conf_file))\n            install_config(config_file_path)\n            deep_download(config_file_path)\n\n            self.infer(test_configs_path / conf_file, PARAMS[model][(conf_file, model_dir, mode)])\n        else:\n            pytest.skip(\"Unsupported mode: {}\".format(mode))\n\n    def test_inferring_pretrained_model_api(self, model, conf_file, model_dir, mode):\n        if 'IP' in mode:\n            self.infer_api(test_configs_path / conf_file, PARAMS[model][(conf_file, model_dir, mode)])\n        else:\n            pytest.skip(\"Unsupported mode: {}\".format(mode))\n\n    def test_inferring_pretrained_model_socket(self, model, conf_file, model_dir, mode):\n        pytest.skip(f\"Disabled\")\n        if 'IP' in mode:\n            self.infer_socket(test_configs_path / conf_file, 'TCP')\n\n            if 'TI' not in mode:\n                shutil.rmtree(str(download_path), ignore_errors=True)\n        else:\n            pytest.skip(f\"Unsupported mode: {mode}\")\n\n\n    def test_consecutive_training_and_inferring(self, model, conf_file, model_dir, mode):\n        if 'TI' in mode:\n            c = test_configs_path / conf_file\n            model_path = download_path / model_dir\n\n            if 'IP' not in mode:\n                config_path = str(test_configs_path.joinpath(conf_file))\n                install_config(config_path)\n                deep_download(config_path)\n            shutil.rmtree(str(model_path), ignore_errors=True)\n\n            logfile = io.BytesIO(b'')\n            p = pexpect.popen_spawn.PopenSpawn(sys.executable + \" -m deeppavlov train \" + str(c), timeout=None,\n                                               logfile=logfile)\n            p.readlines()\n            if p.wait() != 0:\n                raise RuntimeError('Training process of {} returned non-zero exit code: \\n{}'\n                                   .format(model_dir, logfile.getvalue().decode()))\n            self.infer(c, PARAMS[model][(conf_file, model_dir, mode)], check_outputs=False)\n\n            shutil.rmtree(str(download_path), ignore_errors=True)\n        else:\n            pytest.skip(\"Unsupported mode: {}\".format(mode))\n\n\ndef test_crossvalidation():\n    model_dir = 'faq'\n    conf_file = 'faq/fasttext_logreg.json'\n\n    download_config(conf_file)\n\n    c = test_configs_path / conf_file\n    model_path = download_path / model_dir\n\n    install_config(c)\n    deep_download(c)\n    shutil.rmtree(str(model_path), ignore_errors=True)\n\n    logfile = io.BytesIO(b'')\n    p = pexpect.popen_spawn.PopenSpawn(sys.executable + f\" -m deeppavlov crossval {c} --folds 2\",\n                                       timeout=None, logfile=logfile)\n    p.readlines()\n    if p.wait() != 0:\n        raise RuntimeError('Training process of {} returned non-zero exit code: \\n{}'\n                           .format(model_dir, logfile.getvalue().decode()))\n\n    shutil.rmtree(str(download_path), ignore_errors=True)\n\n\ndef test_hashes_existence():\n    all_configs = list(src_dir.glob('**/*.json')) + list(test_src_dir.glob('**/*.json'))\n    url_root = 'http://files.deeppavlov.ai/'\n    downloads_urls = set()\n    for config in all_configs:\n        config = json.loads(config.read_text(encoding='utf-8'))\n        # TODO: replace with get downloads from config\n        # TODO: download only headers\n        # TODO: make requests in async mode\n        config_urls = {d if isinstance(d, str) else d['url'] for d in config.get('metadata', {}).get('download', [])}\n        downloads_urls |= {parse_value_with_config(url, config) for url in config_urls}\n    downloads_urls = [url + '.md5' for url in downloads_urls if url.startswith(url_root)]\n    messages = []\n\n    logging.getLogger(\"urllib3\").setLevel(logging.WARNING)\n\n    for url in downloads_urls:\n        status = requests.get(url).status_code\n        if status != 200:\n            messages.append(f'got status_code {status} for {url}')\n    if messages:\n        raise RuntimeError('\\n'.join(messages))\n\n\ndef test_aliases():\n    configs = list(src_dir.glob('**/*.json'))\n    config_names = [c.stem for c in configs]\n\n    assert len(config_names) == len(set(config_names)), 'Some model names are duplicated'\n\n    aliases_in_configs = set(ALIASES.keys()) & set(config_names)\n    assert aliases_in_configs == set(), f'Following model(s) marked as deprecated but still present in configs list: ' \\\n                                        f'{\", \".join(aliases_in_configs)}.'\n\n    alias_targets_not_in_configs = set(ALIASES.values()) - set(config_names)\n    assert alias_targets_not_in_configs == set(), f'Following model(s) marked as alias targets but there is no such ' \\\n                                                  f'config in the library: {\", \".join(alias_targets_not_in_configs)}'\n"
  },
  {
    "path": "utils/Docker/Dockerfile",
    "content": "ARG BASE_IMAGE\n\nFROM $BASE_IMAGE\n\nSHELL [\"/bin/bash\", \"-c\"]\n\nENV DP_PYTEST_API_PORT=5000\nENV DP_PYTEST_NO_CACHE=True\nENV LANG='en_US.UTF-8'\n\nARG DEBIAN_FRONTEND=noninteractive\nARG PYTHON_VERSION\n\nRUN rm -f /etc/apt/sources.list.d/cuda*.list && \\\n    apt update && \\\n    apt install -y --no-install-recommends \\\n        build-essential \\\n        dpkg-dev \\\n        gcc \\\n        git\t\\\n        libbz2-dev \\\n        libc6-dev \\\n        libexpat1-dev \\\n        libffi-dev \\\n        libgdbm-dev \\\n        liblzma-dev \\\n        libncursesw5-dev \\\n        libreadline-dev \\\n        libsqlite3-dev \\\n        libssl-dev \\\n        libxslt-dev \\\n        locales \\\n        make \\\n        pandoc \\\n        tk-dev \\\n        wget \\\n        xz-utils \\\n        zlib1g-dev && \\\n    locale-gen en_US.UTF-8 && \\\n    wget --no-check-certificate -O python.tar.xz https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tar.xz && \\\n    mkdir -p /usr/src/python && \\\n\ttar -xC /usr/src/python --strip-components=1 -f python.tar.xz && \\\n    rm python.tar.xz && \\\n    cd /usr/src/python && \\\n\t./configure && \\\n\tmake -j \"$(nproc)\" altinstall && \\\n\tln -s /usr/local/bin/python${PYTHON_VERSION%.*} /usr/local/bin/python && \\\n    ln -s /usr/local/bin/pip${PYTHON_VERSION%.*} /usr/local/bin/pip && \\\n    pip install --upgrade pip && \\\n    pip install pybind11==2.2.4 && \\\n    rm -rf /usr/src/python /var/lib/apt/lists/*\n\nWORKDIR /app\n\n# two commands to prevent caching of the next layers\nARG EPOCH\nENV EPOCH=$EPOCH\n\nCOPY . .\n\nCMD utils/Docker/cmd.sh\n"
  },
  {
    "path": "utils/Docker/README.md",
    "content": ""
  },
  {
    "path": "utils/Docker/cmd.sh",
    "content": "#!/bin/bash\n\nset -e\n\npip install .[tests,docs]\n\nrm -rf `find . -mindepth 1 -maxdepth 1 ! -name tests ! -name Jenkinsfile ! -name docs`\n\ncd docs\nmake clean\nmake html\ncd ..\n\nflake8 `python -c 'import deeppavlov; print(deeppavlov.__path__[0])'` --count --select=E9,F63,F7,F82 --show-source --statistics\n\npytest -v --disable-warnings --instafail $PYTEST_ARGS\n"
  },
  {
    "path": "utils/Docker/docker-compose.yml",
    "content": "version: '3.7'\nservices:\n  py36:\n    build:\n      context: ../../\n      dockerfile: utils/Docker/Dockerfile\n      args:\n        - EPOCH=$EPOCH\n        - PYTHON_VERSION=3.6.15\n        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04\n    user: '${UID}:${GID}'\n    environment:\n      - CUDA_VISIBLE_DEVICES=$TEST_GPU_0\n      - PYTEST_ARGS=$PYTEST_ARGS\n      - DP_PYTEST_NO_CACHE=True\n  py37:\n    build:\n      context: ../../\n      dockerfile: utils/Docker/Dockerfile\n      args:\n        - EPOCH=$EPOCH\n        - PYTHON_VERSION=3.7.16\n        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04\n    user: '${UID}:${GID}'\n    environment:\n      - CUDA_VISIBLE_DEVICES=$TEST_GPU_1\n      - PYTEST_ARGS=$PYTEST_ARGS\n      - DP_PYTEST_NO_CACHE=True\n  py38:\n    build:\n      context: ../../\n      dockerfile: utils/Docker/Dockerfile\n      args:\n        - EPOCH=$EPOCH\n        - PYTHON_VERSION=3.8.16\n        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04\n    user: '${UID}:${GID}'\n    environment:\n      - CUDA_VISIBLE_DEVICES=$TEST_GPU_0\n      - PYTEST_ARGS=$PYTEST_ARGS\n      - DP_PYTEST_NO_CACHE=True\n  py39:\n    build:\n      context: ../../\n      dockerfile: utils/Docker/Dockerfile\n      args:\n        - EPOCH=$EPOCH\n        - PYTHON_VERSION=3.9.16\n        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04\n    user: '${UID}:${GID}'\n    environment:\n      - CUDA_VISIBLE_DEVICES=$TEST_GPU_1\n      - PYTEST_ARGS=$PYTEST_ARGS\n      - DP_PYTEST_NO_CACHE=True\n  py310:\n    build:\n      context: ../../\n      dockerfile: utils/Docker/Dockerfile\n      args:\n        - EPOCH=$EPOCH\n        - PYTHON_VERSION=3.10.9\n        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04\n    user: '${UID}:${GID}'\n    environment:\n      - CUDA_VISIBLE_DEVICES=$TEST_GPU_0\n      - PYTEST_ARGS=$PYTEST_ARGS\n      - DP_PYTEST_NO_CACHE=True\n  py311:\n    build:\n      context: ../../\n      dockerfile: utils/Docker/Dockerfile\n      args:\n        - EPOCH=$EPOCH\n        - PYTHON_VERSION=3.11.6\n        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04\n    user: '${UID}:${GID}'\n    environment:\n      - CUDA_VISIBLE_DEVICES=$TEST_GPU_1\n      - PYTEST_ARGS=$PYTEST_ARGS\n      - DP_PYTEST_NO_CACHE=True\n"
  },
  {
    "path": "utils/__init__.py",
    "content": ""
  },
  {
    "path": "utils/prepare/__init__.py",
    "content": ""
  },
  {
    "path": "utils/prepare/hashes.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nimport gzip\nimport sys\nimport tarfile\nfrom hashlib import md5\nfrom pathlib import Path\nfrom typing import Dict, Optional, Union\nfrom zipfile import ZipFile\n\nfrom deeppavlov.core.data.utils import file_md5\n\n\ndef tar_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> Dict[str, str]:\n    tar = tarfile.open(fpath)\n    res = {}\n    while True:\n        item: tarfile.TarInfo = tar.next()\n        if item is None:\n            break\n        if not item.isfile():\n            continue\n        file_hash = md5()\n        with tar.extractfile(item) as f:\n            for chunk in iter(lambda: f.read(chunk_size), b\"\"):\n                file_hash.update(chunk)\n        res[item.name] = file_hash.hexdigest()\n    return res\n\n\ndef gzip_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> str:\n    file_hash = md5()\n    with gzip.open(fpath, 'rb') as f:\n        for chunk in iter(lambda: f.read(chunk_size), b\"\"):\n            file_hash.update(chunk)\n    return file_hash.hexdigest()\n\n\ndef zip_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> Dict[str, str]:\n    res = {}\n    with ZipFile(fpath) as zip_f:\n        for item in zip_f.infolist():\n            if item.is_dir():\n                continue\n            file_hash = md5()\n            with zip_f.open(item) as f:\n                for chunk in iter(lambda: f.read(chunk_size), b\"\"):\n                    file_hash.update(chunk)\n            res[item.filename] = file_hash.hexdigest()\n    return res\n\n\ndef compute_hashes(fpath: Union[str, Path]) -> Dict[str, str]:\n    p = Path(fpath).expanduser()\n    if not p.is_file():\n        raise RuntimeError(f'{p} is not a file')\n\n    if '.tar' in {s.lower() for s in p.suffixes}:\n        hashes = tar_md5(p)\n    elif p.suffix.lower() == '.gz':\n        hashes = {p.with_suffix('').name: gzip_md5(p)}\n    elif p.suffix.lower() == '.zip':\n        hashes = zip_md5(p)\n    else:\n        hashes = {p.name: file_md5(p)}\n    return hashes\n\n\ndef main(fname: str, outfile: Optional[str] = None) -> None:\n    p = Path(fname).expanduser()\n    hashes = compute_hashes(p)\n\n    if outfile is None:\n        outfile = p.with_suffix(p.suffix + '.md5').open('w', encoding='utf-8')\n    elif outfile == '-':\n        outfile = sys.stdout\n    else:\n        outfile = Path(outfile).expanduser().open('w', encoding='utf-8')\n\n    for fname, fhash in hashes.items():\n        print(f'{fhash} *{fname}', file=outfile, flush=True)\n\n    if outfile is not sys.stdout:\n        outfile.close()\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"fname\", help=\"path to a file to compute hash for\", type=str)\n    parser.add_argument('-o', '--outfile', help='where to write the hashes', default=None, type=str)\n\n    args = parser.parse_args()\n    main(args.fname, args.outfile)\n"
  },
  {
    "path": "utils/prepare/optimize_ipynb.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nimport logging\nfrom pathlib import Path\n\ntry:\n    import nbformat as nbf\nexcept ModuleNotFoundError:\n    raise ModuleNotFoundError(f\"Please, run `pip install nbformat==5.8.0` before using this script.\")\n\nlogging.basicConfig(level=logging.INFO, format=\"%(message)s\")\n\n\ndef merge_markdown(nb: nbf.notebooknode.NotebookNode) -> None:\n    \"\"\"Merges consequent markdown cells into one.\"\"\"\n    start_idx = None\n    slices = []\n    for i, cell in enumerate(nb[\"cells\"]):\n        if cell[\"cell_type\"] == \"markdown\":\n            if start_idx is None:\n                start_idx = i\n        else:\n            if start_idx is not None:\n                if i - start_idx > 1:\n                    slices.append(slice(start_idx, i))\n                start_idx = None\n    for sl in slices[::-1]:\n        nb[\"cells\"][sl.start][\"source\"] = \"\\n\\n\".join([c[\"source\"].rstrip() for c in nb[\"cells\"][sl]])\n        del nb[\"cells\"][sl.start + 1: sl.stop]  # nb[\"cells\"][sl] does not work properly\n\n\ndef drop_metadata(nb: nbf.notebooknode.NotebookNode) -> None:\n    \"\"\"Replaces notebook and cells metadata with empty dicts.\"\"\"\n    nb[\"metadata\"] = dict()\n    for i in range(len(nb[\"cells\"])):\n        nb[\"cells\"][i][\"metadata\"] = dict()\n\n\ndef update_file(path: Path, update_ckpts: bool) -> None:\n    \"\"\"Optimizes ipynb files in order to reduce further git diffs.\n    Args:\n        path: File to update, if this is file. If this is dir - recursively searches and updates .ipynb files in it.\n        update_ckpts: If False and path is dir, will skip all found ipynb files from .ipynb_checkpoints.\n    \"\"\"\n    if path.is_dir():\n        logging.info(f\"Updating .ipynb files in {path} dir\"\n                     f\"{', excluding files from .ipynb_checkpoints subdirs' if update_ckpts is False else ''}.\")\n        for f in path.rglob('*.ipynb'):\n            if update_ckpts is False and '.ipynb_checkpoints' in f.parts:\n                continue\n            update_file(f, update_ckpts)\n    else:\n        logging.info(f\"Updating {path}.\")\n        nb = nbf.read(path, nbf.NO_CONVERT)\n        merge_markdown(nb)\n        drop_metadata(nb)\n        with open(path, \"w\") as fout:\n            nbf.write(nb, fout)\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"fname\", help=\"path to an ipynb file to optimize\", type=Path)\n    parser.add_argument(\"--update-ckpts\", help=\"update checkpoints in .ipynb_checkpoints subdirs\", action=\"store_true\")\n    args = parser.parse_args()\n    update_file(args.fname.resolve(), args.update_ckpts)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "utils/prepare/registry.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nimport pkgutil\nfrom importlib import import_module, reload\n\nimport deeppavlov\nfrom deeppavlov.core.common.metrics_registry import _registry_path as m_registry_path, _REGISTRY as M_REGISTRY\nfrom deeppavlov.core.common.registry import _registry_path as c_registry_path, _REGISTRY as C_REGISTRY\n\nif __name__ == '__main__':\n    C_REGISTRY.clear()\n    M_REGISTRY.clear()\n\n    for _, pkg_name, _ in pkgutil.walk_packages(deeppavlov.__path__, deeppavlov.__name__ + '.'):\n        if pkg_name not in ('deeppavlov.core.common.registry', 'deeppavlov.core.common.metrics_registry'):\n            reload(import_module(pkg_name))\n\n    with c_registry_path.open('w', encoding='utf-8') as f:\n        json.dump(dict(sorted(C_REGISTRY.items())), f, indent=2)\n\n    with m_registry_path.open('w', encoding='utf-8') as f:\n        json.dump(dict(sorted(M_REGISTRY.items())), f, indent=2)\n"
  },
  {
    "path": "utils/prepare/upload.py",
    "content": "# Copyright 2017 Neural Networks and Deep Learning lab, MIPT\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nimport pathlib\nimport tarfile\nfrom pathlib import Path\n\nfrom deeppavlov.core.commands.utils import parse_config\nfrom deeppavlov.core.common.file import find_config\nfrom hashes import main\n\n\ndef upload(config_in_file: str, tar_name: str, tar_output_dir: Path):\n    if not tar_output_dir.exists():\n        raise RuntimeError(f'A folder {tar_output_dir} does not exist')\n\n    print(f'Config: {config_in_file}')\n    if not Path(config_in_file).exists():\n        raise RuntimeError(f'A config {config_in_file} does not exist')\n\n    config_in = parse_config(config_in_file)\n    config_in_file = find_config(config_in_file)\n\n    model_path = Path(config_in['metadata']['variables']['MODEL_PATH']).expanduser()\n    model_name, class_name = config_in_file.stem, config_in_file.parent.name\n\n    if tar_name is None:\n        tar_name = f'{model_name}'\n        print(f'tar_name set to {tar_name}')\n\n    full_tar_name = tar_output_dir / f'{tar_name}.tar.gz'\n    if Path(full_tar_name).exists():\n        raise RuntimeError(f'An archive {Path(full_tar_name)} already exists')\n\n    print(f'model_path: {model_path}')\n    print(f'class_name: {class_name}')\n    print(f'model_name: {model_name}')\n    print(f'Start tarring to {full_tar_name}')\n    with tarfile.open(str(full_tar_name), \"w|gz\") as archive:\n        archive.add(model_path, arcname=pathlib.os.sep)\n\n    print(\"Stop tarring\")\n    print(f'Tar archive: {Path(full_tar_name)} has been created')\n\n    print(\"Calculating hash\")\n    main(full_tar_name)\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser()\n    parser.add_argument('-c', '--config_in', help='path to a config', type=str)\n    parser.add_argument('-n', '--tar_name', help='name of the tar archive (without tar.gz extension)',\n                        default=None, required=False, type=str)\n    parser.add_argument('-o', '--tar_output_dir', help='dir to save a tar archive', default='./',\n                        required=False, type=Path)\n    args = parser.parse_args()\n    upload(args.config_in, args.tar_name, args.tar_output_dir)\n"
  }
]