Repository: deeppavlov/DeepPavlov
Branch: master
Commit: 5f9fbed0c719
Files: 411
Total size: 1.8 MB

Directory structure:
gitextract__x5jpadh/

├── .github/
│   └── ISSUE_TEMPLATE/
│       ├── bug_report.md
│       ├── config.yml
│       └── feature-request.md
├── .gitignore
├── .readthedocs.yml
├── CNAME
├── Jenkinsfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── _config.yml
├── _layouts/
│   └── default.html
├── deeppavlov/
│   ├── __init__.py
│   ├── __main__.py
│   ├── _meta.py
│   ├── configs/
│   │   ├── __init__.py
│   │   ├── classifiers/
│   │   │   ├── boolqa_rubert.json
│   │   │   ├── few_shot_roberta.json
│   │   │   ├── glue/
│   │   │   │   ├── glue_cola_roberta.json
│   │   │   │   ├── glue_mnli_cased_bert_torch.json
│   │   │   │   ├── glue_mnli_mm_cased_bert_torch.json
│   │   │   │   ├── glue_mnli_roberta.json
│   │   │   │   ├── glue_mrpc_roberta.json
│   │   │   │   ├── glue_qnli_roberta.json
│   │   │   │   ├── glue_qqp_roberta.json
│   │   │   │   ├── glue_rte_cased_bert_torch.json
│   │   │   │   ├── glue_rte_roberta_mnli.json
│   │   │   │   ├── glue_sst2_roberta.json
│   │   │   │   ├── glue_stsb_roberta.json
│   │   │   │   └── glue_wnli_roberta.json
│   │   │   ├── insults_kaggle_bert.json
│   │   │   ├── paraphraser_convers_distilrubert_2L.json
│   │   │   ├── paraphraser_convers_distilrubert_6L.json
│   │   │   ├── paraphraser_rubert.json
│   │   │   ├── query_pr.json
│   │   │   ├── rusentiment_bert.json
│   │   │   ├── rusentiment_convers_bert.json
│   │   │   ├── rusentiment_convers_distilrubert_2L.json
│   │   │   ├── rusentiment_convers_distilrubert_6L.json
│   │   │   ├── sentiment_sst_conv_bert.json
│   │   │   ├── sentiment_twitter.json
│   │   │   ├── superglue/
│   │   │   │   ├── superglue_boolq_roberta_mnli.json
│   │   │   │   ├── superglue_copa_roberta.json
│   │   │   │   ├── superglue_record_roberta.json
│   │   │   │   └── superglue_wic_bert.json
│   │   │   └── topics_distilbert_base_uncased.json
│   │   ├── doc_retrieval/
│   │   │   ├── en_ranker_pop_wiki.json
│   │   │   ├── en_ranker_tfidf_wiki.json
│   │   │   └── ru_ranker_tfidf_wiki.json
│   │   ├── embedder/
│   │   │   ├── bert_embedder.json
│   │   │   └── bert_sentence_embedder.json
│   │   ├── entity_extraction/
│   │   │   ├── entity_detection_en.json
│   │   │   ├── entity_detection_ru.json
│   │   │   ├── entity_extraction_en.json
│   │   │   ├── entity_extraction_ru.json
│   │   │   ├── entity_linking_en.json
│   │   │   └── entity_linking_ru.json
│   │   ├── faq/
│   │   │   └── fasttext_logreg.json
│   │   ├── kbqa/
│   │   │   ├── kbqa_cq_en.json
│   │   │   ├── kbqa_cq_ru.json
│   │   │   └── wiki_parser.json
│   │   ├── morpho_syntax_parser/
│   │   │   ├── morpho_ru_syntagrus_bert.json
│   │   │   ├── ru_syntagrus_joint_parsing.json
│   │   │   └── syntax_ru_syntagrus_bert.json
│   │   ├── multitask/
│   │   │   ├── mt_glue.json
│   │   │   └── multitask_example.json
│   │   ├── ner/
│   │   │   ├── ner_bert_base.json
│   │   │   ├── ner_case_agnostic_mdistilbert.json
│   │   │   ├── ner_collection3_bert.json
│   │   │   ├── ner_conll2003_bert.json
│   │   │   ├── ner_conll2003_deberta_crf.json
│   │   │   ├── ner_ontonotes_bert.json
│   │   │   ├── ner_ontonotes_bert_mult.json
│   │   │   ├── ner_ontonotes_deberta_crf.json
│   │   │   ├── ner_rus_bert.json
│   │   │   ├── ner_rus_bert_probas.json
│   │   │   ├── ner_rus_convers_distilrubert_2L.json
│   │   │   └── ner_rus_convers_distilrubert_6L.json
│   │   ├── odqa/
│   │   │   ├── en_odqa_infer_wiki.json
│   │   │   ├── en_odqa_pop_infer_wiki.json
│   │   │   └── ru_odqa_infer_wiki.json
│   │   ├── ranking/
│   │   │   ├── path_ranking_nll_roberta_en.json
│   │   │   ├── ranking_ubuntu_v2_torch_bert_uncased.json
│   │   │   ├── rel_ranking_nll_bert_ru.json
│   │   │   └── rel_ranking_roberta_en.json
│   │   ├── regressors/
│   │   │   └── translation_ranker.json
│   │   ├── relation_extraction/
│   │   │   ├── re_docred.json
│   │   │   └── re_rured.json
│   │   ├── russian_super_glue/
│   │   │   ├── russian_superglue_danetqa_rubert.json
│   │   │   ├── russian_superglue_lidirus_rubert.json
│   │   │   ├── russian_superglue_muserc_rubert.json
│   │   │   ├── russian_superglue_parus_rubert.json
│   │   │   ├── russian_superglue_rcb_rubert.json
│   │   │   ├── russian_superglue_rucos_rubert.json
│   │   │   ├── russian_superglue_russe_rubert.json
│   │   │   ├── russian_superglue_rwsd_rubert.json
│   │   │   └── russian_superglue_terra_rubert.json
│   │   ├── sentence_segmentation/
│   │   │   └── sentseg_dailydialog_bert.json
│   │   ├── spelling_correction/
│   │   │   ├── brillmoore_wikitypos_en.json
│   │   │   └── levenshtein_corrector_ru.json
│   │   └── squad/
│   │       ├── qa_multisberquad_bert.json
│   │       ├── qa_nq_psgcls_bert.json
│   │       ├── qa_squad2_bert.json
│   │       ├── squad_bert.json
│   │       ├── squad_ru_bert.json
│   │       ├── squad_ru_convers_distilrubert_2L.json
│   │       └── squad_ru_convers_distilrubert_6L.json
│   ├── core/
│   │   ├── __init__.py
│   │   ├── commands/
│   │   │   ├── __init__.py
│   │   │   ├── infer.py
│   │   │   ├── train.py
│   │   │   └── utils.py
│   │   ├── common/
│   │   │   ├── __init__.py
│   │   │   ├── aliases.py
│   │   │   ├── base.py
│   │   │   ├── chainer.py
│   │   │   ├── cross_validation.py
│   │   │   ├── errors.py
│   │   │   ├── file.py
│   │   │   ├── log.py
│   │   │   ├── log_events.py
│   │   │   ├── metrics_registry.json
│   │   │   ├── metrics_registry.py
│   │   │   ├── params.py
│   │   │   ├── params_search.py
│   │   │   ├── paths.py
│   │   │   ├── prints.py
│   │   │   ├── registry.json
│   │   │   ├── registry.py
│   │   │   └── requirements_registry.json
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── data_fitting_iterator.py
│   │   │   ├── data_learning_iterator.py
│   │   │   ├── dataset_reader.py
│   │   │   ├── simple_vocab.py
│   │   │   └── utils.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── component.py
│   │   │   ├── estimator.py
│   │   │   ├── nn_model.py
│   │   │   ├── serializable.py
│   │   │   └── torch_model.py
│   │   └── trainers/
│   │       ├── __init__.py
│   │       ├── fit_trainer.py
│   │       ├── nn_trainer.py
│   │       ├── torch_trainer.py
│   │       └── utils.py
│   ├── dataset_iterators/
│   │   ├── __init__.py
│   │   ├── basic_classification_iterator.py
│   │   ├── huggingface_dataset_iterator.py
│   │   ├── morphotagger_iterator.py
│   │   ├── multitask_iterator.py
│   │   ├── siamese_iterator.py
│   │   ├── sqlite_iterator.py
│   │   ├── squad_iterator.py
│   │   └── typos_iterator.py
│   ├── dataset_readers/
│   │   ├── __init__.py
│   │   ├── basic_classification_reader.py
│   │   ├── boolqa_reader.py
│   │   ├── conll2003_reader.py
│   │   ├── docred_reader.py
│   │   ├── faq_reader.py
│   │   ├── huggingface_dataset_reader.py
│   │   ├── imdb_reader.py
│   │   ├── line_reader.py
│   │   ├── morphotagging_dataset_reader.py
│   │   ├── multitask_reader.py
│   │   ├── odqa_reader.py
│   │   ├── paraphraser_reader.py
│   │   ├── rel_ranking_reader.py
│   │   ├── rured_reader.py
│   │   ├── sq_reader.py
│   │   ├── squad_dataset_reader.py
│   │   ├── typos_reader.py
│   │   └── ubuntu_v2_reader.py
│   ├── deep.py
│   ├── download.py
│   ├── metrics/
│   │   ├── __init__.py
│   │   ├── accuracy.py
│   │   ├── bleu.py
│   │   ├── correlation.py
│   │   ├── elmo_metrics.py
│   │   ├── fmeasure.py
│   │   ├── google_bleu.py
│   │   ├── log_loss.py
│   │   ├── mse.py
│   │   ├── recall_at_k.py
│   │   ├── record_metrics.py
│   │   ├── roc_auc_score.py
│   │   └── squad_metrics.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── api_requester/
│   │   │   ├── __init__.py
│   │   │   ├── api_requester.py
│   │   │   └── api_router.py
│   │   ├── classifiers/
│   │   │   ├── __init__.py
│   │   │   ├── cos_sim_classifier.py
│   │   │   ├── dnnc_proba2labels.py
│   │   │   ├── proba2labels.py
│   │   │   ├── re_bert.py
│   │   │   ├── torch_classification_model.py
│   │   │   ├── torch_nets.py
│   │   │   └── utils.py
│   │   ├── doc_retrieval/
│   │   │   ├── __init__.py
│   │   │   ├── bpr.py
│   │   │   ├── logit_ranker.py
│   │   │   ├── pop_ranker.py
│   │   │   ├── tfidf_ranker.py
│   │   │   └── utils.py
│   │   ├── embedders/
│   │   │   ├── __init__.py
│   │   │   ├── abstract_embedder.py
│   │   │   ├── fasttext_embedder.py
│   │   │   ├── tfidf_weighted_embedder.py
│   │   │   └── transformers_embedder.py
│   │   ├── entity_extraction/
│   │   │   ├── __init__.py
│   │   │   ├── entity_detection_parser.py
│   │   │   ├── entity_linking.py
│   │   │   ├── find_word.py
│   │   │   └── ner_chunker.py
│   │   ├── kbqa/
│   │   │   ├── __init__.py
│   │   │   ├── query_generator.py
│   │   │   ├── query_generator_base.py
│   │   │   ├── rel_ranking_infer.py
│   │   │   ├── ru_adj_to_noun.py
│   │   │   ├── sentence_answer.py
│   │   │   ├── template_matcher.py
│   │   │   ├── tree_to_sparql.py
│   │   │   ├── type_define.py
│   │   │   ├── utils.py
│   │   │   └── wiki_parser.py
│   │   ├── morpho_syntax_parser/
│   │   │   ├── __init__.py
│   │   │   ├── dependency_decoding.py
│   │   │   ├── joint.py
│   │   │   ├── spacy_lemmatizer.py
│   │   │   └── syntax_parsing.py
│   │   ├── preprocessors/
│   │   │   ├── __init__.py
│   │   │   ├── dirty_comments_preprocessor.py
│   │   │   ├── dnnc_preprocessor.py
│   │   │   ├── mask.py
│   │   │   ├── multitask_preprocessor.py
│   │   │   ├── ner_preprocessor.py
│   │   │   ├── odqa_preprocessors.py
│   │   │   ├── one_hotter.py
│   │   │   ├── re_preprocessor.py
│   │   │   ├── response_base_loader.py
│   │   │   ├── sanitizer.py
│   │   │   ├── sentseg_preprocessor.py
│   │   │   ├── squad_preprocessor.py
│   │   │   ├── str_lower.py
│   │   │   ├── str_token_reverser.py
│   │   │   ├── str_utf8_encoder.py
│   │   │   ├── torch_transformers_preprocessor.py
│   │   │   └── transformers_preprocessor.py
│   │   ├── ranking/
│   │   │   ├── __init__.py
│   │   │   └── metrics.py
│   │   ├── relation_extraction/
│   │   │   ├── __init__.py
│   │   │   ├── losses.py
│   │   │   └── relation_extraction_bert.py
│   │   ├── sklearn/
│   │   │   ├── __init__.py
│   │   │   └── sklearn_component.py
│   │   ├── spelling_correction/
│   │   │   ├── __init__.py
│   │   │   ├── brillmoore/
│   │   │   │   ├── __init__.py
│   │   │   │   └── error_model.py
│   │   │   ├── electors/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── kenlm_elector.py
│   │   │   │   └── top1_elector.py
│   │   │   └── levenshtein/
│   │   │       ├── __init__.py
│   │   │       ├── levenshtein_searcher.py
│   │   │       ├── searcher_component.py
│   │   │       └── tabled_trie.py
│   │   ├── tokenizers/
│   │   │   ├── __init__.py
│   │   │   ├── lazy_tokenizer.py
│   │   │   ├── nltk_moses_tokenizer.py
│   │   │   ├── nltk_tokenizer.py
│   │   │   ├── spacy_tokenizer.py
│   │   │   ├── split_tokenizer.py
│   │   │   └── utils.py
│   │   ├── torch_bert/
│   │   │   ├── __init__.py
│   │   │   ├── crf.py
│   │   │   ├── multitask_transformer.py
│   │   │   ├── torch_bert_ranker.py
│   │   │   ├── torch_transformers_classifier.py
│   │   │   ├── torch_transformers_el_ranker.py
│   │   │   ├── torch_transformers_multiplechoice.py
│   │   │   ├── torch_transformers_nll_ranking.py
│   │   │   ├── torch_transformers_sequence_tagger.py
│   │   │   ├── torch_transformers_squad.py
│   │   │   └── torch_transformers_syntax_parser.py
│   │   └── vectorizers/
│   │       ├── __init__.py
│   │       └── hashing_tfidf_vectorizer.py
│   ├── paramsearch.py
│   ├── requirements/
│   │   ├── datasets.txt
│   │   ├── dependency_decoding.txt
│   │   ├── en_core_web_sm.txt
│   │   ├── faiss.txt
│   │   ├── fasttext.txt
│   │   ├── hdt.txt
│   │   ├── kenlm.txt
│   │   ├── lxml.txt
│   │   ├── opt_einsum.txt
│   │   ├── protobuf.txt
│   │   ├── pytorch.txt
│   │   ├── rapidfuzz.txt
│   │   ├── razdel.txt
│   │   ├── ru_core_news_sm.txt
│   │   ├── sacremoses.txt
│   │   ├── sentencepiece.txt
│   │   ├── slovnet.txt
│   │   ├── sortedcontainers.txt
│   │   ├── torchcrf.txt
│   │   ├── transformers.txt
│   │   ├── udapi.txt
│   │   └── whapi.txt
│   ├── settings.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── benchmarks/
│   │   │   ├── __init__.py
│   │   │   └── benchmarks.py
│   │   ├── connector/
│   │   │   ├── __init__.py
│   │   │   └── dialog_logger.py
│   │   ├── pip_wrapper/
│   │   │   ├── __init__.py
│   │   │   └── pip_wrapper.py
│   │   ├── server/
│   │   │   ├── __init__.py
│   │   │   ├── metrics.py
│   │   │   └── server.py
│   │   ├── settings/
│   │   │   ├── __init__.py
│   │   │   ├── dialog_logger_config.json
│   │   │   ├── log_config.json
│   │   │   └── server_config.json
│   │   └── socket/
│   │       ├── __init__.py
│   │       └── socket.py
│   └── vocabs/
│       ├── __init__.py
│       ├── typos.py
│       └── wiki_sqlite.py
├── docs/
│   ├── Makefile
│   ├── _static/
│   │   ├── deeppavlov.css
│   │   └── my_blocks.css
│   ├── _templates/
│   │   └── footer.html
│   ├── apiref/
│   │   ├── core/
│   │   │   ├── commands.rst
│   │   │   ├── common.rst
│   │   │   ├── data.rst
│   │   │   ├── models.rst
│   │   │   └── trainers.rst
│   │   ├── core.rst
│   │   ├── dataset_iterators.rst
│   │   ├── dataset_readers.rst
│   │   ├── metrics.rst
│   │   ├── models/
│   │   │   ├── api_requester.rst
│   │   │   ├── classifiers.rst
│   │   │   ├── doc_retrieval.rst
│   │   │   ├── embedders.rst
│   │   │   ├── entity_extraction.rst
│   │   │   ├── kbqa.rst
│   │   │   ├── preprocessors.rst
│   │   │   ├── relation_extraction.rst
│   │   │   ├── sklearn.rst
│   │   │   ├── spelling_correction.rst
│   │   │   ├── tokenizers.rst
│   │   │   ├── torch_bert.rst
│   │   │   └── vectorizers.rst
│   │   ├── models.rst
│   │   └── vocabs.rst
│   ├── conf.py
│   ├── devguides/
│   │   ├── contribution_guide.rst
│   │   └── registry.rst
│   ├── features/
│   │   ├── hypersearch.rst
│   │   ├── models/
│   │   │   ├── KBQA.ipynb
│   │   │   ├── NER.ipynb
│   │   │   ├── ODQA.ipynb
│   │   │   ├── SQuAD.ipynb
│   │   │   ├── bert.rst
│   │   │   ├── classification.ipynb
│   │   │   ├── entity_extraction.ipynb
│   │   │   ├── few_shot_classification.ipynb
│   │   │   ├── morpho_tagger.ipynb
│   │   │   ├── multitask_bert.rst
│   │   │   ├── neural_ranking.ipynb
│   │   │   ├── popularity_ranking.rst
│   │   │   ├── relation_extraction.ipynb
│   │   │   ├── spelling_correction.ipynb
│   │   │   ├── superglue.rst
│   │   │   ├── syntax_parser.ipynb
│   │   │   └── tfidf_ranking.ipynb
│   │   ├── overview.rst
│   │   └── pretrained_vectors.rst
│   ├── index.rst
│   ├── integrations/
│   │   ├── aws_ec2.rst
│   │   ├── rest_api.rst
│   │   ├── settings.rst
│   │   └── socket_api.rst
│   ├── internships/
│   │   └── internships.rst
│   └── intro/
│       ├── configuration.rst
│       ├── installation.rst
│       ├── overview.rst
│       ├── python.ipynb
│       └── quick_start.rst
├── requirements.txt
├── setup.py
├── tests/
│   ├── __init__.py
│   ├── test_configs/
│   │   └── doc_retrieval/
│   │       ├── en_ranker_pop_wiki_test.json
│   │       ├── en_ranker_tfidf_wiki_test.json
│   │       └── ru_ranker_tfidf_wiki_test.json
│   └── test_quick_start.py
└── utils/
    ├── Docker/
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── cmd.sh
    │   └── docker-compose.yml
    ├── __init__.py
    └── prepare/
        ├── __init__.py
        ├── hashes.py
        ├── optimize_ipynb.py
        ├── registry.py
        └── upload.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Report on a bug you encountered
title: ''
labels: bug
assignees: ''

---

Want to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first.

Please enter all the information below, otherwise your issue may be closed without a warning. 


**DeepPavlov version** (you can look it up by running `pip show deeppavlov`):

**Python version**:

**Operating system** (ubuntu linux, windows, ...):

**Issue**:


**Content or a name of a configuration file**:
```

```


**Command that led to error**:
```

```

**Error (including full traceback)**:
```

```


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
  - name: Ask a question
    url: https://forum.deeppavlov.ai/
    about: If you have a different question, please ask it in the forum https://forum.deeppavlov.ai


================================================
FILE: .github/ISSUE_TEMPLATE/feature-request.md
================================================
---
name: Feature request
about: Suggest a feature to improve the DeepPavlov library
title: ''
labels: enhancement
assignees: ''

---

Want to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first.


**What problem are we trying to solve?**:
```

```

**How can we solve it?**:
```

```

**Are there other issues that block this solution?**:
```

```


================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# dotenv
.env

# virtualenv
.venv
venv/
ENV/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

#IDEA
.idea/

#Atom IDE
.ftpconfig

#vscode IDE
.vscode

# Vim
*.vim
*.vimrc

#GIT
.git/

#Default usr dir
download/

#project test
/test/
.pytest_cache

# project data
/data/

# local dockerfiles
/Dockerfile
/entrypoint.sh
/.dockerignore


================================================
FILE: .readthedocs.yml
================================================
# .readthedocs.yml
version: 2

build:
  os: "ubuntu-20.04"
  tools:
    python: "3.10"
formats: []

python:
  install:
    - method: pip
      path: .
      extra_requirements:
        - docs


================================================
FILE: CNAME
================================================
deeppavlov.ai

================================================
FILE: Jenkinsfile
================================================
node('cuda-module') {
    timestamps {
        try {
            stage('Clean') {
                sh "rm -rf .[^.] .??* *"
            }
            stage('Checkout') {
                checkout scm
            }
            stage('Setup') {
                env.TFHUB_CACHE_DIR="tfhub_cache"
                sh """
                    EPOCH=\$(date +%s) docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG build
                """
            }
            stage('Tests') {
                sh """
                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py36 py37
                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1
                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py38 py39
                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1
                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py310 py311
                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 || exit 0
                """
                currentBuild.result = 'SUCCESS'
            }
        }
        catch(e) {
            currentBuild.result = 'FAILURE'
            throw e
        }
        finally {
            sh """
                docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG rm -f
                docker network rm \$(echo $BUILD_TAG | awk '{print tolower(\$0)}')_default
            """
            emailext to: "\${DEFAULT_RECIPIENTS}",
                subject: "${env.JOB_NAME} - Build # ${currentBuild.number} - ${currentBuild.result}!",
                body: '${BRANCH_NAME} - ${BUILD_URL}',
                attachLog: true
        }
    }
}


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2018 Neural Systems and Deep Learning Laboratory
                  Moscow Institute of Physics and Technology

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: MANIFEST.in
================================================
include README.MD
include LICENSE
include requirements.txt
include deeppavlov/requirements/*.txt
recursive-include deeppavlov *.json
recursive-include deeppavlov *.md


================================================
FILE: README.md
================================================
# DeepPavlov 1.0

[![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE)
![Python 3.6, 3.7, 3.8, 3.9, 3.10, 3.11](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-green.svg)
[![Downloads](https://pepy.tech/badge/deeppavlov)](https://pepy.tech/project/deeppavlov)
[![Static Badge](https://img.shields.io/badge/DeepPavlov%20Community-blue)](https://forum.deeppavlov.ai/)
[![Static Badge](https://img.shields.io/badge/DeepPavlov%20Demo-blue)](https://demo.deeppavlov.ai/)


DeepPavlov 1.0 is an open-source NLP framework built on [PyTorch](https://pytorch.org/) and [transformers](https://github.com/huggingface/transformers). DeepPavlov 1.0 is created for modular and configuration-driven development of state-of-the-art NLP models and supports a wide range of NLP model applications. DeepPavlov 1.0 is designed for practitioners with limited knowledge of NLP/ML.

## Quick Links

|name|Description|
|--|--|
| ⭐️ [*Demo*](https://demo.deeppavlov.ai/)|Check out our NLP models in the online demo|
| 📚 [*Documentation*](http://docs.deeppavlov.ai/)|How to use DeepPavlov 1.0 and its features|
| 🚀 [*Model List*](http://docs.deeppavlov.ai/en/master/features/overview.html)|Find the NLP model you need in the list of available models|
| 🪐 [*Contribution Guide*](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html)|Please read the contribution guidelines before making a contribution|
| 🎛 [*Issues*](https://github.com/deeppavlov/DeepPavlov/issues)|If you have an issue with DeepPavlov, please let us know|
| ⏩ [*Forum*](https://forum.deeppavlov.ai/)|Please let us know if you have a problem with DeepPavlov|
| 📦 [*Blogs*](https://medium.com/deeppavlov)|Read about our current development|
| 🦙 [Extended colab tutorials](https://github.com/deeppavlov/dp_tutorials)|Check out the code tutorials for our models|
| 🌌 [*Docker Hub*](https://hub.docker.com/u/deeppavlov/)|Check out the Docker images for rapid deployment|
| 👩‍🏫 [*Feedback*](https://forms.gle/i64fowQmiVhMMC7f9)|Please leave us your feedback to make DeepPavlov better|


## Installation

0. DeepPavlov supports `Linux`, `Windows 10+` (through WSL/WSL2), `MacOS` (Big Sur+) platforms, `Python 3.6`, `3.7`, `3.8`, `3.9` and `3.10`.
    Depending on the model used, you may need from 4 to 16 GB RAM.

1. Create and activate a virtual environment:
    * `Linux`

    ```
    python -m venv env
    source ./env/bin/activate
    ```

2. Install the package inside the environment:

    ```
    pip install deeppavlov
    ```

## QuickStart

There is a bunch of great pre-trained NLP models in DeepPavlov. Each model is
determined by its config file.

List of models is available on
[the doc page](http://docs.deeppavlov.ai/en/master/features/overview.html) in
the `deeppavlov.configs` (Python):

```python
from deeppavlov import configs
```

When you're decided on the model (+ config file), there are two ways to train,
evaluate and infer it:

* via [Command line interface (CLI)](#command-line-interface-cli) and
* via [Python](#python).

#### GPU requirements

By default, DeepPavlov installs models requirements from PyPI. PyTorch from PyPI could not support your device CUDA
capability. To run supported DeepPavlov models on GPU you should have [CUDA](https://developer.nvidia.com/cuda-toolkit)
compatible with used GPU and [PyTorch version](deeppavlov/requirements/pytorch.txt) required by DeepPavlov models.
See [docs](https://docs.deeppavlov.ai/en/master/intro/quick_start.html#using-gpu) for details.
GPU with Pascal or newer architecture and 4+ GB VRAM is recommended.

### Command line interface (CLI)

To get predictions from a model interactively through CLI, run

```bash
python -m deeppavlov interact <config_path> [-d] [-i]
```

* `-d` downloads required data - pretrained model files and embeddings (optional).
* `-i` installs model requirements (optional).

You can train it in the same simple way:

```bash
python -m deeppavlov train <config_path> [-d] [-i]
```

Dataset will be downloaded regardless of whether there was `-d` flag or not.

To train on your own data you need to modify dataset reader path in the
[train config doc](http://docs.deeppavlov.ai/en/master/intro/config_description.html#train-config).
The data format is specified in the corresponding model doc page.

There are even more actions you can perform with configs:

```bash
python -m deeppavlov <action> <config_path> [-d] [-i]
```

* `<action>` can be
  * `install` to install model requirements (same as `-i`),
  * `download` to download model's data (same as `-d`),
  * `train` to train the model on the data specified in the config file,
  * `evaluate` to calculate metrics on the same dataset,
  * `interact` to interact via CLI,
  * `riseapi` to run a REST API server (see
    [doc](http://docs.deeppavlov.ai/en/master/integrations/rest_api.html)),
  * `predict` to get prediction for samples from *stdin* or from
      *<file_path>* if `-f <file_path>` is specified.
* `<config_path>` specifies path (or name) of model's config file
* `-d` downloads required data
* `-i` installs model requirements

### Python

To get predictions from a model interactively through Python, run

```python
from deeppavlov import build_model

model = build_model(<config_path>, install=True, download=True)

# get predictions for 'input_text1', 'input_text2'
model(['input_text1', 'input_text2'])
```

where

* `install=True` installs model requirements (optional),
* `download=True` downloads required data from web - pretrained model files and embeddings (optional),
* `<config_path>` is model name (e.g. `'ner_ontonotes_bert_mult'`), path to the chosen model's config file (e.g.
  `"deeppavlov/configs/ner/ner_ontonotes_bert_mult.json"`),  or `deeppavlov.configs` attribute (e.g.
  `deeppavlov.configs.ner.ner_ontonotes_bert_mult` without quotation marks).

You can train it in the same simple way:

```python
from deeppavlov import train_model 

model = train_model(<config_path>, install=True, download=True)
```

To train on your own data you need to modify dataset reader path in the
[train config doc](http://docs.deeppavlov.ai/en/master/intro/config_description.html#train-config).
The data format is specified in the corresponding model doc page.

You can also calculate metrics on the dataset specified in your config file:

```python
from deeppavlov import evaluate_model 

model = evaluate_model(<config_path>, install=True, download=True)
```

DeepPavlov also [allows](https://docs.deeppavlov.ai/en/master/intro/python.html) to build a model from components for
inference using Python.

## License

DeepPavlov is Apache 2.0 - licensed.

## Citation
```
@inproceedings{savkin-etal-2024-deeppavlov,
    title = "DeepPavlov 1.0: Your Gateway to Advanced NLP Models Backed by Transformers and Transfer Learning",
    author = "Savkin Maksim and Voznyuk Anastasia and Ignatov Fedor and Korzanova Anna and Karpov Dmitry and Popov Alexander and Konovalov Vasily"
    editor = "Hernandez Farias and Delia Irazu and Hope Tom and Li Manling",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.emnlp-demo.47",
    pages = "465--474",
    abstract = "We present DeepPavlov 1.0, an open-source framework for using Natural Language Processing (NLP) models by leveraging transfer learning techniques. DeepPavlov 1.0 is created for modular and configuration-driven development of state-of-the-art NLP models and supports a wide range of NLP model applications. DeepPavlov 1.0 is designed for practitioners with limited knowledge of NLP/ML. DeepPavlov is based on PyTorch and supports HuggingFace transformers. DeepPavlov is publicly released under the Apache 2.0 license and provides access to an online demo.",
}
```


================================================
FILE: _config.yml
================================================
theme: jekyll-theme-leap-day
google_analytics: UA-139843736-5
include:
  - _static


================================================
FILE: _layouts/default.html
================================================
<!doctype html>
<html lang="{{ site.lang | default: "en-US" }}">
  <head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">

{% seo %}
    <link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
    <script src="https://code.jquery.com/jquery-3.3.0.min.js" integrity="sha256-RTQy8VOmNlT6b2PIRur37p6JEBZUE7o8wPgMvu18MC4=" crossorigin="anonymous"></script>
    <script src="{{ '/assets/js/main.js' | relative_url }}"></script>
    <!--[if lt IE 9]>
      <script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv.min.js" integrity="sha256-3Jy/GbSLrg0o9y5Z5n1uw0qxZECH7C6OQpVBgNFYa0g=" crossorigin="anonymous"></script>
    <![endif]-->
    <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
    
    <link rel="stylesheet" type="text/css" href="//cdnjs.cloudflare.com/ajax/libs/cookieconsent2/3.1.0/cookieconsent.min.css" />
    <script src="//cdnjs.cloudflare.com/ajax/libs/cookieconsent2/3.1.0/cookieconsent.min.js"></script>
    <script>
        window.addEventListener("load", function(){
            window.cookieconsent.initialise({
                "palette": {
                    "popup": {
                        "background": "#237afc"
                    },
                    "button": {
                        "background": "#fff",
                        "text": "#237afc"
                    }
                },
                "showLink": false,
                "position": "bottom-right",
                "theme": "classic",
                "content": {
                    "message": "This website uses cookies. By continuing to use this site, you accept our use of cookies.",
                    "dismiss": "ACCEPT &amp; CLOSE"
                }
            })});
    </script>

  </head>
  <body>

      <header>
        <h1>{{ site.title | default: site.github.repository_name }}</h1>
        <p>{{ site.description | default: site.github.project_tagline }}</p>
      </header>

      <div id="banner">
        <span id="logo"></span>

        <a href="{{ site.github.repository_url }}" class="button fork"><strong>View On GitHub</strong></a>
        {% if site.show_downloads %}
          <div class="downloads">
            <span>Downloads:</span>
            <ul>
              <li><a href="{{ site.github.zip_url }}" class="button">ZIP</a></li>
              <li><a href="{{ site.github.tar_url }}" class="button">TAR</a></li>
            </ul>
          </div>
        {% endif %}
      </div><!-- end banner -->

    <div class="wrapper">
      <nav>
        <ul></ul>
      </nav>
      <section>
        {{ content }}

      </section>
      <footer>
        {% if site.github.is_project_page %}
          <p>Project maintained by <a href="{{ site.github.owner_url }}">{{ site.github.owner_name }}</a></p>
        {% endif %}
        <p><small>Hosted on GitHub Pages &mdash; Theme by <a href="https://twitter.com/michigangraham">mattgraham</a></small></p>
      </footer>
    </div>

    {% if site.google_analytics %}
      <script>
        (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
        (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
        m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
        })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
        ga('create', '{{ site.google_analytics }}', 'auto');
        ga('send', 'pageview');
      </script>
    {% endif %}
  </body>
</html>


================================================
FILE: deeppavlov/__init__.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
from pathlib import Path

from ._meta import __author__, __description__, __email__, __keywords__, __license__, __version__
from .configs import configs
from .core.commands.infer import build_model
from .core.commands.train import train_evaluate_model_from_config
from .core.common.base import Element, Model
from .core.common.chainer import Chainer
from .core.common.log import init_logger
from .download import deep_download


# TODO: make better
def train_model(config: [str, Path, dict], install: bool = False,
                download: bool = False, recursive: bool = False) -> Chainer:
    train_evaluate_model_from_config(config, install=install, download=download, recursive=recursive)
    return build_model(config, load_trained=True)


def evaluate_model(config: [str, Path, dict], install: bool = False,
                   download: bool = False, recursive: bool = False) -> dict:
    return train_evaluate_model_from_config(config, to_train=False, install=install,
                                            download=download, recursive=recursive)


# check version
assert sys.hexversion >= 0x3060000, 'Does not work in python3.5 or lower'

# resolve conflicts with previous DeepPavlov installations versioned up to 0.0.9
dot_dp_path = Path('~/.deeppavlov').expanduser().resolve()
if dot_dp_path.is_file():
    dot_dp_path.unlink()

# initiate logging
init_logger()


================================================
FILE: deeppavlov/__main__.py
================================================
if __name__ == '__main__':
    from .deep import main

    main()


================================================
FILE: deeppavlov/_meta.py
================================================
__version__ = '1.7.0'
__author__ = 'Neural Networks and Deep Learning lab, MIPT'
__description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'
__keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']
__license__ = 'Apache License, Version 2.0'
__email__ = 'info@deeppavlov.ai'


================================================
FILE: deeppavlov/configs/__init__.py
================================================
from pathlib import Path
from typing import Iterator, Dict, Union, Iterable


class Struct:
    def __iter__(self) -> Iterator[str]:
        return iter(self._keys)

    def __len__(self) -> int:
        return len(self._keys)

    def __init__(self, tree: Dict[str, Union[dict, Path]]) -> None:
        self._keys = set()
        for key, value in tree.items():
            key = key.replace('.', '_')
            self._keys.add(key)
            setattr(self, key,
                    Struct(value) if isinstance(value, dict) else value)
        self._keys = frozenset(self._keys)

        self.keys = lambda: self._keys

    def _asdict(self, *, to_string: bool=False) -> dict:
        res = []
        for key in self._keys:
            value = getattr(self, key)
            if isinstance(value, Struct):
                value = value._asdict(to_string=to_string)
            elif to_string:
                value = str(value)
            res.append((key, value))

        return dict(res)

    def __getitem__(self, key: str) -> Union[dict, Path]:
        if key not in self._keys:
            raise KeyError(key)

        item = getattr(self, key)
        if isinstance(item, Struct):
            item = item._asdict()
        return item

    def __dir__(self) -> Iterable:
        return self._keys

    def _ipython_key_completions_(self) -> Iterable:
        return self._keys

    def __str__(self) -> str:
        return str(self._asdict(to_string=True))

    def __repr__(self) -> str:
        return f'Struct({repr(self._asdict())})'

    def _repr_pretty_(self, p, cycle):
        """method that defines ``Struct``'s pretty printing rules for iPython

        Args:
            p (IPython.lib.pretty.RepresentationPrinter): pretty printer object
            cycle (bool): is ``True`` if pretty detected a cycle
        """
        if cycle:
            p.text('Struct(...)')
        else:
            with p.group(7, 'Struct(', ')'):
                p.pretty(self._asdict())


def _build_configs_tree() -> Struct:
    root = Path(__file__).resolve().parent

    tree = {}

    for config in root.glob('**/*.json'):
        leaf = tree
        for part in config.relative_to(root).parent.parts:
            if part not in leaf:
                leaf[part] = {}
            leaf = leaf[part]
        leaf[config.stem] = config

    return Struct(tree)


configs = _build_configs_tree()


================================================
FILE: deeppavlov/configs/classifiers/boolqa_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "boolqa_reader",
    "data_path": "{DOWNLOADS_PATH}/boolqa_data",
    "language": "ru"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 243
  },
  "chainer": {
    "in": ["text_a", "text_b"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 128,
        "in": ["text_a", "text_b"],
        "out": ["bert_features"]
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": 2,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODELS_PATH}/boolqa_rubert/model_rubert",
        "load_path": "{MODELS_PATH}/boolqa_rubert/model_rubert",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 2e-05},
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y"],
        "out": ["predictions"]
      }
    ],
    "out": ["predictions"]
  },
  "train": {
    "epochs": 50,
    "batch_size": 32,
    "train_metrics": ["f1", "acc"],
    "metrics": ["f1", "acc"],
    "validation_patience": 5,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "evaluation_targets": ["valid", "train"],
    "show_examples": false,
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased"
    }
  }
}


================================================
FILE: deeppavlov/configs/classifiers/few_shot_roberta.json
================================================
{
  "chainer": {
    "in": ["texts", "dataset"],
    "in_y": ["y_true"],
    "pipe": [
      {
        "class_name": "dnnc_pair_generator",
        "in": ["texts", "dataset"],
        "out": ["x", "x_support", "x_populated", "y_support"],
        "bidirectional": true
      },
      {
        "class_name": "torch_transformers_preprocessor",
        "in": ["x_populated", "x_support"],
        "out": ["bert_features"],
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": true,
        "max_seq_length": 128
      },
      {
        "class_name": "torch_transformers_classifier",
        "main": true,
        "in": ["bert_features"],
        "out": ["simmilarity_scores"],
        "n_classes": 2,
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "is_binary": "{BINARY_CLASSIFICATION}"
      },
      {
        "class_name": "dnnc_proba2labels",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "in": ["simmilarity_scores", "x", "x_populated", "x_support", "y_support"],
        "out": ["y_pred"],
        "confidence_threshold": 0.0
      }
    ],
    "out": ["y_pred"]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10",
      "BINARY_CLASSIFICATION": true,
      "BASE_MODEL": "roberta-base"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_cola_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": "sentence",
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": ["x"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 32,
    "metrics": ["matthews_correlation"],
    "validation_patience": 10,
    "val_every_n_batches": 250,
    "log_every_n_batches": 250,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "glue",
      "TASK": "cola",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/glue/glue_cola_roberta.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_mnli_cased_bert_torch.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "glue",
    "name": "mnli",
    "train": "train",
    "valid": "validation_matched",
    "test": "test_matched"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["hypothesis", "premise"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["hypothesis", "premise"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 128,
        "in": ["hypothesis", "premise"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 64,
    "metrics": ["accuracy"],
    "validation_patience": 10,
    "val_every_n_batches": 250,
    "log_every_n_batches": 250,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/glue_mnli_torch_cased_bert",
      "BASE_MODEL": "bert-base-cased"
    }
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_mnli_mm_cased_bert_torch.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "glue",
    "name": "mnli",
    "train": "train",
    "valid": "validation_mismatched",
    "test": "test_mismatched"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["hypothesis", "premise"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["hypothesis", "premise"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 128,
        "in": ["hypothesis", "premise"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 64,
    "metrics": ["accuracy"],
    "validation_patience": 10,
    "val_every_n_batches": 250,
    "log_every_n_batches": 250,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/glue_mnli_mm_torch_cased_bert",
      "BASE_MODEL": "bert-base-cased"
    }
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation_matched",
    "test": "test_matched"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["hypothesis", "premise"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["hypothesis", "premise"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 128,
        "in": ["hypothesis", "premise"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 1e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 4,
    "metrics": ["accuracy"],
    "validation_patience": 10,
    "val_every_n_batches": 250,
    "log_every_n_batches": 250,
    "show_examples": false,
    "evaluation_targets": ["valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "glue",
      "TASK": "mnli",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/0.16/classifiers/glue_mnli.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_mrpc_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["sentence1", "sentence2"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["sentence1", "sentence2"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["sentence1", "sentence2"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 1e-06
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 4,
    "metrics": ["accuracy"],
    "epochs": 2,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "glue",
      "TASK": "mrpc",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/glue/glue_mrpc_roberta.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_qnli_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["question", "sentence"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["question", "sentence"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 128,
        "in": ["question", "sentence"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 16,
    "metrics": ["accuracy"],
    "validation_patience": 10,
    "val_every_n_batches": 250,
    "log_every_n_batches": 250,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "glue",
      "TASK": "qnli",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/glue/glue_qnli_roberta.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_qqp_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["question1", "question2"],
    "label": "label",
    "use_label_name": false,
    "seed": 42
  },
  "chainer": {
    "in": ["question1", "question2"],
    "in_y": ["y_ids"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 128,
        "in": ["question1", "question2"],
        "out": ["bert_features"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": 2,
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": 2,
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      }
    ],
    "out": ["y_pred_ids"]
  },
  "train": {
    "batch_size": 16,
    "metrics": [
      "f1",
      "accuracy"
    ],
    "validation_patience": 10,
    "val_every_n_batches": 250,
    "log_every_n_batches": 250,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "glue",
      "TASK": "qqp",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/glue/glue_qqp_roberta.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_rte_cased_bert_torch.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "glue",
    "name": "rte",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["sentence1", "sentence2"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["sentence1", "sentence2"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["sentence1", "sentence2"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 32,
    "metrics": ["accuracy"],
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/glue_rte_torch_cased_bert",
      "BASE_MODEL": "bert-base-cased"
    }
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["sentence1", "sentence2"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["sentence1", "sentence2"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["sentence1", "sentence2"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 1e-06
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 4,
    "metrics": ["accuracy"],
    "epochs": 2,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large-mnli",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "glue",
      "TASK": "rte",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/0.16/classifiers/glue_rte.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_sst2_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": "sentence",
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "bert-base-cased",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": ["x"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "bert-base-cased",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 128,
    "metrics": ["accuracy"],
    "validation_patience": 10,
    "val_every_n_batches": 250,
    "log_every_n_batches": 250,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "glue",
      "TASK": "sst2",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/glue/glue_sst2_roberta.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_stsb_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["sentence1", "sentence2"],
    "label": "label",
    "use_label_name": false,
    "seed": 42
  },
  "chainer": {
    "in": ["sentence1", "sentence2"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": ["sentence1", "sentence2"],
        "out": ["bert_features"]
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": 1,
        "return_probas": false,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y"],
        "out": ["y_pred"]
      }
    ],
    "out": ["y_pred"]
  },
  "train": {
    "batch_size": 32,
    "metrics": [
      "pearson_correlation",
      "spearman_correlation"
    ],
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "glue",
      "TASK": "stsb",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/glue/glue_stsb_roberta.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/glue/glue_wnli_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["sentence1", "sentence2"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["sentence1", "sentence2"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 192,
        "truncation": "longest_first",
        "padding": "longest",
        "in": ["sentence1", "sentence2"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 1e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 8,
    "metrics": ["accuracy"],
    "epochs": 1,
    "val_every_n_batches": 250,
    "log_every_n_batches": 250,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "glue",
      "TASK": "wnli",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/0.16/classifiers/glue_wnli_roberta.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/insults_kaggle_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "basic_classification_reader",
    "x": "Comment",
    "y": "Class",
    "data_path": "{DOWNLOADS_PATH}/insults_data"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": true,
        "max_seq_length": 64,
        "in": [
          "x"
        ],
        "out": [
          "bert_features"
        ]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": [
          "y"
        ],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": [
          "y"
        ],
        "out": [
          "y_ids"
        ]
      },
      {
        "in": [
          "y_ids"
        ],
        "out": [
          "y_onehot"
        ],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 1e-05
        },
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 2.0,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y_ids"
        ],
        "out": [
          "y_pred_probas"
        ]
      },
      {
        "in": [
          "y_pred_probas"
        ],
        "out": [
          "y_pred_ids"
        ],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": [
          "y_pred_ids"
        ],
        "out": [
          "y_pred_labels"
        ],
        "ref": "classes_vocab"
      }
    ],
    "out": [
      "y_pred_labels"
    ]
  },
  "train": {
    "epochs": 100,
    "batch_size": 64,
    "metrics": [
      {
        "name": "roc_auc",
        "inputs": [
          "y_onehot",
          "y_pred_probas"
        ]
      },
      "accuracy",
      "f1_macro"
    ],
    "validation_patience": 5,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "TRANSFORMER": "bert-base-uncased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/insults_kaggle_torch_bert"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/insults_data.tar.gz",
        "subdir": "{DOWNLOADS_PATH}"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_torch_bert_v5.tar.gz",
        "subdir": "{MODELS_PATH}/classifiers"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
================================================
 {
  "dataset_reader": {
    "class_name": "paraphraser_reader",
    "data_path": "{DOWNLOADS_PATH}/paraphraser_data",
    "do_lower_case": false
  },
  "dataset_iterator": {
    "class_name": "siamese_iterator",
    "seed": 243,
    "len_valid": 500
  },
  "chainer": {
    "in": ["text_a", "text_b"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": ["text_a", "text_b"],
        "out": ["bert_features"]
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": 2,
        "return_probas": false,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "attention_probs_keep_prob": 0.11,
        "hidden_keep_prob": 1.0, 
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 1.89e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 1.5,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y"
        ],
        "out": [
          "predictions"
        ]
      }
    ],
    "out": ["predictions"]
  },
  "train": {
    "epochs": 100,
    "batch_size": 64,
    "metrics": [
        "f1",
        "accuracy"
    ],
    "validation_patience": 7,
    "val_every_n_batches": 50,
    "log_every_n_batches": 50,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_2L"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_2L.tar.gz",
        "subdir": "{MODELS_PATH}"
      },
      {
        "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
      },
      {
        "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
================================================
{
  "dataset_reader": {
    "class_name": "paraphraser_reader",
    "data_path": "{DOWNLOADS_PATH}/paraphraser_data",
    "do_lower_case": false
  },
  "dataset_iterator": {
    "class_name": "siamese_iterator",
    "seed": 243,
    "len_valid": 500
  },
  "chainer": {
    "in": ["text_a", "text_b"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": ["text_a", "text_b"],
        "out": ["bert_features"]
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": 2,
        "return_probas": false,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "attention_probs_keep_prob": 0.0,
        "hidden_keep_prob": 0.67, 
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 7.22e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 1.5,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y"
        ],
        "out": [
          "predictions"
        ]
      }
    ],
    "out": ["predictions"]
  },
  "train": {
    "epochs": 100,
    "batch_size": 64,
    "metrics": [
        "f1",
        "accuracy"
    ],
    "validation_patience": 7,
    "val_every_n_batches": 50,
    "log_every_n_batches": 50,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_6L"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_6L.tar.gz",
        "subdir": "{MODELS_PATH}"
      },
      {
        "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
      },
      {
        "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
      }
    ]
  }
} 


================================================
FILE: deeppavlov/configs/classifiers/paraphraser_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "paraphraser_reader",
    "data_path": "{DOWNLOADS_PATH}/paraphraser_data",
    "do_lower_case": false
  },
  "dataset_iterator": {
    "class_name": "siamese_iterator",
    "seed": 243,
    "len_valid": 500
  },
  "chainer": {
    "in": ["text_a", "text_b"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": ["text_a", "text_b"],
        "out": ["bert_features"]
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": 2,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 2e-05},
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y"],
        "out": ["predictions"]
      }
    ],
    "out": ["predictions"]
  },
  "train": {
    "batch_size": 64,
    "pytest_max_batches": 2,
    "train_metrics": ["f1", "acc"],
    "metrics": ["f1", "acc"],
    "validation_patience": 7,
    "val_every_n_batches": 50,
    "log_every_n_batches": 50,
    "evaluation_targets": ["valid", "test"],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/paraphraser_rubert_torch",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
      },
      {
        "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
      },
      {
        "url": "http://files.deeppavlov.ai/v1/classifiers/paraphraser_rubert/paraphraser_rubert_v1.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/query_pr.json
================================================
{
  "dataset_reader": {
    "class_name": "sq_reader",
    "data_path": "{DOWNLOADS_PATH}/query_prediction/query_prediction_eng.pickle"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": ["x"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 1e-05},
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "epochs": 100,
    "batch_size": 64,
    "metrics": [
      "f1_macro",
      "accuracy",
      {
        "name": "roc_auc",
        "inputs": ["y_onehot", "y_pred_probas"]
      }
    ],
    "validation_patience": 10,
    "val_every_n_batches": 100,
    "log_every_n_batches": 100,
    "show_examples": false,
    "evaluation_targets": ["train", "valid", "test"],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "TRANSFORMER": "haisongzhang/roberta-tiny-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/query_prediction_eng"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/query_prediction_eng.tar.gz",
        "subdir": "{MODELS_PATH}/classifiers/query_prediction_eng"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/query_prediction_eng.pickle",
        "subdir": "{DOWNLOADS_PATH}/query_prediction"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/rusentiment_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "basic_classification_reader",
    "x": "text",
    "y": "label",
    "data_path": "{DOWNLOADS_PATH}/rusentiment/",
    "train": "rusentiment_random_posts.csv",
    "test": "rusentiment_test.csv"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42,
    "split_seed": 23,
    "field_to_split": "train",
    "split_fields": [
      "train",
      "valid"
    ],
    "split_proportions": [
      0.9,
      0.1
    ]
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": [
          "x"
        ],
        "out": [
          "bert_features"
        ]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": [
          "y"
        ],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": "y",
        "out": "y_ids"
      },
      {
        "in": "y_ids",
        "out": "y_onehot",
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer_parameters": {"lr": 1e-05},
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 2.0,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y_onehot"
        ],
        "out": [
          "y_pred_probas"
        ]
      },
      {
        "in": "y_pred_probas",
        "out": "y_pred_ids",
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": "y_pred_ids",
        "out": "y_pred_labels",
        "ref": "classes_vocab"
      }
    ],
    "out": [
      "y_pred_labels"
    ]
  },
  "train": {
    "batch_size": 64,
    "epochs": 100,
    "metrics": [
      "f1_weighted",
      "f1_macro",
      "accuracy",
      {
        "name": "roc_auc",
        "inputs": [
          "y_onehot",
          "y_pred_probas"
        ]
      }
    ],
    "show_examples": false,
    "pytest_max_batches": 2,
    "validation_patience": 5,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_bert_torch",
      "TRANSFORMER": "bert-base-multilingual-cased"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/classifiers/rusentiment_bert/rusentiment_bert_torch.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/rusentiment_convers_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "basic_classification_reader",
    "x": "text",
    "y": "label",
    "data_path": "{DOWNLOADS_PATH}/rusentiment/",
    "train": "rusentiment_random_posts.csv",
    "test": "rusentiment_test.csv"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42,
    "split_seed": 23,
    "field_to_split": "train",
    "split_fields": [
      "train",
      "valid"
    ],
    "split_proportions": [
      0.9,
      0.1
    ]
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": [
          "x"
        ],
        "out": [
          "bert_features"
        ]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": [
          "y"
        ],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": "y",
        "out": "y_ids"
      },
      {
        "in": "y_ids",
        "out": "y_onehot",
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer_parameters": {"lr": 1e-05},
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 2.0,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y_onehot"
        ],
        "out": [
          "y_pred_probas"
        ]
      },
      {
        "in": "y_pred_probas",
        "out": "y_pred_ids",
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": "y_pred_ids",
        "out": "y_pred_labels",
        "ref": "classes_vocab"
      }
    ],
    "out": [
      "y_pred_labels"
    ]
  },
  "train": {
    "batch_size": 64,
    "epochs": 100,
    "metrics": [
      "f1_weighted",
      "f1_macro",
      "accuracy",
      {
        "name": "roc_auc",
        "inputs": [
          "y_onehot",
          "y_pred_probas"
        ]
      }
    ],
    "show_examples": false,
    "pytest_max_batches": 2,
    "validation_patience": 5,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_bert_torch",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased-conversational"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/classifiers/rusentiment_convers_bert/rusentiment_convers_bert_torch.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
================================================
{
  "dataset_reader": {
    "class_name": "basic_classification_reader",
    "x": "text",
    "y": "label",
    "data_path": "{DOWNLOADS_PATH}/rusentiment/",
    "train": "rusentiment_random_posts.csv",
    "test": "rusentiment_test.csv"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42,
    "split_seed": 23,
    "field_to_split": "train",
    "split_fields": [
      "train",
      "valid"
    ],
    "split_proportions": [
      0.9,
      0.1
    ]
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": true,
        "max_seq_length": 64,
        "in": [
          "x"
        ],
        "out": [
          "bert_features"
        ]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": [
          "y"
        ],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": "y",
        "out": "y_ids"
      },
      {
        "in": "y_ids",
        "out": "y_onehot",
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "attention_probs_keep_prob": 0.78,
        "hidden_keep_prob": 0.89, 
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 7.22e-05
        },
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 1.5,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y_ids"
        ],
        "out": [
          "y_pred_probas"
        ]
      },
      {
        "in": "y_pred_probas",
        "out": "y_pred_ids",
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": "y_pred_ids",
        "out": "y_pred_labels",
        "ref": "classes_vocab"
      }
    ],
    "out": [
      "y_pred_labels"
    ]
  },
  "train": {
    "epochs": 100,
    "batch_size": 64,
    "metrics": [
        "f1_weighted",
        "f1_macro",
        "accuracy",
        {
            "name": "roc_auc",
            "inputs": [
                "y_onehot",
                "y_pred_probas"
            ]
        }
    ],
    "validation_patience": 5,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_2L"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_2L.tar.gz",
        "subdir": "{MODELS_PATH}/classifiers/"
      }
    ]
  }
} 


================================================
FILE: deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
================================================
{
  "dataset_reader": {
    "class_name": "basic_classification_reader",
    "x": "text",
    "y": "label",
    "data_path": "{DOWNLOADS_PATH}/rusentiment/",
    "train": "rusentiment_random_posts.csv",
    "test": "rusentiment_test.csv"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42,
    "split_seed": 23,
    "field_to_split": "train",
    "split_fields": [
      "train",
      "valid"
    ],
    "split_proportions": [
      0.9,
      0.1
    ]
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": true,
        "max_seq_length": 64,
        "in": [
          "x"
        ],
        "out": [
          "bert_features"
        ]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": [
          "y"
        ],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": "y",
        "out": "y_ids"
      },
      {
        "in": "y_ids",
        "out": "y_onehot",
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "attention_probs_keep_prob": 0.78,
        "hidden_keep_prob": 0, 
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 4.56e-05
        },
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 1.5,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y_ids"
        ],
        "out": [
          "y_pred_probas"
        ]
      },
      {
        "in": "y_pred_probas",
        "out": "y_pred_ids",
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": "y_pred_ids",
        "out": "y_pred_labels",
        "ref": "classes_vocab"
      }
    ],
    "out": [
      "y_pred_labels"
    ]
  },
  "train": {
    "epochs": 100,
    "batch_size": 64,
    "metrics": [
        "f1_weighted",
        "f1_macro",
        "accuracy",
        {
            "name": "roc_auc",
            "inputs": [
                "y_onehot",
                "y_pred_probas"
            ]
        }
    ],
    "validation_patience": 5,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_6L"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_6L.tar.gz",
        "subdir": "{MODELS_PATH}/classifiers/"
      }
    ]
  }
} 


================================================
FILE: deeppavlov/configs/classifiers/sentiment_sst_conv_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "basic_classification_reader",
    "x": "text",
    "y": "fine_grained_label",
    "data_path": "{DOWNLOADS_PATH}/stanfordSentimentTreebank",
    "train": "train_fine_grained.csv",
    "valid": "valid_fine_grained.csv",
    "test": "test_fine_grained.csv"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": [
          "x"
        ],
        "out": [
          "bert_features"
        ]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": [
          "y"
        ],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": "y",
        "out": "y_ids"
      },
      {
        "in": "y_ids",
        "out": "y_onehot",
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer_parameters": {"lr": 1e-05},
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 2.0,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y_onehot"
        ],
        "out": [
          "y_pred_probas"
        ]
      },
      {
        "in": "y_pred_probas",
        "out": "y_pred_ids",
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": "y_pred_ids",
        "out": "y_pred_labels",
        "ref": "classes_vocab"
      }
    ],
    "out": [
      "y_pred_labels"
    ]
  },
  "train": {
    "epochs": 100,
    "batch_size": 64,
    "metrics": [
      "accuracy",
      {
        "name": "roc_auc",
        "inputs": [
          "y_onehot",
          "y_pred_probas"
        ]
      },
      "f1_macro"
    ],
    "validation_patience": 5,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_sst_bert_torch",
      "TRANSFORMER": "DeepPavlov/bert-base-cased-conversational"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/stanfordSentimentTreebank.zip",
        "subdir": "{DOWNLOADS_PATH}"
      },
      {
        "url": "http://files.deeppavlov.ai/v1/classifiers/sentiment_sst_bert/sentiment_sst_bert_torch.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/sentiment_twitter.json
================================================
{
  "dataset_reader": {
    "class_name": "basic_classification_reader",
    "x": "Twit",
    "y": "Class",
    "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": [
          "y"
        ],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": "y",
        "out": "y_ids"
      },
      {
        "in": "x",
        "out": "x_tok",
        "id": "my_tokenizer",
        "class_name": "nltk_tokenizer",
        "tokenizer": "wordpunct_tokenize"
      },
      {
        "in": "x_tok",
        "out": "x_emb",
        "id": "my_embedder",
        "class_name": "fasttext",
        "load_path": "{DOWNLOADS_PATH}/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin",
        "pad_zero": true
      },
      {
        "in": "y_ids",
        "out": "y_onehot",
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "in": [
          "x_emb"
        ],
        "in_y": [
          "y_ids"
        ],
        "out": [
          "y_pred_probas"
        ],
        "main": true,
        "class_name": "torch_text_classification_model",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "embedding_size": "#my_embedder.dim",
        "n_classes": "#classes_vocab.len",
        "kernel_sizes_cnn": [
          3,
          5,
          7
        ],
        "filters_cnn": 256,
        "dropout_rate": 0.5,
        "dense_size": 64,
        "optimizer": "SGD",
        "optimizer_parameters": {
          "lr": 0.0001,
          "momentum": 0.9,
          "weight_decay": 0.0001
        }
      },
      {
        "in": "y_pred_probas",
        "out": "y_pred_ids",
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": "y_pred_ids",
        "out": "y_pred_labels",
        "ref": "classes_vocab"
      }
    ],
    "out": [
      "y_pred_labels"
    ]
  },
  "train": {
    "epochs": 100,
    "batch_size": 128,
    "metrics": [
      "accuracy",
      "f1_macro",
      {
        "name": "roc_auc",
        "inputs": ["y_onehot", "y_pred_probas"]
      }
    ],
    "validation_patience": 5,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_twitter_torch"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/sentiment_twitter_data.tar.gz",
        "subdir": "{DOWNLOADS_PATH}"
      },
      {
        "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin",
        "subdir": "{DOWNLOADS_PATH}/embeddings"
      },
      {
        "url": "http://files.deeppavlov.ai/v1/classifiers/sentiment_twitter/sentiment_twitter_torch.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/superglue/superglue_boolq_roberta_mnli.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "dev_percentage": 50
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["question", "passage"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["question", "passage"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["question", "passage"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 0.1
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "confidence_threshold": 0.5
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 24,
    "metrics": ["accuracy"],
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large-mnli",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "super_glue",
      "TASK": "boolq",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}",
      "BINARY_CLASSIFICATION": true
    },
    "download": [
	  {
		"url": "http://files.deeppavlov.ai/v1/superglue/superglue_boolq_roberta_mnli.tar.gz",
		"subdir": "{MODEL_PATH}"
	  }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/superglue/superglue_copa_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["contexts", "choices"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["contexts_list", "choices_list"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_multiplechoice_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": ["contexts_list", "choices_list"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_multiplechoice",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 16,
    "metrics": ["accuracy"],
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "super_glue",
      "TASK": "copa",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/0.17/classifiers/superglue/superglue_copa_roberta.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/superglue/superglue_record_roberta.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "downsample_ratio": [1.8, 1.8, 1],
    "do_index_correction": false
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["idx", "query", "passage", "entities", "num_examples"],
    "label": "label",
    "seed": 42,
    "use_label_name": false
  },
  "chainer": {
    "in": ["idx", "query", "passage", "entities", "num_examples"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 320,
        "in": ["query", "passage"],
        "out": ["bert_features"]
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": 2,
        "return_probas": true,
        "is_binary": "{BINARY_CLASSIFICATION}",
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 0.1
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y"],
        "out": ["y_pred_probas"]
      },
      {
        "class_name": "proba2labels",
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "is_binary": "{BINARY_CLASSIFICATION}",
        "max_proba": true
      },
      {
        "class_name": "torch_record_postprocessor",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "in": ["idx", "y", "y_pred_probas", "entities", "num_examples"],
        "out": ["record_examples"]
      }
    ],
    "out": ["y_pred_probas"]
  },
  "train": {
    "batch_size": 24,
    "train_metrics": [
      {
        "name": "accuracy",
        "inputs": ["y", "y_pred_ids"]
      }
    ],
    "metrics": [
      {
        "name": "record_em_score",
        "inputs": ["record_examples"]
      },
      {
        "name": "record_f1_score",
        "inputs": ["record_examples"]
      },
      {
        "name": "accuracy",
        "inputs": ["y", "y_pred_ids"]
      }
    ],
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "class_name": "torch_trainer",
    "evaluation_targets": ["valid"],
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "roberta-large",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "super_glue",
      "TASK": "record",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}",
      "BINARY_CLASSIFICATION": false
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/0.17/classifiers/superglue/superglue_record_roberta.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/superglue/superglue_wic_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test"
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["sentence1", "sentence2"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["sentence1", "sentence2"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["sentence1", "sentence2"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 2e-05},
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 16,
    "metrics": ["accuracy"],
    "epochs": 10,
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "val_every_n_batches": 1000,
    "show_examples": false,
    "evaluation_targets": ["valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "bert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "super_glue",
      "TASK": "wic",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/superglue/superglue_wic_bert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/classifiers/topics_distilbert_base_uncased.json
================================================
{
  "dataset_reader": {
    "class_name": "basic_classification_reader",
    "class_sep": ";",
    "x": "text",
    "y": "topic",
    "data_path": "{DOWNLOADS_PATH}/dp_topics_downsampled_data/",
    "train" : "train.csv",
    "valid" : "valid.csv"  
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": true,
        "max_seq_length": 128,
        "in": [
          "x"
        ],
        "out": [
          "bert_features"
        ]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": [
          "y"
        ],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": [
          "y"
        ],
        "out": [
          "y_ids"
        ]
      },
      {
        "in": [
          "y_ids"
        ],
        "out": [
          "y_onehot"
        ],
        "class_name": "one_hotter",
        "id": "my_one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "multilabel": true,
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 1e-05
        },
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 2.0,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y_onehot"
        ],
        "out": [
          "y_pred_probas"
        ]
      },
      {
        "in": "y_pred_probas",
        "out": "y_pred_ids",
        "class_name": "proba2labels",
        "max_proba": false,
        "confidence_threshold": 0.5
      },
      {
        "in": "y_pred_ids",
        "out": "y_pred_labels",
        "ref": "classes_vocab"
      },
      {
        "ref": "my_one_hotter",
        "in": "y_pred_ids",
        "out": "y_pred_onehot"
      }
    ],
    "out": [
      "y_pred_labels"
    ]
  },
  "train": {
    "epochs": 100,
    "batch_size": 64,
    "metrics": [
      {
        "name": "f1_macro",
        "inputs": [
          "y_onehot",
          "y_pred_onehot"
        ]
      },
      {
        "name": "f1_weighted",
        "inputs": [
          "y_onehot",
          "y_pred_onehot"
        ]
      },
      {
        "name": "accuracy",
        "inputs": [
          "y",
          "y_pred_labels"
        ]
      },
      {
        "name": "roc_auc",
        "inputs": [
          "y_onehot",
          "y_pred_probas"
        ]
      }
    ],
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "log_every_n_batches": 100,
    "show_examples": false,
    "evaluation_targets": [
      "train",
      "valid",
      "test"
    ],
    "tensorboard_log_dir": "{MODEL_PATH}/logs",
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "TRANSFORMER": "distilbert-base-uncased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/topic_distilbert_base_v0"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/dp_topics_downsampled_dataset_v0.tar.gz",
        "subdir": "{DOWNLOADS_PATH}"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/topic_distilbert_base_v0.tar.gz",
        "subdir": "{MODELS_PATH}/classifiers"
      }
    ]
  }
}

================================================
FILE: deeppavlov/configs/doc_retrieval/en_ranker_pop_wiki.json
================================================
{
  "dataset_reader": {
    "class_name": "odqa_reader",
    "data_path": "{DOWNLOADS_PATH}/odqa/enwiki",
    "save_path": "{DOWNLOADS_PATH}/odqa/enwiki.db",
    "dataset_format": "wiki"
  },
  "dataset_iterator": {
    "class_name": "sqlite_iterator",
    "shuffle": false,
    "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
  },
  "chainer": {
    "in": ["docs"],
    "in_y": ["doc_ids", "doc_nums"],
    "out": ["pop_doc_ids"],
    "pipe": [
      {
        "class_name": "hashing_tfidf_vectorizer",
        "id": "vectorizer",
        "fit_on": ["docs", "doc_ids", "doc_nums"],
        "save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
        "load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
        "tokenizer": {
          "class_name": "stream_spacy_tokenizer",
          "lemmas": true,
          "lowercase": true,
          "filter_stopwords": true,
          "ngram_range": [1, 3]
        }
      },
      {
        "class_name": "tfidf_ranker",
        "top_n": 100,
        "in": ["docs"],
        "out": ["tfidf_doc_ids", "tfidf_doc_scores"],
        "vectorizer": "#vectorizer"
      },
      {
        "class_name": "pop_ranker",
        "pop_dict_path": "{DOWNLOADS_PATH}/odqa/enwiki_popularities.json",
        "load_path": "{MODELS_PATH}/odqa/logreg_3features_v2.joblib",
        "top_n": 100,
        "in": ["tfidf_doc_ids", "tfidf_doc_scores"],
        "out": ["pop_doc_ids", "pop_doc_scores"]
      }
    ]
  },
  "train": {
    "batch_size": 10000,
    "evaluation_targets": [],
    "class_name": "fit_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/odqa"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz",
        "subdir": "{MODELS_PATH}/odqa"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_popularities.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/odqa"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib",
        "subdir": "{MODELS_PATH}/odqa"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/doc_retrieval/en_ranker_tfidf_wiki.json
================================================
{
  "dataset_reader": {
    "class_name": "odqa_reader",
    "data_path": "{DOWNLOADS_PATH}/odqa/enwiki",
    "save_path": "{DOWNLOADS_PATH}/odqa/enwiki.db",
    "dataset_format": "wiki"
  },
  "dataset_iterator": {
    "class_name": "sqlite_iterator",
    "shuffle": false,
    "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
  },
  "chainer": {
    "in": ["docs"],
    "in_y": ["doc_ids", "doc_nums"],
    "out": ["tfidf_doc_ids"],
    "pipe": [
      {
        "class_name": "hashing_tfidf_vectorizer",
        "id": "vectorizer",
        "fit_on": ["docs", "doc_ids", "doc_nums"],
        "save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
        "load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
        "tokenizer": {
          "class_name": "stream_spacy_tokenizer",
          "lemmas": true,
          "lowercase": true,
          "filter_stopwords": true,
          "ngram_range": [1, 3]
        }
      },
      {
        "class_name": "tfidf_ranker",
        "top_n": 100,
        "in": ["docs"],
        "out": ["tfidf_doc_ids", "tfidf_doc_scores"],
        "vectorizer": "#vectorizer"
      }
    ]
  },
  "train": {
    "batch_size": 10000,
    "evaluation_targets": [],
    "class_name": "fit_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/odqa"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz",
        "subdir": "{MODELS_PATH}/odqa"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json
================================================
{
  "dataset_reader": {
    "class_name": "odqa_reader",
    "data_path": "{DOWNLOADS_PATH}/odqa/ruwiki",
    "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db",
    "dataset_format": "wiki"
  },
  "dataset_iterator": {
    "class_name": "sqlite_iterator",
    "shuffle": false,
    "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db"
  },
  "chainer": {
    "in": ["docs"],
    "in_y": ["doc_ids", "doc_nums"],
    "out": ["tfidf_doc_ids"],
    "pipe": [
      {
        "class_name": "hashing_tfidf_vectorizer",
        "id": "vectorizer",
        "fit_on": ["docs", "doc_ids", "doc_nums"],
        "save_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz",
        "load_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz",
        "tokenizer": {
          "class_name": "stream_spacy_tokenizer",
          "spacy_model": "ru_core_news_sm",
          "lemmas": true,
          "lowercase": true,
          "filter_stopwords": true,
          "ngram_range": [1, 3]
        }
      },
      {
        "class_name": "tfidf_ranker",
        "top_n": 100,
        "in": ["docs"],
        "out": ["tfidf_doc_ids", "tfidf_doc_scores"],
        "vectorizer": "#vectorizer"
      }
    ]
  },
  "train": {
    "batch_size": 10000,
    "evaluation_targets": [],
    "class_name": "fit_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_par_page_compr.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/odqa"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_tfidf_matrix_compr.tar.gz",
        "subdir": "{MODELS_PATH}/odqa"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/embedder/bert_embedder.json
================================================
{
  "chainer": {
    "in": ["texts"],
    "pipe": [
      {
        "class_name": "transformers_bert_preprocessor",
        "vocab_file": "{BERT_PATH}/vocab.txt",
        "do_lower_case": false,
        "max_seq_length": 512,
        "in": ["texts"],
        "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"]
      },
      {
        "class_name": "transformers_bert_embedder",
        "bert_config_path": "{BERT_PATH}/bert_config.json",
        "load_path": "{BERT_PATH}",
        "truncate": true,
        "in": ["subword_tok_ids", "startofword_markers", "attention_mask"],
        "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"]
      }
    ],
    "out": ["tokens", "word_emb", "subword_tokens", "subword_emb", "max_emb", "mean_emb", "pooler_output"]
  },
  "train": {},
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12_pt"
    },
    "labels": {},
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/bert_models"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/embedder/bert_sentence_embedder.json
================================================
{
  "chainer": {
    "in": ["texts"],
    "pipe": [
      {
        "class_name": "transformers_bert_preprocessor",
        "vocab_file": "{BERT_PATH}/vocab.txt",
        "do_lower_case": false,
        "max_seq_length": 512,
        "in": ["texts"],
        "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"]
      },
      {
        "class_name": "transformers_bert_embedder",
        "bert_config_path": "{BERT_PATH}/config.json",
        "load_path": "{BERT_PATH}",
        "truncate": false,
        "in": ["subword_tok_ids", "startofword_markers", "attention_mask"],
        "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"]
      }
    ],
    "out": ["max_emb", "mean_emb", "pooler_output"]
  },
  "train": {},
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/sentence_multi_cased_L-12_H-768_A-12_pt_v1"
    },
    "labels": {},
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/bert_models"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/entity_extraction/entity_detection_en.json
================================================
{
  "chainer": {
    "in": ["x"],
    "pipe": [
      {
        "class_name": "ner_chunker",
        "batch_size": 16,
        "max_seq_len" : 300,
        "vocab_file": "{TRANSFORMER}",
        "in": ["x"],
        "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"]
      },
      {
        "thres_proba": 0.6,
        "o_tag": "O",
        "tags_file": "{NER_PATH}/tag.dict",
        "class_name": "entity_detection_parser",
        "id": "edp"
      },
      {
        "class_name": "ner_chunk_model",
        "ner": {
          "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert.json",
          "overwrite": {
            "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"]
          }
        },
        "ner_parser": "#edp",
        "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"],
        "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
      }
    ],
    "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
      "TRANSFORMER": "bert-base-cased",
      "NER_PATH": "{MODELS_PATH}/ner_ontonotes_bert_torch_crf"
    }
  }
}


================================================
FILE: deeppavlov/configs/entity_extraction/entity_detection_ru.json
================================================
{
  "chainer": {
    "in": ["x"],
    "pipe": [
      {
        "class_name": "ner_chunker",
        "batch_size": 16,
        "max_seq_len" : 300,
        "vocab_file": "{TRANSFORMER}",
        "in": ["x"],
        "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"]
      },
      {
        "thres_proba": 0.05,
        "o_tag": "O",
        "tags_file": "{NER_PATH}/tag.dict",
        "class_name": "entity_detection_parser",
        "id": "edp"
      },
      {
        "class_name": "ner_chunk_model",
        "ner": {"config_path": "{CONFIGS_PATH}/ner/ner_rus_bert_probas.json"},
        "ner_parser": "#edp",
        "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"],
        "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
      }
    ],
    "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "NER_PATH": "{MODELS_PATH}/wiki_ner_rus_bert"
    }
  }
}


================================================
FILE: deeppavlov/configs/entity_extraction/entity_extraction_en.json
================================================
{
  "chainer": {
    "in": ["x"],
    "pipe": [
      {
        "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_en.json",
        "in": ["x"],
        "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
      },
      {
        "config_path": "{CONFIGS_PATH}/entity_extraction/entity_linking_en.json",
        "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
        "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"]
      }
    ],
    "out": ["entity_substr", "tags", "entity_offsets", "entity_ids", "entity_conf", "entity_pages", "entity_labels"]
  },
  "metadata": {
    "variables": {
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
    }
  }
}


================================================
FILE: deeppavlov/configs/entity_extraction/entity_extraction_ru.json
================================================
{
  "chainer": {
    "in": ["x"],
    "pipe": [
      {
        "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_ru.json",
        "in": ["x"],
        "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
      },
      {
        "config_path": "{CONFIGS_PATH}/entity_extraction/entity_linking_ru.json",
        "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
        "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"]
      }
    ],
    "out": ["entity_substr", "tags", "entity_offsets", "entity_ids", "entity_conf", "entity_pages", "entity_labels"]
  },
  "metadata": {
    "variables": {
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
    }
  }
}


================================================
FILE: deeppavlov/configs/entity_extraction/entity_linking_en.json
================================================
{
  "chainer": {
    "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
    "pipe": [
      {
        "class_name": "torch_transformers_entity_ranker_infer",
        "id": "entity_descr_ranking",
        "pretrained_bert": "{TRANSFORMER}",
        "encoder_weights_path": "{MODELS_PATH}/entity_linking_eng/encoder.pth.tar",
        "bilinear_weights_path": "{MODELS_PATH}/entity_linking_eng/bilinear.pth.tar",
        "special_token_id": 30522,
        "emb_size": 512,
        "block_size": 8
      },
      {
        "class_name": "entity_linker",
        "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
        "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"],
        "load_path": "{DOWNLOADS_PATH}/entity_linking_eng",
        "entities_database_filename": "el_eng_v2.db",
        "entity_ranker": "#entity_descr_ranking",
        "rank_in_runtime": true,
        "num_entities_for_bert_ranking": 20,
        "include_mention": false,
        "num_entities_to_return": 3,
        "lemmatize": true,
        "use_descriptions": true,
        "use_connections": true,
        "use_tags": true,
        "full_paragraph": true,
        "return_confidences": true,
        "lang": "en"
      }
    ],
    "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "prajjwal1/bert-small"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/kbqa/downloads/el_db_eng_v2.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/entity_linking_eng"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/entity_linking/el_ranker_eng.tar.gz",
        "subdir": "{MODELS_PATH}/entity_linking_eng"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/entity_extraction/entity_linking_ru.json
================================================
{
  "chainer": {
    "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
    "pipe": [
      {
        "class_name": "torch_transformers_entity_ranker_infer",
        "id": "entity_descr_ranking",
        "pretrained_bert": "{TRANSFORMER}",
        "encoder_weights_path": "{MODELS_PATH}/entity_linking_rus/encoder.pth.tar",
        "bilinear_weights_path": "{MODELS_PATH}/entity_linking_rus/bilinear.pth.tar",
        "special_token_id": 30522,
        "emb_size": 264,
        "block_size": 6
      },
      {
        "class_name": "entity_linker",
        "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
        "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"],
        "load_path": "{DOWNLOADS_PATH}/entity_linking_rus",
        "entities_database_filename": "el_rus_v2.db",
        "words_dict_filename": "{DOWNLOADS_PATH}/entity_linking_rus/words_dict.pickle",
        "ngrams_matrix_filename": "{DOWNLOADS_PATH}/entity_linking_rus/ngrams_matrix.npz",
        "entity_ranker": "#entity_descr_ranking",
        "rank_in_runtime": true,
        "num_entities_for_bert_ranking": 30,
        "use_gpu": false,
        "include_mention": false,
        "num_entities_to_return": 3,
        "lemmatize": true,
        "use_descriptions": true,
        "use_connections": true,
        "use_tags": true,
        "kb_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_lite.hdt",
        "prefixes": {"entity": ["http://we"],
                     "rels": {"direct": "http://wpd",
                              "no_type": "http://wp",
                              "statement": "http://wps",
                              "qualifier": "http://wpq"
                              }
                     },
        "full_paragraph": true,
        "return_confidences": true,
        "lang": "ru"
      }
    ],
    "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational-v1"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/kbqa/downloads/el_files_rus_v2.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/entity_linking_rus"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/entity_linking/el_ranker_rus.tar.gz",
        "subdir": "{MODELS_PATH}/entity_linking_rus"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_lite.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/wikidata"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/faq/fasttext_logreg.json
================================================
{
  "dataset_reader": {
    "class_name": "basic_classification_reader",
    "format": "json",
    "orient": "split",
    "x": "text",
    "y": "category",
    "data_path": "{DOWNLOADS_PATH}/massive/{LANGUAGE}",
    "train": "train.json",
    "valid": "dev.json",
    "test": "test.json"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42,
    "shuffle": true,
    "shot": 5
  },
  "chainer": {
    "in": ["text"],
    "in_y": ["category"],
    "pipe": [
      {
        "class_name": "stream_spacy_tokenizer",
        "in": ["text"],
        "id": "my_tokenizer",
        "lemmas": false,
        "out": "token_lemmas",
        "spacy_model": "{SPACY_MODEL}"
      },
      {
        "ref": "my_tokenizer",
        "in": ["token_lemmas"],
        "out": ["text_lem"]
      },
      {
        "class_name": "fasttext",
        "in": ["token_lemmas"],
        "load_path": "{DOWNLOADS_PATH}/embeddings/fasttext/{LANGUAGE}.bin",
        "mean": true,
        "out": ["text_vector"]
      },
      {
        "id": "answers_vocab",
        "class_name": "simple_vocab",
        "fit_on": "category",
        "save_path": "{MODEL_PATH}/cat_answers.dict",
        "load_path": "{MODEL_PATH}/cat_answers.dict",
        "in": ["category"],
        "out": ["y_ids"]
      },
      {
        "in": ["text_vector"],
        "fit_on": ["text_vector", "y_ids"],
        "out": ["y_pred_proba"],
        "class_name": "sklearn_component",
        "main": true,
        "save_path": "{MODEL_PATH}/model.pkl",
        "load_path": "{MODEL_PATH}/model.pkl",
        "model_class": "sklearn.linear_model:LogisticRegression",
        "infer_method": "predict_proba",
        "C": 10,
        "penalty": "l2"
      },
      {
        "in": ["y_pred_proba"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_category"],
        "ref": "answers_vocab"
      }
    ],
    "out": ["y_pred_category"]
  },
  "train": {
    "evaluation_targets": ["train", "valid", "test"],
    "class_name": "fit_trainer",
    "metrics": [
      {
        "name": "accuracy",
        "inputs": ["category", "y_pred_category"]
      }
    ]
  },
  "metadata": {
    "variables": {
      "LANGUAGE": "en",
      "ROOT_PATH": "~/.deeppavlov",
      "SPACY_MODEL": "en_core_web_sm",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODEL_PATH": "{ROOT_PATH}/models/faq/{LANGUAGE}/fasttext_logreg"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/embeddings/fasttext/{LANGUAGE}.bin",
        "subdir": "{DOWNLOADS_PATH}/embeddings/fasttext"
      },
      {
        "url": "http://files.deeppavlov.ai/datasets/massive-{LANGUAGE}.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/massive/{LANGUAGE}"
      },
      {
        "url": "http://files.deeppavlov.ai/faq/fasttext_logreg_{LANGUAGE}.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/kbqa/kbqa_cq_en.json
================================================
{
  "dataset_reader": {
    "class_name": "lcquad_reader",
    "question_types": ["statement_property", "right-subgraph", "simple question left",
                      "simple question right", "left-subgraph", "rank"],
    "num_samples": 100,
    "data_path": "{DOWNLOADS_PATH}/lcquad/lcquad2.json"
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["gold_answer_ids", "gold_answer_labels", "gold_query"],
    "pipe": [
      {
        "class_name": "question_sign_checker",
        "in": ["x"],
        "out": ["x_punct"]
      },
      {
        "config_path": "{CONFIGS_PATH}/classifiers/query_pr.json",
        "in": ["x_punct"],
        "out": ["template_type"]
      },
      {
        "class_name": "query_formatter",
        "query_info": {"unk_var": "?answer", "mid_var": "?ent"},
        "in": ["gold_query"],
        "out": ["f_gold_query"]
      },
      {
        "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_en.json",
        "overwrite": {
            "chainer.pipe.1.make_tags_from_probas": true,
            "chainer.pipe.2.ner": {
              "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert.json",
              "overwrite": {
                "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"],
                "chainer.pipe.2.use_crf": false,
                "metadata.variables.TRANSFORMER": "distilbert-base-cased",
                "metadata.variables.MODEL_PATH": "{MODELS_PATH}/entity_type_detection_distilbert_lcquad2.0"
              }
            },
            "metadata.variables.NER_PATH": "{MODELS_PATH}/entity_type_detection_distilbert_lcquad2.0"
        },
        "in": ["x_punct", "template_type"],
        "out": ["entity_type_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
      },
      {
        "class_name": "entity_type_split",
        "in": ["entity_type_substr", "tags"],
        "out": ["entity_substr", "entity_tags", "type_substr"]
      },
      {
        "class_name": "answer_types_extractor",
        "lang": "@en",
        "types_filename": "{DOWNLOADS_PATH}/wikidata_eng/types_labels_dict_en.pickle",
        "types_sets_filename": "{DOWNLOADS_PATH}/wikidata_eng/answer_types.pickle",
        "in": ["x_punct", "entity_substr", "tags"],
        "out": ["answer_types", "f_entity_substr", "f_tags"]
      },
      {
        "class_name": "entity_linker",
        "load_path": "{DOWNLOADS_PATH}/entity_linking_eng",
        "entities_database_filename": "el_db_lcquad2.db",
        "num_entities_to_return": 7,
        "lemmatize": true,
        "use_descriptions": false,
        "use_connections": false,
        "use_tags": true,
        "alias_coef": 1.0,
        "prefixes": {"entity": ["http://we"],
                     "rels": {"direct": "http://wpd",
                              "no_type": "http://wp",
                              "statement": "http://wps",
                              "qualifier": "http://wpq"
                              }
                     },
        "return_confidences": true,
        "lang": "en",
        "id": "entity_linker"
      },
      {
        "class_name": "wiki_parser",
        "id": "wiki_p",
        "wiki_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_full.hdt",
        "lang": "@en"
      },
      {
        "class_name": "template_matcher",
        "id": "template_m",
        "num_processors": 16,
        "load_path": "{DOWNLOADS_PATH}/wikidata_eng",
        "templates_filename": "templates_eng.json"
      },
      {
        "class_name": "rel_ranking_infer",
        "id": "rel_r_inf",
        "ranker": {"config_path": "{CONFIGS_PATH}/ranking/rel_ranking_roberta_en.json",
                   "overwrite": {"chainer.out": ["y_pred_probas"]}
        },
        "wiki_parser": "#wiki_p",
        "batch_size": 32,
        "rank_answers": true,
        "load_path": "{DOWNLOADS_PATH}/wikidata_eng",
        "rel_q2name_filename": "wiki_dict_properties_eng.pickle"
      },
      {
        "class_name": "query_generator",
        "id": "query_g",
        "entity_linker": "#entity_linker",
        "template_matcher": "#template_m",
        "rel_ranker": "#rel_r_inf",
        "wiki_parser": "#wiki_p",
        "load_path": "{DOWNLOADS_PATH}/wikidata",
        "rels_in_ranking_queries_fname": "rels_in_ranking_queries.json",
        "sparql_queries_filename": "{DOWNLOADS_PATH}/wikidata/sparql_queries_eng.json",
        "entities_to_leave": 5,
        "rels_to_leave": 10,
        "return_answers": false,
        "map_query_str_to_kb": [["P0", "http://wd"], ["P00", "http://wl"], ["wd:", "http://we/"], ["wdt:", "http://wpd/"],
                                [" p:", " http://wp/"], ["ps:", "http://wps/"], ["pq:", "http://wpq/"]],
        "kb_prefixes": {"entity": "wd:E", "rel": "wdt:R", "type": "wd:T", "type_rel": "wdt:P", "type_rels": ["P31", "P279"]},
        "gold_query_info": {"unk_var": "?answer", "mid_var": "?ent"},
        "in": ["x_punct", "x_punct", "template_type", "entity_substr", "type_substr", "entity_tags", "probas", "answer_types"],
        "out": ["cand_answers", "template_answers"]
      },
      {
        "class_name": "rel_ranking_infer",
        "ranker": {"config_path": "{CONFIGS_PATH}/ranking/path_ranking_nll_roberta_en.json"},
        "wiki_parser": "#wiki_p",
        "batch_size": 32,
        "nll_path_ranking": true,
        "return_elements": ["answer_ids", "queries"],
        "rank_answers": true,
        "load_path": "{DOWNLOADS_PATH}/wikidata_eng",
        "rel_q2name_filename": "wiki_dict_properties_eng.pickle",
        "in": ["x_punct", "template_type", "cand_answers", "entity_substr", "template_answers"],
        "out": ["answers", "answer_ids", "query"]
      }
    ],
    "out": ["answers", "answer_ids", "query"]
  },
  "train": {
    "evaluation_targets": ["test"],
    "batch_size": 1,
    "metrics": [
      {
        "name": "kbqa_accuracy",
        "inputs": ["x", "answers", "answer_ids", "query", "gold_answer_labels", "gold_answer_ids", "f_gold_query"]
      }
    ],
    "class_name": "nn_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/kbqa/datasets/lcquad2.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/lcquad"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/models/entity_type_detection_distilbert_lcquad2.0.tar.gz",
        "subdir": "{MODELS_PATH}/entity_type_detection_distilbert_lcquad2.0"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/queries_and_rels_lcquad2_v2.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/wikidata"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/downloads/el_db_lcquad2.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/entity_linking_eng"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_full.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/wikidata"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/kbqa_files_en.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/wikidata_eng"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/kbqa/kbqa_cq_ru.json
================================================
{
  "dataset_reader": {
    "class_name": "rubq_reader",
    "version": "2.0",
    "question_types": ["all"],
    "num_samples": 100,
    "data_path": "{DOWNLOADS_PATH}/rubq/rubq2.0.json"
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["gold_answer_ids", "gold_answer_labels", "gold_query"],
    "pipe": [
      {
        "class_name": "question_sign_checker",
        "delete_brackets": true,
        "in": ["x"],
        "out": ["x_punct"]
      },
      {
        "class_name": "query_formatter",
        "query_info": {"unk_var": "?answer", "mid_var": "?ent"},
        "in": ["gold_query"],
        "out": ["f_gold_query"]
      },
      {
        "class_name": "ner_chunker",
        "batch_size": 16,
        "max_seq_len" : 300,
        "vocab_file": "distilbert-base-multilingual-cased",
        "in": ["x_punct"],
        "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"]
      },
      {
        "thres_proba": 0.05,
        "o_tag": "O",
        "tags_file": "{NER_PATH}/tag.dict",
        "class_name": "entity_detection_parser",
        "ignored_tags": ["DATE", "CARDINAL", "ORDINAL", "QUANTITY", "PERCENT", "NORP"],
        "lang": "ru",
        "id": "edp"
      },
      {
        "thres_proba": 0.05,
        "o_tag": "O",
        "tags_file": "{NER_PATH2}/tag.dict",
        "class_name": "entity_detection_parser",
        "ignored_tags": ["T"],
        "lang": "ru",
        "id": "edp2"
      },
      {
        "class_name": "ner_chunk_model",
        "ner": {
          "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert_mult.json",
          "overwrite": {
            "chainer.pipe.2.use_crf": false,
            "metadata.variables.TRANSFORMER": "distilbert-base-multilingual-cased",
            "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"],
            "metadata.variables.MODEL_PATH": "{MODELS_PATH}/ner_ontonotes_torch_distilbert_mult"
          }
        },
        "ner_parser": "#edp",
        "ner2": {
          "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert_mult.json",
          "overwrite": {
            "chainer.pipe.2.use_crf": false,
            "metadata.variables.TRANSFORMER": "DeepPavlov/distilrubert-small-cased-conversational",
            "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"],
            "metadata.variables.MODEL_PATH": "{MODELS_PATH}/entity_detection_rubq"
          }
        },
        "ner_parser2": "#edp2",
        "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"],
        "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
      },
      {
        "class_name": "answer_types_extractor",
        "lang": "@ru",
        "types_filename": "{DOWNLOADS_PATH}/wikidata_rus/types_labels_dict_ru.pickle",
        "types_sets_filename": "{DOWNLOADS_PATH}/wikidata_rus/answer_types.pickle",
        "in": ["x_punct", "entity_substr", "tags"],
        "out": ["answer_types", "f_entity_substr", "f_tags"]
      },
      {
        "class_name": "entity_linker",
        "load_path": "{DOWNLOADS_PATH}/entity_linking_rus",
        "entities_database_filename": "el_db_rus.db",
        "words_dict_filename": "{DOWNLOADS_PATH}/entity_linking_rus/words_dict.pickle",
        "ngrams_matrix_filename": "{DOWNLOADS_PATH}/entity_linking_rus/ngrams_matrix.npz",
        "include_mention": false,
        "num_entities_to_return": 7,
        "lemmatize": true,
        "use_descriptions": false,
        "use_connections": true,
        "use_tags": true,
        "kb_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_full.hdt",
        "prefixes": {"entity": ["http://we"],
                     "rels": {"direct": "http://wpd",
                              "no_type": "http://wp",
                              "statement": "http://wps",
                              "qualifier": "http://wpq"
                              }
                     },
        "return_confidences": true,
        "lang": "ru",
        "id": "entity_linker"
      },
      {
        "class_name": "wiki_parser",
        "id": "wiki_p",
        "wiki_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_full.hdt",
        "max_comb_num": 40000,
        "lang": "@ru"
      },
      {
        "class_name": "slovnet_syntax_parser",
        "load_path": "{MODELS_PATH}/slovnet_syntax_parser",
        "navec_filename": "{MODELS_PATH}/slovnet_syntax_parser/navec_news_v1_1B_250K_300d_100q.tar",
        "syntax_parser_filename": "{MODELS_PATH}/slovnet_syntax_parser/slovnet_syntax_news_v1.tar",
        "tree_patterns_filename": "{MODELS_PATH}/slovnet_syntax_parser/tree_patterns.json",
        "id": "slovnet_parser"
      },
      {
        "class_name": "ru_adj_to_noun",
        "freq_dict_filename": "{DOWNLOADS_PATH}/wikidata_rus/freqrnc2011.csv",
        "id": "adj2noun"
      },
      {
        "class_name": "tree_to_sparql",
        "sparql_queries_filename": "{DOWNLOADS_PATH}/wikidata/sparql_queries_rus.json",
        "adj_to_noun": "#adj2noun",
        "syntax_parser": "#slovnet_parser",
        "kb_prefixes": {"entity": "wd:E", "rel": "wdt:R", "type": "wd:T", "type_rel": "wdt:P", "type_rels": ["P31", "P279"]},
        "in": ["x_punct", "entity_substr", "tags", "entity_offsets", "entity_positions", "probas"],
        "out": ["x_sanitized", "query_nums", "s_entity_substr", "s_tags", "s_probas", "entities_to_link", "s_types_substr"]
      },
      {
        "class_name": "template_matcher",
        "id": "template_m",
        "num_processors": 8,
        "load_path": "{DOWNLOADS_PATH}/wikidata_rus",
        "templates_filename": "templates_rus.json"
      },
      {
        "class_name": "rel_ranking_infer",
        "id": "rel_r_inf",
        "ranker": {"config_path": "{CONFIGS_PATH}/ranking/rel_ranking_nll_bert_ru.json"},
        "wiki_parser": "#wiki_p",
        "batch_size": 32,
        "nll_rel_ranking": true,
        "return_elements": ["answer_ids", "queries"],
        "load_path": "{DOWNLOADS_PATH}/wikidata_rus",
        "rank": false,
        "rel_thres": -4.0,
        "type_rels": ["P31", "P279"],
        "rel_q2name_filename": "wiki_dict_properties_full_rus.pickle"
      },
      {
        "class_name": "query_generator",
        "id": "query_g",
        "entity_linker": "#entity_linker",
        "template_matcher": "#template_m",
        "rel_ranker": "#rel_r_inf",
        "wiki_parser": "#wiki_p",
        "load_path": "{DOWNLOADS_PATH}/wikidata",
        "rels_in_ranking_queries_fname": "rels_in_ranking_queries.json",
        "sparql_queries_filename": "{DOWNLOADS_PATH}/wikidata/sparql_queries_rus.json",
        "entities_to_leave": 9,
        "rels_to_leave": 10,
        "max_comb_num": 1000,
        "map_query_str_to_kb": [["P0", "http://wd"], ["P00", "http://wl"], ["wd:", "http://we/"], ["wdt:", "http://wpd/"],
                                [" p:", " http://wp/"], ["ps:", "http://wps/"], ["pq:", "http://wpq/"]],
        "kb_prefixes": {"entity": "wd:E", "rel": "wdt:R", "type": "wd:T", "type_rel": "wdt:P", "type_rels": ["P31", "P279"]},
        "gold_query_info": {"unk_var": "?answer", "mid_var": "?ent"},
        "syntax_structure_known": true,
        "in": ["x_punct", "x_sanitized", "query_nums", "s_entity_substr", "s_types_substr", "s_tags", "s_probas", "answer_types", "entities_to_link"],
        "out": ["answers", "answer_ids", "query"]
      }
    ],
    "out": ["answers", "answer_ids", "query"]
  },
  "train": {
    "evaluation_targets": ["test"],
    "batch_size": 1,
    "metrics": [
      {
        "name": "kbqa_accuracy",
        "inputs": ["x", "answers", "answer_ids", "query", "gold_answer_labels", "gold_answer_ids", "f_gold_query"]
      }
    ],
    "class_name": "nn_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
      "NER_PATH": "{MODELS_PATH}/ner_ontonotes_torch_distilbert_mult",
      "NER_PATH2": "{MODELS_PATH}/entity_detection_rubq"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/rubq2.0.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/rubq"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/downloads/el_files_rus.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/entity_linking_rus"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/models/ner_ontonotes_torch_distilbert_mult.tar.gz",
        "subdir": "{MODELS_PATH}/ner_ontonotes_torch_distilbert_mult"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/models/entity_detection_rubq.tar.gz",
        "subdir": "{MODELS_PATH}/entity_detection_rubq"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/queries_and_rels_rus_v2.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/wikidata"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/kbqa_files_ru.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/wikidata_rus"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/syntax_parser/slovnet_syntax_parser_v2.tar.gz",
        "subdir": "{MODELS_PATH}/slovnet_syntax_parser"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_full.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/wikidata"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/kbqa/wiki_parser.json
================================================
{
  "chainer": {
    "in": ["parser_info", "query"],
    "pipe": [
      {
        "class_name": "wiki_parser",
        "in": ["parser_info", "query"],
        "out": ["wiki_parser_output"],
        "wiki_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_compr.pickle",
        "file_format": "pickle",
        "lang": "@en"
      }
    ],
    "out": ["wiki_parser_output"]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_compr.pickle",
        "subdir": "{DOWNLOADS_PATH}/wikidata"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/morpho_syntax_parser/morpho_ru_syntagrus_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "morphotagger_dataset_reader",
    "data_path": "{DOWNLOADS_PATH}/UD2.3_source",
    "language": "ru_syntagrus",
    "data_types": ["train", "dev", "test"]
  },
  "dataset_iterator": {
    "class_name": "morphotagger_dataset_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "in": ["x"],
        "class_name": "lazy_tokenizer",
        "out": ["x_words"]
      },
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": ["x_words"],
        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "min_freq": 3,
        "fit_on": ["y"],
        "in": ["y"],
        "out": ["y_ind"],
        "special_tokens": ["PAD", "BEGIN", "END"],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict"
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "use_crf": false,
        "encoder_layer_ids": [-6, -5, -4, -3, -2, -1],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [0.9, 0.999],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 10,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
        "in_y": ["y_ind"],
        "out": ["y_pred_ind", "probas"]
      },
      {
        "ref": "tag_vocab",
        "in": ["y_pred_ind"],
        "out": ["y_pred"]
      },
      {
        "in": ["x_words"],
        "out": ["y_lemmas"],
        "model": "ru_core_news_sm",
        "class_name": "spacy_lemmatizer"
      },
      {
        "in": ["x_words", "y_pred", "y_lemmas"],
        "out": ["y_prettified"],
        "id": "prettifier",
        "class_name": "lemmatized_output_prettifier"
      }
    ],
    "out": ["y_prettified"]
  },
  "train": {
    "epochs": 10,
    "batch_size": 32,
    "metrics": [
      {
        "name": "per_token_accuracy",
        "inputs": ["y", "y_pred"]
      },
      {
        "name": "accuracy",
        "inputs": ["y", "y_pred"]
      }
    ],
    "validation_patience": 15,
    "val_every_n_epochs": 1,
    "val_every_n_batches": 300,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": ["valid", "test"],
    "class_name": "nn_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "MODEL_PATH": "{MODELS_PATH}/morpho_ru_syntagrus_torch_bert"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.3/ru_syntagrus.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/UD2.3_source/ru_syntagrus"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/syntax_parsing/morpho_ru_syntagrus_torch_bert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/morpho_syntax_parser/ru_syntagrus_joint_parsing.json
================================================
{
  "chainer": {
    "in": ["x_words"],
    "pipe": [
      {
        "id": "main",
        "class_name": "joint_tagger_parser",
        "tagger": {
          "config_path": "{CONFIGS_PATH}/morpho_syntax_parser/morpho_ru_syntagrus_bert.json",
          "overwrite": {"chainer.pipe.6.return_string": false}
        },
        "parser": {
          "config_path": "{CONFIGS_PATH}/morpho_syntax_parser/syntax_ru_syntagrus_bert.json",
          "overwrite": {"chainer.pipe.6.return_string": false}
        },
        "in": ["x_words"],
        "out": ["y_parsed"]
      }
    ],
    "out": ["y_parsed"]
  },
  "metadata": {
    "variables": {
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
    }
  }
}


================================================
FILE: deeppavlov/configs/morpho_syntax_parser/syntax_ru_syntagrus_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "morphotagger_dataset_reader",
    "data_path": "{DOWNLOADS_PATH}/UD2.3_source",
    "language": "ru_syntagrus",
    "data_types": ["train", "dev", "test"],
    "read_syntax": true
  },
  "dataset_iterator": {
    "class_name": "morphotagger_dataset_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y_tags", "y_heads", "y_deps"],
    "pipe": [
      {
        "in": ["x"],
        "class_name": "lazy_tokenizer",
        "out": ["x_words"]
      },
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": ["x_words"],
        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
      },
      {
        "id": "dep_vocab",
        "class_name": "simple_vocab",
        "min_freq": 3,
        "fit_on": ["y_deps"],
        "in": ["y_deps"],
        "out": ["y_deps_indexes"],
        "special_tokens": ["PAD"],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/deps.dict",
        "load_path": "{MODEL_PATH}/deps.dict"
      },
      {
        "class_name": "torch_transformers_syntax_parser",
        "n_deps": "#dep_vocab.len",
        "state_size": 384,
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "return_probas": true,
        "encoder_layer_ids": [6, 7, 8, 9, 10, 11],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [0.9, 0.999],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "use_birnn": true,
        "learning_rate_drop_patience": 10,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
        "in_y": ["y_heads", "y_deps_indexes"],
        "out": ["y_predicted_heads_probs", "y_predicted_deps_indexes"]
      },
      {
        "class_name": "chu_liu_edmonds_transformer",
        "in": ["y_predicted_heads_probs"],
        "out": ["y_predicted_heads"]
      },
      {
        "ref": "dep_vocab",
        "in": ["y_predicted_deps_indexes"],
        "out": ["y_predicted_deps"]
      },
      {
        "in": ["x_words", "y_predicted_heads", "y_predicted_deps"],
        "out": ["y_prettified"],
        "id": "dependency_output_prettifier",
        "class_name": "dependency_output_prettifier"
      }
    ],
    "out": ["y_prettified"]
  },
  "train": {
    "epochs": 10,
    "batch_size": 32,
    "metrics": [
      {
        "name": "multitask_token_accuracy",
        "alias": "LAS",
        "inputs": ["y_deps", "y_heads", "y_predicted_deps", "y_predicted_heads"]
      },
      {
        "name": "per_token_accuracy",
        "alias": "UAS",
        "inputs": ["y_heads", "y_predicted_heads"]
      }
    ],
    "validation_patience": 15,
    "val_every_n_batches": 300,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": ["valid", "test"],
    "class_name": "nn_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "MODEL_PATH": "{MODELS_PATH}/syntax_parsing/rus_6layers"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.3/ru_syntagrus.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/UD2.3_source/ru_syntagrus"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/syntax_parsing/rus_6layers.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/multitask/mt_glue.json
================================================
{
  "dataset_reader": {
    "class_name": "multitask_reader",
    "task_defaults": {
      "class_name": "huggingface_dataset_reader",
      "path": "glue",
      "train": "train",
      "valid": "validation"
    },
    "tasks": {
      "cola": {"name": "cola"},
      "sst2": {"name": "sst2"},
      "qqp": {"name": "qqp"},
      "mrpc": {"name": "mrpc"},
      "rte": {"name": "rte"},
      "mnli": {
        "name": "mnli",
        "valid": "validation_matched"
      },
      "qnli": {"name": "qnli"},
      "stsb": {"name": "stsb"}
    }
  },
  "dataset_iterator": {
    "class_name": "multitask_iterator",
    "num_train_epochs": "{NUM_TRAIN_EPOCHS}",
    "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}",
    "seed": 42,
    "task_defaults": {
      "class_name": "huggingface_dataset_iterator",
      "label": "label",
      "use_label_name": false,
      "seed": 42
    },
    "tasks": {
      "cola": {
        "features": ["sentence"]
      },
      "sst2": {
        "features": ["sentence"]
      },
      "qqp": {
        "features": ["question1", "question2"]
      },
      "mrpc": {
        "features": ["sentence1", "sentence2"]
      },
      "rte": {
        "features": ["sentence1", "sentence2"]
      },
      "mnli": {
        "features": ["premise", "hypothesis"]
      },
      "qnli": {
        "features": ["question", "sentence"]
      },
      "stsb": {
        "features": ["sentence1", "sentence2"]
      }
    }
  },
  "chainer": {
    "in": ["x_cola", "x_sst2", "x_qqp", "x_mrpc", "x_rte", "x_mnli", "x_qnli", "x_stsb"],
    "in_y": ["y_cola", "y_sst2", "y_qqp", "y_mrpc", "y_rte", "y_mnli", "y_qnli", "y_stsb"
    ],
    "pipe": [
      {
        "class_name": "multitask_pipeline_preprocessor",
        "possible_keys_to_extract": [0, 1],
        "preprocessor": "TorchTransformersPreprocessor",
        "vocab_file": "{BACKBONE}",
        "max_seq_length": 128,
        "do_lower_case": true,
        "n_task": 8,
        "in": ["x_cola", "x_sst2", "x_qqp", "x_mrpc", "x_rte", "x_mnli", "x_qnli", "x_stsb"],
        "out": [
          "bert_features_cola",
          "bert_features_sst2",
          "bert_features_qqp",
          "bert_features_mrpc",
          "bert_features_rte",
          "bert_features_mnli",
          "bert_features_qnli",
          "bert_features_stsb"
        ]
      },
      {
        "id": "multitask_transformer",
        "class_name": "multitask_transformer",
        "optimizer_parameters": {"lr": 2e-5},
        "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}",
        "learning_rate_drop_patience": 2,
        "learning_rate_drop_div": 2.0,
        "return_probas": true,
        "backbone_model": "{BACKBONE}",
        "save_path": "{MODEL_PATH}",
        "load_path": "{MODEL_PATH}",
        "tasks": {
          "cola": {
            "type": "classification",
            "options": 2
          },
          "sst2": {
            "type": "classification",
            "options": 2
          },
          "qqp": {
            "type": "classification",
            "options": 2
          },
          "mrpc": {
            "type": "classification",
            "options": 2
          },
          "rte": {
            "type": "classification",
            "options": 2
          },
          "mnli": {
            "type": "classification",
            "options": 3
          },
          "qnli": {
            "type": "classification",
            "options": 2
          },
          "stsb": {
            "type": "regression",
            "options": 1
          }
        },
        "in": [
          "bert_features_cola",
          "bert_features_sst2",
          "bert_features_qqp",
          "bert_features_mrpc",
          "bert_features_rte",
          "bert_features_mnli",
          "bert_features_qnli",
          "bert_features_stsb"
        ],
        "in_y": ["y_cola", "y_sst2", "y_qqp", "y_mrpc", "y_rte", "y_mnli", "y_qnli", "y_stsb"],
        "out": [
          "y_cola_pred_probas",
          "y_sst2_pred_probas",
          "y_qqp_pred_probas",
          "y_mrpc_pred_probas",
          "y_rte_pred_probas",
          "y_mnli_pred_probas",
          "y_qnli_pred_probas",
          "y_stsb_pred"
        ]
      },
      {
        "in": [
          "y_cola_pred_probas",
          "y_sst2_pred_probas",
          "y_qqp_pred_probas",
          "y_mrpc_pred_probas",
          "y_rte_pred_probas",
          "y_mnli_pred_probas",
          "y_qnli_pred_probas"
        ],
        "out": [
          "y_cola_pred_ids",
          "y_sst2_pred_ids",
          "y_qqp_pred_ids",
          "y_mrpc_pred_ids",
          "y_rte_pred_ids",
          "y_mnli_pred_ids",
          "y_qnli_pred_ids"
        ],
        "class_name": "proba2labels",
        "max_proba": true
      }
    ],
    "out": [
      "y_cola_pred_probas",
      "y_sst2_pred_probas",
      "y_qqp_pred_probas",
      "y_mrpc_pred_probas",
      "y_rte_pred_probas",
      "y_mnli_pred_probas",
      "y_qnli_pred_probas",
      "y_stsb_pred",
      "y_cola_pred_ids",
      "y_sst2_pred_ids",
      "y_qqp_pred_ids",
      "y_mrpc_pred_ids",
      "y_rte_pred_ids",
      "y_mnli_pred_ids",
      "y_qnli_pred_ids",
      "y_stsb_pred"
    ]
  },
  "train": {
    "epochs": "{NUM_TRAIN_EPOCHS}",
    "batch_size": 32,
    "metrics": [
      {
        "name": "multitask_accuracy",
        "inputs": [
          "y_rte",
          "y_mnli",
          "y_qnli",
          "y_mrpc",
          "y_cola",
          "y_sst2",
          "y_qqp",
          "y_rte_pred_ids",
          "y_mnli_pred_ids",
          "y_qnli_pred_ids",
          "y_mrpc_pred_ids",
          "y_cola_pred_ids",
          "y_sst2_pred_ids",
          "y_qqp_pred_ids"
        ]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_mrpc",
        "inputs": ["y_mrpc", "y_mrpc_pred_ids"]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_rte",
        "inputs": ["y_rte", "y_rte_pred_ids"]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_mnli",
        "inputs": ["y_mnli", "y_mnli_pred_ids"]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_qnli",
        "inputs": ["y_qnli", "y_qnli_pred_ids"]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_sst",
        "inputs": ["y_sst2", "y_sst2_pred_ids"]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_cola",
        "inputs": ["y_cola", "y_cola_pred_ids"]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_qqp",
        "inputs": ["y_qqp", "y_qqp_pred_ids"]
      },
      {
        "name": "pearson_correlation",
        "alias": "pearson_correlation_stsb",
        "inputs": ["y_stsb", "y_stsb_pred"]
      },
      {
        "name": "spearman_correlation",
        "alias": "spearman_correlation_stsb",
        "inputs": ["y_stsb", "y_stsb_pred"]
      }
    ],
    "validation_patience": 3,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["valid"],
    "class_name": "torch_trainer",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BACKBONE": "bert-base-uncased",
      "MODELS_PATH": "~/.deeppavlov/models/glue",
      "MODEL_PATH": "{MODELS_PATH}/8task",
      "NUM_TRAIN_EPOCHS": 5,
      "GRADIENT_ACC_STEPS": 1
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/multitask/glue.tar.gz",
        "subdir": "{MODELS_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/multitask/multitask_example.json
================================================
{
  "dataset_reader": {
    "class_name": "multitask_reader",
    "task_defaults": {
      "class_name": "huggingface_dataset_reader",
      "path": "glue",
      "train": "train",
      "valid": "validation",
      "test": "test"
    },
    "tasks": {
      "cola": {"name": "cola"},
      "rte": {"name": "rte"},
      "stsb": {"name": "stsb"},
      "copa": {
        "path": "super_glue",
        "name": "copa"
      },
      "conll": {
        "class_name": "conll2003_reader",
        "use_task_defaults": false,
        "data_path": "{DOWNLOADS_PATH}/conll2003/",
        "dataset_name": "conll2003",
        "provide_pos": false
      }
    }
  },
  "dataset_iterator": {
    "class_name": "multitask_iterator",
    "num_train_epochs": "{NUM_TRAIN_EPOCHS}",
    "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}",
    "seed": 42,
    "task_defaults": {
      "class_name": "huggingface_dataset_iterator",
      "label": "label",
      "use_label_name": false,
      "seed": 42
    },
    "tasks": {
      "cola": {
        "features": ["sentence"]
      },
      "rte": {
        "features": ["sentence1", "sentence2"]
      },
      "stsb": {
        "features": ["sentence1", "sentence2"]
      },
      "copa": {
        "features": ["contexts", "choices"]
      },
      "conll": {
        "class_name": "basic_classification_iterator",
        "seed": 42,
        "use_task_defaults": false
      }
    }
  },
  "chainer": {
    "in": ["x_cola", "x_rte", "x_stsb", "x_copa", "x_conll"],
    "in_y": ["y_cola", "y_rte", "y_stsb", "y_copa", "y_conll"],
    "pipe": [
      {
        "class_name": "multitask_pipeline_preprocessor",
        "possible_keys_to_extract": [0, 1],
        "preprocessors": [
          "TorchTransformersPreprocessor",
          "TorchTransformersPreprocessor",
          "TorchTransformersPreprocessor",
          "TorchTransformersMultiplechoicePreprocessor",
          "TorchTransformersNerPreprocessor"
        ],
        "do_lower_case": true,
        "n_task": 5,
        "vocab_file": "{BACKBONE}",
        "max_seq_length": 200,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "return_features": true,
        "in": ["x_cola", "x_rte", "x_stsb", "x_copa", "x_conll"],
        "out": [
          "bert_features_cola",
          "bert_features_rte",
          "bert_features_stsb",
          "bert_features_copa",
          "bert_features_conll"
        ]
      },
      {
        "id": "vocab_conll",
        "class_name": "simple_vocab",
        "unk_token": ["O"],
        "pad_with_zeros": true,
        "save_path": "{MODELS_PATH}/tag.dict",
        "load_path": "{MODELS_PATH}/tag.dict",
        "fit_on": ["y_conll"],
        "in": ["y_conll"],
        "out": ["y_ids_conll"]
      },
      {
        "id": "multitask_transformer",
        "class_name": "multitask_transformer",
        "optimizer_parameters": {"lr": 2e-5},
        "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}",
        "learning_rate_drop_patience": 2,
        "learning_rate_drop_div": 2.0,
        "return_probas": true,
        "backbone_model": "{BACKBONE}",
        "save_path": "{MODEL_PATH}",
        "load_path": "{MODEL_PATH}",
        "tasks": {
          "cola": {
            "type": "classification",
            "options": 2
          },
          "rte": {
            "type": "classification",
            "options": 2
          },
          "stsb": {
            "type": "regression",
            "options": 1
          },
          "copa": {
            "type": "multiple_choice",
            "options": 2
          },
          "conll": {
            "type": "sequence_labeling",
            "options": "#vocab_conll.len"
          }
        },
        "in": [
          "bert_features_cola",
          "bert_features_rte",
          "bert_features_stsb",
          "bert_features_copa",
          "bert_features_conll"
        ],
        "in_y": ["y_cola", "y_rte", "y_stsb", "y_copa", "y_ids_conll"],
        "out": [
          "y_cola_pred_probas",
          "y_rte_pred_probas",
          "y_stsb_pred",
          "y_copa_pred_probas",
          "y_conll_pred_ids"
        ]
      },
      {
        "in": ["y_cola_pred_probas"],
        "out": ["y_cola_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_rte_pred_probas"],
        "out": ["y_rte_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_copa_pred_probas"],
        "out": ["y_copa_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_conll_pred_ids"],
        "out": ["y_conll_pred_labels"],
        "ref": "vocab_conll"
      }
    ],
    "out": ["y_cola_pred_ids", "y_rte_pred_ids", "y_stsb_pred", "y_copa_pred_ids", "y_conll_pred_labels"]
  },
  "train": {
    "epochs": "{NUM_TRAIN_EPOCHS}",
    "batch_size": 32,
    "metrics": [
      {
        "name": "multitask_accuracy",
        "inputs": ["y_rte", "y_cola", "y_copa", "y_rte_pred_ids", "y_cola_pred_ids", "y_copa_pred_ids"]
      },
      {
        "name": "ner_f1",
        "inputs": ["y_conll", "y_conll_pred_labels"]
      },
      {
        "name": "ner_token_f1",
        "inputs": ["y_conll", "y_conll_pred_labels"]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_cola",
        "inputs": ["y_cola", "y_cola_pred_ids"]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_rte",
        "inputs": ["y_rte", "y_rte_pred_ids"]
      },
      {
        "name": "accuracy",
        "alias": "accuracy_copa",
        "inputs": ["y_copa", "y_copa_pred_ids"]
      },
      {
        "name": "pearson_correlation",
        "alias": "pearson_stsb",
        "inputs": ["y_stsb", "y_stsb_pred"]
      },
      {
        "name": "spearman_correlation",
        "alias": "spearman_stsb",
        "inputs": ["y_stsb", "y_stsb_pred"]
      }
    ],
    "validation_patience": 3,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["valid"],
    "class_name": "torch_trainer",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "MODELS_PATH": "{ROOT_PATH}/models/multitask_example",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "BACKBONE": "distilbert-base-uncased",
      "MODEL_PATH": "{MODELS_PATH}/{BACKBONE}",
      "NUM_TRAIN_EPOCHS": 5,
      "GRADIENT_ACC_STEPS": 1
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/multitask/multitask_example.tar.gz",
        "subdir": "{MODELS_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_bert_base.json
================================================
{
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "in": ["x"],
        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": ["O"],
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": ["y"],
        "in": ["y"],
        "out": ["y_ind"]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
        "in_y": ["y_ind"],
        "out": ["y_pred_ind", "probas"]
      },
      {
        "ref": "tag_vocab",
        "in": ["y_pred_ind"],
        "out": ["y_pred"]
      }
    ],
    "out": ["x_tokens", "y_pred"]
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "bert-base-multilingual-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/ner/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_bert_base.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_case_agnostic_mdistilbert.json
================================================
{
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/conll2003/",
    "dataset_name": "conll2003",
    "provide_pos": false
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": ["x"],
        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": ["O"],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": ["y"],
        "in": ["y"],
        "out": ["y_ind"]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "use_crf": true,
        "encoder_layer_ids": [-1],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [0.9, 0.999],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 20,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
        "in_y": ["y_ind"],
        "out": ["y_pred_ind", "probas"]
      },
      {
        "ref": "tag_vocab",
        "in": ["y_pred_ind"],
        "out": ["y_pred"]
      }
    ],
    "out": ["x_tokens", "y_pred"]
  },
  "train": {
    "epochs": 50,
    "batch_size": 8,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": ["y", "y_pred"]
      },
      {
        "name": "ner_token_f1",
        "inputs": ["y", "y_pred"]
      }
    ],
    "validation_patience": 100,
    "val_every_n_batches": 50,
    "log_every_n_batches": 50,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": ["test", "valid"],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "~/.deeppavlov/downloads",
      "MODELS_PATH": "~/.deeppavlov/models",
      "TRANSFORMER": "distilbert-base-multilingual-cased",
      "MODEL_PATH": "{MODELS_PATH}/ner/ner_case_agnostic_mdistilbert"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_case_agnostic_mdistilbert.tar.gz",
        "subdir": "{MODELS_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_collection3_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/collection3/",
    "dataset_name": "collection3",
    "provide_pos": false,
    "provide_chunk": false,
    "iobes": true
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": [
          "x"
        ],
        "out": [
          "x_tokens",
          "x_subword_tokens",
          "x_subword_tok_ids",
          "startofword_markers",
          "attention_mask",
          "tokens_offsets"
        ]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": [
          "O"
        ],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": [
          "y"
        ],
        "in": [
          "y"
        ],
        "out": [
          "y_ind"
        ]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "encoder_layer_ids": [
          -1
        ],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [
            0.9,
            0.999
          ],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 30,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": [
          "x_subword_tok_ids",
          "attention_mask",
          "startofword_markers"
        ],
        "in_y": [
          "y_ind"
        ],
        "out": [
          "y_pred_ind",
          "probas"
        ]
      },
      {
        "ref": "tag_vocab",
        "in": [
          "y_pred_ind"
        ],
        "out": [
          "y_pred"
        ]
      }
    ],
    "out": [
      "x_tokens",
      "y_pred"
    ]
  },
  "train": {
    "epochs": 30,
    "batch_size": 10,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      },
      {
        "name": "ner_token_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      }
    ],
    "validation_patience": 100,
    "val_every_n_batches": 20,
    "log_every_n_batches": 20,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": [
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "MODEL_PATH": "{MODELS_PATH}/ner_rus_bert_coll3_torch"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_bert_coll3_torch.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_conll2003_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/conll2003/",
    "dataset_name": "conll2003",
    "provide_pos": false
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": [
          "x"
        ],
        "out": [
          "x_tokens",
          "x_subword_tokens",
          "x_subword_tok_ids",
          "startofword_markers",
          "attention_mask",
          "tokens_offsets"
        ]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": [
          "O"
        ],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": [
          "y"
        ],
        "in": [
          "y"
        ],
        "out": [
          "y_ind"
        ]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "use_crf": true,
        "encoder_layer_ids": [
          -1
        ],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [
            0.9,
            0.999
          ],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 30,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": [
          "x_subword_tok_ids",
          "attention_mask",
          "startofword_markers"
        ],
        "in_y": [
          "y_ind"
        ],
        "out": [
          "y_pred_ind",
          "probas"
        ]
      },
      {
        "ref": "tag_vocab",
        "in": [
          "y_pred_ind"
        ],
        "out": [
          "y_pred"
        ]
      }
    ],
    "out": [
      "x_tokens",
      "y_pred"
    ]
  },
  "train": {
    "epochs": 30,
    "batch_size": 16,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      },
      {
        "name": "ner_token_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      }
    ],
    "validation_patience": 100,
    "val_every_n_batches": 20,
    "log_every_n_batches": 20,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": [
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "bert-base-cased",
      "MODEL_PATH": "{MODELS_PATH}/ner_conll2003_torch_bert_crf"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_conll2003_bert_torch_crf.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_conll2003_deberta_crf.json
================================================
{
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/conll2003/",
    "dataset_name": "conll2003",
    "provide_pos": false
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": [
          "x"
        ],
        "out": [
          "x_tokens",
          "x_subword_tokens",
          "x_subword_tok_ids",
          "startofword_markers",
          "attention_mask",
          "tokens_offsets"
        ]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": [
          "O"
        ],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": [
          "y"
        ],
        "in": [
          "y"
        ],
        "out": [
          "y_ind"
        ]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "use_crf": true,
        "encoder_layer_ids": [
          -1
        ],
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": [
          "x_subword_tok_ids",
          "attention_mask",
          "startofword_markers"
        ],
        "in_y": [
          "y_ind"
        ],
        "out": [
          "y_pred_ind",
          "probas"
        ]
      },
      {
        "ref": "tag_vocab",
        "in": [
          "y_pred_ind"
        ],
        "out": [
          "y_pred"
        ]
      }
    ],
    "out": [
      "x_tokens",
      "y_pred"
    ]
  },
  "train": {
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      },
      {
        "name": "ner_token_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      }
    ],
    "evaluation_targets": [
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "microsoft/deberta-v3-base",
      "MODEL_PATH": "{MODELS_PATH}/ner_conll2003_deberta_crf"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_conll2003_deberta_crf.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_ontonotes_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/ontonotes/",
    "dataset_name": "ontonotes",
    "provide_pos": false
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": ["x"],
        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": ["O"],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": ["y"],
        "in": ["y"],
        "out": ["y_ind"]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "use_crf": true,
        "encoder_layer_ids": [-1],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [0.9, 0.999],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 30,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
        "in_y": ["y_ind"],
        "out": ["y_pred_ind", "probas"]
      },
      {
        "ref": "tag_vocab",
        "in": ["y_pred_ind"],
        "out": ["y_pred"]
      }
    ],
    "out": ["x_tokens", "y_pred"]
  },
  "train": {
    "epochs": 30,
    "batch_size": 60,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": ["y", "y_pred"]
      },
      {
        "name": "ner_token_f1",
        "inputs": ["y", "y_pred"]
      }
    ],
    "validation_patience": 100,
    "val_every_n_batches": 20,
    "log_every_n_batches": 20,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": ["valid", "test"],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "bert-base-cased",
      "MODEL_PATH": "{MODELS_PATH}/ner_ontonotes_bert_torch_crf"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_torch_crf.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_ontonotes_bert_mult.json
================================================
{
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/ontonotes/",
    "dataset_name": "ontonotes",
    "provide_pos": false
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": ["x"],
        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": ["O"],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": ["y"],
        "in": ["y"],
        "out": ["y_ind"]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "use_crf": true,
        "encoder_layer_ids": [-1],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [0.9, 0.999],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 30,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
        "in_y": ["y_ind"],
        "out": ["y_pred_ind", "probas"]
      },
      {
        "ref": "tag_vocab",
        "in": ["y_pred_ind"],
        "out": ["y_pred"]
      }
    ],
    "out": ["x_tokens", "y_pred"]
  },
  "train": {
    "epochs": 30,
    "batch_size": 10,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": ["y", "y_pred"]
      },
      {
        "name": "ner_token_f1",
        "inputs": ["y", "y_pred"]
      }
    ],
    "validation_patience": 100,
    "val_every_n_batches": 20,
    "log_every_n_batches": 20,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": ["valid", "test"],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "bert-base-multilingual-cased",
      "MODEL_PATH": "{MODELS_PATH}/ner_ontonotes_torch_bert_mult_crf"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_mult_torch_crf.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_ontonotes_deberta_crf.json
================================================
{
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/ontonotes/",
    "dataset_name": "ontonotes",
    "provide_pos": false
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": ["x"],
        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": ["O"],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": ["y"],
        "in": ["y"],
        "out": ["y_ind"]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "use_crf": true,
        "encoder_layer_ids": [-1],
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
        "in_y": ["y_ind"],
        "out": ["y_pred_ind", "probas"]
      },
      {
        "ref": "tag_vocab",
        "in": ["y_pred_ind"],
        "out": ["y_pred"]
      }
    ],
    "out": ["x_tokens", "y_pred"]
  },
  "train": {
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": ["y", "y_pred"]
      },
      {
        "name": "ner_token_f1",
        "inputs": ["y", "y_pred"]
      }
    ],
    "evaluation_targets": ["valid", "test"],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "microsoft/deberta-v3-base",
      "MODEL_PATH": "{MODELS_PATH}/ner_ontonotes_deberta_crf"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_ontonotes_deberta_crf.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_rus_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/total_rus/",
    "dataset_name": "collection_rus",
    "provide_pos": false
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": [
          "x"
        ],
        "out": [
          "x_tokens",
          "x_subword_tokens",
          "x_subword_tok_ids",
          "startofword_markers",
          "attention_mask",
          "tokens_offsets"
        ]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": [
          "O"
        ],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": [
          "y"
        ],
        "in": [
          "y"
        ],
        "out": [
          "y_ind"
        ]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "encoder_layer_ids": [
          -1
        ],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [
            0.9,
            0.999
          ],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 30,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": [
          "x_subword_tok_ids",
          "attention_mask",
          "startofword_markers"
        ],
        "in_y": [
          "y_ind"
        ],
        "out": [
          "y_pred_ind",
          "probas"
        ]
      },
      {
        "ref": "tag_vocab",
        "in": [
          "y_pred_ind"
        ],
        "out": [
          "y_pred"
        ]
      }
    ],
    "out": [
      "x_tokens",
      "y_pred"
    ]
  },
  "train": {
    "epochs": 30,
    "batch_size": 10,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      },
      {
        "name": "ner_token_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      }
    ],
    "validation_patience": 100,
    "val_every_n_batches": 20,
    "log_every_n_batches": 20,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": [
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "MODEL_PATH": "{MODELS_PATH}/ner_rus_bert_torch"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch_new.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_rus_bert_probas.json
================================================
{
  "dataset_reader": {
    "class_name": "sq_reader",
    "data_path": "{DOWNLOADS_PATH}/wiki_ner_rus/wikipedia_dataset.pickle"
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": ["x"],
        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": ["O"],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": ["y"],
        "in": ["y"],
        "out": ["y_ind"]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "encoder_layer_ids": [-1],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [
            0.9,
            0.999
          ],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 30,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
        "in_y": ["y_ind"],
        "out": ["y_pred_ind", "probas"]
      },
      {
        "ref": "tag_vocab",
        "in": ["y_pred_ind"],
        "out": ["y_pred"]
      }
    ],
    "out": ["x_tokens", "tokens_offsets", "y_pred", "probas"]
  },
  "train": {
    "epochs": 30,
    "batch_size": 10,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      },
      {
        "name": "ner_token_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      }
    ],
    "validation_patience": 100,
    "val_every_n_batches": 20,
    "log_every_n_batches": 20,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": [
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "MODEL_PATH": "{MODELS_PATH}/wiki_ner_rus_bert"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/rus_dream_entity_detection/wiki_ner_rus_bert.tar.gz",
        "subdir": "{MODELS_PATH}/wiki_ner_rus_bert"
      },
      {
        "url": "http://files.deeppavlov.ai/datasets/wiki_ner_rus/wiki_ner_rus_dataset.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/wiki_ner_rus"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json
================================================
 {
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/total_rus/",
    "dataset_name": "collection_rus",
    "provide_pos": false
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": [
          "x"
        ],
        "out": [
          "x_tokens",
          "x_subword_tokens",
          "x_subword_tok_ids",
          "startofword_markers",
          "attention_mask",
          "tokens_offsets"
        ]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": [
          "O"
        ],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": [
          "y"
        ],
        "in": [
          "y"
        ],
        "out": [
          "y_ind"
        ]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.11,
        "hidden_keep_prob": 0.11,
        "encoder_layer_ids": [
          -1
        ],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 5.45e-05,
          "weight_decay": 1e-06,
          "betas": [
            0.9,
            0.999
          ],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 30,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": [
          "x_subword_tok_ids",
          "attention_mask",
          "startofword_markers"
        ],
        "in_y": [
          "y_ind"
        ],
        "out": [
          "y_pred_ind",
          "probas"
        ]
      },
      {
        "ref": "tag_vocab",
        "in": [
          "y_pred_ind"
        ],
        "out": [
          "y_pred"
        ]
      }
    ],
    "out": [
      "x_tokens",
      "y_pred"
    ]
  },
  "train": {
    "epochs": 30,
    "batch_size": 10,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      },
      {
        "name": "ner_token_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      }
    ],
    "validation_patience": 100,
    "val_every_n_batches": 20,
    "log_every_n_batches": 20,
    "show_examples": false,
    "evaluation_targets": [
      "valid",
      "test"
    ],
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models", 
      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_2L",
      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational"
    }, 
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_2L.tar.gz",
        "subdir": "{MODELS_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
================================================
 {
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/total_rus/",
    "dataset_name": "collection_rus",
    "provide_pos": false
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": [
          "x"
        ],
        "out": [
          "x_tokens",
          "x_subword_tokens",
          "x_subword_tok_ids",
          "startofword_markers",
          "attention_mask",
          "tokens_offsets"
        ]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": [
          "O"
        ],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": [
          "y"
        ],
        "in": [
          "y"
        ],
        "out": [
          "y_ind"
        ]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.44,
        "hidden_keep_prob": 0.89,
        "encoder_layer_ids": [
          -1
        ],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2.78e-05,
          "weight_decay": 1e-06,
          "betas": [
            0.9,
            0.999
          ],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 30,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": [
          "x_subword_tok_ids",
          "attention_mask",
          "startofword_markers"
        ],
        "in_y": [
          "y_ind"
        ],
        "out": [
          "y_pred_ind",
          "probas"
        ]
      },
      {
        "ref": "tag_vocab",
        "in": [
          "y_pred_ind"
        ],
        "out": [
          "y_pred"
        ]
      }
    ],
    "out": [
      "x_tokens",
      "y_pred"
    ]
  },
  "train": {
    "epochs": 30,
    "batch_size": 10,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      },
      {
        "name": "ner_token_f1",
        "inputs": [
          "y",
          "y_pred"
        ]
      }
    ],
    "validation_patience": 100,
    "val_every_n_batches": 20,
    "log_every_n_batches": 20,
    "show_examples": false,
    "evaluation_targets": [
      "valid",
      "test"
    ],
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models", 
      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_6L",
      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational"
    }, 
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_6L.tar.gz",
        "subdir": "{MODELS_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/odqa/en_odqa_infer_wiki.json
================================================
{
  "chainer": {
    "in": ["question_raw"],
    "out": ["answer", "answer_score", "answer_place"],
    "pipe": [
      {
        "config_path": "{CONFIGS_PATH}/doc_retrieval/en_ranker_tfidf_wiki.json",
        "in": ["question_raw"],
        "out": ["tfidf_doc_ids"]
      },
      {
        "class_name": "bpr",
        "load_path": "{MODELS_PATH}/bpr/eng",
        "query_encoder_file": "query_encoder_en.pth.tar",
        "bpr_index": "bpr_finetuned_nq_adv.idx",
        "pretrained_model": "bert-base-uncased",
        "top_n": 100,
        "in": ["question_raw"],
        "out": ["bpr_doc_ids"]
      },
      {
        "class_name": "concat_lists",
        "in": ["tfidf_doc_ids", "bpr_doc_ids"],
        "out": ["doc_ids"]
      },
      {
        "class_name": "wiki_sqlite_vocab",
        "in": ["doc_ids"],
        "out": ["doc_text"],
        "join_docs": false,
        "shuffle": false,
        "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
      },
      {
        "class_name": "string_multiplier",
        "in": ["question_raw", "doc_text"],
        "out":["questions"]
      },
      {
        "class_name": "logit_ranker",
        "batch_size": 64,
        "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json"},
        "sort_noans": true,
        "in": ["doc_text", "questions"],
        "out": ["answer", "answer_score", "answer_place"]
      }
    ]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz",
        "subdir": "{MODELS_PATH}/bpr/eng"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/odqa/en_odqa_pop_infer_wiki.json
================================================
{
  "chainer": {
    "in": ["question_raw"],
    "out": ["answer", "answer_score", "answer_place"],
    "pipe": [
      {
        "config_path": "{CONFIGS_PATH}/doc_retrieval/en_ranker_pop_wiki.json",
        "in": ["question_raw"],
        "out": ["tfidf_doc_ids"]
      },
      {
        "class_name": "bpr",
        "load_path": "{MODELS_PATH}/bpr/eng",
        "query_encoder_file": "query_encoder_en.pth.tar",
        "bpr_index": "bpr_finetuned_nq_adv.idx",
        "pretrained_model": "bert-base-uncased",
        "top_n": 100,
        "in": ["question_raw"],
        "out": ["bpr_doc_ids"]
      },
      {
        "class_name": "concat_lists",
        "in": ["tfidf_doc_ids", "bpr_doc_ids"],
        "out": ["doc_ids"]
      },
      {
        "class_name": "wiki_sqlite_vocab",
        "in": ["doc_ids"],
        "out": ["doc_text"],
        "join_docs": false,
        "shuffle": false,
        "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
      },
      {
        "class_name": "string_multiplier",
        "in": ["question_raw", "doc_text"],
        "out":["questions"]
      },
      {
        "class_name": "logit_ranker",
        "batch_size": 64,
        "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json"},
        "sort_noans": true,
        "in": ["doc_text", "questions"],
        "out": ["answer", "answer_score", "answer_place"]
      }
    ]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz",
        "subdir": "{MODELS_PATH}/bpr/eng"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/odqa/ru_odqa_infer_wiki.json
================================================
{
  "chainer": {
    "in": ["question_raw"],
    "out": ["best_answer"],
    "pipe": [
      {
        "config_path": "{CONFIGS_PATH}/doc_retrieval/ru_ranker_tfidf_wiki.json",
        "in": ["question_raw"],
        "out": ["tfidf_doc_ids"]
      },
      {
        "class_name": "wiki_sqlite_vocab",
        "in": ["tfidf_doc_ids"],
        "out": ["tfidf_doc_text"],
        "join_docs": false,
        "shuffle": false,
        "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db"
      },
      {
        "class_name": "string_multiplier",
        "in": ["question_raw", "tfidf_doc_text"],
        "out":["questions"]
      },
      {
        "class_name": "logit_ranker",
        "batch_size": 64,
        "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_multisberquad_bert.json"},
        "sort_noans": true,
        "in": ["tfidf_doc_text", "questions"],
        "out": ["best_answer", "best_answer_score"]
      }
    ]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
    },
    "download": [
    ]
  }
}


================================================
FILE: deeppavlov/configs/ranking/path_ranking_nll_roberta_en.json
================================================
{
  "chainer": {
    "in": ["question", "rels"],
    "pipe": [
      {
        "class_name": "path_ranking_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "additional_special_tokens": ["<one_rel>", "</one_rel>", "<double>", "</double>", "<first_rel>", "<mid>", "</second_rel>"],
        "max_seq_length": 96,
        "in": ["question", "rels"],
        "out": ["bert_features"]
      },
      {
        "class_name": "torch_transformers_nll_ranker",
        "in": ["bert_features"],
        "out": ["model_output"],
        "return_probas": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "encoder_save_path": "{MODEL_PATH}/encoder",
        "linear_save_path": "{MODEL_PATH}/linear",
        "pretrained_bert": "{TRANSFORMER}",
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 1.5,
        "optimizer_parameters": {"lr": 1e-5, "weight_decay": 0.01, "eps": 1e-6}
      }
    ],
    "out": ["model_output"]
  },
  "metadata": {
    "variables": {
      "TRANSFORMER": "haisongzhang/roberta-tiny-cased",
      "MODEL_PATH": "~/.deeppavlov/models/classifiers/path_ranking_nll_roberta_lcquad2"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/kbqa/models/path_ranking_nll_roberta_lcquad2.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ranking/ranking_ubuntu_v2_torch_bert_uncased.json
================================================
{
  "dataset_reader": {
    "class_name": "ubuntu_v2_reader",
    "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data"
  },
  "dataset_iterator": {
    "class_name": "siamese_iterator",
    "seed": 243
  },
  "chainer": {
    "in": [
      "x"
    ],
    "in_y": [
      "y"
    ],
    "pipe": [
      {
        "class_name": "torch_bert_ranker_preprocessor",
        "vocab_file": "bert-base-uncased",
        "do_lower_case": true,
        "max_seq_length": 128,
        "in": [
          "x"
        ],
        "out": [
          "bert_features"
        ]
      },
      {
        "class_name": "torch_bert_ranker",
        "pretrained_bert": "bert-base-uncased",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-5,
          "weight_decay": 1e-2,
          "betas": [
            0.9,
            0.999
          ],
          "eps": 1e-6
        },
        "clip_norm": 1.0,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "y"
        ],
        "out": [
          "predictions"
        ]
      }
    ],
    "out": [
      "predictions"
    ]
  },
  "train": {
    "batch_size": 32,
    "pytest_max_batches": 2,
    "train_metrics": [],
    "metrics": [
      "r@1",
      "r@2",
      "r@5"
    ],
    "validation_patience": 1,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "evaluation_targets": [
      "valid",
      "test"
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_torch_bert_model"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_torch_bert_model_v2.tar.gz",
        "subdir": "{MODELS_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ranking/rel_ranking_nll_bert_ru.json
================================================
{
  "chainer": {
    "in": ["question", "rels"],
    "pipe": [
      {
        "class_name": "path_ranking_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 96,
        "in": ["question", "rels"],
        "out": ["bert_features"]
      },
      {
        "class_name": "torch_transformers_nll_ranker",
        "in": ["bert_features"],
        "out": ["model_output"],
        "return_probas": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "encoder_save_path": "{MODEL_PATH}/encoder",
        "linear_save_path": "{MODEL_PATH}/linear",
        "pretrained_bert": "{TRANSFORMER}",
        "learning_rate_drop_patience": 4,
        "learning_rate_drop_div": 1.5,
        "optimizer_parameters": {"lr": 1e-5, "weight_decay": 0.01, "eps": 1e-6}
      }
    ],
    "out": ["model_output"]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/rel_ranking_nll_bert_ru"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/kbqa/models/rel_ranking_nll_bert_ru.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/ranking/rel_ranking_roberta_en.json
================================================
{
  "dataset_reader": {
    "class_name": "sq_reader",
    "data_path": "{DOWNLOADS_PATH}/rel_ranking_eng/lcquad_one_rel_ranking.json"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator",
    "seed": 42
  },
  "chainer": {
    "in": ["question", "rel_list"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "rel_ranking_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": false,
        "max_seq_length": 64,
        "in": ["question", "rel_list"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 1e-05},
        "learning_rate_drop_patience": 5,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "epochs": 3,
    "batch_size": 30,
    "metrics": [
      "accuracy",
      "f1_macro"
    ],
    "validation_patience": 10,
    "val_every_n_batches": 100,
    "log_every_n_batches": 100,
    "show_examples": false,
    "evaluation_targets": ["valid", "test"],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "haisongzhang/roberta-tiny-cased",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/rel_ranking_roberta_en"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/kbqa/models/rel_ranking_roberta_en.tar.gz",
        "subdir": "{MODEL_PATH}"
      },
      {
        "url": "http://files.deeppavlov.ai/kbqa/wikidata/lcquad_rel_ranking.pickle",
        "subdir": "{DOWNLOADS_PATH}/rel_ranking_eng"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/regressors/translation_ranker.json
================================================
{
  "metadata":
  {
    "variables": {
      "BASE_MODEL": "cointegrated/LaBSE-en-ru",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/classifiers/ranker_labse",
      "SEED": 42
    },
    "download": [
	{
		"url": "http://files.deeppavlov.ai/v1/tmp/translation_ranker.tar.gz",
		"subdir": "{MODELS_PATH}"
	}
    ]
  },
    "dataset_iterator": {
      "class_name": "huggingface_dataset_iterator",
      "features": [
        "source",
        "hypothesis"
      ],
    "label": "agg_score",
    "seed": "{SEED}",
    "use_label_name": false
  },
    "chainer": {
      "in": [
        "source",
        "hypothesis"
      ],
      "in_y": [
        "score"
      ],
      "pipe": [
        {
          "class_name": "torch_transformers_preprocessor",
          "vocab_file": "{BASE_MODEL}",
          "do_lower_case": false,
          "max_seq_length": 256,
          "in": [
          "source",
          "hypothesis"
          ],
          "out": [
            "bert_features"
          ]
        },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": 1,
        "return_probas": false,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-06,
          "weight_decay": 0.1
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "score"
        ],
        "out": [
          "pred_score"
        ]
      }
      ],
      "out": [
        "pred_score"
      ]
    },
    "train": {
    "batch_size": 32,
    "metrics": [
      {
        "name": "mean_squared_error",
        "inputs": [
          "score",
          "pred_score"
        ]
      }
    ],
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "class_name": "torch_trainer",
    "evaluation_targets": [
      "train",
      "valid"
    ],
    "metric_optimization": "minimize",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  }
}


================================================
FILE: deeppavlov/configs/relation_extraction/re_docred.json
================================================
{
  "dataset_reader": {
    "class_name": "docred_reader",
    "data_path": "{DOWNLOADS_PATH}/docred/",
    "rel2id_path": "{DOWNLOADS_PATH}/docred/meta/rel2id.json",
    "rel_info_path": "{DOWNLOADS_PATH}/docred/rel_info.json",
    "valid_test_data_size": 150
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator"
  },
  "chainer": {
    "in": ["tokens", "entity_pos", "entity_tags"],
    "in_y": ["y_ids"],
    "pipe": [
      {
        "in": ["tokens", "entity_pos", "entity_tags"],
        "out": ["input_ids", "attention_mask", "upd_entity_pos", "upd_entity_tags", "nf_samples"],
        "class_name": "re_preprocessor",
        "vocab_file": "bert-base-cased",
        "default_tag": "PER"
      },
      {
        "class_name": "re_classifier",
        "in": ["input_ids", "attention_mask", "upd_entity_pos", "upd_entity_tags"],
        "in_y": ["y_ids"],
        "out": ["model_output"],
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer_parameters": {"lr": 5e-5, "weight_decay": 0.01, "eps": 1e-6},
        "n_classes": 97,
        "num_ner_tags": 6,
        "pretrained_bert": "bert-base-cased",
        "return_probas": true
      },
      {
        "class_name": "re_postprocessor",
        "rel2id_path": "{DOWNLOADS_PATH}/docred/meta/rel2id.json",
        "rel2label_path": "{DOWNLOADS_PATH}/docred/rel_info.json",
        "in": ["model_output", "nf_samples"],
        "out": ["wikidata_relation_id", "relation_name"]
      }
    ],
    "out": ["wikidata_relation_id", "relation_name"]
  },
  "train": {
    "epochs": 50,
    "batch_size": 30,
    "log_every_n_batches": 100,
    "train_metrics": ["f1_weighted", "acc"],
    "evaluation_targets": ["valid", "train"],
    "metrics": ["f1_weighted", "acc"],
    "validation_patience": 50,
    "val_every_n_batches": 200,
    "show_examples": false,
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/re_docred"
      },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/docred.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/docred"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/re_docred_model_v1.tar.gz",
        "subdir": "{MODELS_PATH}/re_docred"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/rel2label.json",
        "subdir": "{DOWNLOADS_PATH}/docred"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/relation_extraction/re_rured.json
================================================
{
  "dataset_reader": {
    "class_name": "rured_reader",
    "data_path": "{DOWNLOADS_PATH}/rured/"
  },
  "dataset_iterator": {
    "class_name": "basic_classification_iterator"
  },
  "chainer": {
    "in": ["tokens", "entity_pos", "entity_tags"],
    "in_y": ["y_ids"],
    "pipe": [
      {
        "in": ["tokens", "entity_pos", "entity_tags"],
        "out": ["input_ids", "attention_mask", "upd_entity_pos", "upd_entity_tags", "nf_samples"],
        "class_name": "re_preprocessor",
        "ner_tags": ["WORK_OF_ART", "NORP", "GROUP", "LAW", "NATIONALITY", "EVENT", "DATE", "CURRENCY", "GPE",
                     "QUANTITY", "FAMILY", "ORDINAL", "RELIGION", "CITY", "MONEY", "AGE", "LOCATION", "PERCENT",
                     "BOROUGH", "STREET", "PERSON", "REGION", "COUNTRY", "PROFESSION", "ORGANIZATION", "FAC",
                     "CARDINAL", "PRODUCT", "TIME"],
        "max_seq_length": 512,
        "vocab_file": "{TRANSFORMER}",
        "default_tag": "PERSON"
      },
      {
        "class_name": "re_classifier",
        "in": ["input_ids", "attention_mask", "upd_entity_pos", "upd_entity_tags"],
        "in_y": ["y_ids"],
        "out": ["model_output"],
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer_parameters": {"lr": 5e-5, "weight_decay": 0.01, "eps": 1e-6},
        "n_classes": 30,
        "num_ner_tags": 29,
        "pretrained_bert": "{TRANSFORMER}"
      },
      {
        "class_name": "re_postprocessor",
        "rel2id_path": "{DOWNLOADS_PATH}/rured/rel2id.json",
        "rel2label_path": "{DOWNLOADS_PATH}/rured/rel2label.json",
        "in": ["model_output", "nf_samples"],
        "out": ["wikidata_relation_id", "relation_name"]
      }
    ],
    "out": ["wikidata_relation_id", "relation_name"]
  },
  "train": {
    "epochs": 50,
    "batch_size": 16,
    "train_metrics": ["acc"],
    "metrics": ["acc"],
    "validation_patience": 50,
    "val_every_n_batches": 100,
    "log_every_n_batches": 100,
    "evaluation_targets": ["valid", "train"],
    "show_examples": false,
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "MODEL_PATH": "{MODELS_PATH}/re_rured"
      },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/rured.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/rured"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/re_rured_model_v1.tar.gz",
        "subdir": "{MODELS_PATH}/re_rured"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/russian_super_glue/russian_superglue_danetqa_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/DaNetQA",
    "ignore_verifications": true
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["question", "passage"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["question", "passage"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "in": ["question", "passage"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 2e-05},
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 4,
    "metrics": ["accuracy"],
    "epochs": 10,
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "russian_super_glue",
      "BINARY_CLASSIFICATION": false,
      "TASK": "danetqa",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_danetqa_rubert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/russian_super_glue/russian_superglue_lidirus_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "test": "test",
    "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/LiDiRus",
    "ignore_verifications": true
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["sentence1", "sentence2"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["sentence1", "sentence2"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["sentence1", "sentence2"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 2e-05, "weight_decay": 0.1},
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "confidence_threshold": 0.5
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 16,
    "metrics": ["matthews_correlation"],
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["test"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "russian_super_glue",
      "BINARY_CLASSIFICATION": false,
      "TASK": "lidirus",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/terra/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_terra_rubert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/russian_super_glue/russian_superglue_muserc_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/MuSeRC",
    "ignore_verifications": true
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["context", "answer", "idx"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["context", "answer", "idx"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 512,
        "in": ["context", "answer"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "BINARY_CLASSIFICATION": false,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 2e-05},
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 8,
    "metrics": ["roc_auc","f1"],
    "epochs": 10,
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "russian_super_glue",
      "BINARY_CLASSIFICATION": false,
      "TASK": "muserc",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_muserc_rubert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/russian_super_glue/russian_superglue_parus_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/PARus",
    "ignore_verifications": true
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["contexts", "choices"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["contexts_list", "choices_list"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_multiplechoice_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["contexts_list", "choices_list"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_multiplechoice",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 4e-05},
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 4,
    "metrics": ["accuracy"],
    "validation_patience": 10,
    "epochs": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "russian_super_glue",
      "TASK": "parus",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_parus_rubert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/russian_super_glue/russian_superglue_rcb_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/RCB",
    "ignore_verifications": true
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["premise", "hypothesis"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["premise", "hypothesis"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["premise", "hypothesis"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 4e-05},
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 4,
    "metrics": ["accuracy", "f1_macro"],
    "validation_patience": 10,
    "epochs": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "russian_super_glue",
      "TASK": "rcb",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_rcb_rubert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/russian_super_glue/russian_superglue_rucos_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/RuCoS",
    "ignore_verifications": true,
    "downsample_ratio": [1.8, 1.8, 1],
    "do_index_correction": false
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["idx", "query", "passage", "entities", "num_examples"],
    "label": "label",
    "use_label_name": false
  },
  "chainer": {
    "in": ["idx", "query", "passage", "entities", "num_examples"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 320,
        "in": ["query", "passage"],
        "out": ["bert_features"]
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": 2,
        "return_probas": true,
        "is_binary": "{BINARY_CLASSIFICATION}",
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 2e-05},
        "in": ["bert_features"],
        "in_y": ["y"],
        "out": ["y_pred_probas"]
      },
      {
        "class_name": "proba2labels",
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "is_binary": "{BINARY_CLASSIFICATION}",
        "max_proba": true
      },
      {
        "class_name": "torch_record_postprocessor",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "in": ["idx", "y", "y_pred_probas", "entities", "num_examples"],
        "out": ["record_examples"]
      }
    ],
    "out": ["y_pred_probas"]
  },
  "train": {
    "batch_size": 12,
    "train_metrics": [
      {
        "name": "accuracy",
        "inputs": ["y", "y_pred_ids"]
      }
    ],
    "metrics": [
      {
        "name": "record_em_score",
        "inputs": ["record_examples"]
      },
      {
        "name": "record_f1_score",
        "inputs": ["record_examples"]
      },
      {
        "name": "accuracy",
        "inputs": ["y", "y_pred_ids"]
      }
    ],
    "epochs": 10,
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "class_name": "torch_trainer",
    "evaluation_targets": ["valid"],
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "russian_super_glue",
      "BINARY_CLASSIFICATION": false,
      "TASK": "rucos",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_rucos_rubert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/russian_super_glue/russian_superglue_russe_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/RUSSE",
    "ignore_verifications": true
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["sentence1", "sentence2"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["sentence1", "sentence2"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["sentence1", "sentence2"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 2e-05},
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 4,
    "metrics": ["accuracy"],
    "epochs": 10,
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "val_every_n_batches": 1000,
    "show_examples": false,
    "evaluation_targets": ["valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "russian_super_glue",
      "TASK": "russe",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_russe_rubert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/russian_super_glue/russian_superglue_rwsd_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/RWSD",
    "ignore_verifications": true
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["text", "answer"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["text", "answer"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": true,
        "max_seq_length": 256,
        "in": ["text", "answer"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "Adam",
        "optimizer_parameters": {"lr": 2e-05},
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 4,
    "metrics": ["accuracy"],
    "epochs": 10,
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "COMPETITION": "russian_super_glue",
      "TASK": "rwsd",
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_rwsd_rubert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/russian_super_glue/russian_superglue_terra_rubert.json
================================================
{
  "dataset_reader": {
    "class_name": "huggingface_dataset_reader",
    "path": "{COMPETITION}",
    "name": "{TASK}",
    "train": "train",
    "valid": "validation",
    "test": "test",
    "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/TERRa",
    "ignore_verifications": true
  },
  "dataset_iterator": {
    "class_name": "huggingface_dataset_iterator",
    "features": ["premise", "hypothesis"],
    "label": "label",
    "seed": 42
  },
  "chainer": {
    "in": ["premise", "hypothesis"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_preprocessor",
        "vocab_file": "{BASE_MODEL}",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["premise", "hypothesis"],
        "out": ["bert_features"]
      },
      {
        "id": "classes_vocab",
        "class_name": "simple_vocab",
        "fit_on": ["y"],
        "save_path": "{MODEL_PATH}/classes.dict",
        "load_path": "{MODEL_PATH}/classes.dict",
        "in": ["y"],
        "out": ["y_ids"]
      },
      {
        "in": ["y_ids"],
        "out": ["y_onehot"],
        "class_name": "one_hotter",
        "depth": "#classes_vocab.len",
        "single_vector": true
      },
      {
        "class_name": "torch_transformers_classifier",
        "n_classes": "#classes_vocab.len",
        "return_probas": true,
        "pretrained_bert": "{BASE_MODEL}",
        "is_binary": "{BINARY_CLASSIFICATION}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {"lr": 2e-05},
        "in": ["bert_features"],
        "in_y": ["y_ids"],
        "out": ["y_pred_probas"]
      },
      {
        "in": ["y_pred_probas"],
        "out": ["y_pred_ids"],
        "class_name": "proba2labels",
        "max_proba": true
      },
      {
        "in": ["y_pred_ids"],
        "out": ["y_pred_labels"],
        "ref": "classes_vocab"
      }
    ],
    "out": ["y_pred_labels"]
  },
  "train": {
    "batch_size": 4,
    "metrics": ["accuracy"],
    "epochs": 10,
    "validation_patience": 10,
    "val_every_n_epochs": 1,
    "log_every_n_epochs": 1,
    "show_examples": false,
    "evaluation_targets": ["train", "valid"],
    "class_name": "torch_trainer",
    "tensorboard_log_dir": "{MODEL_PATH}/",
    "pytest_max_batches": 2,
    "pytest_batch_size": 2
  },
  "metadata": {
    "variables": {
      "BASE_MODEL": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "COMPETITION": "russian_super_glue",
      "TASK": "terra",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "BINARY_CLASSIFICATION": false,
      "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_terra_rubert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/sentence_segmentation/sentseg_dailydialog_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "conll2003_reader",
    "data_path": "{DOWNLOADS_PATH}/dailydialog/",
    "dataset_name": "dailydialog"
  },
  "dataset_iterator": {
    "class_name": "data_learning_iterator"
  },
  "chainer": {
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "torch_transformers_ner_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": true,
        "max_seq_length": 512,
        "max_subword_length": 15,
        "token_masking_prob": 0.0,
        "in": ["x"],
        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
      },
      {
        "id": "tag_vocab",
        "class_name": "simple_vocab",
        "unk_token": ["O"],
        "pad_with_zeros": true,
        "save_path": "{MODEL_PATH}/tag.dict",
        "load_path": "{MODEL_PATH}/tag.dict",
        "fit_on": ["y"],
        "in": ["y"],
        "out": ["y_ind"]
      },
      {
        "class_name": "torch_transformers_sequence_tagger",
        "n_tags": "#tag_vocab.len",
        "pretrained_bert": "{TRANSFORMER}",
        "attention_probs_keep_prob": 0.5,
        "encoder_layer_ids": [-1],
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 1e-06,
          "betas": [0.9, 0.999],
          "eps": 1e-06
        },
        "clip_norm": 1.0,
        "min_learning_rate": 1e-07,
        "learning_rate_drop_patience": 6,
        "learning_rate_drop_div": 1.5,
        "load_before_drop": true,
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
        "in_y": ["y_ind"],
        "out": ["y_pred_ind", "probas"]
      },
      {
        "ref": "tag_vocab",
        "in": ["y_pred_ind"],
        "out": ["y_pred"]
      },
      {
        "in": ["x_tokens", "y_pred"],
        "out": "punctuated_sents",
        "class_name": "sentseg_restore_sent"
      }
    ],
    "out": ["x_tokens", "punctuated_sents"]
  },
  "train": {
    "epochs": 30,
    "batch_size": 30,
    "metrics": [
      {
        "name": "ner_f1",
        "inputs": ["y", "y_pred"]
      },
      {
        "name": "ner_token_f1",
        "inputs": ["y", "y_pred"]
      }
    ],
    "validation_patience": 20,
    "val_every_n_batches": 100,
    "log_every_n_batches": 100,
    "show_examples": false,
    "pytest_max_batches": 2,
    "pytest_batch_size": 8,
    "evaluation_targets": ["valid", "test"],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "TRANSFORMER": "bert-base-uncased",
      "MODEL_PATH": "{MODELS_PATH}/sentseg_dailydialog_bert"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/sentseg_dailydialog_bert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json
================================================
{
  "dataset_reader": {
    "class_name": "typos_wikipedia_reader",
    "data_path": "{DOWNLOADS_PATH}"
  },
  "dataset_iterator": {
    "class_name": "typos_iterator",
    "test_ratio": 0.05
  },
  "chainer":{
    "in": ["x"],
    "in_y": ["y"],
    "pipe": [
      {
        "class_name": "str_lower",
        "id": "lower",
        "in": ["x"],
        "out": ["x_lower"]
      },
      {
        "class_name": "nltk_moses_tokenizer",
        "id": "tokenizer",
        "in": ["x_lower"],
        "out": ["x_tokens"]
      },
      {
        "ref": "tokenizer",
        "in": ["y"],
        "out": ["y_tokens"]
      },
      {
        "fit_on": ["x_tokens", "y_tokens"],
        "in": ["x_tokens"],
        "out": ["tokens_candidates"],
        "class_name": "spelling_error_model",
        "window": 1,
        "candidates_count": 4,
        "dictionary": {
          "class_name": "wikitionary_100K_vocab",
          "data_dir": "{DOWNLOADS_PATH}/vocabs"
        },
        "save_path": "{MODELS_PATH}/error_model/error_model.tsv"
      },
      {
        "class_name": "kenlm_elector",
        "in": ["tokens_candidates"],
        "out": ["y_predicted_tokens"],
        "load_path": "{DOWNLOADS_PATH}/language_models/en_wiki_no_punkt.arpa.binary"
      },
      {
        "ref": "tokenizer",
        "in": ["y_predicted_tokens"],
        "out": ["y_predicted"]
      }
    ],
    "out": ["y_predicted"]
  },
  "train": {
    "evaluation_targets": ["test"],
    "class_name": "fit_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/error_model.tar.gz",
        "subdir": "{MODELS_PATH}"
      },
      {
        "url": "http://files.deeppavlov.ai/lang_models/en_wiki_no_punkt.arpa.binary.gz",
        "subdir": "{DOWNLOADS_PATH}/language_models"
      },
      {
        "url": "http://files.deeppavlov.ai/datasets/wiktionary/wikipedia_100K_vocab.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/vocabs"
      }
    ]
  }
}

================================================
FILE: deeppavlov/configs/spelling_correction/levenshtein_corrector_ru.json
================================================
{
  "chainer":{
    "in": ["x"],
    "pipe": [
      {
        "class_name": "str_lower",
        "id": "lower",
        "in": ["x"],
        "out": ["x_lower"]
      },
      {
        "class_name": "nltk_moses_tokenizer",
        "id": "tokenizer",
        "in": ["x_lower"],
        "out": ["x_tokens"]
      },
      {
        "id": "vocab",
        "class_name": "simple_vocab",
        "save_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict",
        "load_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict"
      },
      {
        "in": ["x_tokens"],
        "out": ["tokens_candidates"],
        "class_name": "spelling_levenshtein",
        "words": "#vocab.keys()"
      },
      {
        "class_name": "kenlm_elector",
        "in": ["tokens_candidates"],
        "out": ["y_predicted_tokens"],
        "load_path": "{DOWNLOADS_PATH}/language_models/ru_wiyalen_no_punkt.arpa.binary"
      },
      {
        "ref": "tokenizer",
        "in": ["y_predicted_tokens"],
        "out": ["y_predicted"]
      }
    ],
    "out": ["y_predicted"]
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs/russian_words_vocab.dict.gz",
        "subdir": "{DOWNLOADS_PATH}/vocabs"
      },
      {
        "url": "http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz",
        "subdir": "{DOWNLOADS_PATH}/language_models"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/squad/qa_multisberquad_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "multi_squad_dataset_reader",
    "dataset": "MultiSQuADRuRetrClean",
    "url": "http://files.deeppavlov.ai/datasets/multi_squad_ru_retr_clean.tar.gz",
    "data_path": "{DOWNLOADS_PATH}/multi_squad_ru_retr_clean/"
  },
  "dataset_iterator": {
    "class_name": "multi_squad_retr_iterator",
    "seed": 1337,
    "shuffle": false,
    "with_answer_rate": 0.666
  },
  "chainer": {
    "in": ["context_raw", "question_raw"],
    "in_y": ["ans_raw", "ans_raw_start"],
    "pipe": [
      {
        "class_name": "torch_squad_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": "{LOWERCASE}",
        "max_seq_length": 384,
        "in": ["question_raw", "context_raw"],
        "out": ["bert_features", "subtokens", "split_context"]
      },
      {
        "class_name": "squad_bert_mapping",
        "do_lower_case": "{LOWERCASE}",
        "in": ["split_context", "bert_features", "subtokens"],
        "out": ["subtok2chars", "char2subtoks"]
      },
      {
        "class_name": "squad_bert_ans_preprocessor",
        "do_lower_case": "{LOWERCASE}",
        "in": ["ans_raw", "ans_raw_start", "char2subtoks"],
        "out": ["ans", "ans_start", "ans_end"]
      },
      {
        "class_name": "torch_transformers_squad",
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 0.01,
          "betas": [0.9, 0.999],
          "eps": 1e-06
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "in_y": ["ans_start", "ans_end"],
        "out": ["ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds"]
      },
      {
        "class_name": "squad_bert_ans_postprocessor",
        "in": ["ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds"],
        "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"]
      }
    ],
    "out": ["ans_predicted", "ans_start_predicted", "scores"]
  },
  "train": {
    "show_examples": false,
    "evaluation_targets": ["valid"],
    "log_every_n_batches": 250,
    "val_every_n_batches": 500,
    "batch_size": 20,
    "valid_batch_size": 64,
    "validation_patience": 10,
    "metrics": [
      {
        "name": "squad_v1_f1",
        "inputs": ["ans", "ans_predicted"]
      },
      {
        "name": "squad_v1_em",
        "inputs": ["ans", "ans_predicted"]
      },
      {
        "name": "squad_v2_f1",
        "inputs": ["ans", "ans_predicted"]
      },
      {
        "name": "squad_v2_em",
        "inputs": ["ans", "ans_predicted"]
      }
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "LOWERCASE": false,
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/multi_squad_ru_torch_bert_retr_noans/{TRANSFORMER}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/squad/multi_squad_ru_torch_bert_retr_noans.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/squad/qa_nq_psgcls_bert.json
================================================
{
  "chainer": {
    "in": ["context_raw", "question_raw"],
    "pipe": [
      {
        "class_name": "torch_squad_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": "{LOWERCASE}",
        "max_seq_length": 384,
        "in": ["question_raw", "context_raw"],
        "out": ["bert_features", "subtokens", "split_context"]
      },
      {
        "class_name": "squad_bert_mapping",
        "do_lower_case": "{LOWERCASE}",
        "in": ["split_context", "bert_features", "subtokens"],
        "out": ["subtok2chars", "char2subtoks"]
      },
      {
        "class_name": "torch_transformers_squad",
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "torch_seed": 1,
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 0.01,
          "betas": [0.9, 0.999],
          "eps": 1e-06
        },
        "random_seed": 1,
        "psg_cls": true,
        "learning_rate_drop_patience": 2,
        "learning_rate_drop_div": 2.0,
        "in": ["bert_features"],
        "out": ["ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds"]
      },
      {
        "class_name": "squad_bert_ans_postprocessor",
        "in": ["ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds"],
        "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"]
      }
    ],
    "out": ["ans_predicted", "ans_start_predicted", "scores"]
  },
  "metadata": {
    "variables": {
      "LOWERCASE": true,
      "TRANSFORMER": "bert-base-uncased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/passage_reader_classifier_eng"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/nq_psgcls_bert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/squad/qa_squad2_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "squad_dataset_reader",
    "dataset": "SQuAD2.0",
    "data_path": "{DOWNLOADS_PATH}/squad2/"
  },
  "dataset_iterator": {
    "class_name": "squad_iterator",
    "seed": 1337,
    "shuffle": true
  },
  "chainer": {
    "in": [
      "context_raw",
      "question_raw"
    ],
    "in_y": [
      "ans_raw",
      "ans_raw_start"
    ],
    "pipe": [
      {
        "class_name": "torch_squad_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": "{LOWERCASE}",
        "max_seq_length": 384,
        "in": [
          "question_raw",
          "context_raw"
        ],
        "out": [
          "bert_features",
          "subtokens",
          "split_context"
        ]
      },
      {
        "class_name": "squad_bert_mapping",
        "do_lower_case": "{LOWERCASE}",
        "in": [
          "split_context",
          "bert_features",
          "subtokens"
        ],
        "out": [
          "subtok2chars",
          "char2subtoks"
        ]
      },
      {
        "class_name": "squad_bert_ans_preprocessor",
        "do_lower_case": "{LOWERCASE}",
        "in": [
          "ans_raw",
          "ans_raw_start",
          "char2subtoks"
        ],
        "out": [
          "ans",
          "ans_start",
          "ans_end"
        ]
      },
      {
        "class_name": "torch_transformers_squad",
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "torch_seed": 1,
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 0.01,
          "betas": [
            0.9,
            0.999
          ],
          "eps": 1e-06
        },
        "random_seed": 1,
        "learning_rate_drop_patience": 2,
        "learning_rate_drop_div": 2.0,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "ans_start",
          "ans_end"
        ],
        "out": [
          "ans_start_predicted",
          "ans_end_predicted",
          "logits",
          "scores",
          "inds"
        ]
      },
      {
        "class_name": "squad_bert_ans_postprocessor",
        "in": [
          "ans_start_predicted",
          "ans_end_predicted",
          "split_context",
          "subtok2chars",
          "subtokens",
          "inds"
        ],
        "out": [
          "ans_predicted",
          "ans_start_predicted",
          "ans_end_predicted"
        ]
      }
    ],
    "out": [
      "ans_predicted",
      "ans_start_predicted",
      "scores"
    ]
  },
  "train": {
    "show_examples": false,
    "evaluation_targets": [
      "valid"
    ],
    "log_every_n_batches": 50,
    "val_every_n_batches": 500,
    "batch_size": 20,
    "valid_batch_size": 60,
    "valid_batch_size": 32,
    "pytest_max_batches": 2,
    "pytest_batch_size": 5,
    "validation_patience": 10,
    "metrics": [
      {
        "name": "squad_v1_f1",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v1_em",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v2_f1",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v2_em",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      }
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "LOWERCASE": false,
      "TRANSFORMER": "bert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/squad2_bert"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/squad/squad2_bert.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/squad/squad_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "squad_dataset_reader",
    "data_path": "{DOWNLOADS_PATH}/squad/"
  },
  "dataset_iterator": {
    "class_name": "squad_iterator",
    "seed": 1337,
    "shuffle": true
  },
  "chainer": {
    "in": ["context_raw", "question_raw"],
    "in_y": ["ans_raw", "ans_raw_start"],
    "pipe": [
      {
        "class_name": "torch_squad_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": "{LOWERCASE}",
        "max_seq_length": 384,
        "in": ["question_raw", "context_raw"],
        "out": ["bert_features", "subtokens", "split_context"]
      },
      {
        "class_name": "squad_bert_mapping",
        "do_lower_case": "{LOWERCASE}",
        "in": ["split_context", "bert_features", "subtokens"],
        "out": ["subtok2chars", "char2subtoks"]
      },
      {
        "class_name": "squad_bert_ans_preprocessor",
        "do_lower_case": "{LOWERCASE}",
        "in": ["ans_raw", "ans_raw_start", "char2subtoks"],
        "out": ["ans", "ans_start", "ans_end"]
      },
      {
        "class_name": "torch_transformers_squad",
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 0.01,
          "betas": [0.9, 0.999],
          "eps": 1e-06
        },
        "learning_rate_drop_patience": 2,
        "learning_rate_drop_div": 2.0,
        "batch_size": 10,
        "in": ["bert_features"],
        "in_y": ["ans_start", "ans_end"],
        "out": ["ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds"]
      },
      {
        "class_name": "squad_bert_ans_postprocessor",
        "in": ["ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds"],
        "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"]
      }
    ],
    "out": ["ans_predicted", "ans_start_predicted", "scores"]
  },
  "train": {
    "show_examples": false,
    "evaluation_targets": ["valid"],
    "log_every_n_batches": 250,
    "val_every_n_batches": 500,
    "batch_size": 10,
    "pytest_max_batches": 2,
    "pytest_batch_size": 5,
    "validation_patience": 10,
    "metrics": [
      {
        "name": "squad_v1_f1",
        "inputs": ["ans", "ans_predicted"]
      },
      {
        "name": "squad_v1_em",
        "inputs": ["ans", "ans_predicted"]
      },
      {
        "name": "squad_v2_f1",
        "inputs": ["ans", "ans_predicted"]
      },
      {
        "name": "squad_v2_em",
        "inputs": ["ans", "ans_predicted"]
      }
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "LOWERCASE": false,
      "TRANSFORMER": "bert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/squad_torch_bert/cased/{TRANSFORMER}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/squad/squad_torch_bert_cased.tar.gz",
        "subdir": "{MODEL_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/squad/squad_ru_bert.json
================================================
{
  "dataset_reader": {
    "class_name": "squad_dataset_reader",
    "dataset": "SberSQuADClean",
    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
  },
  "dataset_iterator": {
    "class_name": "squad_iterator",
    "seed": 1337,
    "shuffle": true
  },
  "chainer": {
    "in": [
      "context_raw",
      "question_raw"
    ],
    "in_y": [
      "ans_raw",
      "ans_raw_start"
    ],
    "pipe": [
      {
        "class_name": "torch_squad_transformers_preprocessor",
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": "{LOWERCASE}",
        "max_seq_length": 384,
        "in": [
          "question_raw",
          "context_raw"
        ],
        "out": [
          "bert_features",
          "subtokens",
          "split_context"
        ]
      },
      {
        "class_name": "squad_bert_mapping",
        "do_lower_case": "{LOWERCASE}",
        "in": [
          "split_context",
          "bert_features",
          "subtokens"
        ],
        "out": [
          "subtok2chars",
          "char2subtoks"
        ]
      },
      {
        "class_name": "squad_bert_ans_preprocessor",
        "do_lower_case": "{LOWERCASE}",
        "in": [
          "ans_raw",
          "ans_raw_start",
          "char2subtoks"
        ],
        "out": [
          "ans",
          "ans_start",
          "ans_end"
        ]
      },
      {
        "class_name": "torch_transformers_squad",
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 2e-05,
          "weight_decay": 0.01,
          "betas": [
            0.9,
            0.999
          ],
          "eps": 1e-06
        },
        "learning_rate_drop_patience": 3,
        "learning_rate_drop_div": 2.0,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "ans_start",
          "ans_end"
        ],
        "out": [
          "ans_start_predicted",
          "ans_end_predicted",
          "logits",
          "scores",
          "inds"
        ]
      },
      {
        "class_name": "squad_bert_ans_postprocessor",
        "in": [
          "ans_start_predicted",
          "ans_end_predicted",
          "split_context",
          "subtok2chars",
          "subtokens",
          "inds"
        ],
        "out": [
          "ans_predicted",
          "ans_start_predicted",
          "ans_end_predicted"
        ]
      }
    ],
    "out": [
      "ans_predicted",
      "ans_start_predicted",
      "scores"
    ]
  },
  "train": {
    "show_examples": false,
    "evaluation_targets": [
      "valid"
    ],
    "log_every_n_batches": 250,
    "val_every_n_batches": 500,
    "batch_size": 10,
    "validation_patience": 10,
    "metrics": [
      {
        "name": "squad_v1_f1",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v1_em",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v2_f1",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v2_em",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      }
    ],
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "LOWERCASE": false,
      "TRANSFORMER": "DeepPavlov/rubert-base-cased",
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/squad_ru_torch_bert/{TRANSFORMER}"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/v1/squad/squad_ru_torch_bert.tar.gz",
        "subdir": "{MODELS_PATH}"
      }
    ]
  }
}


================================================
FILE: deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json
================================================
{
  "dataset_reader": {
    "class_name": "squad_dataset_reader",
    "dataset": "SberSQuADClean",
    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
  },
  "dataset_iterator": {
    "class_name": "squad_iterator",
    "seed": 1337,
    "shuffle": true
  },
  "chainer": {
    "in": [
      "context_raw",
      "question_raw"
    ],
    "in_y": [
      "ans_raw",
      "ans_raw_start"
    ],
    "pipe": [
      {
        "class_name": "torch_squad_transformers_preprocessor",
        "add_token_type_ids": true, 
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": "{lowercase}",
        "max_seq_length": 384,
        "in": [
          "question_raw",
          "context_raw"
        ],
        "out": [
          "bert_features",
          "subtokens",
          "split_context"
        ]
      },
      {
        "class_name": "squad_bert_mapping",
        "do_lower_case": "{lowercase}",
        "in": [
          "split_context",
          "bert_features",
          "subtokens"
        ],
        "out": [
          "subtok2chars",
          "char2subtoks"
        ]
      },
      {
        "class_name": "squad_bert_ans_preprocessor",
        "do_lower_case": "{lowercase}",
        "in": [
          "ans_raw",
          "ans_raw_start",
          "char2subtoks"
        ],
        "out": [
          "ans",
          "ans_start",
          "ans_end"
        ]
      },
      {
        "class_name": "torch_transformers_squad",
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "attention_probs_keep_prob": 0.11,
        "hidden_keep_prob": 0.33, 
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 9e-05
        },
        "learning_rate_drop_patience": 2,
        "learning_rate_drop_div": 1.5,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "ans_start",
          "ans_end"
        ],
        "out": [
          "ans_start_predicted",
          "ans_end_predicted",
          "logits",
          "scores",
          "inds"
        ]
      },
      {
        "class_name": "squad_bert_ans_postprocessor",
        "in": [
          "ans_start_predicted",
          "ans_end_predicted",
          "split_context",
          "subtok2chars",
          "subtokens",
          "inds"
        ],
        "out": [
          "ans_predicted",
          "ans_start_predicted",
          "ans_end_predicted"
        ]
      }
    ],
    "out": [
      "ans_predicted",
      "ans_start_predicted",
      "scores"
    ]
  },
  "train": {
    "show_examples": false,
    "evaluation_targets": [
      "valid"
    ],
    "log_every_n_batches": 250,
    "val_every_n_batches": 500,
    "batch_size": 10,
    "validation_patience": 10,
    "metrics": [
      {
        "name": "squad_v2_f1",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v2_em",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v1_f1",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v1_em",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      }
    ],
    "tensorboard_log_dir": "{MODEL_PATH}/logs",
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "lowercase": false, 
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L.tar.gz",
        "subdir": "{MODELS_PATH}"
      }
    ]
  }
} 


================================================
FILE: deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
================================================
{
  "dataset_reader": {
    "class_name": "squad_dataset_reader",
    "dataset": "SberSQuADClean",
    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
  },
  "dataset_iterator": {
    "class_name": "squad_iterator",
    "seed": 1337,
    "shuffle": true
  },
  "chainer": {
    "in": [
      "context_raw",
      "question_raw"
    ],
    "in_y": [
      "ans_raw",
      "ans_raw_start"
    ],
    "pipe": [
      {
        "class_name": "torch_squad_transformers_preprocessor", 
        "add_token_type_ids": true, 
        "vocab_file": "{TRANSFORMER}",
        "do_lower_case": "{lowercase}",
        "max_seq_length": 384,
        "in": [
          "question_raw",
          "context_raw"
        ],
        "out": [
          "bert_features",
          "subtokens",
          "split_context"
        ]
      },
      {
        "class_name": "squad_bert_mapping",
        "do_lower_case": "{lowercase}",
        "in": [
          "split_context",
          "bert_features",
          "subtokens"
        ],
        "out": [
          "subtok2chars",
          "char2subtoks"
        ]
      },
      {
        "class_name": "squad_bert_ans_preprocessor",
        "do_lower_case": "{lowercase}",
        "in": [
          "ans_raw",
          "ans_raw_start",
          "char2subtoks"
        ],
        "out": [
          "ans",
          "ans_start",
          "ans_end"
        ]
      },
      {
        "class_name": "torch_transformers_squad",
        "pretrained_bert": "{TRANSFORMER}",
        "save_path": "{MODEL_PATH}/model",
        "load_path": "{MODEL_PATH}/model",
        "attention_probs_keep_prob": 0.0,
        "hidden_keep_prob": 0.33, 
        "optimizer": "AdamW",
        "optimizer_parameters": {
          "lr": 3.67e-5
        },
        "learning_rate_drop_patience": 2,
        "learning_rate_drop_div": 1.5,
        "in": [
          "bert_features"
        ],
        "in_y": [
          "ans_start",
          "ans_end"
        ],
        "out": [
          "ans_start_predicted",
          "ans_end_predicted",
          "logits",
          "scores",
          "inds"
        ]
      },
      {
        "class_name": "squad_bert_ans_postprocessor",
        "in": [
          "ans_start_predicted",
          "ans_end_predicted",
          "split_context",
          "subtok2chars",
          "subtokens",
          "inds"
        ],
        "out": [
          "ans_predicted",
          "ans_start_predicted",
          "ans_end_predicted"
        ]
      }
    ],
    "out": [
      "ans_predicted",
      "ans_start_predicted",
      "scores"
    ]
  },
  "train": {
    "show_examples": false,
    "evaluation_targets": [
      "valid"
    ],
    "log_every_n_batches": 250,
    "val_every_n_batches": 500,
    "batch_size": 10,
    "validation_patience": 10,
    "metrics": [
      {
        "name": "squad_v2_f1",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v2_em",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v1_f1",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      },
      {
        "name": "squad_v1_em",
        "inputs": [
          "ans",
          "ans_predicted"
        ]
      }
    ],
    "tensorboard_log_dir": "{MODEL_PATH}/logs",
    "class_name": "torch_trainer"
  },
  "metadata": {
    "variables": {
      "lowercase": false, 
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
      "MODELS_PATH": "{ROOT_PATH}/models",
      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L.tar.gz",
        "subdir": "{MODELS_PATH}"
      }
    ]
  }
} 


================================================
FILE: deeppavlov/core/__init__.py
================================================


================================================
FILE: deeppavlov/core/commands/__init__.py
================================================


================================================
FILE: deeppavlov/core/commands/infer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import sys
from itertools import islice
from logging import getLogger
from pathlib import Path
from typing import Optional, Union

from deeppavlov.core.commands.utils import import_packages, parse_config
from deeppavlov.core.common.chainer import Chainer
from deeppavlov.core.common.params import from_params
from deeppavlov.core.data.utils import jsonify_data
from deeppavlov.download import deep_download
from deeppavlov.utils.pip_wrapper import install_from_config

log = getLogger(__name__)


def build_model(config: Union[str, Path, dict], mode: str = 'infer',
                load_trained: bool = False, install: bool = False, download: bool = False) -> Chainer:
    """Build and return the model described in corresponding configuration file."""
    config = parse_config(config)

    if install:
        install_from_config(config)
    if download:
        deep_download(config)

    import_packages(config.get('metadata', {}).get('imports', []))

    model_config = config['chainer']

    model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'))

    for component_config in model_config['pipe']:
        if load_trained and ('fit_on' in component_config or 'in_y' in component_config):
            try:
                component_config['load_path'] = component_config['save_path']
            except KeyError:
                log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed'
                            .format(component_config.get('class_name', component_config.get('ref', 'UNKNOWN'))))

        component = from_params(component_config, mode=mode)

        if 'id' in component_config:
            model._components_dict[component_config['id']] = component

        if 'in' in component_config:
            c_in = component_config['in']
            c_out = component_config['out']
            in_y = component_config.get('in_y', None)
            main = component_config.get('main', False)
            model.append(component, c_in, c_out, in_y, main)

    return model


def interact_model(config: Union[str, Path, dict]) -> None:
    """Start interaction with the model described in corresponding configuration file."""
    model = build_model(config)

    while True:
        args = []
        for in_x in model.in_x:
            args.append((input('{}::'.format(in_x)),))
            # check for exit command
            if args[-1][0] in {'exit', 'stop', 'quit', 'q'}:
                return

        pred = model(*args)
        if len(model.out_params) > 1:
            pred = zip(*pred)

        print('>>', *pred)


def predict_on_stream(config: Union[str, Path, dict],
                      batch_size: Optional[int] = None,
                      file_path: Optional[str] = None) -> None:
    """Make a prediction with the component described in corresponding configuration file."""

    batch_size = batch_size or 1
    if file_path is None or file_path == '-':
        if sys.stdin.isatty():
            raise RuntimeError('To process data from terminal please use interact mode')
        f = sys.stdin
    else:
        f = open(file_path, encoding='utf8')

    model: Chainer = build_model(config)

    args_count = len(model.in_x)
    while True:
        batch = list((l.strip() for l in islice(f, batch_size * args_count)))

        if not batch:
            break

        args = []
        for i in range(args_count):
            args.append(batch[i::args_count])

        res = model(*args)
        if len(model.out_params) == 1:
            res = [res]
        for res in zip(*res):
            res = json.dumps(jsonify_data(res), ensure_ascii=False)
            print(res, flush=True)

    if f is not sys.stdin:
        f.close()


================================================
FILE: deeppavlov/core/commands/train.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import Dict, Union, Optional, Iterable

from deeppavlov.core.commands.utils import expand_path, import_packages, parse_config
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.params import resolve
from deeppavlov.core.common.registry import get_model
from deeppavlov.core.data.data_fitting_iterator import DataFittingIterator
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
from deeppavlov.core.data.utils import get_all_elems_from_json
from deeppavlov.download import deep_download
from deeppavlov.utils.pip_wrapper import install_from_config

log = getLogger(__name__)


def read_data_by_config(config: dict):
    """Read data by dataset_reader from specified config."""
    dataset_config = config.get('dataset', None)

    if dataset_config:
        config.pop('dataset')
        ds_type = dataset_config['type']
        if ds_type == 'classification':
            reader = {'class_name': 'basic_classification_reader'}
            iterator = {'class_name': 'basic_classification_iterator'}
            config['dataset_reader'] = {**dataset_config, **reader}
            config['dataset_iterator'] = {**dataset_config, **iterator}
        else:
            raise Exception("Unsupported dataset type: {}".format(ds_type))

    try:
        reader_config = dict(config['dataset_reader'])
    except KeyError:
        raise ConfigError("No dataset reader is provided in the JSON config.")

    reader = get_model(reader_config.pop('class_name'))()
    data_path = reader_config.get('data_path')
    if isinstance(data_path, list):
        reader_config['data_path'] = [expand_path(path) for path in data_path]
    elif data_path is not None:
        reader_config['data_path'] = expand_path(data_path)
    return reader.read(**reader_config)


def get_iterator_from_config(config: dict, data: dict):
    """Create iterator (from config) for specified data."""
    iterator_config = {k: resolve(v) for k, v in config['dataset_iterator'].items()}
    iterator: Union[DataLearningIterator, DataFittingIterator] = get_model(iterator_config.pop('class_name'))(
        **iterator_config, data=data)
    return iterator


def train_evaluate_model_from_config(config: Union[str, Path, dict],
                                     iterator: Union[DataLearningIterator, DataFittingIterator] = None, *,
                                     to_train: bool = True,
                                     evaluation_targets: Optional[Iterable[str]] = None,
                                     install: bool = False,
                                     download: bool = False,
                                     start_epoch_num: Optional[int] = None,
                                     recursive: bool = False) -> Dict[str, Dict[str, float]]:
    """Make training and evaluation of the model described in corresponding configuration file."""
    config = parse_config(config)

    if install:
        install_from_config(config)
    if download:
        deep_download(config)

    if to_train and recursive:
        for subconfig in get_all_elems_from_json(config['chainer'], 'config_path'):
            log.info(f'Training "{subconfig}"')
            train_evaluate_model_from_config(subconfig, download=False, recursive=True)

    import_packages(config.get('metadata', {}).get('imports', []))

    if iterator is None:
        try:
            data = read_data_by_config(config)
            # TODO: check class objects, not strings
            is_mtl = config['dataset_reader']['class_name'] == 'multitask_reader'
            if config.get('train', {}).get('val_every_n_epochs') and not data.get('valid') and not is_mtl:
                error_message = 'The value "val_every_n_epochs" is set in the config but no validation data is provided'
                raise AttributeError(error_message)
        except ConfigError as e:
            to_train = False
            log.warning(f'Skipping training. {e.message}')
        else:
            iterator = get_iterator_from_config(config, data)

    if 'train' not in config:
        log.warning('Train config is missing. Populating with default values')
    train_config = config.get('train', {})

    if start_epoch_num is not None:
        train_config['start_epoch_num'] = start_epoch_num

    trainer_class = get_model(train_config.pop('class_name', 'torch_trainer'))

    trainer = trainer_class(config['chainer'], **train_config)

    if to_train:
        trainer.train(iterator)

    res = {}

    if iterator is not None:
        res = trainer.evaluate(iterator, evaluation_targets)
        trainer.get_chainer().destroy()

    res = {k: v['metrics'] for k, v in res.items()}

    return res


================================================
FILE: deeppavlov/core/commands/utils.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from copy import deepcopy
from pathlib import Path
from typing import Any, Union, Dict, TypeVar, Optional

from deeppavlov.core.common.file import read_json, find_config
from deeppavlov.core.common.registry import inverted_registry
from deeppavlov.core.data.utils import get_all_elems_from_json

# noinspection PyShadowingBuiltins
_T = TypeVar('_T', str, float, bool, list, dict)


def _parse_config_property(item: _T, variables: Dict[str, Union[str, Path, float, bool, int, None]],
                           variables_exact: Dict[str, Union[str, Path, float, bool, int, None]]) -> _T:
    """Recursively apply config's variables values to its property"""
    if isinstance(item, str):
        if item in variables_exact:
            return variables_exact[item]
        return item.format(**variables)
    elif isinstance(item, list):
        return [_parse_config_property(item, variables, variables_exact) for item in item]
    elif isinstance(item, dict):
        return {k: _parse_config_property(v, variables, variables_exact) for k, v in item.items()}
    else:
        return item


def _get_variables_from_config(config: Union[str, Path, dict]):
    """Read config's variables"""
    if isinstance(config, (str, Path)):
        config = read_json(find_config(config))

    variables = {
        'DEEPPAVLOV_PATH': os.getenv(f'DP_DEEPPAVLOV_PATH', Path(__file__).parent.parent.parent)
    }
    variables_exact = {f'{{{k}}}': v for k, v in variables.items()}
    for name, value in config.get('metadata', {}).get('variables', {}).items():
        env_name = f'DP_{name}'
        if env_name in os.environ:
            value = os.getenv(env_name)
        if value in variables_exact:
            value = variables_exact[value]
        elif isinstance(value, str):
            value = value.format(**variables)
        variables[name] = value
        variables_exact[f'{{{name}}}'] = value

    return variables, variables_exact


def _update_requirements(config: dict) -> dict:
    """
    Generates requirements for DeepPavlov model and adds them as ``metadata.requirements`` field to the returned dict.

    Searches for the ``class_name`` keys in the passed config at all nesting levels. For each found component,
    function looks for dependencies in the requirements registry. Found dependencies are added to the returned copy of
    the config as ``metadata.requirements``. If the config already has ``metadata.requirements``, the existing one
    is complemented by the found requirements.

    Args:
        config: DeepPavlov model config
    Returns:
        config copy with updated ``metadata.requirements`` field according to the config components.
    """
    components = get_all_elems_from_json(config, 'class_name')
    components = {inverted_registry.get(component, component) for component in components}
    requirements_registry_path = Path(__file__).parents[1] / 'common' / 'requirements_registry.json'
    requirements_registry = read_json(requirements_registry_path)
    requirements = []
    for component in components:
        requirements.extend(requirements_registry.get(component, []))
    requirements.extend(config.get('metadata', {}).get('requirements', []))
    response = deepcopy(config)
    response['metadata'] = response.get('metadata', {})
    response['metadata']['requirements'] = list(set(requirements))
    return response


def _overwrite(data: Any, value: Any, nested_keys: list) -> None:
    """Changes ``data`` nested key value to ``value`` using ``nested_keys`` as nested keys list.

    Example:
        >>> x = {'a': [None, {'b': 2}]}
        >>> _overwrite(x, 42, ['a', 1, 'b'])
        >>> x
        {'a': [None, {'b': 42}]}

    """
    key = nested_keys.pop(0)
    if not nested_keys:
        data[key] = value
    else:
        _overwrite(data[key], value, nested_keys)


def parse_config(config: Union[str, Path, dict], overwrite: Optional[dict] = None) -> dict:
    """Apply metadata.variables values to placeholders inside config and update nested configs using overwrite parameter

    Args:
        config: Config to parse.
        overwrite: If not None - key-value pairs of nested keys and values to overwrite config.
            For {'chainer.pipe.0.class_name': 'simple_vocab'} it will update config
            config['chainer']['pipe'][0]['class_name'] = 'simple_vocab'.

    """
    if isinstance(config, (str, Path)):
        config = read_json(find_config(config))

    if overwrite is not None:
        for key, value in overwrite.items():
            items = [int(item) if item.isdigit() else item for item in key.split('.')]
            _overwrite(config, value, items)

    updated_config = _update_requirements(config)

    variables, variables_exact = _get_variables_from_config(updated_config)

    return _parse_config_property(updated_config, variables, variables_exact)


def expand_path(path: Union[str, Path]) -> Path:
    """Convert relative paths to absolute with resolving user directory."""
    return Path(path).expanduser().resolve()


def import_packages(packages: list) -> None:
    """Import packages from list to execute their code."""
    for package in packages:
        __import__(package)


def parse_value_with_config(value: Union[str, Path], config: Union[str, Path, dict]) -> Path:
    """Fill the variables in `value` with variables values from `config`.
    `value` should be a string. If `value` is a string of only variable, `value` will be replaced with
    variable's value from config (the variable's value could be anything then)."""
    variables, variables_exact = _get_variables_from_config(config)

    return _parse_config_property(str(value), variables, variables_exact)


================================================
FILE: deeppavlov/core/common/__init__.py
================================================


================================================
FILE: deeppavlov/core/common/aliases.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

ALIASES = {
    'kbqa_cq': 'kbqa_cq_en',
    'kbqa_cq_online': 'kbqa_cq_en',
    'kbqa_cq_rus': 'kbqa_cq_ru',
    'multi_squad_noans': 'qa_squad2_bert',
    'multi_squad_noans_infer': 'qa_squad2_bert',
    'multi_squad_retr_noans': 'qa_squad2_bert',
    'ner_collection3_m1': 'ner_collection3_bert',
    'ner_conll2003': 'ner_conll2003_bert',
    'ner_conll2003_torch_bert': 'ner_conll2003_bert',
    'ner_dstc2': 'ner_conll2003_bert',
    'ner_few_shot_ru': 'ner_rus_bert',
    'ner_few_shot_ru_simulate': 'ner_rus_bert',
    'ner_ontonotes': 'ner_ontonotes_bert',
    'ner_ontonotes_bert_emb': 'ner_ontonotes_bert',
    'ner_ontonotes_bert_mult_torch': 'ner_ontonotes_bert_mult',
    'ner_ontonotes_bert_torch': 'ner_ontonotes_bert',
    'ner_rus': 'ner_rus_bert',
    'paraphraser_bert': 'paraphraser_rubert',
    'ru_odqa_infer_wiki_rubert': 'ru_odqa_infer_wiki',
    'sentseg_dailydialog': 'sentseg_dailydialog_bert',
    'squad': 'squad_bert',
    'squad_bert_infer': 'squad_bert',
    'squad_bert_multilingual_freezed_emb': 'squad_bert',
    'squad_ru': 'squad_ru_bert',
    'squad_ru_bert_infer': 'squad_ru_bert',
    'squad_ru_convers_distilrubert_2L_infer': 'squad_ru_convers_distilrubert_2L',
    'squad_ru_convers_distilrubert_6L_infer': 'squad_ru_convers_distilrubert_6L',
    'squad_ru_rubert': 'squad_ru_bert',
    'squad_ru_rubert_infer': 'squad_ru_bert',
    'squad_torch_bert': 'squad_bert',
    'squad_torch_bert_infer': 'squad_bert'
}


================================================
FILE: deeppavlov/core/common/base.py
================================================
# Copyright 2021 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from types import FunctionType
from typing import List, Optional, Union

from deeppavlov.core.common.chainer import Chainer
from deeppavlov.core.models.component import Component


class Element:
    """DeepPavlov model pipeline element."""
    def __init__(self, component: Union[Component, FunctionType],
                 x: Optional[Union[str, list]] = None,
                 out: Optional[Union[str, list]] = None,
                 y: Optional[Union[str, list]] = None,
                 main: bool = False) -> None:
        """
        Args:
            component: Pipeline component object.
            x: Names of the component inference inputs. Output from other pipeline elements with such names will be fed
                to the input of this component.
            out: Names of the component inference outputs. Component outputs can be fed to other pipeline elements
                using this names.
            y: Names of additional inputs (targets) for component training and evaluation.
            main: Set True if this is the main component. Main component is trained during model training process.
        """
        self.component = component
        self.x = x
        self.y = y
        self.out = out
        self.main = main


class Model(Chainer):
    """Builds a component pipeline to train and infer models."""
    def __init__(self, x: Optional[Union[str, list]] = None,
                 out: Optional[Union[str, list]] = None,
                 y: Optional[Union[str, list]] = None,
                 pipe: Optional[List[Element]] = None) -> None:
        """
        Args:
            x: Names of pipeline inference inputs.
            out: Names of pipeline inference outputs.
            y: Names of additional inputs (targets) for pipeline training and evaluation.
            pipe: List of pipeline elements.
        """
        super().__init__(in_x=x, out_params=out, in_y=y)
        if pipe is not None:
            for element in pipe:
                self.append(element.component, element.x, element.out, element.y, element.main)


================================================
FILE: deeppavlov/core/common/chainer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle
from itertools import islice
from logging import getLogger
from types import FunctionType
from typing import Union, Tuple, List, Optional, Hashable, Reversible

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.nn_model import NNModel
from deeppavlov.core.models.serializable import Serializable

log = getLogger(__name__)


class Chainer(Component):
    """
    Builds a component pipeline from heterogeneous components (Rule-based/ML/DL). It allows to train
    and infer models in a pipeline as a whole.

    Attributes:
        pipe: list of components and their input and output variable names for inference
        train_pipe: list of components and their input and output variable names for training and evaluation
        in_x: names of inputs for pipeline inference mode
        out_params: names of pipeline inference outputs
        in_y: names of additional inputs for pipeline training and evaluation modes
        forward_map: list of all variables in chainer's memory after  running every component in ``self.pipe``
        train_map: list of all variables in chainer's memory after  running every component in ``train_pipe.pipe``
        main: reference to the main component

    Args:
        in_x: names of inputs for pipeline inference mode
        out_params: names of pipeline inference outputs
        in_y: names of additional inputs for pipeline training and evaluation modes
    """

    def __init__(self, in_x: Union[str, list] = None, out_params: Union[str, list] = None,
                 in_y: Union[str, list] = None, *args, **kwargs) -> None:
        self.pipe: List[Tuple[Tuple[List[str], List[str]], List[str], Component]] = []
        self.train_pipe = []
        if isinstance(in_x, str):
            in_x = [in_x]
        if isinstance(in_y, str):
            in_y = [in_y]
        if isinstance(out_params, str):
            out_params = [out_params]
        self.in_x = in_x or ['x']
        self.in_y = in_y or ['y']
        self.out_params = out_params or self.in_x

        self.forward_map = set(self.in_x)
        self.train_map = self.forward_map.union(self.in_y)

        self._components_dict = {}

        self.main = None

    def __getitem__(self, item):
        if isinstance(item, int):
            in_params, out_params, component = self.train_pipe[item]
            return component
        return self._components_dict[item]

    def _ipython_key_completions_(self):
        return self._components_dict.keys()

    def __repr__(self):
        reversed_components_dict = {v: f'{repr(k)}: ' for k, v in self._components_dict.items()
                                    if isinstance(v, Hashable)}

        components_list = []
        for in_params, out_params, component in self.train_pipe:
            component_repr = repr(component)
            if isinstance(component, Hashable) and component in reversed_components_dict:
                component_repr = reversed_components_dict[component] + component_repr
            else:
                for k, v in self._components_dict.items():
                    if v is component:
                        component_repr = f'{k}: {component_repr}'
                        break
            components_list.append(component_repr)

        return f'Chainer[{", ".join(components_list)}]'

    def _repr_pretty_(self, p, cycle):
        """method that defines ``Struct``'s pretty printing rules for iPython

        Args:
            p (IPython.lib.pretty.RepresentationPrinter): pretty printer object
            cycle (bool): is ``True`` if pretty detected a cycle
        """
        if cycle:
            p.text('Chainer(...)')
        else:
            with p.group(8, 'Chainer[', ']'):
                reversed_components_dict = {v: k for k, v in self._components_dict.items()
                                            if isinstance(v, Hashable)}
                # p.pretty(self.__prepare_repr())
                for i, (in_params, out_params, component) in enumerate(self.train_pipe):
                    if i > 0:
                        p.text(',')
                        p.breakable()
                    if isinstance(component, Hashable) and component in reversed_components_dict:
                        p.pretty(reversed_components_dict[component])
                        p.text(': ')
                    else:
                        for k, v in self._components_dict.items():
                            if v is component:
                                p.pretty(k)
                                p.text(': ')
                                break
                    p.pretty(component)

    def append(self, component: Union[Component, FunctionType], in_x: [str, list, dict] = None,
               out_params: [str, list] = None, in_y: [str, list, dict] = None, main: bool = False):
        if isinstance(in_x, str):
            in_x = [in_x]
        if isinstance(in_y, str):
            in_y = [in_y]
        if isinstance(out_params, str):
            out_params = [out_params]
        in_x = in_x or self.in_x

        if isinstance(in_x, dict):
            x_keys, in_x = zip(*in_x.items())
        else:
            x_keys = []
        out_params = out_params or in_x
        if in_y is not None:
            if isinstance(in_y, dict):
                y_keys, in_y = zip(*in_y.items())
            else:
                y_keys = []
            keys = x_keys + y_keys

            if bool(x_keys) != bool(y_keys):
                raise ConfigError('`in` and `in_y` for a component have to both be lists or dicts')

            component: NNModel
            main = True
            assert self.train_map.issuperset(in_x + in_y), ('Arguments {} are expected but only {} are set'
                                                            .format(in_x + in_y, self.train_map))
            preprocessor = Chainer(self.in_x, in_x + in_y, self.in_y)
            for (t_in_x_keys, t_in_x), t_out, t_component in self.train_pipe:
                if t_in_x_keys:
                    t_in_x = dict(zip(t_in_x_keys, t_in_x))
                preprocessor.append(t_component, t_in_x, t_out)

            def train_on_batch(*args, **kwargs):
                preprocessed = preprocessor.compute(*args, **kwargs)
                if len(in_x + in_y) == 1:
                    preprocessed = [preprocessed]
                if keys:
                    return component.train_on_batch(**dict(zip(keys, preprocessed)))
                else:
                    return component.train_on_batch(*preprocessed)

            self.train_on_batch = train_on_batch
            self.process_event = component.process_event
        if main:
            self.main = component
        if self.forward_map.issuperset(in_x):
            self.pipe.append(((x_keys, in_x), out_params, component))
            self.forward_map = self.forward_map.union(out_params)

        if self.train_map.issuperset(in_x):
            self.train_pipe.append(((x_keys, in_x), out_params, component))
            self.train_map = self.train_map.union(out_params)
        else:
            raise ConfigError('Arguments {} are expected but only {} are set'.format(in_x, self.train_map))

    def compute(self, x, y=None, targets=None):
        if targets is None:
            targets = self.out_params
        in_params = list(self.in_x)
        if len(in_params) == 1:
            args = [x]
        else:
            args = list(zip(*x))

        if y is None:
            pipe = self.pipe
        else:
            pipe = self.train_pipe
            if len(self.in_y) == 1:
                args.append(y)
            else:
                args += list(zip(*y))
            in_params += self.in_y

        return self._compute(*args, pipe=pipe, param_names=in_params, targets=targets)

    def __call__(self, *args):
        return self._compute(*args, param_names=self.in_x, pipe=self.pipe, targets=self.out_params)

    @staticmethod
    def _compute(*args, param_names, pipe, targets):
        expected = set(targets)
        final_pipe = []
        for (in_keys, in_params), out_params, component in reversed(pipe):
            if expected.intersection(out_params):
                expected = expected - set(out_params) | set(in_params)
                final_pipe.append(((in_keys, in_params), out_params, component))
        final_pipe.reverse()
        if not expected.issubset(param_names):
            raise RuntimeError(f'{expected} are required to compute {targets} but were not found in memory or inputs')
        pipe = final_pipe

        mem = dict(zip(param_names, args))
        del args

        for (in_keys, in_params), out_params, component in pipe:
            x = [mem[k] for k in in_params]
            if in_keys:
                res = component.__call__(**dict(zip(in_keys, x)))
            else:
                res = component.__call__(*x)
            if len(out_params) == 1:
                mem[out_params[0]] = res
            else:
                mem.update(zip(out_params, res))

        res = [mem[k] for k in targets]
        if len(res) == 1:
            res = res[0]
        return res

    def batched_call(self, *args: Reversible, batch_size: int = 16) -> Union[list, Tuple[list, ...]]:
        """
        Partitions data into mini-batches and applies :meth:`__call__` to each batch.

        Args:
            args: input data, each element of the data corresponds to a single model inputs sequence.
            batch_size: the size of a batch.

        Returns:
            the model output as if the data was passed to the :meth:`__call__` method.
        """
        args = [iter(arg) for arg in args]
        answer = [[] for _ in self.out_params]

        while True:
            batch = [list(islice(arg, batch_size)) for arg in args]
            if not any(batch):  # empty batch, reached the end
                break

            curr_answer = self.__call__(*batch)
            if len(self.out_params) == 1:
                curr_answer = [curr_answer]

            for y, curr_y in zip(answer, curr_answer):
                y.extend(curr_y)

        if len(self.out_params) == 1:
            answer = answer[0]
        return answer

    def get_main_component(self) -> Optional[Serializable]:
        try:
            return self.main or self.pipe[-1][-1]
        except IndexError:
            log.warning('Cannot get a main component for an empty chainer')
            return None

    def save(self) -> None:
        main_component = self.get_main_component()
        if isinstance(main_component, Serializable):
            main_component.save()

    def load(self) -> None:
        for in_params, out_params, component in self.train_pipe:
            if callable(getattr(component, 'load', None)):
                component.load()

    def reset(self) -> None:
        for in_params, out_params, component in self.train_pipe:
            if callable(getattr(component, 'reset', None)):
                component.reset()

    def destroy(self):
        if hasattr(self, 'train_pipe'):
            for in_params, out_params, component in self.train_pipe:
                if callable(getattr(component, 'destroy', None)):
                    component.destroy()
            self.train_pipe.clear()
        if hasattr(self, 'pipe'):
            self.pipe.clear()
        super().destroy()


================================================
FILE: deeppavlov/core/common/cross_validation.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import shutil
from collections import OrderedDict
from logging import getLogger
from pathlib import Path

import numpy as np
from sklearn.model_selection import KFold

from deeppavlov.core.commands.train import train_evaluate_model_from_config, get_iterator_from_config, \
    read_data_by_config
from deeppavlov.core.commands.utils import expand_path, parse_config
from deeppavlov.core.common.params_search import ParamsSearch

SAVE_PATH_ELEMENT_NAME = 'save_path'
TEMP_DIR_FOR_CV = 'cv_tmp'
log = getLogger(__name__)


def change_savepath_for_model(config):
    params_helper = ParamsSearch()

    dirs_for_saved_models = set()
    for p in params_helper.find_model_path(config, SAVE_PATH_ELEMENT_NAME):
        p.append(SAVE_PATH_ELEMENT_NAME)
        save_path = Path(params_helper.get_value_from_config(config, p))
        new_save_path = save_path.parent / TEMP_DIR_FOR_CV / save_path.name

        dirs_for_saved_models.add(expand_path(new_save_path.parent))

        params_helper.insert_value_or_dict_into_config(config, p, str(new_save_path))

    return config, dirs_for_saved_models


def delete_dir_for_saved_models(dirs_for_saved_models):
    for new_save_dir in dirs_for_saved_models:
        shutil.rmtree(str(new_save_dir))


def create_dirs_to_save_models(dirs_for_saved_models):
    for new_save_dir in dirs_for_saved_models:
        new_save_dir.mkdir(exist_ok=True, parents=True)


def generate_train_valid(data, n_folds=5, is_loo=False):
    all_data = data['train'] + data['valid']

    if is_loo:
        # for Leave One Out
        for i in range(len(all_data)):
            data_i = {
                'train': all_data.copy(),
                'test': data['test']
            }
            data_i['valid'] = [data_i['train'].pop(i)]

            yield data_i
    else:
        # for Cross Validation
        kf = KFold(n_splits=n_folds, shuffle=True)
        for train_index, valid_index in kf.split(all_data):
            data_i = {
                'train': [all_data[i] for i in train_index],
                'valid': [all_data[i] for i in valid_index],
                'test': data['test']
            }

            yield data_i


def calc_cv_score(config, data=None, n_folds=5, is_loo=False):
    config = parse_config(config)

    if data is None:
        data = read_data_by_config(config)

    config, dirs_for_saved_models = change_savepath_for_model(config)

    cv_score = OrderedDict()
    for data_i in generate_train_valid(data, n_folds=n_folds, is_loo=is_loo):
        iterator = get_iterator_from_config(config, data_i)
        create_dirs_to_save_models(dirs_for_saved_models)
        score = train_evaluate_model_from_config(config, iterator=iterator)
        delete_dir_for_saved_models(dirs_for_saved_models)
        for key, value in score['valid'].items():
            if key not in cv_score:
                cv_score[key] = []
            cv_score[key].append(value)

    for key, value in cv_score.items():
        cv_score[key] = np.mean(value)
        log.info('Cross-Validation \"{}\" is: {}'.format(key, cv_score[key]))

    return cv_score


================================================
FILE: deeppavlov/core/common/errors.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging

logger = logging.getLogger(__name__)


class ConfigError(Exception):
    """Any configuration error."""

    def __init__(self, message):
        super(ConfigError, self).__init__()
        self.message = message

    def __str__(self):
        return repr(self.message)


================================================
FILE: deeppavlov/core/common/file.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import pickle
from collections import OrderedDict
from logging import getLogger
from pathlib import Path
from typing import Union, Any, Iterable

from deeppavlov.core.common.aliases import ALIASES

log = getLogger(__name__)

_red_text, _reset_text_color, _sharp_line = "\x1b[31;20m", "\x1b[0m", '#'*80
DEPRECATOIN_MSG = f"{_red_text}\n\n{_sharp_line}\n" \
                  "# The model '{0}' has been removed from the DeepPavlov configs.\n" \
                  "# The model '{1}' is used instead.\n" \
                  "# To disable this message please switch to '{1}'.\n" \
                  "# Automatic name resolving will be disabled in the deeppavlov 1.2.0,\n" \
                  "# and if you try to use '{0}' you will get an ERROR.\n" \
                  f"{_sharp_line}{_reset_text_color}\n"


def find_config(pipeline_config_path: Union[str, Path]) -> Path:
    if pipeline_config_path in ALIASES:
        new_pipeline_config_path = ALIASES[pipeline_config_path]
        log.warning(DEPRECATOIN_MSG.format(pipeline_config_path, new_pipeline_config_path))
        pipeline_config_path = new_pipeline_config_path

    if not Path(pipeline_config_path).is_file():
        configs = [c for c in Path(__file__).parent.parent.parent.glob(f'configs/**/{pipeline_config_path}.json')
                   if str(c.with_suffix('')).endswith(pipeline_config_path)]  # a simple way to not allow * and ?
        if configs:
            log.debug(f"Interpreting '{pipeline_config_path}' as '{configs[0]}'")
            pipeline_config_path = configs[0]

    return Path(pipeline_config_path)


def read_json(fpath: Union[str, Path]) -> dict:
    with open(fpath, encoding='utf8') as fin:
        return json.load(fin, object_pairs_hook=OrderedDict)


def save_json(data: dict, fpath: Union[str, Path]) -> None:
    with open(fpath, 'w', encoding='utf8') as fout:
        json.dump(data, fout, ensure_ascii=False, indent=2)


def save_pickle(data: dict, fpath: Union[str, Path]) -> None:
    with open(fpath, 'wb') as fout:
        pickle.dump(data, fout, protocol=4)


def load_pickle(fpath: Union[str, Path]) -> Any:
    with open(fpath, 'rb') as fin:
        return pickle.load(fin)


def save_jsonl(data: Iterable[dict], fpath: Union[str, Path]) -> None:
    with open(fpath, 'w') as f:
        for item in data:
            f.write(f"{json.dumps(item, ensure_ascii=False)}\n")


================================================
FILE: deeppavlov/core/common/log.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import logging.config
from pathlib import Path

from .paths import get_settings_path

LOG_CONFIG_FILENAME = 'log_config.json'
TRACEBACK_LOGGER_ERRORS = True

root_path = Path(__file__).resolve().parents[3]

log_config_path = get_settings_path() / LOG_CONFIG_FILENAME

with log_config_path.open(encoding='utf8') as log_config_json:
    log_config = json.load(log_config_json)


class ProbeFilter(logging.Filter):
    """ProbeFilter class is used to filter POST requests to /probe endpoint from logs."""

    def filter(self, record: logging.LogRecord) -> bool:
        """To log the record method should return True."""
        return 'POST /probe HTTP' not in record.getMessage()


def init_logger():
    configured_loggers = [log_config.get('root', {})] + [logger for logger in
                                                         log_config.get('loggers', {}).values()]

    used_handlers = {handler for log in configured_loggers for handler in log.get('handlers', [])}

    for handler_id, handler in list(log_config['handlers'].items()):
        if handler_id not in used_handlers:
            del log_config['handlers'][handler_id]
        elif 'filename' in handler.keys():
            filename = handler['filename']
            logfile_path = Path(filename).expanduser().resolve()
            handler['filename'] = str(logfile_path)

    logging.config.dictConfig(log_config)


================================================
FILE: deeppavlov/core/common/log_events.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import Optional
from deeppavlov.core.commands.utils import expand_path

log = getLogger(__name__)


class TBWriter:
    def __init__(self, tensorboard_log_dir: str):
        # TODO: After adding wandb logger, create common parent class for both loggers
        from torch.utils.tensorboard import SummaryWriter
        tensorboard_log_dir = expand_path(tensorboard_log_dir)
        self.tb_train_writer = SummaryWriter(str(tensorboard_log_dir / 'train_log'))
        self.tb_valid_writer = SummaryWriter(str(tensorboard_log_dir / 'valid_log'))

    # TODO: find how to write Summary
    def write_train(self, tag, scalar_value, global_step):
        self.tb_train_writer.add_scalar(tag, scalar_value, global_step)

    def write_valid(self, tag, scalar_value, global_step):
        self.tb_valid_writer.add_scalar(tag, scalar_value, global_step)

    def flush(self):
        self.tb_train_writer.flush()
        self.tb_valid_writer.flush()


def get_tb_writer(tensorboard_log_dir: Optional[str]) -> Optional[TBWriter]:
    try:
        if tensorboard_log_dir is not None:
            tb_writer = TBWriter(tensorboard_log_dir)
        else:
            tb_writer = None
    except ImportError:
        log.error('Failed to import SummaryWriter from torch.utils.tensorboard. Failed to initialize Tensorboard '
                  'logger. Install appropriate Pytorch version to use this logger or remove tensorboard_log_dir '
                  'parameter from the train parameters list in the configuration file.')
        tb_writer = None
    return tb_writer


================================================
FILE: deeppavlov/core/common/metrics_registry.json
================================================
{
  "acc": "deeppavlov.metrics.accuracy:round_accuracy",
  "accuracy": "deeppavlov.metrics.accuracy:accuracy",
  "average__ner_f1__f1_macro__f1": "deeppavlov.metrics.fmeasure:ner_f1__f1_macro__f1",
  "average__roc_auc__roc_auc__ner_f1": "deeppavlov.metrics.fmeasure:roc_auc__roc_auc__ner_f1",
  "bleu": "deeppavlov.metrics.bleu:bleu",
  "bleu_advanced": "deeppavlov.metrics.bleu:bleu_advanced",
  "elmo_loss2ppl": "deeppavlov.metrics.elmo_metrics:elmo_loss2ppl",
  "f1": "deeppavlov.metrics.fmeasure:round_f1",
  "f1_macro": "deeppavlov.metrics.fmeasure:round_f1_macro",
  "f1_weighted": "deeppavlov.metrics.fmeasure:round_f1_weighted",
  "google_bleu": "deeppavlov.metrics.bleu:google_bleu",
  "kbqa_accuracy": "deeppavlov.metrics.accuracy:kbqa_accuracy",
  "log_loss": "deeppavlov.metrics.log_loss:sk_log_loss",
  "matthews_correlation": "deeppavlov.metrics.correlation:matthews_correlation",
  "mean_squared_error": "deeppavlov.metrics.mse:mse",
  "multitask_accuracy": "deeppavlov.metrics.accuracy:multitask_accuracy",
  "multitask_sequence_accuracy": "deeppavlov.metrics.accuracy:multitask_sequence_accuracy",
  "multitask_token_accuracy": "deeppavlov.metrics.accuracy:multitask_token_accuracy",
  "ner_f1": "deeppavlov.metrics.fmeasure:ner_f1",
  "ner_token_f1": "deeppavlov.metrics.fmeasure:ner_token_f1",
  "pearson_correlation": "deeppavlov.metrics.correlation:pearson_correlation",
  "per_item_bleu": "deeppavlov.metrics.bleu:per_item_bleu",
  "per_item_dialog_accuracy": "deeppavlov.metrics.accuracy:per_item_dialog_accuracy",
  "per_item_dialog_bleu": "deeppavlov.metrics.bleu:per_item_dialog_bleu",
  "per_token_accuracy": "deeppavlov.metrics.accuracy:per_token_accuracy",
  "r@1": "deeppavlov.metrics.recall_at_k:r_at_1",
  "r@10": "deeppavlov.metrics.recall_at_k:r_at_10",
  "r@1_insQA": "deeppavlov.models.ranking.metrics:r_at_1_insQA",
  "r@2": "deeppavlov.metrics.recall_at_k:r_at_2",
  "r@5": "deeppavlov.metrics.recall_at_k:r_at_5",
  "rank_response": "deeppavlov.models.ranking.metrics:rank_response",
  "roc_auc": "deeppavlov.metrics.roc_auc_score:roc_auc_score",
  "sets_accuracy": "deeppavlov.metrics.accuracy:sets_accuracy",
  "slots_accuracy": "deeppavlov.metrics.accuracy:slots_accuracy",
  "spearman_correlation": "deeppavlov.metrics.correlation:spearman_correlation",
  "squad_v1_em": "deeppavlov.metrics.squad_metrics:squad_v1_exact_match",
  "squad_v1_f1": "deeppavlov.metrics.squad_metrics:squad_v1_f1",
  "squad_v2_em": "deeppavlov.metrics.squad_metrics:squad_v2_exact_match",
  "squad_v2_f1": "deeppavlov.metrics.squad_metrics:squad_v2_f1",
  "record_f1_score": "deeppavlov.metrics.record_metrics:record_f1_score",
  "record_em_score": "deeppavlov.metrics.record_metrics:record_em_score"
}


================================================
FILE: deeppavlov/core/common/metrics_registry.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import json
from logging import getLogger
from pathlib import Path
from typing import Callable, Any

from deeppavlov.core.common.errors import ConfigError

log = getLogger(__name__)

_registry_path = Path(__file__).parent / 'metrics_registry.json'
if _registry_path.exists():
    with _registry_path.open(encoding='utf-8') as f:
        _REGISTRY = json.load(f)
else:
    _REGISTRY = {}


def fn_from_str(name: str) -> Callable[..., Any]:
    """Returns a function object with the name given in string."""
    try:
        module_name, fn_name = name.split(':')
        return getattr(importlib.import_module(module_name), fn_name)
    except ValueError:
        raise ConfigError('Expected function description in a `module.submodules:function_name` form, but got `{}`'
                          .format(name))
    except AttributeError:
        # noinspection PyUnboundLocalVariable
        raise ConfigError(f"Incorrect metric: '{module_name}' has no attribute '{fn_name}'.")


def register_metric(metric_name: str) -> Callable[..., Any]:
    """Decorator for metric registration."""

    def decorate(fn):
        fn_name = fn.__module__ + ':' + fn.__name__
        if metric_name in _REGISTRY and _REGISTRY[metric_name] != fn_name:
            log.warning('"{}" is already registered as a metric name, the old function will be ignored'
                        .format(metric_name))
        _REGISTRY[metric_name] = fn_name
        return fn

    return decorate


def get_metric_by_name(name: str) -> Callable[..., Any]:
    """Returns a metric callable with a corresponding name."""
    name = _REGISTRY.get(name, name)
    return fn_from_str(name)


================================================
FILE: deeppavlov/core/common/params.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
from logging import getLogger
from types import FunctionType
from typing import Any, Dict, Union

from deeppavlov.core.commands.utils import expand_path, parse_config
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import get_model
from deeppavlov.core.models.component import Component

log = getLogger(__name__)

_refs = {}


def resolve(val):
    if isinstance(val, str) and val.startswith('#'):
        component_id, *attributes = val[1:].split('.')
        try:
            val = _refs[component_id]
        except KeyError:
            e = ConfigError('Component with id "{id}" was referenced but not initialized'
                            .format(id=component_id))
            log.exception(e)
            raise e
        attributes = ['val'] + attributes
        val = eval('.'.join(attributes))
    return val


def _init_param(param, mode):
    if isinstance(param, str):
        param = resolve(param)
    elif isinstance(param, (list, tuple)):
        param = [_init_param(p, mode) for p in param]
    elif isinstance(param, dict):
        if {'ref', 'class_name', 'config_path'}.intersection(param.keys()):
            param = from_params(param, mode=mode)
        else:
            param = {k: _init_param(v, mode) for k, v in param.items()}
    return param


def from_params(params: Dict, mode: str = 'infer', **kwargs) -> Union[Component, FunctionType]:
    """Builds and returns the Component from corresponding dictionary of parameters."""
    # what is passed in json:
    config_params = {k: resolve(v) for k, v in params.items()}

    # get component by reference (if any)
    if 'ref' in config_params:
        try:
            return _refs[config_params['ref']]
        except KeyError:
            e = ConfigError('Component with id "{id}" was referenced but not initialized'
                            .format(id=config_params['ref']))
            log.exception(e)
            raise e

    elif 'config_path' in config_params:
        from deeppavlov.core.commands.infer import build_model
        refs = _refs.copy()
        _refs.clear()
        config = parse_config(expand_path(config_params['config_path']), config_params.get('overwrite'))
        model = build_model(config)
        _refs.clear()
        _refs.update(refs)
        try:
            _refs[config_params['id']] = model
        except KeyError:
            pass
        return model

    cls_name = config_params.pop('class_name', None)
    if not cls_name:
        e = ConfigError('Component config has no `class_name` nor `ref` fields')
        log.exception(e)
        raise e
    obj = get_model(cls_name)

    if inspect.isclass(obj):
        # find the submodels params recursively
        config_params = {k: _init_param(v, mode) for k, v in config_params.items()}
        try:
            spec = inspect.getfullargspec(obj)
            if 'mode' in spec.args + spec.kwonlyargs or spec.varkw is not None:
                kwargs['mode'] = mode

            component = obj(**dict(config_params, **kwargs))
            try:
                _refs[config_params['id']] = component
            except KeyError:
                pass
        except Exception:
            log.exception("Exception in {}".format(obj))
            raise
    else:
        component = obj

    return component


================================================
FILE: deeppavlov/core/common/params_search.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random
from copy import deepcopy
from logging import getLogger
from typing import List, Generator, Any, Tuple

import numpy as np

from deeppavlov.core.common.registry import register

log = getLogger(__name__)


@register('params_search')
class ParamsSearch:
    """
    Class determine the main operations for parameters search
    like finding all changing parameters.

    Args:
        prefix: prefix to determine special keys like "`prefix`_range", "`prefix`_bool", "`prefix`_choice"
        seed: random seed for initialization
        **kwargs: basic config with parameters

    Attributes:
        basic_config: dictionary with initial config with possible values of searched parameters
        prefix: prefix to determine special keys like "`prefix`_range", "`prefix`_bool", "`prefix`_choice"
        paths_to_params: list of lists of keys and/or integers (for list)
                with relative paths to searched parameters
        n_params: number of searched parameters
        eps: threshold value
    """

    def __init__(self,
                 prefix="search",
                 seed: int = None,
                 **kwargs):
        """
        Initialize evolution with random population
        """

        self.basic_config = deepcopy(kwargs)
        self.prefix = prefix

        self.paths_to_params = []
        for search_type in [prefix + "_range", prefix + "_choice", prefix + "_bool"]:
            for path_ in self.find_model_path(self.basic_config, search_type):
                self.paths_to_params.append(path_)

        self.n_params = len(self.paths_to_params)

        self.eps = 1e-6

        if seed is None:
            pass
        else:
            np.random.seed(seed)
            random.seed(seed)

    def find_model_path(self, config: dict, key_model: str, path: list = []) -> Generator:
        """
        Find paths to all dictionaries in config that contain key 'key_model'

        Args:
            config: dictionary
            key_model: key of sub-dictionary to be found
            path: list of keys and/or integers (for list) with relative path (needed for recursion)

        Returns:
            path in config -- list of keys (strings and integers)
        """
        config_pointer = config
        if isinstance(config_pointer, dict) and key_model in config_pointer.keys():
            yield path
        else:
            if isinstance(config_pointer, dict):
                for key in list(config_pointer.keys()):
                    for path_ in self.find_model_path(config_pointer[key], key_model, path + [key]):
                        yield path_
            elif isinstance(config_pointer, list):
                for i in range(len(config_pointer)):
                    for path_ in self.find_model_path(config_pointer[i], key_model, path + [i]):
                        yield path_

    @staticmethod
    def insert_value_or_dict_into_config(config: dict, path: list,
                                         value: [int, float, str, bool, list, dict, np.ndarray]) -> None:
        """
        Insert value to dictionary determined by path[:-1] in field with key path[-1]

        Args:
            config: dictionary
            path: list of keys and/or integers (for list)
            value: value to be inserted

        Returns:
            config with inserted value
        """
        config_pointer = config
        for el in path[:-1]:
            if isinstance(config_pointer, dict):
                config_pointer = config_pointer.setdefault(el, {})
            elif isinstance(config_pointer, list):
                config_pointer = config_pointer[el]
            else:
                pass
        config_pointer[path[-1]] = value

    @staticmethod
    def get_value_from_config(config: dict, path: list) -> Any:
        """
        Return value of config element determined by path

        Args:
            config: dictionary
            path: list of keys and/or integers (for list)

        Returns:
            value
        """
        config_copy = deepcopy(config)
        config_pointer = config_copy
        for el in path[:-1]:
            if isinstance(config_pointer, dict):
                config_pointer = config_pointer.setdefault(el, {})
            elif isinstance(config_pointer, list):
                config_pointer = config_pointer[el]
            else:
                pass
        return config_pointer[path[-1]]

    @staticmethod
    def remove_key_from_config(config: dict, path: list) -> Tuple[dict, Any]:
        """
        Remove config element determined by path

        Args:
            config: dictionary
            path: list of keys and/or integers (for list)

        Returns:
            dictionary without value from path, value from path
        """
        config_copy = deepcopy(config)
        config_pointer = config_copy
        for el in path[:-1]:
            if isinstance(config_pointer, dict):
                config_pointer = config_pointer.setdefault(el, {})
            elif isinstance(config_pointer, list):
                config_pointer = config_pointer[el]
            else:
                pass
        value = config_pointer.pop(path[-1])
        return config_copy, value

    def initialize_params_in_config(self, basic_config: dict, paths: List[list]) -> dict:
        """
        Randomly initialize all the changable parameters in config

        Args:
            basic_config: config where changable parameters are dictionaries with keys
                ``prefix`_range`, ``prefix`_bool`, ``prefix`_choice`
            paths: list of paths to changable parameters

        Returns:
            config
        """
        config = deepcopy(basic_config)
        for path_ in paths:
            param_name = path_[-1]
            value = self.get_value_from_config(basic_config, path_)
            if isinstance(value, dict):
                if (value.get(self.prefix + "_choice") or
                        value.get(self.prefix + "_range") or
                        value.get(self.prefix + "_bool")):
                    self.insert_value_or_dict_into_config(
                        config, path_,
                        self.sample_params(**{param_name: deepcopy(value)})[param_name])

        return config

    def sample_params(self, **params) -> dict:
        """
        Sample parameters according to the given possible values

        Args:
            **params: dictionary like {"param_0": {"`prefix`_range": [0, 10]},
                                       "param_1": {"`prefix`_range": [0, 10], "discrete": true},
                                       "param_2": {"`prefix`_range": [0, 1], "scale": "log"},
                                       "param_3": {"`prefix`_bool": true},
                                       "param_4": {"`prefix`_choice": [0, 1, 2, 3]}}

        Returns:
            dictionary with randomly sampled parameters
        """
        if not params:
            return {}
        else:
            params_copy = deepcopy(params)
        params_sample = dict()
        for param, param_val in params_copy.items():
            if isinstance(param_val, dict):
                if self.prefix + '_bool' in param_val and param_val[self.prefix + '_bool']:
                    sample = bool(random.choice([True, False]))
                elif self.prefix + '_range' in param_val:
                    sample = self._sample_from_ranges(param_val)
                elif self.prefix + '_choice' in param_val:
                    sample = random.choice(param_val[self.prefix + '_choice'])
                else:
                    sample = param_val
                params_sample[param] = sample
            else:
                params_sample[param] = params_copy[param]
        return params_sample

    def _sample_from_ranges(self, opts: dict) -> [int, float]:
        """
        Sample parameters from ranges

        Args:
            opts: dictionary  {"`prefix`_range": [0, 10]} or \
                              {"`prefix`_range": [0, 10], "discrete": true} or \
                              {"`prefix`_range": [0, 1], "scale": "log"}

        Returns:
            random parameter value from range
        """
        from_ = opts[self.prefix + '_range'][0]
        to_ = opts[self.prefix + '_range'][1]
        if opts.get('scale', None) == 'log':
            sample = self._sample_log(from_, to_)
        else:
            sample = np.random.uniform(from_, to_)
        if opts.get('discrete', False):
            sample = int(np.round(sample))
        return sample

    @staticmethod
    def _sample_log(from_: float = 0., to_: float = 1.) -> float:
        """
        Sample parameters from ranges with log scale

        Args:
            from_: lower boundary of values
            to_:  upper boundary of values

        Returns:
            random parameters value from range with log scale
        """
        sample = np.exp(np.random.uniform(np.log(from_), np.log(to_)))
        return float(sample)


================================================
FILE: deeppavlov/core/common/paths.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil

from pathlib import Path

_root_path = Path(__file__).resolve().parents[3]
_default_settings_path: Path = _root_path / 'deeppavlov' / 'utils' / 'settings'
_settings_path = Path(os.getenv('DP_SETTINGS_PATH', _default_settings_path)).expanduser().resolve()
if _settings_path.is_file():
    raise FileExistsError(f'DP_SETTINGS_PATH={_settings_path} is a file and not a directory')

if _default_settings_path in _settings_path.parents:
    raise RecursionError(f'DP_SETTINGS_PATH={_settings_path} is relative'
                         f' to the default settings path {_default_settings_path}')


def get_settings_path() -> Path:
    """Return an absolute path to the DeepPavlov settings directory"""
    populate_settings_dir()
    return _settings_path


def populate_settings_dir(force: bool = False) -> bool:
    """
    Populate settings directory with default settings files

    Args:
        force: if ``True``, replace existing settings files with default ones

    Returns:
        ``True`` if any files were copied and ``False`` otherwise
    """
    res = False
    if _default_settings_path == _settings_path:
        return res

    for src in list(_default_settings_path.glob('**/*.json')):
        dest = _settings_path / src.relative_to(_default_settings_path)
        if not force and dest.exists():
            continue
        res = True
        dest.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy(src, dest)
    return res


================================================
FILE: deeppavlov/core/common/prints.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
from contextlib import redirect_stdout


class RedirectedPrints(redirect_stdout):
    """Context manager for temporarily redirecting stdout to another stream """

    def __init__(self, new_target=sys.stderr):
        super().__init__(new_target=new_target)


================================================
FILE: deeppavlov/core/common/registry.json
================================================
{
  "answer_types_extractor": "deeppavlov.models.kbqa.type_define:AnswerTypesExtractor",
  "api_requester": "deeppavlov.models.api_requester.api_requester:ApiRequester",
  "api_router": "deeppavlov.models.api_requester.api_router:ApiRouter",
  "basic_classification_iterator": "deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator",
  "basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader",
  "boolqa_reader": "deeppavlov.dataset_readers.boolqa_reader:BoolqaReader",
  "bpr": "deeppavlov.models.doc_retrieval.bpr:BPR",
  "chu_liu_edmonds_transformer": "deeppavlov.models.morpho_syntax_parser.dependency_decoding:ChuLiuEdmonds",
  "concat_lists": "deeppavlov.models.doc_retrieval.utils:concat_lists",
  "conll2003_reader": "deeppavlov.dataset_readers.conll2003_reader:Conll2003DatasetReader",
  "cos_sim_classifier": "deeppavlov.models.classifiers.cos_sim_classifier:CosineSimilarityClassifier",
  "data_fitting_iterator": "deeppavlov.core.data.data_fitting_iterator:DataFittingIterator",
  "data_learning_iterator": "deeppavlov.core.data.data_learning_iterator:DataLearningIterator",
  "dependency_output_prettifier": "deeppavlov.models.morpho_syntax_parser.syntax_parsing:DependencyOutputPrettifier",
  "dirty_comments_preprocessor": "deeppavlov.models.preprocessors.dirty_comments_preprocessor:DirtyCommentsPreprocessor",
  "docred_reader": "deeppavlov.dataset_readers.docred_reader:DocREDDatasetReader",
  "document_chunker": "deeppavlov.models.preprocessors.odqa_preprocessors:DocumentChunker",
  "dnnc_pair_generator": "deeppavlov.models.preprocessors.dnnc_preprocessor:PairGenerator",
  "dnnc_proba2labels": "deeppavlov.models.classifiers.dnnc_proba2labels:Proba2Labels",
  "entity_detection_parser": "deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser",
  "entity_linker": "deeppavlov.models.entity_extraction.entity_linking:EntityLinker",
  "entity_type_split": "deeppavlov.models.entity_extraction.entity_detection_parser:entity_type_split",
  "faq_reader": "deeppavlov.dataset_readers.faq_reader:FaqDatasetReader",
  "fasttext": "deeppavlov.models.embedders.fasttext_embedder:FasttextEmbedder",
  "fit_trainer": "deeppavlov.core.trainers.fit_trainer:FitTrainer",
  "hashing_tfidf_vectorizer": "deeppavlov.models.vectorizers.hashing_tfidf_vectorizer:HashingTfIdfVectorizer",
  "huggingface_dataset_iterator": "deeppavlov.dataset_iterators.huggingface_dataset_iterator:HuggingFaceDatasetIterator",
  "huggingface_dataset_reader": "deeppavlov.dataset_readers.huggingface_dataset_reader:HuggingFaceDatasetReader",
  "imdb_reader": "deeppavlov.dataset_readers.imdb_reader:ImdbReader",
  "joint_tagger_parser": "deeppavlov.models.morpho_syntax_parser.joint:JointTaggerParser",
  "kenlm_elector": "deeppavlov.models.spelling_correction.electors.kenlm_elector:KenlmElector",
  "lazy_tokenizer": "deeppavlov.models.tokenizers.lazy_tokenizer:lazy_tokenizer",
  "lcquad_reader": "deeppavlov.dataset_readers.sq_reader:LCQuADReader",
  "lemmatized_output_prettifier": "deeppavlov.models.morpho_syntax_parser.syntax_parsing:LemmatizedOutputPrettifier",
  "line_reader": "deeppavlov.dataset_readers.line_reader:LineReader",
  "logit_ranker": "deeppavlov.models.doc_retrieval.logit_ranker:LogitRanker",
  "mask": "deeppavlov.models.preprocessors.mask:Mask",
  "morphotagger_dataset_iterator": "deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerDatasetIterator",
  "morphotagger_dataset_reader": "deeppavlov.dataset_readers.morphotagging_dataset_reader:MorphotaggerDatasetReader",
  "multitask_reader":"deeppavlov.dataset_readers.multitask_reader:MultiTaskReader",
  "multitask_pipeline_preprocessor":"deeppavlov.models.preprocessors.multitask_preprocessor:MultiTaskPipelinePreprocessor",
  "multitask_transformer":"deeppavlov.models.torch_bert.multitask_transformer:MultiTaskTransformer",
  "multitask_iterator":"deeppavlov.dataset_iterators.multitask_iterator:MultiTaskIterator",
  "multi_squad_dataset_reader": "deeppavlov.dataset_readers.squad_dataset_reader:MultiSquadDatasetReader",
  "multi_squad_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadIterator",
  "multi_squad_retr_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadRetrIterator",
  "ner_chunk_model": "deeppavlov.models.entity_extraction.ner_chunker:NerChunkModel",
  "ner_chunker": "deeppavlov.models.entity_extraction.ner_chunker:NerChunker",
  "ner_vocab": "deeppavlov.models.preprocessors.ner_preprocessor:NerVocab",
  "nltk_moses_tokenizer": "deeppavlov.models.tokenizers.nltk_moses_tokenizer:NLTKMosesTokenizer",
  "nltk_tokenizer": "deeppavlov.models.tokenizers.nltk_tokenizer:NLTKTokenizer",
  "nn_trainer": "deeppavlov.core.trainers.nn_trainer:NNTrainer",
  "odqa_reader": "deeppavlov.dataset_readers.odqa_reader:ODQADataReader",
  "one_hotter": "deeppavlov.models.preprocessors.one_hotter:OneHotter",
  "params_search": "deeppavlov.core.common.params_search:ParamsSearch",
  "paraphraser_reader": "deeppavlov.dataset_readers.paraphraser_reader:ParaphraserReader",
  "path_ranking_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:PathRankingPreprocessor",
  "pop_ranker": "deeppavlov.models.doc_retrieval.pop_ranker:PopRanker",
  "proba2labels": "deeppavlov.models.classifiers.proba2labels:Proba2Labels",
  "query_formatter": "deeppavlov.models.kbqa.query_generator:QueryFormatter",
  "query_generator": "deeppavlov.models.kbqa.query_generator:QueryGenerator",
  "question_sign_checker": "deeppavlov.models.entity_extraction.entity_detection_parser:QuestionSignChecker",
  "re_classifier": "deeppavlov.models.relation_extraction.relation_extraction_bert:REBertModel",
  "re_postprocessor": "deeppavlov.models.preprocessors.re_preprocessor:REPostprocessor",
  "re_preprocessor": "deeppavlov.models.preprocessors.re_preprocessor:REPreprocessor",
  "rel_ranking_infer": "deeppavlov.models.kbqa.rel_ranking_infer:RelRankerInfer",
  "rel_ranking_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:RelRankingPreprocessor",
  "rel_ranking_reader": "deeppavlov.dataset_readers.rel_ranking_reader:ParaphraserReader",
  "response_base_loader": "deeppavlov.models.preprocessors.response_base_loader:ResponseBaseLoader",
  "ru_adj_to_noun": "deeppavlov.models.kbqa.ru_adj_to_noun:RuAdjToNoun",
  "rubq_reader": "deeppavlov.dataset_readers.sq_reader:RuBQReader",
  "rured_reader": "deeppavlov.dataset_readers.rured_reader:RuREDDatasetReader",
  "russian_words_vocab": "deeppavlov.vocabs.typos:RussianWordsVocab",
  "sanitizer": "deeppavlov.models.preprocessors.sanitizer:Sanitizer",
  "sentseg_restore_sent": "deeppavlov.models.preprocessors.sentseg_preprocessor:SentSegRestoreSent",
  "siamese_iterator": "deeppavlov.dataset_iterators.siamese_iterator:SiameseIterator",
  "simple_vocab": "deeppavlov.core.data.simple_vocab:SimpleVocabulary",
  "sklearn_component": "deeppavlov.models.sklearn.sklearn_component:SklearnComponent",
  "slovnet_syntax_parser": "deeppavlov.models.kbqa.tree_to_sparql:SlovnetSyntaxParser",
  "spacy_lemmatizer": "deeppavlov.models.morpho_syntax_parser.spacy_lemmatizer:SpacyLemmatizer",
  "spelling_error_model": "deeppavlov.models.spelling_correction.brillmoore.error_model:ErrorModel",
  "spelling_levenshtein": "deeppavlov.models.spelling_correction.levenshtein.searcher_component:LevenshteinSearcherComponent",
  "split_tokenizer": "deeppavlov.models.tokenizers.split_tokenizer:SplitTokenizer",
  "sq_reader": "deeppavlov.dataset_readers.sq_reader:SQReader",
  "sqlite_iterator": "deeppavlov.dataset_iterators.sqlite_iterator:SQLiteDataIterator",
  "squad_bert_ans_postprocessor": "deeppavlov.models.preprocessors.squad_preprocessor:SquadBertAnsPostprocessor",
  "squad_bert_ans_preprocessor": "deeppavlov.models.preprocessors.squad_preprocessor:SquadBertAnsPreprocessor",
  "squad_bert_mapping": "deeppavlov.models.preprocessors.squad_preprocessor:SquadBertMappingPreprocessor",
  "squad_dataset_reader": "deeppavlov.dataset_readers.squad_dataset_reader:SquadDatasetReader",
  "squad_iterator": "deeppavlov.dataset_iterators.squad_iterator:SquadIterator",
  "static_dictionary": "deeppavlov.vocabs.typos:StaticDictionary",
  "str_lower": "deeppavlov.models.preprocessors.str_lower:str_lower",
  "str_token_reverser": "deeppavlov.models.preprocessors.str_token_reverser:StrTokenReverser",
  "str_utf8_encoder": "deeppavlov.models.preprocessors.str_utf8_encoder:StrUTF8Encoder",
  "stream_spacy_tokenizer": "deeppavlov.models.tokenizers.spacy_tokenizer:StreamSpacyTokenizer",
  "string_multiplier": "deeppavlov.models.preprocessors.odqa_preprocessors:StringMultiplier",
  "template_matcher": "deeppavlov.models.kbqa.template_matcher:TemplateMatcher",
  "tfidf_ranker": "deeppavlov.models.doc_retrieval.tfidf_ranker:TfidfRanker",
  "tfidf_weighted": "deeppavlov.models.embedders.tfidf_weighted_embedder:TfidfWeightedEmbedder",
  "top1_elector": "deeppavlov.models.spelling_correction.electors.top1_elector:TopOneElector",
  "torch_bert_ranker": "deeppavlov.models.torch_bert.torch_bert_ranker:TorchBertRankerModel",
  "torch_bert_ranker_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchBertRankerPreprocessor",
  "torch_record_postprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchRecordPostprocessor",
  "torch_squad_transformers_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchSquadTransformersPreprocessor",
  "torch_text_classification_model": "deeppavlov.models.classifiers.torch_classification_model:TorchTextClassificationModel",
  "torch_trainer": "deeppavlov.core.trainers.torch_trainer:TorchTrainer",
  "torch_transformers_classifier": "deeppavlov.models.torch_bert.torch_transformers_classifier:TorchTransformersClassifierModel",
  "torch_transformers_el_ranker": "deeppavlov.models.torch_bert.torch_transformers_el_ranker:TorchTransformersElRanker",
  "torch_transformers_entity_ranker_infer": "deeppavlov.models.torch_bert.torch_transformers_el_ranker:TorchTransformersEntityRankerInfer",
  "torch_transformers_entity_ranker_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersEntityRankerPreprocessor",
  "torch_transformers_multiplechoice": "deeppavlov.models.torch_bert.torch_transformers_multiplechoice:TorchTransformersMultiplechoiceModel",
  "torch_transformers_multiplechoice_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersMultiplechoicePreprocessor",
  "torch_transformers_ner_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersNerPreprocessor",
  "torch_transformers_nll_ranker": "deeppavlov.models.torch_bert.torch_transformers_nll_ranking:TorchTransformersNLLRanker",
  "torch_transformers_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersPreprocessor",
  "torch_transformers_sequence_tagger": "deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger",
  "torch_transformers_squad": "deeppavlov.models.torch_bert.torch_transformers_squad:TorchTransformersSquad",
  "torch_transformers_syntax_parser": "deeppavlov.models.torch_bert.torch_transformers_syntax_parser:TorchTransformersSyntaxParser",
  "transformers_bert_embedder": "deeppavlov.models.embedders.transformers_embedder:TransformersBertEmbedder",
  "transformers_bert_preprocessor": "deeppavlov.models.preprocessors.transformers_preprocessor:TransformersBertPreprocessor",
  "tree_to_sparql": "deeppavlov.models.kbqa.tree_to_sparql:TreeToSparql",
  "typos_custom_reader": "deeppavlov.dataset_readers.typos_reader:TyposCustom",
  "typos_iterator": "deeppavlov.dataset_iterators.typos_iterator:TyposDatasetIterator",
  "typos_kartaslov_reader": "deeppavlov.dataset_readers.typos_reader:TyposKartaslov",
  "typos_wikipedia_reader": "deeppavlov.dataset_readers.typos_reader:TyposWikipedia",
  "ubuntu_v2_reader": "deeppavlov.dataset_readers.ubuntu_v2_reader:UbuntuV2Reader",
  "wiki_parser": "deeppavlov.models.kbqa.wiki_parser:WikiParser",
  "wiki_sqlite_vocab": "deeppavlov.vocabs.wiki_sqlite:WikiSQLiteVocab",
  "wikitionary_100K_vocab": "deeppavlov.vocabs.typos:Wiki100KDictionary"
}


================================================
FILE: deeppavlov/core/common/registry.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import json
from logging import getLogger
from pathlib import Path

from deeppavlov.core.common.errors import ConfigError

logger = getLogger(__name__)

_registry_path = Path(__file__).parent / 'registry.json'
if _registry_path.exists():
    with _registry_path.open(encoding='utf-8') as f:
        _REGISTRY = json.load(f)
else:
    _REGISTRY = {}

inverted_registry = {val: key for key, val in _REGISTRY.items()}


def cls_from_str(name: str) -> type:
    """Returns a class object with the name given as a string."""
    try:
        module_name, cls_name = name.split(':')
    except ValueError:
        raise ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`'
                          .format(name))

    return getattr(importlib.import_module(module_name), cls_name)


def register(name: str = None) -> type:
    """
    Register classes that could be initialized from JSON configuration file.
    If name is not passed, the class name is converted to snake-case.
    """

    def decorate(model_cls: type, reg_name: str = None) -> type:
        model_name = reg_name or short_name(model_cls)
        global _REGISTRY
        cls_name = model_cls.__module__ + ':' + model_cls.__name__
        if model_name in _REGISTRY and _REGISTRY[model_name] != cls_name:
            logger.warning('Registry name "{}" has been already registered and will be overwritten.'.format(model_name))
        _REGISTRY[model_name] = cls_name
        return model_cls

    return lambda model_cls_name: decorate(model_cls_name, name)


def short_name(cls: type) -> str:
    """Returns just a class name (without package and module specification)."""
    return cls.__name__.split('.')[-1]


def get_model(name: str) -> type:
    """Returns a registered class object with the name given in the string."""
    if name not in _REGISTRY:
        if ':' not in name:
            raise ConfigError("Model {} is not registered.".format(name))
        return cls_from_str(name)
    return cls_from_str(_REGISTRY[name])


def list_models() -> list:
    """Returns a list of names of registered classes."""
    return list(_REGISTRY)


================================================
FILE: deeppavlov/core/common/requirements_registry.json
================================================
{
  "answer_types_extractor": [
    "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt",
    "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt"
  ],
  "chu_liu_edmonds_transformer": [
    "{DEEPPAVLOV_PATH}/requirements/dependency_decoding.txt"
  ],
  "bpr": [
    "{DEEPPAVLOV_PATH}/requirements/faiss.txt",
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "entity_linker": [
    "{DEEPPAVLOV_PATH}/requirements/hdt.txt",
    "{DEEPPAVLOV_PATH}/requirements/rapidfuzz.txt",
    "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt",
    "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt"
  ],
  "fasttext": [
    "{DEEPPAVLOV_PATH}/requirements/fasttext.txt"
  ],
  "huggingface_dataset_iterator": [
    "{DEEPPAVLOV_PATH}/requirements/datasets.txt"
  ],
  "huggingface_dataset_reader": [
    "{DEEPPAVLOV_PATH}/requirements/datasets.txt"
  ],
  "kenlm_elector": [
    "{DEEPPAVLOV_PATH}/requirements/kenlm.txt"
  ],
  "ner_chunk_model": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "ner_chunker": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "nltk_moses_tokenizer": [
    "{DEEPPAVLOV_PATH}/requirements/sacremoses.txt"
  ],
  "path_ranking_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "query_generator": [
    "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt",
    "{DEEPPAVLOV_PATH}/requirements/hdt.txt",
    "{DEEPPAVLOV_PATH}/requirements/rapidfuzz.txt",
    "{DEEPPAVLOV_PATH}/requirements/whapi.txt"
  ],
  "re_classifier": [
    "{DEEPPAVLOV_PATH}/requirements/opt_einsum.txt",
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "re_postprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "re_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "rel_ranking_infer": [
    "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt",
    "{DEEPPAVLOV_PATH}/requirements/hdt.txt"
  ],
  "rel_ranking_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "ru_adj_to_noun": [
    "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt"
  ],
  "russian_words_vocab": [
    "{DEEPPAVLOV_PATH}/requirements/lxml.txt"
  ],
  "slovnet_syntax_parser": [
    "{DEEPPAVLOV_PATH}/requirements/slovnet.txt",
    "{DEEPPAVLOV_PATH}/requirements/razdel.txt",
    "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt"
  ],
  "spacy_lemmatizer": [
    "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt",
    "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt"
  ],
  "spelling_error_model": [
    "{DEEPPAVLOV_PATH}/requirements/lxml.txt"
  ],
  "spelling_levenshtein": [
    "{DEEPPAVLOV_PATH}/requirements/sortedcontainers.txt"
  ],
  "static_dictionary": [
    "{DEEPPAVLOV_PATH}/requirements/lxml.txt"
  ],
  "stream_spacy_tokenizer": [
    "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt",
    "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt"
  ],
  "torch_bert_ranker": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_bert_ranker_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_record_postprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_squad_transformers_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_text_classification_model": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt"
  ],
  "torch_transformers_classifier": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "multitask_transformer": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_transformers_el_ranker": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_transformers_entity_ranker_infer": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_transformers_entity_ranker_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_transformers_multiplechoice": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_transformers_multiplechoice_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_transformers_ner_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt",
    "{DEEPPAVLOV_PATH}/requirements/sentencepiece.txt",
    "{DEEPPAVLOV_PATH}/requirements/protobuf.txt"
  ],
  "torch_transformers_nll_ranker": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_transformers_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_transformers_sequence_tagger": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/torchcrf.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "torch_transformers_syntax_parser": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/torchcrf.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "multitask_pipeline_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/torchcrf.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],  
  "torch_transformers_squad": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "transformers_bert_embedder": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "transformers_bert_preprocessor": [
    "{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
    "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
  ],
  "tree_to_sparql": [
    "{DEEPPAVLOV_PATH}/requirements/udapi.txt",
    "{DEEPPAVLOV_PATH}/requirements/razdel.txt",
    "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt"
  ],
  "typos_custom_reader": [
    "{DEEPPAVLOV_PATH}/requirements/lxml.txt"
  ],
  "typos_kartaslov_reader": [
    "{DEEPPAVLOV_PATH}/requirements/lxml.txt"
  ],
  "typos_wikipedia_reader": [
    "{DEEPPAVLOV_PATH}/requirements/lxml.txt"
  ],
  "wiki_parser": [
    "{DEEPPAVLOV_PATH}/requirements/hdt.txt"
  ],
  "wikitionary_100K_vocab": [
    "{DEEPPAVLOV_PATH}/requirements/lxml.txt"
  ]
}


================================================
FILE: deeppavlov/core/data/__init__.py
================================================


================================================
FILE: deeppavlov/core/data/data_fitting_iterator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from random import Random
from typing import List, Generator, Tuple, Any, Optional

from deeppavlov.core.common.registry import register

logger = getLogger(__name__)


@register('data_fitting_iterator')
class DataFittingIterator:
    """Dataset iterator for fitting estimator models, like vocabs, kNN, vectorizers.
    Data is passed as a list of strings(documents).
    Generate batches (for large datasets).

    Args:
        data: list of documents
        doc_ids: provided document ids
        seed: random seed for data shuffling
        shuffle: whether to shuffle data during batching

    Attributes:
        shuffle: whether to shuffle data during batching
        random: instance of :class:`Random` initialized with a seed
        data: list of documents
        doc_ids: provided by a user ids or generated automatically ids

    """

    def __init__(self, data: List[str], doc_ids: List[Any] = None,
                 seed: int = None, shuffle: bool = True,
                 *args, **kwargs) -> None:

        self.shuffle = shuffle
        self.random = Random(seed)
        self.data = data
        self.doc_ids = doc_ids or self.get_doc_ids()

    def get_doc_ids(self):
        """Generate doc ids.

        Returns: doc ids

        """
        return list(range(len(self.data)))

    def get_doc_content(self, doc_id: Any) -> Optional[str]:
        """Get doc content by id.

        Args:
            doc_id: an id for a doc which content should be extracted

        Returns:
            doc content as a string if id exists or raise an error

        """
        return self.data[doc_id]

    def gen_batches(self, batch_size: int, shuffle: bool = None) \
            -> Generator[Tuple[List[str], List[int]], Any, None]:
        """Gen batches of documents.

        Args:
            batch_size: a number of samples in a single batch
            shuffle: whether to shuffle data during batching

        Yields:
            generated tuple of documents and their ids

        """
        if shuffle is None:
            shuffle = self.shuffle

        if shuffle:
            _doc_ids = self.random.sample(self.doc_ids, len(self.doc_ids))
        else:
            _doc_ids = self.doc_ids

        if batch_size > 0:
            batches = [_doc_ids[i:i + batch_size] for i in
                       range(0, len(_doc_ids), batch_size)]
        else:
            batches = [_doc_ids]

        # DEBUG
        # len_batches = len(batches)

        for i, doc_ids in enumerate(batches):
            # DEBUG
            # logger.info(
            #     "Processing batch # {} of {} ({} documents)".format(i, len_batches, len(doc_index)))
            docs = [self.get_doc_content(doc_id) for doc_id in doc_ids]
            yield docs, doc_ids

    def get_instances(self):
        """Get all data"""
        doc_ids = list(self.doc_ids)
        docs = [self.get_doc_content(doc_id) for doc_id in doc_ids]
        return docs, doc_ids


================================================
FILE: deeppavlov/core/data/data_learning_iterator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from random import Random
from typing import List, Dict, Tuple, Any, Iterator

from deeppavlov.core.common.registry import register


@register('data_learning_iterator')
class DataLearningIterator:
    """Dataset iterator for learning models, e. g. neural networks.

    Args:
        data: list of (x, y) pairs for every data type in ``'train'``, ``'valid'`` and ``'test'``
        seed: random seed for data shuffling
        shuffle: whether to shuffle data during batching

    Attributes:
        shuffle: whether to shuffle data during batching
        random: instance of ``Random`` initialized with a seed
    """

    def split(self, *args, **kwargs):
        """ Manipulate self.train, self.valid, and self.test into their final form. """
        pass

    def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple[Any, Any]]:
        """ Transform the data for a specific data type (e.g. ``'train'``). """
        return data

    def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], seed: int = None, shuffle: bool = True,
                 *args, **kwargs) -> None:
        self.shuffle = shuffle

        self.random = Random(seed)

        self.train = self.preprocess(data.get('train', []), *args, **kwargs)
        self.valid = self.preprocess(data.get('valid', []), *args, **kwargs)
        self.test = self.preprocess(data.get('test', []), *args, **kwargs)
        self.split(*args, **kwargs)
        self.data = {
            'train': self.train,
            'valid': self.valid,
            'test': self.test,
            'all': self.train + self.test + self.valid
        }

    def gen_batches(self, batch_size: int, data_type: str = 'train',
                    shuffle: bool = None) -> Iterator[Tuple[tuple, tuple]]:
        """Generate batches of inputs and expected output to train neural networks

        Args:
            batch_size: number of samples in batch
            data_type: can be either 'train', 'test', or 'valid'
            shuffle: whether to shuffle dataset before batching

        Yields:
             a tuple of a batch of inputs and a batch of expected outputs
        """
        if shuffle is None:
            shuffle = self.shuffle

        data = self.data[data_type]
        data_len = len(data)

        if data_len == 0:
            return

        order = list(range(data_len))
        if shuffle:
            self.random.shuffle(order)

        if batch_size < 0:
            batch_size = data_len

        for i in range((data_len - 1) // batch_size + 1):
            yield tuple(zip(*[data[o] for o in order[i * batch_size:(i + 1) * batch_size]]))

    def get_instances(self, data_type: str = 'train') -> Tuple[tuple, tuple]:
        """Get all data for a selected data type

        Args:
            data_type (str): can be either ``'train'``, ``'test'``, ``'valid'`` or ``'all'``

        Returns:
             a tuple of all inputs for a data type and all expected outputs for a data type
        """
        data = self.data[data_type]
        return tuple(zip(*data))


================================================
FILE: deeppavlov/core/data/dataset_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Dict, Tuple, Any


class DatasetReader:
    """An abstract class for reading data from some location and construction of a dataset."""

    def read(self, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:
        """Reads a file from a path and returns data as a list of tuples of inputs and correct outputs
         for every data type in ``train``, ``valid`` and ``test``.
        """
        raise NotImplementedError


================================================
FILE: deeppavlov/core/data/simple_vocab.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import Counter, defaultdict
from itertools import chain
from logging import getLogger
from typing import Iterable, Optional, Tuple

import numpy as np

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import zero_pad, is_str_batch, flatten_str_batch
from deeppavlov.core.models.estimator import Estimator

log = getLogger(__name__)


@register('simple_vocab')
class SimpleVocabulary(Estimator):
    """Implements simple vocabulary.

    Parameters:
        special_tokens: tuple of tokens that shouldn't be counted.
        max_tokens: upper bound for number of tokens in the vocabulary.
        min_freq: minimal count of a token (except special tokens).
        pad_with_zeros: if True, then batch of elements will be padded with zeros up to length of
            the longest element in batch.
        unk_token: label assigned to unknown tokens.
        freq_drop_load: if True, then frequencies of tokens are set to min_freq on the model load.
        """

    def __init__(self,
                 special_tokens: Tuple[str, ...] = tuple(),
                 max_tokens: int = 2 ** 30,
                 min_freq: int = 0,
                 pad_with_zeros: bool = False,
                 unk_token: Optional[str] = None,
                 freq_drop_load: Optional[bool] = None,
                 *args,
                 **kwargs):
        super().__init__(**kwargs)
        self.special_tokens = special_tokens
        self._max_tokens = max_tokens
        self._min_freq = min_freq
        self._pad_with_zeros = pad_with_zeros
        self.unk_token = unk_token
        self.freq_drop_load = freq_drop_load
        self.reset()
        if self.load_path:
            self.load()

    def fit(self, *args):
        self.reset()
        tokens = chain(*args)
        # filter(None, <>) -- to filter empty tokens
        self.freqs = Counter(filter(None, flatten_str_batch(tokens)))
        for special_token in self.special_tokens:
            self._t2i[special_token] = self.count
            self._i2t.append(special_token)
            self.count += 1
        for token, freq in self.freqs.most_common()[:self._max_tokens]:
            if token in self.special_tokens:
                continue
            if freq >= self._min_freq:
                self._t2i[token] = self.count
                self._i2t.append(token)
                self.count += 1

    def _add_tokens_with_freqs(self, tokens, freqs):
        self.freqs = Counter()
        self.freqs.update(dict(zip(tokens, freqs)))
        for token, freq in zip(tokens, freqs):
            if freq >= self._min_freq or token in self.special_tokens:
                self._t2i[token] = self.count
                self._i2t.append(token)
                self.count += 1

    def __call__(self, batch, is_top=True, **kwargs):
        if isinstance(batch, Iterable) and not isinstance(batch, str):
            if all([k is None for k in batch]):
                return batch
            else:
                looked_up_batch = [self(sample, is_top=False) for sample in batch]
        else:
            return self[batch]
        if self._pad_with_zeros and is_top and not is_str_batch(looked_up_batch):
            looked_up_batch = zero_pad(looked_up_batch)

        return looked_up_batch

    def save(self):
        log.info("[saving vocabulary to {}]".format(self.save_path))
        with self.save_path.open('wt', encoding='utf8') as f:
            for n in range(len(self)):
                token = self._i2t[n]
                cnt = self.freqs[token]
                f.write('{}\t{:d}\n'.format(token, cnt))

    def load(self):
        self.reset()
        if self.load_path:
            if self.load_path.is_file():
                log.debug("[loading vocabulary from {}]".format(self.load_path))
                tokens, counts = [], []
                for ln in self.load_path.open('r', encoding='utf8'):
                    token, cnt = self.load_line(ln)
                    tokens.append(token)
                    counts.append(int(cnt))
                self._add_tokens_with_freqs(tokens, counts)
            elif not self.load_path.parent.is_dir():
                raise ConfigError("Provided `load_path` for {} doesn't exist!".format(
                    self.__class__.__name__))
        else:
            raise ConfigError("`load_path` for {} is not provided!".format(self))

    def load_line(self, ln):
        if self.freq_drop_load:
            token = ln.strip().split()[0]
            cnt = self._min_freq
        else:
            token, cnt = ln.rsplit('\t', 1)
        return token, cnt

    @property
    def len(self):
        return len(self)

    def keys(self):
        return (self[n] for n in range(self.len))

    def values(self):
        return list(range(self.len))

    def items(self):
        return zip(self.keys(), self.values())

    def __getitem__(self, key):
        if isinstance(key, (int, np.integer)):
            return self._i2t[key]
        elif isinstance(key, str):
            return self._t2i[key]
        else:
            raise NotImplementedError("not implemented for type `{}`".format(type(key)))

    def __contains__(self, item):
        return item in self._t2i

    def __len__(self):
        return len(self._i2t)

    def reset(self):
        self.freqs = None
        unk_index = 0
        if self.unk_token in self.special_tokens:
            unk_index = self.special_tokens.index(self.unk_token)
        self._t2i = defaultdict(lambda: unk_index)
        self._i2t = []
        self.count = 0

    def idxs2toks(self, idxs):
        return [self[idx] for idx in idxs]


================================================
FILE: deeppavlov/core/data/utils.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import gzip
import os
import secrets
import shutil
import tarfile
import zipfile
from hashlib import md5
from itertools import chain
from logging import getLogger
from pathlib import Path
from typing import Any, Generator, Iterable, List, Mapping, Optional, Sequence, Sized, Union, Collection
from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit, urlparse

import numpy as np
import requests
from tqdm import tqdm

log = getLogger(__name__)

_MARK_DONE = '.done'

tqdm.monitor_interval = 0


def get_download_token() -> str:
    """Return a download token from ~/.deeppavlov/token file.

    If token file does not exists, creates the file and writes to it a random URL-safe text string
    containing 32 random bytes.

    Returns:
        32 byte URL-safe text string from ~/.deeppavlov/token.

    """
    token_file = Path.home() / '.deeppavlov' / 'token'
    if not token_file.exists():
        if token_file.parent.is_file():
            token_file.parent.unlink()
        token_file.parent.mkdir(parents=True, exist_ok=True)
        token_file.write_text(secrets.token_urlsafe(32), encoding='utf8')

    return token_file.read_text(encoding='utf8').strip()


def s3_download(url: str, destination: str) -> None:
    """Download a file from an Amazon S3 path `s3://<bucket_name>/<key>`

    Requires the boto3 library to be installed and AWS credentials being set
    via environment variables or a credentials file

    Args:
        url: The source URL.
        destination: Path to the file destination (including file name).
    """
    import boto3

    s3 = boto3.resource('s3', endpoint_url=os.environ.get('AWS_ENDPOINT_URL'))

    bucket, key = url[5:].split('/', maxsplit=1)
    file_object = s3.Object(bucket, key)
    file_size = file_object.content_length
    with tqdm(total=file_size, unit='B', unit_scale=True) as pbar:
        file_object.download_file(destination, Callback=pbar.update)


def simple_download(url: str, destination: Union[Path, str], headers: Optional[dict] = None, n_tries: int = 3) -> None:
    """Download a file from URL to target location.

    Displays a progress bar to the terminal during the download process.

    Args:
        url: The source URL.
        destination: Path to the file destination (including file name).
        headers: Headers for file server.
        n_tries: Number of retries if download fails.

    """
    try:
        destination = Path(destination)
        destination.parent.mkdir(parents=True, exist_ok=True)

        log.info('Downloading from {} to {}'.format(url, destination))

        if url.startswith('s3://'):
            return s3_download(url, str(destination))

        chunk_size = 32 * 1024
        temporary = destination.with_suffix(destination.suffix + '.part')

        r = requests.get(url, stream=True, headers=headers)
        if r.status_code != 200:
            raise RuntimeError(f'Got status code {r.status_code} when trying to download {url}')
        total_length = int(r.headers.get('content-length', 0))

        if temporary.exists() and temporary.stat().st_size > total_length:
            temporary.write_bytes(b'')  # clearing temporary file when total_length is inconsistent

        with temporary.open('ab') as f:
            downloaded = f.tell()
            if downloaded != 0:
                log.warning(f'Found a partial download {temporary}')
            with tqdm(initial=downloaded, total=total_length, unit='B', unit_scale=True) as pbar:
                while True:
                    if downloaded != 0:
                        log.warning(f'Download stopped abruptly, trying to resume from {downloaded} '
                                    f'to reach {total_length}')
                        headers['Range'] = f'bytes={downloaded}-'
                        r = requests.get(url, headers=headers, stream=True)
                        if 'content-length' not in r.headers or \
                                total_length - downloaded != int(r.headers['content-length']):
                            raise RuntimeError('It looks like the server does not support resuming downloads.')

                    try:
                        for chunk in r.iter_content(chunk_size=chunk_size):
                            if chunk:  # filter out keep-alive new chunks
                                downloaded += len(chunk)
                                pbar.update(len(chunk))
                                f.write(chunk)
                    except requests.exceptions.ChunkedEncodingError:
                        if downloaded == 0:
                            r = requests.get(url, stream=True, headers=headers)

                    if downloaded >= total_length:
                        # Note that total_length is 0 if the server didn't return the content length,
                        # in this case we perform just one iteration and assume that we are done.
                        break

        temporary.rename(destination)
    except Exception as e:
        if n_tries > 0:
            log.warning(f'Download failed: {e}, retrying')
            simple_download(url, destination, headers, n_tries - 1)
        else:
            raise e


def download(dest_file_path: [List[Union[str, Path]]], source_url: str, force_download: bool = True,
             headers: Optional[dict] = None) -> None:
    """Download a file from URL to one or several target locations.

    Args:
        dest_file_path: Path or list of paths to the file destination (including file name).
        source_url: The source URL.
        force_download: Download file if it already exists, or not.
        headers: Headers for file server.

    """

    if isinstance(dest_file_path, list):
        dest_file_paths = [Path(path) for path in dest_file_path]
    else:
        dest_file_paths = [Path(dest_file_path).absolute()]

    if not force_download:
        to_check = list(dest_file_paths)
        dest_file_paths = []
        for p in to_check:
            if p.exists():
                log.info(f'File already exists in {p}')
            else:
                dest_file_paths.append(p)

    if dest_file_paths:
        cache_dir = os.getenv('DP_CACHE_DIR')
        cached_exists = False
        if cache_dir:
            first_dest_path = Path(cache_dir) / md5(source_url.encode('utf8')).hexdigest()[:15]
            cached_exists = first_dest_path.exists()
        else:
            first_dest_path = dest_file_paths.pop()

        if not cached_exists:
            first_dest_path.parent.mkdir(parents=True, exist_ok=True)

            simple_download(source_url, first_dest_path, headers)
        else:
            log.info(f'Found cached {source_url} in {first_dest_path}')

        for dest_path in dest_file_paths:
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy(str(first_dest_path), str(dest_path))


def untar(file_path: Union[Path, str], extract_folder: Optional[Union[Path, str]] = None) -> None:
    """Simple tar archive extractor.

    Args:
        file_path: Path to the tar file to be extracted.
        extract_folder: Folder to which the files will be extracted.

    """
    file_path = Path(file_path)
    if extract_folder is None:
        extract_folder = file_path.parent
    extract_folder = Path(extract_folder)
    tar = tarfile.open(file_path)
    tar.extractall(extract_folder)
    tar.close()


def ungzip(file_path: Union[Path, str], extract_path: Optional[Union[Path, str]] = None) -> None:
    """Simple .gz archive extractor.

    Args:
        file_path: Path to the gzip file to be extracted.
        extract_path: Path where the file will be extracted.

    """
    chunk_size = 16 * 1024
    file_path = Path(file_path)
    if extract_path is None:
        extract_path = file_path.with_suffix('')
    extract_path = Path(extract_path)

    with gzip.open(file_path, 'rb') as fin, extract_path.open('wb') as fout:
        while True:
            block = fin.read(chunk_size)
            if not block:
                break
            fout.write(block)


def download_decompress(url: str,
                        download_path: Union[Path, str],
                        extract_paths: Optional[Union[List[Union[Path, str]], Path, str]] = None,
                        headers: Optional[dict] = None) -> None:
    """Download and extract .tar.gz or .gz file to one or several target locations.

    The archive is deleted if extraction was successful.

    Args:
        url: URL for file downloading.
        download_path: Path to the directory where downloaded file will be stored until the end of extraction.
        extract_paths: Path or list of paths where contents of archive will be extracted.
        headers: Headers for file server.

    """
    file_name = Path(urlparse(url).path).name
    download_path = Path(download_path)

    if extract_paths is None:
        extract_paths = [download_path]
    elif isinstance(extract_paths, list):
        extract_paths = [Path(path) for path in extract_paths]
    else:
        extract_paths = [Path(extract_paths)]

    cache_dir = os.getenv('DP_CACHE_DIR')
    extracted = False
    if cache_dir:
        cache_dir = Path(cache_dir)
        url_hash = md5(url.encode('utf8')).hexdigest()[:15]
        arch_file_path = cache_dir / url_hash
        extracted_path = cache_dir / (url_hash + '_extracted')
        extracted = extracted_path.exists()
        if not extracted and not arch_file_path.exists():
            simple_download(url, arch_file_path, headers)
        else:
            if extracted:
                log.info(f'Found cached and extracted {url} in {extracted_path}')
            else:
                log.info(f'Found cached {url} in {arch_file_path}')
    else:
        arch_file_path = download_path / file_name
        simple_download(url, arch_file_path, headers)
        extracted_path = extract_paths.pop()

    if not extracted:
        log.info('Extracting {} archive into {}'.format(arch_file_path, extracted_path))
        extracted_path.mkdir(parents=True, exist_ok=True)

        if file_name.endswith('.tar.gz'):
            untar(arch_file_path, extracted_path)
        elif file_name.endswith('.gz'):
            ungzip(arch_file_path, extracted_path / Path(file_name).with_suffix('').name)
        elif file_name.endswith('.zip'):
            with zipfile.ZipFile(arch_file_path, 'r') as zip_ref:
                zip_ref.extractall(extracted_path)
        else:
            raise RuntimeError(f'Trying to extract an unknown type of archive {file_name}')

        if not cache_dir:
            arch_file_path.unlink()

    for extract_path in extract_paths:
        for src in extracted_path.iterdir():
            dest = extract_path / src.name
            if src.is_dir():
                _copytree(src, dest)
            else:
                extract_path.mkdir(parents=True, exist_ok=True)
                shutil.copy(str(src), str(dest))


def _copytree(src: Path, dest: Path) -> None:
    """Recursively copies directory.

    Destination directory could exist (unlike if we used shutil.copytree).

    Args:
        src: Path to copied directory.
        dest: Path to destination directory.

    """
    dest.mkdir(parents=True, exist_ok=True)
    for f in src.iterdir():
        f_dest = dest / f.name
        if f.is_dir():
            _copytree(f, f_dest)
        else:
            shutil.copy(str(f), str(f_dest))


def file_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> Optional[str]:
    """Return md5 hash value for file contents.

    Args:
        fpath: Path to file.
        chunk_size: md5 object updated by ``chunk_size`` bytes from file.

    Returns:
        None if ``fpath`` does not point to a file, else returns md5 hash value as string.

    """
    fpath = Path(fpath)
    if not fpath.is_file():
        return None
    file_hash = md5()
    with fpath.open('rb') as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            file_hash.update(chunk)
    return file_hash.hexdigest()


def mark_done(path: Union[Path, str]) -> None:
    """Create ``.done`` empty file in the directory.

    Args:
        path: Path to directory.

    Raises:
        NotADirectoryError: If ``path`` does not point to a directory.

    """
    path = Path(path)
    if not path.is_dir():
        raise NotADirectoryError(f"Not a directory: '{path}'")
    mark = path / _MARK_DONE
    mark.touch(exist_ok=True)


def is_done(path: Union[Path, str]) -> bool:
    """Check if ``.done`` file exists in directory.

    Args:
        path: Path to directory.

    Returns:
        True if directory contains ``.done`` file, False otherwise.

    """
    mark = Path(path) / _MARK_DONE
    return mark.is_file()


def _get_all_dimensions(batch: Sequence, level: int = 0, res: Optional[List[List[int]]] = None) -> List[List[int]]:
    """Return all presented element sizes of each dimension.

    Args:
        batch: Data array.
        level: Recursion level.
        res: List containing element sizes of each dimension.

    Return:
        List, i-th element of which is list containing all presented sized of batch's i-th dimension.

    Examples:
        >>> x = [[[1], [2, 3]], [[4], [5, 6, 7], [8, 9]]]
        >>> _get_all_dimensions(x)
        [[2], [2, 3], [1, 2, 1, 3, 2]]

    """
    if not level:
        res = [[len(batch)]]
    if len(batch) and isinstance(batch[0], Sized) and not isinstance(batch[0], str):
        level += 1
        if len(res) <= level:
            res.append([])
        for item in batch:
            res[level].append(len(item))
            _get_all_dimensions(item, level, res)
    return res


def get_dimensions(batch: Sequence) -> List[int]:
    """Return maximal size of each batch dimension."""
    return list(map(max, _get_all_dimensions(batch)))


def zero_pad(batch: Sequence,
             zp_batch: Optional[np.ndarray] = None,
             dtype: type = np.float32,
             padding: Union[int, float] = 0) -> np.ndarray:
    """Fills the end of each array item to make its length maximal along each dimension.

    Args:
        batch: Initial array.
        zp_batch: Padded array.
        dtype = Type of padded array.
        padding = Number to will initial array with.

    Returns:
        Padded array.

    Examples:
        >>> x = np.array([[1, 2, 3], [4], [5, 6]])
        >>> zero_pad(x)
        array([[1., 2., 3.],
               [4., 0., 0.],
               [5., 6., 0.]], dtype=float32)

    """
    if zp_batch is None:
        dims = get_dimensions(batch)
        zp_batch = np.ones(dims, dtype=dtype) * padding
    if zp_batch.ndim == 1:
        zp_batch[:len(batch)] = batch
    else:
        for b, zp in zip(batch, zp_batch):
            zero_pad(b, zp)
    return zp_batch


def is_str_batch(batch: Iterable) -> bool:
    """Checks if iterable argument contains string at any nesting level."""
    while True:
        if isinstance(batch, Iterable):
            if isinstance(batch, str):
                return True
            elif isinstance(batch, np.ndarray):
                return batch.dtype.kind == 'U'
            else:
                if len(batch) > 0:
                    batch = batch[0]
                else:
                    return True
        else:
            return False


def flatten_str_batch(batch: Union[str, Iterable]) -> Union[list, chain]:
    """Joins all strings from nested lists to one ``itertools.chain``.

    Args:
        batch: List with nested lists to flatten.

    Returns:
        Generator of flat List[str]. For str ``batch`` returns [``batch``].

    Examples:
        >>> [string for string in flatten_str_batch(['a', ['b'], [['c', 'd']]])]
        ['a', 'b', 'c', 'd']

    """
    if isinstance(batch, str):
        return [batch]
    else:
        return chain(*[flatten_str_batch(sample) for sample in batch])


def zero_pad_truncate(batch: Sequence[Sequence[Union[int, float, np.integer, np.floating,
                                                     Sequence[Union[int, float, np.integer, np.floating]]]]],
                      max_len: int, pad: str = 'post', trunc: str = 'post',
                      dtype: Optional[Union[type, str]] = None) -> np.ndarray:
    """

    Args:
        batch: assumes a batch of lists of word indexes or their vector representations
        max_len: resulting length of every batch item
        pad: how to pad shorter batch items: can be ``'post'`` or ``'pre'``
        trunc: how to truncate a batch item: can be ``'post'`` or ``'pre'``
        dtype: overrides dtype for the resulting ``ndarray`` if specified,
         otherwise ``np.int32`` is used for 2-d arrays and ``np.float32`` — for 3-d arrays

    Returns:
        a 2-d array of size ``(len(batch), max_len)`` or a 3-d array of size ``(len(batch), max_len, len(batch[0][0]))``
    """
    if isinstance(batch[0][0], Collection):  # ndarray behaves like a Sequence without actually being one
        size = (len(batch), max_len, len(batch[0][0]))
        dtype = dtype or np.float32
    else:
        size = (len(batch), max_len)
        dtype = dtype or np.int32

    padded_batch = np.zeros(size, dtype=dtype)
    for i, batch_item in enumerate(batch):
        if len(batch_item) > max_len:  # trunc
            padded_batch[i] = batch_item[slice(max_len) if trunc == 'post' else slice(-max_len, None)]
        else:  # pad
            padded_batch[i, slice(len(batch_item)) if pad == 'post' else slice(-len(batch_item), None)] = batch_item

    return np.asarray(padded_batch)


def get_all_elems_from_json(search_json: dict, search_key: str) -> list:
    """Returns values by key in all nested dicts.

    Args:
        search_json: Dictionary in which one needs to find all values by specific key.
        search_key: Key for search.

    Returns:
        List of values stored in nested structures by ``search_key``.

    Examples:
        >>> get_all_elems_from_json({'a':{'b': [1,2,3]}, 'b':42}, 'b')
        [[1, 2, 3], 42]

    """
    result = []
    if isinstance(search_json, dict):
        for key in search_json:
            if key == search_key:
                result.append(search_json[key])
            else:
                result.extend(get_all_elems_from_json(search_json[key], search_key))
    elif isinstance(search_json, list):
        for item in search_json:
            result.extend(get_all_elems_from_json(item, search_key))

    return result


def check_nested_dict_keys(check_dict: dict, keys: list) -> bool:
    """Checks if dictionary contains nested keys from keys list.

    Args:
        check_dict: Dictionary to check.
        keys: Keys list. i-th nested dict of ``check_dict`` should contain dict containing (i+1)-th key
        from the ``keys`` list by i-th key.

    Returns:
        True if dictionary contains nested keys from keys list, False otherwise.

    Examples:
        >>> check_nested_dict_keys({'x': {'y': {'z': 42}}}, ['x', 'y', 'z'])
        True
        >>> check_nested_dict_keys({'x': {'y': {'z': 42}}}, ['x', 'z', 'y'])
        False
        >>> check_nested_dict_keys({'x': {'y': 1, 'z': 42}}, ['x', 'y', 'z'])
        False

    """
    if isinstance(keys, list) and len(keys) > 0:
        element = check_dict
        for key in keys:
            if isinstance(element, dict) and key in element.keys():
                element = element[key]
            else:
                return False
        return True
    else:
        return False


def jsonify_data(data: Any) -> Any:
    """Replaces JSON-non-serializable objects with JSON-serializable.

    Function replaces numpy arrays and numbers with python lists and numbers, tuples is replaces with lists. All other
    object types remain the same.

    Args:
        data: Object to make JSON-serializable.

    Returns:
        Modified input data.

    """
    if isinstance(data, (list, tuple)):
        result = [jsonify_data(item) for item in data]
    elif isinstance(data, dict):
        result = {}
        for key in data.keys():
            result[key] = jsonify_data(data[key])
    elif isinstance(data, np.ndarray):
        result = data.tolist()
    elif isinstance(data, np.integer):
        result = int(data)
    elif isinstance(data, np.floating):
        result = float(data)
    elif callable(getattr(data, "to_serializable_dict", None)):
        result = data.to_serializable_dict()
    else:
        result = data
    return result


def chunk_generator(items_list: list, chunk_size: int) -> Generator[list, None, None]:
    """Yields consecutive slices of list.

    Args:
        items_list: List to slice.
        chunk_size: Length of slice.

    Yields:
        list: ``items_list`` consecutive slices.

    """
    for i in range(0, len(items_list), chunk_size):
        yield items_list[i:i + chunk_size]


def update_dict_recursive(editable_dict: dict, editing_dict: Mapping) -> None:
    """Updates dict recursively.

    You need to use this function to update dictionary if depth of editing_dict is more then 1.

    Args:
        editable_dict: Dictionary to edit.
        editing_dict: Dictionary containing edits.

    """
    for k, v in editing_dict.items():
        if isinstance(v, collections.Mapping):
            update_dict_recursive(editable_dict.get(k, {}), v)
        else:
            editable_dict[k] = v


def path_set_md5(url: str) -> str:
    """Given a file URL, return a md5 query of the file.

    Args:
        url: A given URL.

    Returns:
        URL of the md5 file.

    """
    scheme, netloc, path, query_string, fragment = urlsplit(url)
    path += '.md5'

    return urlunsplit((scheme, netloc, path, query_string, fragment))


def set_query_parameter(url: str, param_name: str, param_value: str) -> str:
    """Given a URL, set or replace a query parameter and return the modified URL.

    Args:
        url: A given  URL.
        param_name: The parameter name to add.
        param_value: The parameter value.

    Returns:
        URL with the added parameter.

    """
    scheme, netloc, path, query_string, fragment = urlsplit(url)
    query_params = parse_qs(query_string)

    query_params[param_name] = [param_value]
    new_query_string = urlencode(query_params, doseq=True)

    return urlunsplit((scheme, netloc, path, new_query_string, fragment))


================================================
FILE: deeppavlov/core/models/__init__.py
================================================


================================================
FILE: deeppavlov/core/models/component.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABCMeta, abstractmethod

from logging import getLogger

log = getLogger(__name__)


class Component(metaclass=ABCMeta):
    """Abstract class for all callables that could be used in Chainer's pipe."""

    @abstractmethod
    def __call__(self, *args, **kwargs):
        pass

    def reset(self):
        pass

    def destroy(self):
        attr_list = list(self.__dict__.keys())
        for attr_name in attr_list:
            attr = getattr(self, attr_name)
            if hasattr(attr, 'destroy'):
                attr.destroy()
            delattr(self, attr_name)


================================================
FILE: deeppavlov/core/models/estimator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod

from .component import Component
from .serializable import Serializable


class Estimator(Component, Serializable):
    """Abstract class for components that could be fitted on the data as a whole."""

    @abstractmethod
    def fit(self, *args, **kwargs):
        pass


================================================
FILE: deeppavlov/core/models/nn_model.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod

from .component import Component
from .serializable import Serializable


class NNModel(Component, Serializable):
    """Abstract class for deep learning components."""

    @abstractmethod
    def train_on_batch(self, x: list, y: list):
        pass

    def process_event(self, event_name, data):
        pass


================================================
FILE: deeppavlov/core/models/serializable.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABCMeta, abstractmethod
from logging import getLogger
from pathlib import Path
from typing import Union, Optional

from deeppavlov.core.commands.utils import expand_path

log = getLogger(__name__)


class Serializable(metaclass=ABCMeta):
    """Abstract base class that expresses the interface for all models that can serialize data to a path."""

    def __init__(self, save_path: Optional[Union[str, Path]], load_path: Optional[Union[str, Path]] = None,
                 mode: str = 'infer',
                 *args, **kwargs) -> None:

        if save_path:
            self.save_path = expand_path(save_path)
            self.save_path.parent.mkdir(parents=True, exist_ok=True)
        else:
            self.save_path = None

        if load_path:
            self.load_path = expand_path(load_path)
            if mode != 'train' and self.save_path and self.load_path != self.save_path:
                log.warning("Load path '{}' differs from save path '{}' in '{}' mode for {}."
                            .format(self.load_path, self.save_path, mode, self.__class__.__name__))
        elif mode != 'train' and self.save_path:
            self.load_path = self.save_path
            log.warning("No load path is set for {} in '{}' mode. Using save path instead"
                        .format(self.__class__.__name__, mode))
        else:
            self.load_path = None
            log.warning("No load path is set for {}!".format(self.__class__.__name__))

    @abstractmethod
    def save(self, *args, **kwargs):
        pass

    @abstractmethod
    def load(self, *args, **kwargs):
        pass


================================================
FILE: deeppavlov/core/models/torch_model.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod
from logging import getLogger
from pathlib import Path
from typing import Optional, Union

import torch

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.models.nn_model import NNModel

log = getLogger(__name__)


class TorchModel(NNModel):
    """Class implements torch model's main methods.

    Args:
        model: torch.nn.Model-based neural network model
        device: device to use
        optimizer: name of `torch.optim` optimizer
        optimizer_parameters: dictionary with optimizer parameters
        learning_rate_drop_patience: how many validations with no improvements to wait
        learning_rate_drop_div: the divider of the learning rate after `learning_rate_drop_patience` unsuccessful
            validations
        load_before_drop: whether to load best model before dropping learning rate or not
        min_learning_rate: min value of learning rate if learning rate decay is used
        args:
        kwargs: dictionary with other model parameters

    Attributes:
        device: `cpu` or `cuda` device to use
        opt: dictionary with all model parameters
        model: torch model
        epochs_done: number of epochs that were done
        optimizer: `torch.optim` instance
        learning_rate_drop_patience: how many validations with no improvements to wait
        learning_rate_drop_div: the divider of the learning rate after `learning_rate_drop_patience` unsuccessful
            validations
        load_before_drop: whether to load best model before dropping learning rate or not
        min_learning_rate: min value of learning rate if learning rate decay is used
        clip_norm: clip gradients by norm coefficient
    """

    def __init__(self, model: torch.nn.Module,
                 device: Union[torch.device, str] = "cuda",
                 optimizer: str = "AdamW",
                 optimizer_parameters: Optional[dict] = None,
                 learning_rate_drop_patience: Optional[int] = None,
                 learning_rate_drop_div: Optional[float] = None,
                 load_before_drop: bool = True,
                 min_learning_rate: float = 1e-07,
                 clip_norm: Optional[float] = None,
                 *args, **kwargs):

        super().__init__(*args, **kwargs)
        self.model = model
        self.device = self._init_device(device)
        self.model.to(self.device)
        if self.device.type == "cuda" and torch.cuda.device_count() > 1:
            self.model = torch.nn.DataParallel(self.model)
        if optimizer_parameters is None:
            optimizer_parameters = {"lr": 0.01}
        self.optimizer = getattr(torch.optim, optimizer)(self.model.parameters(), **optimizer_parameters)
        self.epochs_done = 0
        self.learning_rate_drop_patience = learning_rate_drop_patience
        self.learning_rate_drop_div = learning_rate_drop_div
        self.load_before_drop = load_before_drop
        self.min_learning_rate = min_learning_rate
        self.clip_norm = clip_norm
        self.load()
        # we need to switch to eval mode here because by default it's in `train` mode.
        # But in case of `interact/build_model` usage, we need to have model in eval mode.
        self.model.eval()
        log.debug(f"Model was successfully initialized! Model summary:\n {self.model}")

    def _init_device(self, device: Union[torch.device, str]) -> torch.device:
        if device == "gpu":
            device = "cuda"
        if isinstance(device, str):
            device = torch.device(device)
        if device.type == "cuda" and not torch.cuda.is_available():
            log.warning(f"Unable to place component {self.__class__.__name__} on GPU, "
                        "since no CUDA GPUs are available. Using CPU.")
            device = torch.device('cpu')
        return device

    @property
    def is_data_parallel(self) -> bool:
        return isinstance(self.model, torch.nn.DataParallel)

    def load(self, fname: Optional[str] = None, *args, **kwargs) -> None:
        """Load model from `fname` (if `fname` is not given, use `self.load_path`) to `self.model` along with
            the optimizer `self.optimizer`.
            If `fname` (if `fname` is not given, use `self.load_path`) does not exist, initialize model from scratch.

        Args:
            fname: string path to checkpoint
            *args:
            **kwargs:

        Returns:
            None
        """
        if fname is not None:
            self.load_path = fname

        if self.load_path:
            log.debug(f"Load path {self.load_path} is given.")
            if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir():
                raise ConfigError("Provided load path is incorrect!")

            weights_path = Path(self.load_path.resolve())
            weights_path = weights_path.with_suffix(f".pth.tar")
            if weights_path.exists():
                log.debug(f"Load path {weights_path} exists.")
                log.debug(f"Initializing `{self.__class__.__name__}` from saved.")

                # now load the weights, optimizer from saved
                log.debug(f"Loading weights from {weights_path}.")
                checkpoint = torch.load(weights_path, map_location=self.device)
                model_state = checkpoint["model_state_dict"]
                optimizer_state = checkpoint["optimizer_state_dict"]
                # load a multi-gpu model on a single device
                if all([key.startswith("module.") for key in list(model_state.keys())]):
                    model_state = {key.replace("module.", "", 1): val for key, val in model_state.items()}

                if self.is_data_parallel:
                    self.model.module.load_state_dict(model_state)
                else:
                    self.model.load_state_dict(model_state)
                try:  # TODO: remove this try-except after hf models deep update
                    self.optimizer.load_state_dict(optimizer_state)
                except ValueError as e:
                    log.error(f'Failed to load optimizer state due to {repr(e)}')
                self.epochs_done = checkpoint.get("epochs_done", 0)
            else:
                log.warning(f"Init from scratch. Load path {weights_path} does not exist.")
        else:
            log.warning(f"Init from scratch. Load path {self.load_path} is not provided.")
        self.model.to(self.device)

    def save(self, fname: Optional[str] = None, *args, **kwargs) -> None:
        """Save torch model to `fname` (if `fname` is not given, use `self.save_path`). Checkpoint includes
            `model_state_dict`, `optimizer_state_dict`, and `epochs_done` (number of training epochs).

        Args:
            fname:
            *args:
            **kwargs:

        Returns:

        """
        if fname is None:
            fname = self.save_path

        if not fname.parent.is_dir():
            raise ConfigError("Provided save path is incorrect!")

        weights_path = Path(fname).with_suffix(f".pth.tar")
        log.info(f"Saving model to {weights_path}.")
        # move the model to `cpu` before saving to provide consistency
        if self.is_data_parallel:
            model_state_dict = self.model.module.cpu().state_dict()
        else:
            model_state_dict = self.model.cpu().state_dict()
        torch.save({
            "model_state_dict": model_state_dict,
            "optimizer_state_dict": self.optimizer.state_dict(),
            "epochs_done": self.epochs_done
        }, weights_path)
        # return it back to device (necessary if it was on `cuda`)
        self.model.to(self.device)

    def process_event(self, event_name: str, data: dict) -> None:
        """Process event. After epoch, increase `self.epochs_done`. After validation, decrease learning rate in
            `self.learning_rate_drop_div` times (not lower than `self.min_learning_rate`)
            if given `self.learning_rate_drop_patience`.

        Args:
            event_name: whether event is send after epoch or batch.
                    Set of values: ``"after_epoch", "after_batch"``
            data: event data (dictionary)
        Returns:
            None
        """
        if event_name == "after_epoch":
            self.epochs_done += 1

        if event_name == "after_validation" and 'impatience' in data and self.learning_rate_drop_patience:
            if data['impatience'] == self.learning_rate_drop_patience:
                log.info(f"----------Current LR is decreased in {self.learning_rate_drop_div} times----------")
                if self.load_before_drop:
                    self.load(self.save_path)
                    self.model.eval()
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = max(param_group['lr'] / self.learning_rate_drop_div, self.min_learning_rate)

    @abstractmethod
    def train_on_batch(self, x: list, y: list):
        pass

    def _make_step(self, loss: torch.Tensor) -> None:
        loss.backward()
        if self.clip_norm is not None:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm)
        self.optimizer.step()


================================================
FILE: deeppavlov/core/trainers/__init__.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .fit_trainer import FitTrainer
from .nn_trainer import NNTrainer
from .torch_trainer import TorchTrainer


================================================
FILE: deeppavlov/core/trainers/fit_trainer.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import json
import time
from itertools import islice
from logging import getLogger
from typing import Tuple, Dict, Union, Optional, Iterable, Any, Collection

from tqdm import tqdm

from deeppavlov.core.commands.infer import build_model
from deeppavlov.core.common.chainer import Chainer
from deeppavlov.core.common.params import from_params
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_fitting_iterator import DataFittingIterator
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
from deeppavlov.core.models.estimator import Estimator
from deeppavlov.core.trainers.utils import Metric, parse_metrics, prettify_metrics, NumpyArrayEncoder

log = getLogger(__name__)
report_log = getLogger('train_report')


@register('fit_trainer')
class FitTrainer:
    """
    Trainer class for fitting and evaluating :class:`Estimators <deeppavlov.core.models.estimator.Estimator>`

    Args:
        chainer_config: ``"chainer"`` block of a configuration file
        batch_size: batch_size to use for partial fitting (if available) and evaluation,
            the whole dataset is used if ``batch_size`` is negative or zero (default is ``-1``)
        metrics: iterable of metrics where each metric can be a registered metric name or a dict of ``name`` and
            ``inputs`` where ``name`` is a registered metric name and ``inputs`` is a collection of parameter names
            from chainer’s inner memory that will be passed to the metric function;
            default value for ``inputs`` parameter is a concatenation of chainer’s ``in_y`` and ``out`` fields
            (default is ``('accuracy',)``)
        evaluation_targets: data types on which to evaluate trained pipeline (default is ``('valid', 'test')``)
        show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch
            in evaluation logs (default is ``False``)
        max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative
            (default is ``-1``)
        **kwargs: additional parameters whose names will be logged but otherwise ignored
    """

    def __init__(self, chainer_config: dict, *, batch_size: int = -1,
                 metrics: Iterable[Union[str, dict]] = ('accuracy',),
                 evaluation_targets: Iterable[str] = ('valid', 'test'),
                 show_examples: bool = False,
                 max_test_batches: int = -1,
                 **kwargs) -> None:
        if kwargs:
            log.warning(f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:')
        self.chainer_config = chainer_config
        self._chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y'))
        self.batch_size = batch_size
        self.metrics = parse_metrics(metrics, self._chainer.in_y, self._chainer.out_params)
        self.evaluation_targets = tuple(evaluation_targets)
        self.show_examples = show_examples
        self.max_test_batches = None if max_test_batches < 0 else max_test_batches
        self._built = False
        self._saved = False
        self._loaded = False

    def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator]) -> None:
        """
        Build the pipeline :class:`~deeppavlov.core.common.chainer.Chainer` and successively fit
        :class:`Estimator <deeppavlov.core.models.estimator.Estimator>` components using a provided data iterator
        """
        if self._built:
            raise RuntimeError('Cannot fit already built chainer')
        for component_index, component_config in enumerate(self.chainer_config['pipe'], 1):
            component = from_params(component_config, mode='train')
            if 'fit_on' in component_config:
                component: Estimator

                targets = component_config['fit_on']
                if isinstance(targets, str):
                    targets = [targets]

                if self.batch_size > 0 and callable(getattr(component, 'partial_fit', None)):
                    for i, (x, y) in tqdm(enumerate(iterator.gen_batches(self.batch_size, shuffle=False))):
                        preprocessed = self._chainer.compute(x, y, targets=targets)
                        # noinspection PyUnresolvedReferences
                        component.partial_fit(*preprocessed)
                else:
                    preprocessed = self._chainer.compute(*iterator.get_instances(), targets=targets)
                    if len(targets) == 1:
                        preprocessed = [preprocessed]
                    component.fit(*preprocessed)

                component.save()

            if 'in' in component_config:
                c_in = component_config['in']
                c_out = component_config['out']
                in_y = component_config.get('in_y', None)
                main = component_config.get('main', False)
                self._chainer.append(component, c_in, c_out, in_y, main)
        self._built = True

    def _load(self) -> None:
        if not self._loaded:
            self._chainer.destroy()
            self._chainer = build_model({'chainer': self.chainer_config}, load_trained=self._saved)
            self._loaded = True

    def get_chainer(self) -> Chainer:
        """Returns a :class:`~deeppavlov.core.common.chainer.Chainer` built from ``self.chainer_config`` for inference"""
        self._load()
        return self._chainer

    def train(self, iterator: Union[DataFittingIterator, DataLearningIterator]) -> None:
        """Calls :meth:`~fit_chainer` with provided data iterator as an argument"""
        self.fit_chainer(iterator)
        self._saved = True

    def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]],
             metrics: Optional[Collection[Metric]] = None, *,
             start_time: Optional[float] = None, show_examples: Optional[bool] = None) -> dict:
        """
        Calculate metrics and return reports on provided data for currently stored
        :class:`~deeppavlov.core.common.chainer.Chainer`

        Args:
            data: iterable of batches of inputs and expected outputs
            metrics: collection of metrics namedtuples containing names for report, metric functions
                and their inputs names (if omitted, ``self.metrics`` is used)
            start_time: start time for test report
            show_examples: a flag used to return inputs, expected outputs and predicted outputs for the last batch
                in a result report (if omitted, ``self.show_examples`` is used)

        Returns:
            a report dict containing calculated metrics, spent time value, examples count in tested data
            and maybe examples
        """

        if start_time is None:
            start_time = time.time()
        if show_examples is None:
            show_examples = self.show_examples
        if metrics is None:
            metrics = self.metrics

        expected_outputs = list(set().union(self._chainer.out_params, *[m.inputs for m in metrics]))

        outputs = {out: [] for out in expected_outputs}
        examples = 0

        data = islice(data, self.max_test_batches)

        for x, y_true in tqdm(data):
            examples += len(x)
            y_predicted = list(self._chainer.compute(list(x), list(y_true), targets=expected_outputs))
            if len(expected_outputs) == 1:
                y_predicted = [y_predicted]
            for out, val in zip(outputs.values(), y_predicted):
                out += list(val)
        if examples == 0:
            log.warning('Got empty data iterable for scoring')
            return {'eval_examples_count': 0, 'metrics': None, 'time_spent': str(datetime.timedelta(seconds=0))}

        # metrics_values = [(m.name, m.fn(*[outputs[i] for i in m.inputs])) for m in metrics]
        metrics_values = []
        for metric in metrics:
            calculate_metric = True
            for i in metric.inputs:
                outputs[i] = [k for k in outputs[i] if k is not None]
                if len(outputs[i]) == 0:
                    log.info(f'Metric {metric.alias} is not calculated due to absense of true and predicted samples')
                    calculate_metric = False
                    value = -1
            if calculate_metric:
                value = metric.fn(*[outputs[i] for i in metric.inputs])
            metrics_values.append((metric.alias, value))

        report = {
            'eval_examples_count': examples,
            'metrics': prettify_metrics(metrics_values),
            'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5)))
        }

        if show_examples:
            y_predicted = zip(*[y_predicted_group
                                for out_name, y_predicted_group in zip(expected_outputs, y_predicted)
                                if out_name in self._chainer.out_params])
            if len(self._chainer.out_params) == 1:
                y_predicted = [y_predicted_item[0] for y_predicted_item in y_predicted]
            report['examples'] = [{
                'x': x_item,
                'y_predicted': y_predicted_item,
                'y_true': y_true_item
            } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)]

        return report

    def evaluate(self, iterator: DataLearningIterator,
                 evaluation_targets: Optional[Iterable[str]] = None) -> Dict[str, dict]:
        """
        Run :meth:`test` on multiple data types using provided data iterator

        Args:
            iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation
            evaluation_targets: iterable of data types to evaluate on

        Returns:
            a dictionary with data types as keys and evaluation reports as values
        """
        self._load()
        if evaluation_targets is None:
            evaluation_targets = self.evaluation_targets

        res = {}

        for data_type in evaluation_targets:
            data_gen = iterator.gen_batches(self.batch_size, data_type=data_type, shuffle=False)
            report = self.test(data_gen)
            res[data_type] = report
            report_log.info(json.dumps({data_type: report}, ensure_ascii=False, cls=NumpyArrayEncoder))

        return res


================================================
FILE: deeppavlov/core/trainers/nn_trainer.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import json
import time
from itertools import islice
from logging import getLogger
from pathlib import Path
from typing import List, Tuple, Union, Optional, Iterable

from tqdm import tqdm

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.log_events import get_tb_writer
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
from deeppavlov.core.trainers.fit_trainer import FitTrainer
from deeppavlov.core.trainers.utils import parse_metrics, NumpyArrayEncoder

log = getLogger(__name__)
report_log = getLogger('train_report')


@register('nn_trainer')
class NNTrainer(FitTrainer):
    """
    | Bases :class:`~deeppavlov.core.trainers.FitTrainer`
    | Trainer class for training and evaluating pipelines containing
      :class:`Estimators <deeppavlov.core.models.estimator.Estimator>`
      and an :class:`~deeppavlov.core.models.nn_model.NNModel`

    Args:
        chainer_config: ``"chainer"`` block of a configuration file
        batch_size: batch_size to use for partial fitting (if available) and evaluation,
            the whole dataset is used if ``batch_size`` is negative or zero (default is ``1``)
        epochs: maximum epochs number to train the pipeline, ignored if negative or zero (default is ``-1``)
        start_epoch_num: starting epoch number for reports (default is ``0``)
        max_batches: maximum batches number to train the pipeline, ignored if negative or zero (default is ``-1``)
        metrics: iterable of metrics where each metric can be a registered metric name or a dict of ``name`` and
            ``inputs`` where ``name`` is a registered metric name and ``inputs`` is a collection of parameter names
            from chainer’s inner memory that will be passed to the metric function;
            default value for ``inputs`` parameter is a concatenation of chainer’s ``in_y`` and ``out`` fields;
            the first metric is used for early stopping (default is ``('accuracy',)``)
        train_metrics: metrics calculated for train logs (if omitted, ``metrics`` argument is used)
        metric_optimization: one of ``'maximize'`` or ``'minimize'`` — strategy for metric optimization used in early
            stopping (default is ``'maximize'``)
        evaluation_targets: data types on which to evaluate a trained pipeline (default is ``('valid', 'test')``)
        show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch
            in evaluation logs (default is ``False``)
        tensorboard_log_dir: path to a directory where tensorboard logs can be stored, ignored if None
            (default is ``None``)
        validate_first: flag used to calculate metrics on the ``'valid'`` data type before starting training
            (default is ``True``)
        validation_patience: how many times in a row the validation metric has to not improve for early stopping,
            ignored if negative or zero (default is ``5``)
        val_every_n_epochs: how often (in epochs) to validate the pipeline, ignored if negative or zero
            (default is ``-1``)
        val_every_n_batches: how often (in batches) to validate the pipeline, ignored if negative or zero
            (default is ``-1``)
        log_every_n_epochs: how often (in epochs) to calculate metrics on train data, ignored if negative or zero
            (default is ``-1``)
        log_every_n_batches: how often (in batches) to calculate metrics on train data, ignored if negative or zero
            (default is ``-1``)
        log_on_k_batches: count of random train batches to calculate metrics in log (default is ``1``)
        max_test_batches: maximum batches count for pipeline testing and evaluation, overrides ``log_on_k_batches``,
            ignored if negative (default is ``-1``)
        **kwargs: additional parameters whose names will be logged but otherwise ignored


    Trainer saves the model if it sees progress in scores. The full rules look like following:

    - For the validation savepoint:
        * 0-th validation (optional). Don't save model, establish a baseline.
        * 1-th validation.
             + If we have a baseline, save the model if we see an improvement, don't save otherwise.
             + If we don't have a baseline, save the model.
        * 2nd and later validations. Save the model if we see an improvement
    - For the at-train-exit savepoint:
        * Save the model if it happened before 1st validation (to capture early training results), don't save otherwise.

    """

    def __init__(self, chainer_config: dict, *, 
                 batch_size: int = 1,
                 epochs: int = -1,
                 start_epoch_num: int = 0,
                 max_batches: int = -1,
                 metrics: Iterable[Union[str, dict]] = ('accuracy',),
                 train_metrics: Optional[Iterable[Union[str, dict]]] = None,
                 metric_optimization: str = 'maximize',
                 evaluation_targets: Iterable[str] = ('valid', 'test'),
                 show_examples: bool = False,
                 tensorboard_log_dir: Optional[Union[str, Path]] = None,
                 max_test_batches: int = -1,
                 validate_first: bool = True,
                 validation_patience: int = 5, val_every_n_epochs: int = -1, val_every_n_batches: int = -1,
                 log_every_n_batches: int = -1, log_every_n_epochs: int = -1, log_on_k_batches: int = 1,
                 **kwargs) -> None:
        super().__init__(chainer_config, batch_size=batch_size, metrics=metrics, evaluation_targets=evaluation_targets,
                         show_examples=show_examples, max_test_batches=max_test_batches, **kwargs)
        if train_metrics is None:
            self.train_metrics = self.metrics
        else:
            self.train_metrics = parse_metrics(train_metrics, self._chainer.in_y, self._chainer.out_params)

        metric_optimization = metric_optimization.strip().lower()
        self.score_best = None

        def _improved(op):
            return lambda score, baseline: False if baseline is None or score is None \
                else op(score, baseline)

        if metric_optimization == 'maximize':
            self.improved = _improved(lambda a, b: a > b)
        elif metric_optimization == 'minimize':
            self.improved = _improved(lambda a, b: a < b)
        else:
            raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize']))

        self.validate_first = validate_first
        self.validation_number = 0 if validate_first else 1
        self.validation_patience = validation_patience
        self.val_every_n_epochs = val_every_n_epochs
        self.val_every_n_batches = val_every_n_batches
        self.log_every_n_epochs = log_every_n_epochs
        self.log_every_n_batches = log_every_n_batches
        self.log_on_k_batches = log_on_k_batches if log_on_k_batches >= 0 else None

        self.max_epochs = epochs
        self.epoch = start_epoch_num
        self.max_batches = max_batches

        self.train_batches_seen = 0
        self.examples = 0
        self.patience = 0
        self.last_result = {}
        self.losses = []
        self.start_time: Optional[float] = None
        self.tb_writer = get_tb_writer(tensorboard_log_dir)

    def save(self) -> None:
        if self._loaded:
            raise RuntimeError('Cannot save already finalized chainer')

        self._chainer.save()

    def _is_initial_validation(self):
        return self.validation_number == 0

    def _is_first_validation(self):
        return self.validation_number == 1

    def _validate(self, iterator: DataLearningIterator,
                  tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None:
        self._send_event(event_name='before_validation')
        report = self.test(iterator.gen_batches(self.batch_size, data_type='valid', shuffle=False),
                           start_time=self.start_time)

        report['epochs_done'] = self.epoch
        report['batches_seen'] = self.train_batches_seen
        report['train_examples_seen'] = self.examples

        metrics = list(report['metrics'].items())

        if tensorboard_tag is not None and self.tb_writer is not None:
            if tensorboard_index is None:
                tensorboard_index = self.train_batches_seen
            for name, score in metrics:
                self.tb_writer.write_valid(tag=f'{tensorboard_tag}/{name}', scalar_value=score,
                                           global_step=tensorboard_index)
            self.tb_writer.flush()

        m_name, score = metrics[0]

        # Update the patience
        if self.score_best is None:
            self.patience = 0
        else:
            if self.improved(score, self.score_best):
                self.patience = 0
            else:
                self.patience += 1

        # Run the validation model-saving logic
        if self._is_initial_validation():
            log.info('Initial best {} of {}'.format(m_name, score))
            self.score_best = score
        elif self._is_first_validation() and self.score_best is None:
            log.info('First best {} of {}'.format(m_name, score))
            self.score_best = score
            log.info('Saving model')
            self.save()
        elif self.improved(score, self.score_best):
            log.info(f'Improved best {m_name} from {self.score_best} to {score}')
            self.score_best = score
            log.info('Saving model')
            self.save()
        else:
            log.info('Did not improve on the {} of {}'.format(m_name, self.score_best))

        report['impatience'] = self.patience
        if self.validation_patience > 0:
            report['patience_limit'] = self.validation_patience

        self._send_event(event_name='after_validation', data=report)
        report = {'valid': report}
        report_log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder))
        self.validation_number += 1

    def _log(self, iterator: DataLearningIterator,
             tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None:
        self._send_event(event_name='before_log')
        if self.log_on_k_batches == 0:
            report = {
                'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5)))
            }
        else:
            data = islice(iterator.gen_batches(self.batch_size, data_type='train', shuffle=True),
                          self.log_on_k_batches)
            report = self.test(data, self.train_metrics, start_time=self.start_time)

        report.update({
            'epochs_done': self.epoch,
            'batches_seen': self.train_batches_seen,
            'train_examples_seen': self.examples
        })

        metrics: List[Tuple[str, float]] = list(report.get('metrics', {}).items()) + list(self.last_result.items())

        report.update(self.last_result)
        if self.losses:
            report['loss'] = sum(self.losses) / len(self.losses)
            self.losses.clear()
            metrics.append(('loss', report['loss']))

        if metrics and self.tb_writer is not None:
            for name, score in metrics:
                self.tb_writer.write_train(tag=f'{tensorboard_tag}/{name}', scalar_value=score,
                                           global_step=tensorboard_index)
            self.tb_writer.flush()

        self._send_event(event_name='after_train_log', data=report)

        report = {'train': report}
        report_log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder))

    def _send_event(self, event_name: str, data: Optional[dict] = None) -> None:
        report = {
            'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))),
            'epochs_done': self.epoch,
            'batches_seen': self.train_batches_seen,
            'train_examples_seen': self.examples
        }
        if data is not None:
            report.update(data)
        self._chainer.process_event(event_name=event_name, data=report)

    def train_on_batches(self, iterator: DataLearningIterator) -> None:
        """Train pipeline on batches using provided data iterator and initialization parameters"""
        self.start_time = time.time()
        if self.validate_first:
            self._validate(iterator)

        while True:
            impatient = False
            self._send_event(event_name='before_train')
            for x, y_true in tqdm(iterator.gen_batches(self.batch_size, data_type='train')):
                self.last_result = self._chainer.train_on_batch(x, y_true)
                if self.last_result is None:
                    self.last_result = {}
                elif not isinstance(self.last_result, dict):
                    self.last_result = {'loss': self.last_result}
                if 'loss' in self.last_result:
                    self.losses.append(self.last_result.pop('loss'))

                self.train_batches_seen += 1
                self.examples += len(x)

                if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0:
                    self._log(iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen)

                if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0:
                    self._validate(iterator,
                                   tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen)

                self._send_event(event_name='after_batch')

                if 0 < self.max_batches <= self.train_batches_seen:
                    impatient = True
                    break

                if 0 < self.validation_patience <= self.patience:
                    log.info('Ran out of patience')
                    impatient = True
                    break

            if impatient:
                break

            self.epoch += 1

            if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0:
                self._log(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch)

            if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0:
                self._validate(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch)

            self._send_event(event_name='after_epoch')

            if 0 < self.max_epochs <= self.epoch:
                break

            if 0 < self.validation_patience <= self.patience:
                log.info('Ran out of patience')
                break

    def train(self, iterator: DataLearningIterator) -> None:
        """Call :meth:`~fit_chainer` and then :meth:`~train_on_batches` with provided data iterator as an argument"""
        self.fit_chainer(iterator)
        if callable(getattr(self._chainer, 'train_on_batch', None)):
            try:
                self.train_on_batches(iterator)
            except KeyboardInterrupt:
                log.info('Stopped training')
        else:
            log.warning(f'Using {self.__class__.__name__} for a pipeline without batched training')

        # Run the at-train-exit model-saving logic
        if self.validation_number < 1:
            log.info('Save model to capture early training results')
            self.save()


================================================
FILE: deeppavlov/core/trainers/torch_trainer.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import Tuple, Optional, Iterable, Collection, Any

from deeppavlov.core.trainers.utils import Metric
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
from deeppavlov.core.trainers.nn_trainer import NNTrainer

log = getLogger(__name__)


@register('torch_trainer')
class TorchTrainer(NNTrainer):

    def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]],
             metrics: Optional[Collection[Metric]] = None, *,
             start_time: Optional[float] = None, show_examples: Optional[bool] = None) -> dict:
        self._chainer.get_main_component().model.eval()

        report = super(TorchTrainer, self).test(data=data, metrics=metrics, start_time=start_time,
                                                show_examples=show_examples)
        self._chainer.get_main_component().model.train()
        return report

    def train_on_batches(self, iterator: DataLearningIterator) -> None:
        self._chainer.get_main_component().model.train()
        super(TorchTrainer, self).train_on_batches(iterator=iterator)
        self._chainer.get_main_component().model.eval()


================================================
FILE: deeppavlov/core/trainers/utils.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import OrderedDict, namedtuple
from dataclasses import is_dataclass
from functools import partial
from json import JSONEncoder
from typing import List, Tuple, Union, Iterable

import numpy as np

from deeppavlov.core.common.metrics_registry import get_metric_by_name

Metric = namedtuple('Metric', ['name', 'fn', 'inputs', 'alias'])


def parse_metrics(metrics: Iterable[Union[str, dict]], in_y: List[str], out_vars: List[str]) -> List[Metric]:
    metrics_functions = []
    for metric in metrics:
        if isinstance(metric, str):
            metric = {'name': metric, 'alias': metric}

        metric_name = metric.pop('name')
        alias = metric.pop('alias', metric_name)

        f = get_metric_by_name(metric_name)

        inputs = metric.pop('inputs', in_y + out_vars)
        if isinstance(inputs, str):
            inputs = [inputs]

        metrics_functions.append(Metric(metric_name, partial(f, **metric), inputs, alias))

    return metrics_functions


def prettify_metrics(metrics: List[Tuple[str, float]], precision: int = 4) -> OrderedDict:
    """Prettifies the dictionary of metrics."""
    prettified_metrics = OrderedDict()
    for key, value in metrics:
        if key in prettified_metrics:
            Warning("Multiple metrics with the same name {}.".format(key))
        if isinstance(value, float):
            value = round(value, precision)
        prettified_metrics[key] = value
    return prettified_metrics


class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif is_dataclass(obj):
            return obj.__dict__
        return JSONEncoder.default(self, obj)


================================================
FILE: deeppavlov/dataset_iterators/__init__.py
================================================


================================================
FILE: deeppavlov/dataset_iterators/basic_classification_iterator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from collections import defaultdict
from logging import getLogger
from typing import List

from sklearn.model_selection import train_test_split

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator

log = getLogger(__name__)


@register('basic_classification_iterator')
class BasicClassificationDatasetIterator(DataLearningIterator):
    """
    Class gets data dictionary from DatasetReader instance, merge fields if necessary, split a field if necessary

    Args:
        data: dictionary of data with fields "train", "valid" and "test" (or some of them)
        fields_to_merge: list of fields (out of ``"train", "valid", "test"``) to merge
        merged_field: name of field (out of ``"train", "valid", "test"``) to which save merged fields
        field_to_split: name of field (out of ``"train", "valid", "test"``) to split
        split_fields: list of fields (out of ``"train", "valid", "test"``) to which save splitted field
        split_proportions: list of corresponding proportions for splitting
        seed: random seed for iterating
        shuffle: whether to shuffle examples in batches
        split_seed: random seed for splitting dataset, if ``split_seed`` is None, division is based on `seed`.
        stratify: whether to use stratified split
        shot: number of examples to sample for each class in training data. If None, all examples will remain in data.
        *args: arguments
        **kwargs: arguments

    Attributes:
        data: dictionary of data with fields "train", "valid" and "test" (or some of them)
    """

    def __init__(self, data: dict,
                 fields_to_merge: List[str] = None, merged_field: str = None,
                 field_to_split: str = None, split_fields: List[str] = None, split_proportions: List[float] = None,
                 seed: int = None, shuffle: bool = True, split_seed: int = None,
                 stratify: bool = None,
                 shot: int = None,
                 *args, **kwargs):
        """
        Initialize dataset using data from DatasetReader,
        merges and splits fields according to the given parameters.
        """
        super().__init__(data, seed=seed, shuffle=shuffle)

        if fields_to_merge is not None:
            if merged_field is not None:
                log.info("Merging fields <<{}>> to new field <<{}>>".format(fields_to_merge,
                                                                            merged_field))
                self._merge_data(fields_to_merge=fields_to_merge,
                                 merged_field=merged_field)
            else:
                raise IOError("Given fields to merge BUT not given name of merged field")

        if field_to_split is not None:
            if split_fields is not None:
                log.info("Splitting field <<{}>> to new fields <<{}>>".format(field_to_split,
                                                                              split_fields))
                self._split_data(field_to_split=field_to_split,
                                 split_fields=split_fields,
                                 split_proportions=[float(s) for s in
                                                    split_proportions],
                                 split_seed=split_seed,
                                 stratify=stratify)
            else:
                raise IOError("Given field to split BUT not given names of split fields")
        
        if shot is not None:
            train_data = self.data['train']
            self.random.shuffle(train_data)
            self.random.seed(seed)

            data_dict = defaultdict(list)
            for text, label in train_data:
                if len(data_dict[label]) < shot:
                    data_dict[label].append(text)
            
            if min(len(x) for x in data_dict.values()) < shot:
                log.warning(f"Some labels have less than {shot} examples")

            self.data['train'] = [(text, label) for label in data_dict for text in data_dict[label]]

    def _split_data(self, field_to_split: str = None, split_fields: List[str] = None,
                    split_proportions: List[float] = None, split_seed: int = None, stratify: bool = None) -> bool:
        """
        Split given field of dataset to the given list of fields with corresponding proportions

        Args:
            field_to_split: field name (out of ``"train", "valid", "test"``) which to split
            split_fields: list of names (out of ``"train", "valid", "test"``) of fields to which split
            split_proportions: corresponding proportions
            split_seed: random seed for splitting dataset
            stratify: whether to use stratified split

        Returns:
            None
        """
        if split_seed is None:
            split_seed = self.random.randint(0, 10000)
        data_to_div = self.data[field_to_split].copy()
        data_size = len(self.data[field_to_split])

        for i in range(len(split_fields) - 1):
            if stratify:
                stratify = [sample[1] for sample in data_to_div]
            self.data[split_fields[i]], data_to_div = train_test_split(
                data_to_div,
                test_size=len(data_to_div) - int(data_size * split_proportions[i]),
                random_state=split_seed,
                stratify=stratify)
            self.data[split_fields[-1]] = data_to_div
        return True

    def _merge_data(self, fields_to_merge: List[str] = None, merged_field: str = None) -> bool:
        """
        Merge given fields of dataset

        Args:
            fields_to_merge: list of fields (out of ``"train", "valid", "test"``) to merge
            merged_field: name of field (out of ``"train", "valid", "test"``) to which save merged fields

        Returns:
            None
        """
        data = self.data.copy()
        data[merged_field] = []
        for name in fields_to_merge:
            data[merged_field] += self.data[name]
        self.data = data
        return True


================================================
FILE: deeppavlov/dataset_iterators/huggingface_dataset_iterator.py
================================================
# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Tuple, Any, Union

from datasets import Dataset

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator


@register('huggingface_dataset_iterator')
class HuggingFaceDatasetIterator(DataLearningIterator):
    """Dataset iterator for HuggingFace Datasets."""

    def preprocess(self,
                   data: Dataset,
                   features: Union[str, List[str]],
                   label: str = 'label',
                   use_label_name: bool = True,
                   *args, **kwargs) -> List[Tuple[Any, Any]]:
        """Extracts features and labels from HuggingFace Dataset

        Args:
            data: instance of HuggingFace Dataset
            features: Dataset fields names to be extracted as features
            label: Dataset field name to be used as label.
            use_label_name: Use actual label name instead of its index (0, 1, ...). Defaults to True.

        Returns:
            List[Tuple[Any, Any]]: list of pairs of extracted features and labels
        """

        dataset = []
        for i in range(len(data)):  # for example in data
            example = data[i]
            if isinstance(features, str):
                feat = example[features]
            elif isinstance(features, list):
                try:
                    feat = tuple(example[f] for f in features)
                except Exception as e:
                    raise Exception(f"{e} for example {example} while trying to find keys {features}")
            else:
                raise RuntimeError(f"features should be str or list, but found: {features}")
            lb = example[label]
            if use_label_name and lb != -1:
                # -1 label is used if there is no label (test set)
                lb = data.info.features[label].names[lb]
            dataset += [(feat, lb)]
        return dataset


================================================
FILE: deeppavlov/dataset_iterators/morphotagger_iterator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple, List, Dict, Any, Iterator

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator


@register('morphotagger_dataset_iterator')
class MorphoTaggerDatasetIterator(DataLearningIterator):
    """
    Iterates over data for Morphological Tagging.
    A subclass of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`.

    Args:
        seed: random seed for data shuffling
        shuffle: whether to shuffle data during batching
        validation_split: the fraction of validation data
            (is used only if there is no `valid` subset in `data`)
    """

    def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], seed: int = None,
                 shuffle: bool = True, validation_split: float = 0.2) -> None:
        self.validation_split = validation_split
        super().__init__(data, seed, shuffle)

    def split(self, *args, **kwargs) -> None:
        """
        Splits the `train` part to `train` and `valid`, if no `valid` part is specified.
        Moves deficient data from `valid` to `train` if both parts are given,
        but `train` subset is too small.
        """
        if len(self.valid) == 0:
            if self.shuffle:
                self.random.shuffle(self.train)
            L = int(len(self.train) * (1.0 - self.validation_split))
            self.train, self.valid = self.train[:L], self.train[L:]

    def gen_batches(self, batch_size: int, data_type: str = 'train',
                    shuffle: bool = None, return_indexes: bool = False) -> Iterator[tuple]:
        """Generate batches of inputs and expected output to train neural networks
        Args:
            batch_size: number of samples in batch
            data_type: can be either 'train', 'test', or 'valid'
            shuffle: whether to shuffle dataset before batching
            return_indexes: whether to return indexes of batch elements in initial dataset
        Yields:
            a tuple of a batch of inputs and a batch of expected outputs.
            If `return_indexes` is True, also yields indexes of batch elements.
        """
        if shuffle is None:
            shuffle = self.shuffle
        data = self.data[data_type]
        lengths = [len(x[0]) for x in data]
        indexes = np.argsort(lengths)
        L = len(data)
        if batch_size < 0:
            batch_size = L
        starts = list(range(0, L, batch_size))
        if shuffle:
            self.random.shuffle(starts)
        for start in starts:
            indexes_to_yield = indexes[start:start + batch_size]
            data_to_yield = tuple(list(x) for x in zip(*([data[i] for i in indexes_to_yield])))
            if return_indexes:
                yield indexes_to_yield, data_to_yield
            else:
                yield data_to_yield


================================================
FILE: deeppavlov/dataset_iterators/multitask_iterator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import math
import random
from logging import getLogger
from typing import Iterator, Optional, Tuple, Union

import numpy as np

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.params import from_params
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator

log = getLogger(__name__)


@register('multitask_iterator')
class MultiTaskIterator:
    """
    Class merges data from several dataset iterators. When used for batch generation batches from
    merged dataset iterators are united into one batch. If sizes of merged datasets are different
    smaller datasets are repeated until their size becomes equal to the largest dataset.

    Args:
        data: dictionary which keys are task names and values are dictionaries with fields
            ``"train", "valid", "test"``.
        num_train_epochs: number of training epochs
        tasks: dictionary which keys are task names and values are init params of dataset iterators. If task has
            key-value pair ``'use_task_defaults': False`` task_defaults for this task dataset iterator will be ignored.
        batch_size: batch_size
        sampling_mode: mode of sampling we use. It can be plain, uniform or anneal.
        gradient_accumulation_steps: number of gradient accumulation steps. Default is 1
        steps_per_epoch: number of steps per epoch. Nesessary if gradient_accumulation_steps > 1
        iterator_class_name: name of iterator class.
        use_label_name, seed, features - parameters for the iterator class
        one_element_tuples: if True, tuple of x consisting of one element is returned in this element. Default: True
        task_defaults: default task parameters.
        seed - random seed for sampling

    Attributes:
        data: dictionary of data with fields "train", "valid" and "test" (or some of them)
    """

    def __init__(
            self,
            data: dict,
            num_train_epochs: int,
            tasks: dict,
            batch_size: int = 8,
            sampling_mode: str = 'plain',
            gradient_accumulation_steps: int = 1,
            steps_per_epoch: int = 0,
            one_element_tuples: bool = True,
            task_defaults: dict = None,
            seed: int = 42,
            **kwargs
    ):
        if data.keys() != tasks.keys():
            raise ConfigError("Task names from dataset reader don't mach task names from dataset iterator: "
                              f"{data.keys()} != {tasks.keys()}.")
        self.task_iterators = {}
        if task_defaults is None:
            task_defaults = dict()
        for task_name, task_params in tasks.items():
            if task_params.pop('use_task_defaults', True) is True:
                task_config = copy.deepcopy(task_defaults)
                task_config.update(task_params)
            else:
                task_config = task_params
            try:
                self.task_iterators[task_name] = from_params(task_config, data=data[task_name])
            except Exception as e:
                log.error(f'Failed to initialize dataset_iterator for "{task_name}" task. Make sure that all parameters'
                          'from `task_defaults` and task parameters are correct.')
                raise e
        self.n_tasks = len(tasks.keys())
        self.num_train_epochs = num_train_epochs
        self.steps_per_epoch = steps_per_epoch
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.epochs_done = 0
        self.steps_taken = 0
        self.task_id = None
        self.sampling_mode = sampling_mode
        self.data = {
            "train": self._extract_data_type("train"),
            "valid": self._extract_data_type("valid"),
            "test": self._extract_data_type("test"),
        }
        for mode in ["train", "valid", "test"]:
            log.info(f'For {mode}')
            for task_name in self.data[mode]:
                log.info(f'{task_name} has {len(self.data[mode][task_name])} examples')
        self.train_sizes = self._get_data_size("train")
        if steps_per_epoch == 0:
            self.steps_per_epoch = sum(self.train_sizes) // batch_size
        else:
            self.steps_per_epoch = steps_per_epoch

        def is_nan(a):
            return a != a

        for mode in ['train', 'valid', 'test']:
            for task in self.data[mode]:
                for i in range(len(self.data[mode][task]) - 1, -1, -1):
                    x = self.data[mode][task][i][0]
                    y = self.data[mode][task][i][1]
                    if is_nan(x) or any([is_nan(z) for z in x]) or is_nan(y):
                        log.info(f'NAN detected {self.data[mode][task][i - 1:i]}')
                        del self.data[mode][task][i]
                        log.info(f'NAN for mode {mode} task {task} element {i} CLEARED')
                    elif isinstance(x, tuple) and len(x) == 1 and one_element_tuples:
                        # x is a tuple consisting of 1 element. return it as string
                        self.data[mode][task][i] = (x[0], y)
        self.max_task_data_len = dict()
        for data_type in self.data:
            sizes = self._get_data_size(data_type)
            self.max_task_data_len[data_type] = max(sizes)
        random.seed(seed)

    def _get_data_size(self, data_type):
        """Returns list of sizes of each dataset for the given data_type: train,test or valid."""
        return [len(self.data[data_type][key]) for key in self.data[data_type]]

    def _get_probs(self, data_type):
        """Returns sampling probabilities for different sampling modes - plain, uniform or anneal"""
        if self.sampling_mode == 'uniform':
            sizes = [1 for _ in self._get_data_size(data_type)]
            # as we sample uniformly
            s = sum(sizes)
            probs = [p / s for p in sizes]
        elif self.sampling_mode == 'plain':
            sizes = self._get_data_size(data_type)
            n_samples = sum(sizes)
            probs = [p / n_samples for p in sizes]
        elif self.sampling_mode == 'anneal':
            alpha = 1.0 - 0.8 * (self.epochs_done / self.num_train_epochs)
            annealed_sizes = [p ** alpha for p in self._get_data_size(data_type)]
            n_samples = sum(annealed_sizes)
            probs = [p / n_samples for p in annealed_sizes]
        else:
            raise ValueError(f'Unsupported sampling mode {self.sampling_mode}')
        return probs

    def _extract_data_type(self, data_type):
        """Function that merges data of the current data_type (e.g. train) from all task_iterators into one dict"""
        dataset_part = {}
        for task, iterator in self.task_iterators.items():
            dataset_part[task] = getattr(iterator, data_type)
        return dataset_part

    def _transform_before_yielding(self, x, y, batch_size):
        """Function that transforms data from dataset before yielding"""

        if len(x) != len(y):
            raise Exception(f'x has len {len(x)} but y has len {len(y)}')
        new_x, new_y = [], []
        for i in range(batch_size):
            x_tuple = tuple([x[t_id][i] for t_id in range(self.n_tasks)])
            y_tuple = tuple([y[t_id][i] for t_id in range(self.n_tasks)])
            if self.n_tasks == 1:
                x_tuple = x_tuple[0]
                y_tuple = y_tuple[0]
            new_x.append(x_tuple)
            new_y.append(y_tuple)
        batches = (tuple(new_x), tuple(new_y))
        return batches

    def gen_batches(self, batch_size: int, data_type: str = "train",
                    shuffle: bool = None) -> Iterator[Tuple[tuple, tuple]]:
        """
        Generates batches and expected output to train neural networks.
        If there are not enough samples from any task, samples are padded with None
        Args:
            batch_size: number of samples in batch
            data_type: can be either 'train', 'test', or 'valid'
            shuffle: whether to shuffle dataset before batching
        Yields:
            A tuple of a batch of inputs and a batch of expected outputs.
            Inputs and outputs are tuples. Element of inputs or outputs is a tuple which
            elements are x values of merged tasks in the order tasks are present in
            `tasks` argument of `__init__` method.
        """

        max_task_data_len = self.max_task_data_len[data_type]
        log.info(f'Batch size {batch_size} with gradient accumulation steps {self.gradient_accumulation_steps}')
        log.info(f'Efficient batch size {batch_size // self.gradient_accumulation_steps}')
        batch_size = batch_size // self.gradient_accumulation_steps

        if data_type == "train":
            generators = [
                SingleTaskBatchGenerator(iter_, batch_size, data_type, shuffle)
                for iter_ in self.task_iterators.values()
            ]
            # probs only required while training
            probs = self._get_probs("train")
            for step in range(self.steps_per_epoch):
                if (self.steps_taken + 1) % self.gradient_accumulation_steps == 0 or self.task_id is None:
                    self.task_id = np.random.choice(self.n_tasks, p=probs)
                x = [[None for _ in range(batch_size)] for _ in range(self.n_tasks)]
                y = [[None for _ in range(batch_size)] for _ in range(self.n_tasks)]
                x[self.task_id], y[self.task_id] = generators[self.task_id].__next__()
                if not all([s is None for s in x[self.task_id]]):
                    batch_to_yield = self._transform_before_yielding(
                        x, y, batch_size)
                    yield batch_to_yield

            self.epochs_done += 1
            # one additional step is taken while logging training metrics
            self.steps_taken -= 1
        else:
            eval_batch_size = 1
            x = [[None for _ in range(eval_batch_size)] for _ in range(self.n_tasks)]
            y = [[None for _ in range(eval_batch_size)] for _ in range(self.n_tasks)]
            generators = [
                SingleTaskBatchGenerator(
                    iter_, batch_size=eval_batch_size, data_type=data_type, shuffle=shuffle)
                for iter_ in self.task_iterators.values()
            ]
            for step in range(max_task_data_len):
                for task_id in range(self.n_tasks):
                    x[task_id], y[task_id] = generators[task_id].__next__()

                batches = self._transform_before_yielding(x, y, eval_batch_size)
                yield batches

    def get_instances(self, data_type: str = "train"):
        """
        Returns a tuple of inputs and outputs from all datasets. Lengths of
        and outputs are equal to the size of the largest dataset. Smaller
        datasets are padded with Nones until their sizes are equal to the size of the
        largest dataset.
        Args:
            data_type: can be either 'train', 'test', or 'valid'
        Returns:
            A tuple of all inputs for a data type and all expected outputs
            for a data type.
        """

        max_task_data_len = max(
            [
                len(iter_.get_instances(data_type)[0])
                for iter_ in self.task_iterators.values()
            ]
        )
        x_instances = []
        y_instances = []
        for task_name, iter_ in self.task_iterators.items():
            x, y = iter_.get_instances(data_type)
            n_repeats = math.ceil(max_task_data_len / len(x))
            x *= n_repeats
            y *= n_repeats
            x_instances.append(x[:max_task_data_len])
            y_instances.append(y[:max_task_data_len])
        error_msg = f'Len of x_instances {len(x_instances)} and y_instances {len(y_instances)} dont match'
        if len(x_instances) != len(y_instances):
            raise Exception(error_msg)
        instances = (tuple(zip(*x_instances)), tuple(zip(*y_instances)))
        return instances


class SingleTaskBatchGenerator:
    """
    Batch generator for a single task.
    If there are no elements in the dataset to form another batch, Nones are returned.
    Args:
        dataset_iterator: dataset iterator from which batches are drawn.
        batch_size: size fo the batch.
        data_type: "train", "valid", or "test"
        shuffle: whether dataset will be shuffled.
        n_batches: the number of batches that will be generated.
    """

    def __init__(
            self,
            dataset_iterator: Union[DataLearningIterator],
            batch_size: int,
            data_type: str,
            shuffle: bool,
            n_batches: Optional[int] = None,
            size_of_last_batch: Optional[int] = None,
    ):
        self.dataset_iterator = dataset_iterator
        self.batch_size = batch_size
        self.data_type = data_type
        self.shuffle = shuffle
        self.n_batches = n_batches
        self.size_of_last_batch = (
            self.batch_size if size_of_last_batch is None else size_of_last_batch)

        self.inner_batch_size = math.gcd(
            len(self.dataset_iterator.data[data_type]), batch_size
        )
        self.gen = self.dataset_iterator.gen_batches(
            self.inner_batch_size, self.data_type, self.shuffle
        )
        self.batch_count = 0

    def __iter__(self):
        return self

    def __next__(self):
        if self.n_batches is not None and self.batch_count > self.n_batches:
            raise StopIteration
        x, y = (), ()
        while len(x) < self.batch_size or len(y) < self.batch_size:
            try:
                xx, yy = next(self.gen)
                x += xx
                y += yy
            except StopIteration:
                x_nones = tuple([None for _ in range(self.batch_size)])
                y_nones = x_nones
                return x_nones, y_nones

        self.batch_count += 1
        if self.batch_count == self.n_batches:
            x = x[:self.size_of_last_batch]
            y = y[:self.size_of_last_batch]
        return x, y


================================================
FILE: deeppavlov/dataset_iterators/siamese_iterator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import Dict, List, Tuple

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator

log = getLogger(__name__)


@register('siamese_iterator')
class SiameseIterator(DataLearningIterator):
    """The class contains methods for iterating over a dataset for ranking in training, validation and test mode."""

    def split(self, *args, len_valid=1000, len_test=1000, **kwargs) -> None:
        if len(self.valid) == 0 and len_valid != 0:
            self.random.shuffle(self.train)
            self.valid = self.train[-len_valid:]
            self.train = self.train[:-len_valid]
        if len(self.test) == 0 and len_test != 0:
            self.random.shuffle(self.train)
            self.test = self.train[-len_test:]
            self.train = self.train[:-len_test]


================================================
FILE: deeppavlov/dataset_iterators/sqlite_iterator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sqlite3
from logging import getLogger
from pathlib import Path
from random import Random
from typing import List, Any, Dict, Optional, Union, Generator, Tuple

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_fitting_iterator import DataFittingIterator

logger = getLogger(__name__)


@register('sqlite_iterator')
class SQLiteDataIterator(DataFittingIterator):
    """Iterate over SQLite database.
    Gen batches from SQLite data.
    Get document ids and document.

    Args:
        load_path: a path to local DB file
        batch_size: a number of samples in a single batch
        shuffle: whether to shuffle data during batching
        seed: random seed for data shuffling

    Attributes:
        connect: a DB connection
        db_name: a DB name
        doc_ids: DB document ids
        doc2index: a dictionary of document indices and their titles
        batch_size: a number of samples in a single batch
        shuffle: whether to shuffle data during batching
        random: an instance of :class:`Random` class.

    """

    def __init__(self, load_path: Union[str, Path], batch_size: Optional[int] = None,
                 shuffle: Optional[bool] = None, seed: Optional[int] = None, **kwargs) -> None:

        load_path = str(expand_path(load_path))
        logger.info("Connecting to database, path: {}".format(load_path))
        try:
            self.connect = sqlite3.connect(load_path, check_same_thread=False)
        except sqlite3.OperationalError as e:
            e.args = e.args + ("Check that DB path exists and is a valid DB file",)
            raise e
        try:
            self.db_name = self.get_db_name()
        except TypeError as e:
            e.args = e.args + (
                'Check that DB path was created correctly and is not empty. '
                'Check that a correct dataset_format is passed to the ODQAReader config',)
            raise e
        self.doc_ids = self.get_doc_ids()
        self.doc2index = self.map_doc2idx()
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.random = Random(seed)

    def get_doc_ids(self) -> List[Any]:
        """Get document ids.

        Returns:
            document ids
        """
        cursor = self.connect.cursor()
        cursor.execute('SELECT id FROM {}'.format(self.db_name))
        ids = [ids[0] for ids in cursor.fetchall()]
        cursor.close()
        return ids

    def get_db_name(self) -> str:
        """Get DB name.

        Returns:
            DB name

        """
        cursor = self.connect.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        assert cursor.arraysize == 1
        name = cursor.fetchone()[0]
        cursor.close()
        return name

    def map_doc2idx(self) -> Dict[int, Any]:
        """Map DB ids to integer ids.

        Returns:
            a dictionary of document titles and correspondent integer indices

        """
        doc2idx = {doc_id: i for i, doc_id in enumerate(self.doc_ids)}
        logger.info(
            "SQLite iterator: The size of the database is {} documents".format(len(doc2idx)))
        return doc2idx

    def get_doc_content(self, doc_id: Any) -> Optional[str]:
        """Get document content by id.

        Args:
            doc_id: a document id

        Returns:
            document content if success, else raise Exception

        """
        cursor = self.connect.cursor()
        cursor.execute(
            "SELECT text FROM {} WHERE id = ?".format(self.db_name),
            (doc_id,)
        )
        result = cursor.fetchone()
        cursor.close()
        return result if result is None else result[0]

    def gen_batches(self, batch_size: int, shuffle: bool = None) \
            -> Generator[Tuple[List[str], List[int]], Any, None]:
        """Gen batches of documents.

        Args:
            batch_size: a number of samples in a single batch
            shuffle: whether to shuffle data during batching

        Yields:
            generated tuple of documents and their ids

        """
        if shuffle is None:
            shuffle = self.shuffle

        if shuffle:
            _doc_ids = self.random.sample(self.doc_ids, len(self.doc_ids))
        else:
            _doc_ids = self.doc_ids

        if batch_size > 0:
            batches = [_doc_ids[i:i + batch_size] for i in
                       range(0, len(_doc_ids), batch_size)]
        else:
            batches = [_doc_ids]

        for i, doc_ids in enumerate(batches):
            docs = [self.get_doc_content(doc_id) for doc_id in doc_ids]
            doc_nums = [self.doc2index[doc_id] for doc_id in doc_ids]
            yield docs, zip(doc_ids, doc_nums)

    def get_instances(self):
        """Get all data"""
        doc_ids = list(self.doc_ids)
        docs = [self.get_doc_content(doc_id) for doc_id in doc_ids]
        doc_nums = [self.doc2index[doc_id] for doc_id in doc_ids]
        return docs, zip(doc_ids, doc_nums)


================================================
FILE: deeppavlov/dataset_iterators/squad_iterator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
from typing import Dict, Any, List, Tuple, Generator, Optional

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator


@register('squad_iterator')
class SquadIterator(DataLearningIterator):
    """SquadIterator allows to iterate over examples in SQuAD-like datasets.
    SquadIterator is used to train 
    :class:`~deeppavlov.models.torch_bert.torch_transformers_squad:TorchTransformersSquad`.

    It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset.
    Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``

    Attributes:
        train: train examples
        valid: validation examples
        test: test examples

    """

    def preprocess(self, data: Dict[str, Any], *args, **kwargs) -> \
            List[Tuple[Tuple[str, str], Tuple[List[str], List[int]]]]:
        """Extracts context, question, answer, answer_start from SQuAD data

        Args:
            data: data in squad format

        Returns:
            list of (context, question), (answer_text, answer_start)
            answer text and answer_start are lists

        """
        cqas = []
        if data:
            for article in data['data']:
                for par in article['paragraphs']:
                    context = par['context']
                    for qa in par['qas']:
                        q = qa['question']
                        ans_text = []
                        ans_start = []
                        if qa['answers']:
                            for answer in qa['answers']:
                                ans_text.append(answer['text'])
                                ans_start.append(answer['answer_start'])
                        else:
                            ans_text = ['']
                            ans_start = [-1]
                        cqas.append(((context, q), (ans_text, ans_start)))
        return cqas


@register('multi_squad_iterator')
class MultiSquadIterator(DataLearningIterator):
    """Dataset iterator for multiparagraph-SQuAD dataset.

    With ``with_answer_rate`` rate samples context with answer and with ``1 - with_answer_rate`` samples context
    from the same article, but without an answer. Contexts without an answer are sampled according to
    their tfidf scores (tfidf score between question and context).

    It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset.
    Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``. If there is
    no answer in context, then ``answer_text`` is empty string and `answer_start` is equal to -1.

    Args:
        data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values
        seed: random seed for data shuffling
        shuffle: whether to shuffle data during batching
        with_answer_rate: sampling rate of contexts with answer

    Attributes:
        shuffle: whether to shuffle data during batching
        random: instance of ``Random`` initialized with a seed
    """

    def __init__(self, data, seed: Optional[int] = None, shuffle: bool = True, with_answer_rate: float = 0.666,
                 *args, **kwargs) -> None:
        self.with_answer_rate = with_answer_rate
        self.seed = seed
        self.np_random = np.random.RandomState(seed)
        super().__init__(data, seed, shuffle, *args, **kwargs)

    def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: bool = None) \
            -> Generator[Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]], None, None]:

        if shuffle is None:
            shuffle = self.shuffle

        if data_type == 'train':
            random = self.np_random
        else:
            random = np.random.RandomState(self.seed)

        if shuffle:
            random.shuffle(self.data[data_type])

        data = self.data[data_type]
        data_len = len(data)

        for i in range((data_len - 1) // batch_size + 1):
            batch = []
            for j in range(i * batch_size, min((i + 1) * batch_size, data_len)):
                q = data[j]['question']
                contexts = data[j]['contexts']
                ans_contexts = [c for c in contexts if len(c['answer']) > 0]
                noans_contexts = [c for c in contexts if len(c['answer']) == 0]
                # sample context with answer or without answer
                if random.rand() < self.with_answer_rate or len(noans_contexts) == 0:
                    # select random context with answer
                    context = random.choice(ans_contexts)
                else:
                    # select random context without answer
                    # prob ~ context tfidf score
                    noans_scores = np.array([x['score'] for x in noans_contexts])
                    noans_scores = noans_scores / np.sum(noans_scores)
                    context = noans_contexts[np.argmax(random.multinomial(1, noans_scores))]

                answer_text = [ans['text'] for ans in context['answer']] if len(context['answer']) > 0 else ['']
                answer_start = [ans['answer_start']
                                for ans in context['answer']] if len(context['answer']) > 0 else [-1]
                batch.append(((context['context'], q), (answer_text, answer_start)))
            yield tuple(zip(*batch))

    def get_instances(self, data_type: str = 'train') -> Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]]:
        data_examples = []
        for qcas in self.data[data_type]:  # question, contexts, answers
            question = qcas['question']
            for context in qcas['contexts']:
                answer_text = [x['text'] for x in context['answer']]
                answer_start = [x['answer_start'] for x in context['answer']]
                data_examples.append(((context['context'], question), (answer_text, answer_start)))
        return tuple(zip(*data_examples))


@register('multi_squad_retr_iterator')
class MultiSquadRetrIterator(DataLearningIterator):
    """Dataset iterator for multiparagraph-SQuAD dataset.

    reads data from jsonl files

    With ``with_answer_rate`` rate samples context with answer and with ``1 - with_answer_rate`` samples context
    from the same article, but without an answer. Contexts without an answer are sampled from uniform distribution.
    If ``with_answer_rate`` is None than we compute actual ratio for each data example.

    It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset.
    Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``. If there is
    no answer in context, then ``answer_text`` is empty string and `answer_start` is equal to -1.

    Args:
        data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values
        seed: random seed for data shuffling
        shuffle: whether to shuffle data during batching
        with_answer_rate: sampling rate of contexts with answer
        squad_rate: sampling rate of context from squad dataset (actual rate would be with_answer_rate * squad_rate)

    Attributes:
        shuffle: whether to shuffle data during batching
        random: instance of ``Random`` initialized with a seed
    """

    def __init__(self, data, seed: Optional[int] = None, shuffle: bool = False,
                 with_answer_rate: Optional[float] = None,
                 squad_rate: Optional[float] = None, *args, **kwargs) -> None:
        self.with_answer_rate = with_answer_rate
        self.squad_rate = squad_rate
        self.seed = seed
        self.np_random = np.random.RandomState(seed)
        self.shuffle = shuffle

        self.train = data.get('train', [])
        self.valid = data.get('valid', [])
        self.test = data.get('test', [])

        self.data = {
            'train': self.train,
            'valid': self.valid,
            'test': self.test,
        }

        if self.shuffle:
            raise RuntimeError('MultiSquadIterator doesn\'t support shuffling.')

    def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: bool = None) \
            -> Generator[Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]], None, None]:

        if shuffle is None:
            shuffle = self.shuffle

        if data_type == 'train':
            random = self.np_random
        else:
            random = np.random.RandomState(self.seed)

        if shuffle:
            raise RuntimeError('MultiSquadIterator doesn\'t support shuffling.')

        datafile = self.data[data_type]
        with datafile.open('r', encoding='utf8') as fin:
            end_of_file = False
            while not end_of_file:
                batch = []
                for i in range(batch_size):
                    line = fin.readline()
                    if len(line) == 0:
                        end_of_file = True
                        break

                    qcas = json.loads(line)
                    q = qcas['question']
                    contexts = qcas['contexts']
                    ans_contexts = [c for c in contexts if len(c['answer']) > 0]
                    noans_contexts = [c for c in contexts if len(c['answer']) == 0]
                    ans_clen = len(ans_contexts)
                    noans_clen = len(noans_contexts)
                    # sample context with answer or without answer
                    with_answer_rate = self.with_answer_rate
                    if with_answer_rate is None:
                        with_answer_rate = 1.0 if noans_clen == 0 else ans_clen / (ans_clen + noans_clen)

                    if random.rand() < with_answer_rate or noans_clen == 0:
                        # select random context with answer
                        if self.squad_rate is not None:
                            if random.rand() < self.squad_rate or len(ans_contexts) == 1:
                                # first context is always from squad dataset
                                context = ans_contexts[0]
                            else:
                                context = random.choice(ans_contexts[1:])
                        else:
                            context = random.choice(ans_contexts)
                    else:
                        # select random context without answer
                        # prob ~ context tfidf score
                        # noans_scores = np.array([x['score'] for x in noans_contexts])
                        # noans_scores = noans_scores / np.sum(noans_scores)
                        # context = noans_contexts[np.argmax(random.multinomial(1, noans_scores))]
                        context = random.choice(noans_contexts)

                    answer_text = [ans['text'] for ans in context['answer']] if len(context['answer']) > 0 else ['']
                    answer_start = [ans['answer_start']
                                    for ans in context['answer']] if len(context['answer']) > 0 else [-1]
                    batch.append(((context['context'], q), (answer_text, answer_start)))
                if batch:
                    yield tuple(zip(*batch))

    def get_instances(self, data_type: str = 'train') -> Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]]:
        data_examples = []
        for f in self.data[data_type]:  # question, contexts, answers
            for line in f.open('r', encoding='utf8'):
                qcas = json.loads(line)
                question = qcas['question']
                for context in qcas['contexts']:
                    answer_text = [x['text'] for x in context['answer']]
                    answer_start = [x['answer_start'] for x in context['answer']]
                    data_examples.append(((context['context'], question), (answer_text, answer_start)))
        return tuple(zip(*data_examples))


================================================
FILE: deeppavlov/dataset_iterators/typos_iterator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator


@register('typos_iterator')
class TyposDatasetIterator(DataLearningIterator):
    """Implementation of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for training
     :class:`~deeppavlov.models.spelling_correction.brillmoore.ErrorModel`

    """

    def split(self, test_ratio: float = 0., *args, **kwargs):
        """Split all data into train and test

        Args:
            test_ratio: ratio of test data to train, from 0. to 1.
        """
        self.train += self.valid + self.test

        split = int(len(self.train) * test_ratio)

        self.random.shuffle(self.train)

        self.test = self.train[:split]
        self.train = self.train[split:]
        self.valid = []


================================================
FILE: deeppavlov/dataset_readers/__init__.py
================================================


================================================
FILE: deeppavlov/dataset_readers/basic_classification_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from logging import getLogger
from pathlib import Path

import pandas as pd

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download

log = getLogger(__name__)


@register('basic_classification_reader')
class BasicClassificationDatasetReader(DatasetReader):
    """
    Class provides reading dataset in .csv format
    """

    def read(self, data_path: str, url: str = None,
             format: str = "csv", class_sep: str = None,
             *args, **kwargs) -> dict:
        """
        Read dataset from data_path directory.
        Reading files are all data_types + extension
        (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
        data_path will be read)

        Args:
            data_path: directory with files
            url: download data files if data_path not exists or empty
            format: extension of files. Set of Values: ``"csv", "json"``
            class_sep: string separator of labels in column with labels
            sep (str): delimeter for ``"csv"`` files. Default: None -> only one class per sample
            header (int): row number to use as the column names
            names (array): list of column names to use
            orient (str): indication of expected JSON string format
            lines (boolean): read the file as a json object per line. Default: ``False``

        Returns:
            dictionary with types from data_types.
            Each field of dictionary is a list of tuples (x_i, y_i)
        """
        data_types = ["train", "valid", "test"]

        train_file = kwargs.get('train', 'train.csv')

        if not Path(data_path, train_file).exists():
            if url is None:
                raise Exception(
                    "data path {} does not exist or is empty, and download url parameter not specified!".format(
                        data_path))
            log.info("Loading train data from {} to {}".format(url, data_path))
            download(source_url=url, dest_file_path=Path(data_path, train_file))

        data = {"train": [],
                "valid": [],
                "test": []}
        for data_type in data_types:
            file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format))
            if file_name is None:
                continue

            file = Path(data_path).joinpath(file_name)
            if file.exists():
                if format == 'csv':
                    keys = ('sep', 'header', 'names')
                    options = {k: kwargs[k] for k in keys if k in kwargs}
                    df = pd.read_csv(file, **options)
                elif format == 'json':
                    keys = ('orient', 'lines')
                    options = {k: kwargs[k] for k in keys if k in kwargs}
                    df = pd.read_json(file, **options)
                else:
                    raise Exception('Unsupported file format: {}'.format(format))

                x = kwargs.get("x", "text")
                y = kwargs.get('y', 'labels')
                if isinstance(x, list):
                    if class_sep is None:
                        # each sample is a tuple ("text", "label")
                        data[data_type] = [([row[x_] for x_ in x], str(row[y]))
                                           for _, row in df.iterrows()]
                    else:
                        # each sample is a tuple ("text", ["label", "label", ...])
                        data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep))
                                           for _, row in df.iterrows()]
                else:
                    if class_sep is None:
                        # each sample is a tuple ("text", "label")
                        data[data_type] = [(row[x], str(row[y])) for _, row in df.iterrows()]
                    else:
                        # each sample is a tuple ("text", ["label", "label", ...])
                        data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()]
            else:
                log.warning("Cannot find {} file".format(file))

        return data


================================================
FILE: deeppavlov/dataset_readers/boolqa_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from pathlib import Path
from typing import Dict, List, Tuple

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download_decompress


@register('boolqa_reader')
class BoolqaReader(DatasetReader):
    """
    The class to read the BoolQ dataset from files. 
    BoolQ is a question answering dataset for yes/no questions containing 15942 examples. 
    Each example is a triplet of (question, passage, answer).

    More details about the English BoolQ are available in https://arxiv.org/abs/1905.10044
    https://github.com/google-research-datasets/boolean-questions

    The details about the Russian DaNetQA are available in 
    https://russiansuperglue.com/ru/tasks/task_info/DaNetQA

    The reader supports English and Russian variants of the dataset.
    The config example is boolqa_rubert.json.
    """

    urls = { 
            'en': 'http://files.deeppavlov.ai/datasets/BoolQ.tar.gz',
            'ru': 'http://files.deeppavlov.ai/datasets/DaNetQA.tar.gz'
           }

    def read(self,
             data_path: str,
             language: str = 'en',
             *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:

        """
        Reads BoolQ dataset from files.

        Args:
            data_path: A path to a folder with dataset files.
            language: The dataset language ('ru', 'en' are available)

        Returns:
            dataset: items of the dataset [(question, passage), label]
        """

        if language in self.urls:
            self.url = self.urls[language]
        else:
            raise RuntimeError(f'The dataset for {language} is unavailable')

        data_path = expand_path(data_path)
        if not data_path.exists():
            data_path.mkdir(parents=True)

        download_decompress(self.url, data_path)
        dataset = {}

        for filename in ['train.jsonl', 'valid.jsonl']:
            dataset[filename.split('.')[0]] = self._build_data(language, data_path / filename)

        return dataset

    @staticmethod
    def _build_data(ln: str, data_path: Path) -> List[Tuple[Tuple[str, str], int]]:

        data = {}
        with open(data_path, 'r') as f:
            for line in f:
                jline = json.loads(line)
                if ln == 'ru':
                    if 'label' in jline:
                        data[jline['question'], jline['passage']] = int(jline['label'])
                if ln == 'en':
                    if 'answer' in jline:
                        data[jline['question'], jline['passage']] = int(jline['answer'])

        return list(data.items())


================================================
FILE: deeppavlov/dataset_readers/conll2003_reader.py
================================================
from logging import getLogger
from pathlib import Path

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download_decompress

log = getLogger(__name__)


@register('conll2003_reader')
class Conll2003DatasetReader(DatasetReader):
    """Class to read training datasets in CoNLL-2003 format"""

    def read(self,
             data_path: str,
             dataset_name: str = None,
             provide_pos: bool = False,
             provide_chunk: bool = False,
             provide_doc_ids: bool = False,
             iob: bool = False,
             iobes: bool = False,
             docstart_token: str = None,
            *args, **kwargs):
        self.provide_pos = provide_pos
        self.provide_chunk = provide_chunk
        self.provide_doc_ids = provide_doc_ids
        self.iob = iob
        self.iobes = iobes
        self.docstart_token = docstart_token
        self.num_docs = 0
        self.x_is_tuple = self.provide_pos or self.provide_doc_ids
        data_path = Path(data_path)
        files = list(data_path.glob('*.txt'))
        if 'train.txt' not in {file_path.name for file_path in files}:
            if dataset_name == 'conll2003':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz'
            elif dataset_name == 'collection_rus':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/collection3_v2.tar.gz'
            elif dataset_name == 'ontonotes':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/ontonotes_ner.tar.gz'
            elif dataset_name == 'vlsp2016':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/vlsp2016.tar.gz'
            elif dataset_name == 'dailydialog':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/dailydialog.tar.gz'
            elif dataset_name == 'collection3':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/collection3_anh.tar.gz'
            else:
                raise RuntimeError('train.txt not found in "{}"'.format(data_path))
            data_path.mkdir(exist_ok=True, parents=True)
            download_decompress(url, data_path)
            files = list(data_path.glob('*.txt'))
        dataset = {}

        for file_name in files:
            name = file_name.with_suffix('').name
            dataset[name] = self.parse_ner_file(file_name)
        return dataset

    def parse_ner_file(self, file_name: Path):
        samples = []
        with file_name.open(encoding='utf8') as f:
            tokens = []
            pos_tags = []
            chunk_tags = []
            tags = []
            expected_items = 2 + int(self.provide_pos) + int(self.provide_chunk)
            for line in f:
                # Check end of the document
                if 'DOCSTART' in line:
                    if len(tokens) > 1:
                        x = tokens if not self.x_is_tuple else (tokens,)
                        if self.provide_pos:
                            x = x + (pos_tags,)
                        if self.provide_chunk:
                            x = x + (chunk_tags,)
                        if self.provide_doc_ids:
                            x = x + (self.num_docs,)
                        samples.append((x, tags))
                        tokens = []
                        pos_tags = []
                        chunk_tags = []
                        tags = []
                    self.num_docs += 1
                    if self.docstart_token is not None:
                        tokens = [self.docstart_token]
                        pos_tags = ['O']
                        chunk_tags = ['O']
                        tags = ['O']
                elif len(line) < 2:
                    if (len(tokens) > 0) and (tokens != [self.docstart_token]):
                        x = tokens if not self.x_is_tuple else (tokens,)
                        if self.provide_pos:
                            x = x + (pos_tags,)
                        if self.provide_chunk:
                            x = x + (chunk_tags,)
                        if self.provide_doc_ids:
                            x = x + (self.num_docs,)
                        samples.append((x, tags))
                        tokens = []
                        pos_tags = []
                        chunk_tags = []
                        tags = []
                else:
                    items = line.split()
                    if len(items) < expected_items:
                        raise Exception(f"Input is not valid {line}")
                    tokens.append(items[0])
                    tags.append(items[-1])
                    if self.provide_pos:
                        pos_tags.append(items[1])
                    if self.provide_chunk:
                        chunk_tags.append(items[2])
            if tokens:
                x = tokens if not self.x_is_tuple else (tokens,)
                if self.provide_pos:
                    x = x + (pos_tags,)
                if self.provide_chunk:
                    x = x + (chunk_tags,)
                if self.provide_doc_ids:
                    x = x + (self.num_docs,)
                samples.append((x, tags))
                self.num_docs += 1

            if self.iob:
                return [(x, self._iob2_to_iob(tags)) for x, tags in samples]
            if self.iobes:
                return [(x, self._iob2_to_iobes(tags)) for x, tags in samples]

        return samples

    @staticmethod
    def _iob2_to_iob(tags):
        iob_tags = []

        for n, tag in enumerate(tags):
            if tag.startswith('B-') and (not n or (tags[n - 1][2:] != tag[2:])):
                tag = tag.replace("B-", "I-")
            iob_tags.append(tag)

        return iob_tags

    @staticmethod
    def _iob2_to_iobes(tags):
        tag_map = {"BB": "S", "BO": "S", "IB": "E", "IO": "E"}
        tags = tags + ["O"]
        iobes_tags = []
        for i in range(len(tags) - 1):
            tagtag = tags[i][0] + tags[i + 1][0]
            if tagtag in tag_map:
                iobes_tags.append(tag_map[tagtag] + tags[i][1:])
            else:
                iobes_tags.append(tags[i])
        return iobes_tags


================================================
FILE: deeppavlov/dataset_readers/docred_reader.py
================================================
# Copyright 2021 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
import json
import os
import random
from logging import getLogger
from pathlib import Path
from typing import Dict, List, Tuple, Union

import numpy as np
import pandas as pd

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader

logger = getLogger(__name__)


@register('docred_reader')
class DocREDDatasetReader(DatasetReader):
    """ Class to read the datasets in DocRED format"""

    def read(
            self,
            data_path: str,
            rel2id_path: str,
            rel_info_path: str,
            negative_label: str = "Na",
            train_valid_test_proportion: int = None,
            valid_test_data_size: int = None,
            generate_additional_neg_samples: bool = False,
            num_neg_samples: int = None
    ) -> Dict[str, List[Tuple]]:
        """
        This class processes the DocRED relation extraction dataset (https://arxiv.org/abs/1906.06127v3).
        Args:
            data_path: a path to a folder with dataset files.
            rel2id_path: a path to a file where information about relation to relation id corresponding is stored.
            rel_info_path: a path to a file where information about relations and their real names is stored
            negative_label: a label which will be used as a negative one (by default in DocRED: "Na")
            train_valid_test_proportion: a proportion in which the data will be splitted into train, valid and test sets
            valid_test_data_size: absolute amount of dev & test sets
            generate_additional_neg_samples: boolean; whether to generate additional negative samples or not.
            num_neg_samples: a number of additional negative samples that will be generated for each positive sample.
        Returns:
            DocRED output dictionary in the following format:
            {"data_type":
                List[
                    Tuple(
                        List[
                            List[all tokens of the document],
                            List[
                                List[Tuple(start pos of mention 1 of ent 1, end pos of mention 1 of ent 1), ...],
                                List[Tuple(start position of entity 2, end position of entity 2), ...],
                                List[str(NER tag of entity 1), str(NER tag of entity 2)]
                            ],
                        List(int(one-hot encoded relation label))
                    )
                ]
            }
        """

        with open(str(expand_path(rel2id_path))) as file:
            self.rel2id = json.load(file)
        self.id2rel = {value: key for key, value in self.rel2id.items()}

        with open(str(expand_path(rel_info_path))) as file:
            self.relid2rel = json.load(file)
        self.rel2relid = {value: key for key, value in self.relid2rel.items()}

        self.negative_label = negative_label
        self.if_add_neg_samples = generate_additional_neg_samples
        self.num_neg_samples = num_neg_samples

        if self.if_add_neg_samples and not self.num_neg_samples:
            raise ValueError("Please provide a number of negative samples to be generated!")

        if train_valid_test_proportion and valid_test_data_size:
            raise ValueError(
                f"The train, valid and test splitting should be done either basing on their proportional values to each"
                f"other (train_valid_test_proportion parameter), or on the absolute size of valid and test data "
                f"(valid_test_data_size parameter). They can't be used simultaneously."
            )

        self.train_valid_test_proportion = train_valid_test_proportion
        self.valid_test_data_size = valid_test_data_size

        data_path = Path(data_path).resolve()

        with open(os.path.join(data_path, "train_annotated.json")) as file_ann:
            train_data = json.load(file_ann)

        with open(os.path.join(data_path, "dev.json")) as file:
            valid_data = json.load(file)

        # if you want to use test data from the original docred without labels (e.g. as negatives...),
        # uncomment lines below
        # with open(os.path.join(data_path, "test.json")) as file:
        #     test_data = json.load(file)
        #     test_processed = self.process_docred_file(test_data, neg_samples=None)

        # merge valid and train data and split them again into train, valid & test
        if self.train_valid_test_proportion:
            train_data, test_data, valid_data = self.split_by_relative(list(train_data + valid_data))
        elif self.valid_test_data_size:
            train_data, test_data, valid_data = self.split_by_absolute(list(train_data + valid_data))

        else:
            raise ValueError(
                f"The train, valid and test splitting should be done either basing on their proportional values to each"
                f"other (train_valid_test_proportion parameter), or on the absolute size of valid and test data "
                f"(valid_test_data_size parameter). One of them should be set to the not-None value."
            )

        logger.info("Train data processing...")
        train_data, train_stat = self.process_docred_file(train_data, neg_samples="twice")

        logger.info("Valid data processing...")
        valid_data, valid_stat = self.process_docred_file(valid_data, neg_samples="equal")

        logger.info("Test data processing...")
        test_data, test_stat = self.process_docred_file(test_data, neg_samples="equal")

        self.print_statistics(train_stat, valid_stat, test_stat)

        data = {"train": train_data, "valid": valid_data, "test": test_data}

        return data

    def split_by_absolute(self, all_labeled_data: List) -> Tuple[List, List, List]:
        """
        All annotated data from DocRED is splitted into train, valid and test sets in following proportions:
          len(valid_data) = len(test_data) = self.valid_test_data_size
          len(train_data) = len(all data) - 2 * self.valid_test_data_size
        Args:
            all_labeled_data: List of all annotated data samples
        Return:
            Lists of train, valid and test data
        """
        if (int(self.valid_test_data_size) * 3) > len(all_labeled_data):
            raise ValueError(
                f"The dataset size {len(all_labeled_data)} is too small for taking {self.valid_test_data_size} samples"
                f"for valid and test. Reduce the size of valid and test set."
            )

        random.shuffle(all_labeled_data)
        valid_data = all_labeled_data[:int(self.valid_test_data_size)]
        test_data = all_labeled_data[int(self.valid_test_data_size) + 1: 2 * int(self.valid_test_data_size)]
        train_data = all_labeled_data[2 * int(self.valid_test_data_size) + 1:]
        return train_data, valid_data, test_data

    def split_by_relative(self, all_labeled_data: List) -> Tuple[List, List, List]:
        """
        All annotated data from DocRED is splitted into train, valid and test sets in following proportions:
          len(train_data) = train_valid_test_proportion * len(valid_data) = train_valid_test_proportion * len(test_data)
        """
        random.shuffle(all_labeled_data)
        one_prop = int(len(all_labeled_data)/int(self.train_valid_test_proportion))

        valid_data = all_labeled_data[:one_prop]
        test_data = all_labeled_data[one_prop + 1: 2 * one_prop]
        train_data = all_labeled_data[2 * one_prop + 1:]
        return train_data, valid_data, test_data

    def process_docred_file(self, data: List[Dict], neg_samples: str = None) -> Tuple[List, Dict]:
        """
        Processes a DocRED data and returns a DeepPavlov relevant output

        Args:
            data: List of data units
            neg_samples: how many negative samples are to be generated
                Possible values:
                    - None: no negative samples will be generated
                        (relevant to the test set which has from neg samples only)
                    - equal: there will be one negative sample pro positive sample
                    - twice: there will be twice as many negative samples as positive ones
                    - thrice: there will be thrice as many negative samples as positive ones
        Returns:
            one list of processed documents
        """
        stat_rel_name = {rel_name: 0 for _, rel_name in self.relid2rel.items()}
        self.stat = {"POS_REL": 0, "NEG_REL": 0}  # collect statistics of positive and negative samples
        processed_data_samples = []

        for data_unit in data:
            ent_ids2ent_pos, ent_ids2ent_text, ent_ids2ent_tag = {}, {}, {}

            # get list of all tokens from the document
            doc = [token for sent in data_unit["sents"] for token in sent]

            # the sentence start indices are needed for entities' indices recalculation to the whole text
            sents_begins = list(np.cumsum([0] + [len(sent) for sent in data_unit["sents"]]))

            for ent_set_id, ent_set in enumerate(data_unit["vertexSet"]):
                ent_ids2ent_pos[ent_set_id], ent_ids2ent_text[ent_set_id], ent_ids2ent_tag[ent_set_id] = [], [], []
                for ent in ent_set:
                    # the list of tuples with each entity's new indices (recalculated regarding to the whole doc)
                    ent_ids2ent_pos[ent_set_id].append(
                        ((ent["pos"][0] + sents_begins[ent["sent_id"]]),
                         (ent["pos"][1] + sents_begins[ent["sent_id"]]))
                    )
                    # also save entity id to entity as exact text mentions correspondence
                    ent_ids2ent_text[ent_set_id].append(ent["name"])
                # get the sample NER tag (logically, the same for all entity mentions)
                ent_ids2ent_tag[ent_set_id] = ent_set[0]["type"]
                ent_ids2ent_text[ent_set_id] = list(set(ent_ids2ent_text[ent_set_id]))

            # if no labels are provided for the data, handle all samples as negative ones
            if "labels" not in data_unit:
                processed_data_samples += self.construct_neg_samples(ent_ids2ent_pos, ent_ids2ent_tag, doc)

            # if labels are provided, save samples as positive samples and generate negatives
            else:
                labels = data_unit["labels"]
                curr_processed_data_samples, stat_rel_name = self.construct_pos_neg_samples(
                    labels, ent_ids2ent_pos, ent_ids2ent_tag, doc, stat_rel_name, neg_samples=neg_samples,
                )
                processed_data_samples += curr_processed_data_samples

        logger.info(f"Pos samples: {self.stat['POS_REL']}  Neg samples: {self.stat['NEG_REL']}.")
        self.stat.pop("POS_REL")
        self.stat.pop("NEG_REL")

        return processed_data_samples, stat_rel_name

    def construct_pos_neg_samples(
            self, labels: List, ent_id2ent: Dict, ent_id2ent_tag: Dict, doc: List, stat_rel: Dict, neg_samples: str,
    ) -> Tuple[List, Dict]:
        """
        Transforms the relevant information into an entry of the DocRED reader output. The entities between which
        the relation is hold will serve as an annotation for positive samples, while all other entity pairs will be
        used to construct the negative samples.

        Args:
            labels: information about relation found in a document (whole labels list of the original DocRED)
            ent_id2ent: a dictionary {entity id: [entity mentions' positions]}
            stat_rel: a dictionary with relation statistics (will be updated)
            neg_samples: amount of negative samples that are to be generated
            ent_id2ent_tag: a dictionary {entity id: entity NER tag}
            doc: list of all tokens of the document
        Returns:
            a tuple with list of all doc tokens, entity information (positions & NER tags) and relation.
        """

        num_pos_samples, num_neg_samples = 0, 0

        data_samples = []
        rel_triples = {}
        for label_info in labels:
            entity1_id, entity2_id = label_info["h"], label_info["t"]
            if (entity1_id, entity2_id) in rel_triples:
                rel_triples[(entity1_id, entity2_id)].append(self.rel2id[label_info['r']])
            else:
                rel_triples[(entity1_id, entity2_id)] = [self.rel2id[label_info['r']]]

        # the one hot encoding of the negative label
        neg_label_one_hot = self.label_to_one_hot([self.rel2id[self.negative_label]])

        # iterate over all entities
        for (ent1, ent2) in itertools.permutations(ent_id2ent, 2):

            # if there is a relation hold between entities, save them (and a corresponding sample) as positive one
            if (ent1, ent2) in rel_triples:
                num_pos_samples += 1
                labels = rel_triples[(ent1, ent2)]
                label_one_hot = self.label_to_one_hot(labels)
                data_samples.append(
                    self.generate_data_sample(doc, ent1, ent2, label_one_hot, ent_id2ent, ent_id2ent_tag)
                )
                self.stat["POS_REL"] += 1

                for label in labels:
                    rel_name = self.relid2rel[self.id2rel[label]]
                    stat_rel[rel_name] += 1

            else:
                if not neg_samples:         # if no negative samples should be generated, skip
                    continue

                # if there is no relation hold between entities, save them (and a corresponding sample) as negative one
                if neg_samples == "equal" and num_neg_samples < num_pos_samples:
                    num_neg_samples += 1
                    data_samples.append(
                        self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag)
                    )
                    self.stat["NEG_REL"] += 1

                elif neg_samples == "twice" and num_neg_samples < 2 * num_pos_samples:
                    num_neg_samples += 1
                    data_samples.append(
                        self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag)
                    )
                    self.stat["NEG_REL"] += 1

                elif neg_samples == "thrice" and num_neg_samples < 3 * num_pos_samples:
                    num_neg_samples += 1
                    data_samples.append(
                        self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag)
                    )
                    self.stat["NEG_REL"] += 1

        return data_samples, stat_rel

    def construct_neg_samples(
            self, ent_id2ent: Dict, ent_id2ent_tag: Dict, doc: List
    ) -> List[Tuple[Tuple[List, List], List]]:
        """
        Turn the annotated documents but without any positive relation label to the negative samples in a format of
            the DocRED reader output.

        Args:
            ent_id2ent: a dictionary {entity id: [entity mentions' positions]}
            ent_id2ent_tag: a dictionary {entity id: entity NER tag}
            doc: list of all tokens of the document
        Returns:
            a tuple with list of all doc tokens, entity information (positions & NER tags) and relation (=neg_label).
        """
        neg_data_samples = []
        neg_label_one_hot = self.label_to_one_hot([self.rel2id[self.negative_label]])
        for ent1, ent2 in itertools.permutations(ent_id2ent.keys(), 2):
            neg_data_samples.append(
                self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag)
            )

            self.stat["NEG_REL"] += 1
        return neg_data_samples

    @staticmethod
    def generate_data_sample(
            doc: List, ent1: int, ent2: int, label: List, ent_id2ent: Dict, ent_id2ent_tag: Dict
    ) -> Tuple[List[Union[List, List]], List]:
        """ Creates an entry of processed docred corpus """
        return (
                    [
                        doc,
                        [ent_id2ent[ent1], ent_id2ent[ent2]],
                        [ent_id2ent_tag[ent1], ent_id2ent_tag[ent2]]
                    ],
                    label
                )

    def generate_additional_neg_samples(self, doc: List, forbidden_entities: List, num_neg_samples: int):
        """
        <CURRENTLY NOT USED>
        Generated negative samples, i.e. the same document that is used for positive samples, but labeled with
        "no_relation" label and with entities, that are not connected with any relation, marked as such.

        Args:
             doc: list of positive sentences
             forbidden_entities: list of entities that participate in any of the relations (and, therefore, cannot be
                chosen for negative sample)
             num_neg_samples: number of negative samples that are to be generated out of this document
        Returns:
             a tuple with list of all doc tokens, entity information (positions & NER tags) and relation (=neg_label).
        """
        # ATTENTION! To make it work, please run the following command: python3 -m deeppavlov install ner_ontonotes_bert

        from deeppavlov import build_model, configs
        ner = build_model(configs.ner.ner_ontonotes_bert_mult, download=True)
        neg_data_samples = []
        analysed_sentences = ner([" ".join(doc)])  # returns [[[tokens]], [[ner tags]]]

        # select ids of tokens that were not part of any relation so far
        neg_entities_idx = random.sample(
            [ent_idx for ent_idx in range(len(analysed_sentences[0][0]))
             if analysed_sentences[0][0][ent_idx] not in forbidden_entities],
            num_neg_samples * 2
        )

        # the one hot encoding of the negative label
        neg_label_one_hot = self.label_to_one_hot([self.rel2id[self.negative_label]])

        for n_ent_1_idx, n_ent_2_idx in itertools.permutations(neg_entities_idx, 2):
            # if already sufficient number of negative samples have been generated
            if len(neg_data_samples) == num_neg_samples:
                break
            neg_entity_1 = analysed_sentences[0][0][n_ent_1_idx]
            neg_entity_2 = analysed_sentences[0][0][n_ent_2_idx]
            neg_entity_1_tag = analysed_sentences[1][0][n_ent_1_idx]
            neg_entity_2_tag = analysed_sentences[1][0][n_ent_2_idx]
            neg_data_samples.append(
                (doc, [[neg_entity_1], [neg_entity_2], neg_entity_1_tag, neg_entity_2_tag], neg_label_one_hot)
            )
            self.stat["NEG_REL"] += 1

        return neg_data_samples

    def label_to_one_hot(self, labels: List[int]) -> List:
        """ Turn labels to one hot encodings """
        relation = [0] * len(self.rel2id)
        for label in labels:
            relation[label] = 1
        return relation

    def print_statistics(self, train_stat: Dict, valid_stat: Dict, test_stat: Dict) -> None:
        """ Print out the relation statistics as a markdown table """
        df = pd.DataFrame([self.rel2relid, train_stat, valid_stat, test_stat]).T
        df.columns = ['d{}'.format(i) for i, col in enumerate(df, 1)]
        logger.info("\n")
        logger.info(df)


================================================
FILE: deeppavlov/dataset_readers/faq_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, softwaredata
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict

from pandas import read_csv

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('faq_reader')
class FaqDatasetReader(DatasetReader):
    """Reader for FAQ dataset"""

    def read(self, data_path: str = None, data_url: str = None, x_col_name: str = 'x', y_col_name: str = 'y') -> Dict:
        """
        Read FAQ dataset from specified csv file or remote url

        Parameters:
            data_path: path to csv file of FAQ
            data_url: url to csv file of FAQ
            x_col_name: name of Question column in csv file
            y_col_name: name of Answer column in csv file

        Returns:
            A dictionary containing training, validation and test parts of the dataset obtainable via
            ``train``, ``valid`` and ``test`` keys.
        """

        if data_url is not None:
            data = read_csv(data_url)
        elif data_path is not None:
            data = read_csv(data_path)
        else:
            raise ValueError("Please specify data_path or data_url parameter")

        x = data[x_col_name]
        y = data[y_col_name]

        train_xy_tuples = [(x[i].strip(), y[i].strip()) for i in range(len(x))]

        dataset = dict()
        dataset["train"] = train_xy_tuples
        dataset["valid"] = []
        dataset["test"] = []

        return dataset


================================================
FILE: deeppavlov/dataset_readers/huggingface_dataset_reader.py
================================================
# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import re
from collections import Counter
from math import floor
from typing import Dict, Optional, List, Union

from datasets import load_dataset, Dataset, Features, ClassLabel, concatenate_datasets

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('huggingface_dataset_reader')
class HuggingFaceDatasetReader(DatasetReader):
    """Adds HuggingFace Datasets https://huggingface.co/datasets/ to DeepPavlov
    """

    def read(self,
             path: str,
             name: Optional[str] = None,
             train: Optional[str] = None,  # for lidirus with no train
             valid: Optional[str] = None,
             test: Optional[str] = None,
             **kwargs) -> Dict[str, Dataset]:
        """Wraps datasets.load_dataset method

        Args:
            path: datasets.load_dataset path argument (e.g., `glue`)
            name: datasets.load_dataset name argument (e.g., `mrpc`)
            train: split name to use as training data.
            valid: split name to use as validation data.
            test: split name to use as test data.

        Returns:
            Dict[str, List[Dict]]: Dictionary with train, valid, test datasets
        """
        if 'split' in kwargs:
            raise RuntimeError('Split argument was used. Use train, valid, test arguments instead of split.')

        # pop elements not relevant to BuilderConfig
        downsample_ratio: Union[List[float], float] = kwargs.pop("downsample_ratio", 1.)
        seed = kwargs.pop("seed", 42)
        percentage = kwargs.pop("dev_percentage", 50)
        do_index_correction = kwargs.pop("do_index_correction", True)

        split_mapping = {'train': train, 'valid': valid, 'test': test}
        # filter unused splits
        split_mapping = {el: split_mapping[el] for el in split_mapping if split_mapping[el]}

        if isinstance(downsample_ratio, float):
            downsample_ratio = [downsample_ratio] * len(split_mapping)
        elif isinstance(downsample_ratio, list) and len(downsample_ratio) != len(split_mapping):
            raise ValueError("The number of downsample ratios must be the same as the number of splits")

        if path == "russian_super_glue" and "_mixed" in name:
            name = name.replace("_mixed", "")

        dataset = load_dataset(path=path, name=name, split=list(split_mapping.values()), **kwargs)

        if (path == "super_glue" and name == "copa") or (path == "russian_super_glue" and name == "parus"):
            lang = "en" if name == "copa" else "ru"
            dataset = [
                dataset_split.map(preprocess_copa, batched=True, fn_kwargs={"lang": lang}) for dataset_split in dataset
            ]
        elif path == "super_glue" and name == "boolq":
            # danetqa doesn't require the same preprocessing
            dataset = load_dataset(path=path,
                                   name=name,
                                   split=interleave_splits(splits=list(split_mapping.values()),
                                                           percentage=percentage),
                                   **kwargs)
            dataset = [dataset_split.map(preprocess_boolq, batched=True) for dataset_split in dataset]
        elif (path == "super_glue" and name == "record") or (path == "russian_super_glue" and name == "rucos"):
            label_column = "label"
            dataset = [
                binary_downsample(
                    add_label_names(
                        dataset_split.map(preprocess_record,
                                          batched=True,
                                          remove_columns=["answers"]),
                        label_column=label_column,
                        label_names=["False", "True"]
                    ),
                    ratio=ratio,
                    seed=seed,
                    label_column=label_column,
                    do_correction=do_index_correction
                ).map(add_num_examples, batched=True, batch_size=None)
                for dataset_split, ratio
                in zip(dataset, downsample_ratio)
            ]
        elif (path == "super_glue" and name == "multirc") or (path == "russian_super_glue" and name == "muserc"):
            dataset = [
                dataset_split.map(
                    preprocess_multirc, batched=True, remove_columns=["paragraph", "question"]
                ) for dataset_split in dataset
            ]
        elif (path == "super_glue" and name == "wsc") or (path == "russian_super_glue" and name == "rwsd"):
            dataset = [
                dataset_split.map(
                    preprocess_wsc,
                    batched=True,
                    remove_columns=["span1_index", "span2_index", "span1_text", "span2_text"],
                ) for dataset_split in dataset
            ]
        elif path == "russian_super_glue" and name == "terra_mixed" and "train" in list(split_mapping.values()):
            tmp_dataset = []
            for d, split in zip(dataset, split_mapping.values()):
                if split == "train":
                    to_mix = load_dataset("super_glue", "rte", split="train")
                    combined_train = concatenate_datasets([to_mix, d])
                    tmp_dataset.append(combined_train)
                else:
                    tmp_dataset.append(d)
            dataset = tmp_dataset

        elif path == "russian_super_glue" and name == "rcb_mixed" and "train" in list(split_mapping.values()):
            tmp_dataset = []
            for d, split in zip(dataset, split_mapping.values()):
                if split == "train":
                    to_mix = load_dataset("super_glue", "cb", split="train")
                    combined_train = concatenate_datasets([to_mix, d.remove_columns(["verb", "negation"])])
                    tmp_dataset.append(combined_train)
                else:
                    tmp_dataset.append(d.remove_columns(["verb", "negation"]))
            dataset = tmp_dataset
        elif path == "russian_super_glue" and name == "danetqa_mixed" and "train" in list(split_mapping.values()):
            tmp_dataset = []
            for d, split in zip(dataset, split_mapping.values()):
                if split == "train":
                    to_mix = load_dataset(
                        "super_glue", "boolq", split="train"
                    ).map(
                        preprocess_boolq, batched=True
                    ).cast(d.features)
                    combined_train = concatenate_datasets([to_mix, d])
                    tmp_dataset.append(combined_train)
                else:
                    tmp_dataset.append(d)
            dataset = tmp_dataset
        return dict(zip(split_mapping.keys(), dataset))


def interleave_splits(splits: List[str], percentage: int = 50) -> List[str]:
    """Adds a portion of `dev` (or, `test` if there's only `train` and `test`) set to the `train` set.
    Assumes that there are at two splits are passed ordered as (train, dev, test).
    Args:
        splits: list of strings
        percentage: percentage (represented as an integer value between 0 and 100)
                    of samples to extract from `dev` and add to `train`
    Returns:
        List[str] containing mixing instructions (e.g. ['train+validation[:50%]', 'validation[-50%:]'])
    """
    if len(splits) < 2:
        raise ValueError("At least two splits should be passed to this function")
    mixed_splits = [f"{splits[0]}+{splits[1]}[:{percentage}%]", f"{splits[1]}[-{percentage}%:]"]
    if len(splits) == 3:
        mixed_splits += [splits[2]]
    return mixed_splits


def preprocess_copa(examples: Dataset, *, lang: str = "en") -> Dict[str, List[List[str]]]:
    """COPA preprocessing to be applied by the map function.
    Args:
        examples: an instance of Dataset class
        lang: task language. Either `en` or `ru`.
    Returns:
        Dict[str, List[List[str]]]: processed features represented as nested
        list with number of elements corresponding to the number of choices
        (2 in this case)
    """
    if lang == "en":
        question_dict = {
            "cause": "What was the cause of this?",
            "effect": "What happened as a result?",
        }
    elif lang == "ru":
        question_dict = {
            "cause": "Что было причиной этого?",
            "effect": "Что случилось в результате?",
        }
    else:
        raise ValueError(f"Incorrect `lang` value '{lang}'. Should be either 'en' or 'ru'.")

    num_choices = 2

    questions = [question_dict[question] for question in examples["question"]]
    premises = examples["premise"]

    contexts = [f"{premise} {question}" for premise, question in zip(premises, questions)]
    contexts = [[context] * num_choices for context in contexts]

    choices = [[choice1, choice2] for choice1, choice2 in zip(examples["choice1"], examples["choice2"])]

    return {"contexts": contexts,
            "choices": choices}


def preprocess_boolq(examples: Dataset) -> Dict[str, List[str]]:
    """BoolQ preprocessing to be applied by the map function. The preprocessing boils down
    to removing redundant titles from the passages.
    Args:
        examples: an instance of Dataset class
    Returns:
        Dict[str, List[str]]: processed features (just the passage in this case)
    """

    def remove_passage_title(passage: str) -> str:
        """Removes the title of a given passage. The motivation is that the title duplicates
        the beginning of the text body, which means that it's redundant. We remove to save space.
        Args:
            passage: a single `passage` string
        Returns:
            str: the same `passage` string with the title removed
        """
        return re.sub(r"^.+-- ", "", passage)

    passages = [remove_passage_title(passage) for passage in examples["passage"]]

    return {"passage": passages}


def preprocess_record(examples: Dataset, *, clean_entities: bool = True) -> Dict[str, Union[List[str], List[int]]]:
    """ReCoRD preprocessing to be applied by the map function. This transforms the original
    nested structure of the dataset into a flat one. New indices are generated to allow for
    the restoration of the original structure. The resulting dataset amounts to a binary
    classification problem.
    Args:
        examples: an instance of Dataset class
        clean_entities: a boolean flag indicating whether to clean-up given entities
    Returns:
        Dict[str, Union[List[str], List[int]]]: flattened features of the dataset
    """

    def fill_placeholder(sentence: str, candidate: str) -> str:
        """Fills `@placeholder` of a given query with the provided entity
        Args:
            sentence: query to fill
            candidate: entity candidate for the query
        Returns:
            str: `sentence` with `@placeholder` replaced with `candidate`
        """
        return re.sub(r"@placeholder", candidate.replace("\\", ""), sentence)

    def remove_highlight(context: str) -> str:
        """Removes highlights from a given passage
        Args:
            context: a passage to remove highlights from
        Returns:
            str: `context` with highlights removed
        """
        return re.sub(r"\n@highlight\n", ". ", context)

    queries: List[str] = examples["query"]
    passages: List[str] = [remove_highlight(passage) for passage in examples["passage"]]
    answers: List[List[str]] = examples["answers"]
    entities: List[List[str]] = examples["entities"]
    indices: List[Dict[str, int]] = examples["idx"]

    if clean_entities:
        tmp_entities = []
        for list_of_entities in entities:
            tmp_entities.append(
                list(set([entity.strip("\n ,.!") for entity in list_of_entities]))
            )
        entities = tmp_entities

        tmp_answers = []
        for list_of_answers in answers:
            tmp_answers.append(
                list(set([answer.strip("\n ,.!") for answer in list_of_answers]))
            )
        answers = tmp_answers

    # new indices for flat examples
    merged_indices: List[str] = []
    # queries with placeholders filled
    filled_queries: List[str] = []
    # duplicated passages
    extended_passages: List[str] = []
    # contains one entity per flat example
    flat_entities: List[str] = []
    # whether the entity in this example is found in the answers (0 or 1)
    labels: List[int] = []

    for query, passage, list_of_answers, list_of_entities, index in zip(queries,
                                                                        passages,
                                                                        answers,
                                                                        entities,
                                                                        indices):
        num_candidates: int = len(list_of_entities)

        candidate_queries: List[str] = [fill_placeholder(query, entity) for entity in list_of_entities]
        cur_labels: List[int] = [
            int(entity in list_of_answers) if list_of_answers else -1 for entity in list_of_entities
        ]
        cur_passages: List[str] = [passage] * num_candidates

        # keep track of the indices to be able to use target metrics
        passage_index: int = index["passage"]
        query_index: int = index["query"]
        example_indices: List[str] = [f"{passage_index}-{query_index}-{num_candidates}"] * num_candidates

        if sum(cur_labels) != 0:
            merged_indices.extend(example_indices)
            filled_queries.extend(candidate_queries)
            extended_passages.extend(cur_passages)
            flat_entities.extend(list_of_entities)
            labels.extend(cur_labels)

    return {"idx": merged_indices,
            "query": filled_queries,
            "passage": extended_passages,
            "entities": flat_entities,
            "label": labels}


def add_label_names(dataset: Dataset, label_column: str, label_names: List[str]):
    """Adds `names` to a specified `label` column.
    All labels (i.e. integers) in the dataset should be < than the number of label names.
    Args:
        dataset: a Dataset to add label names to
        label_column: the name of the label column (such as `label` or `labels`) in the dataset
        label_names: a list of label names
    Returns:
        Dataset: A copy of the passed `dataset` with added label names
    """
    new_features: Features = dataset.features.copy()
    new_features[label_column] = ClassLabel(names=label_names)
    return dataset.cast(new_features)


def binary_downsample(dataset: Dataset,
                      ratio: float = 0.,
                      seed: int = 42,
                      label_column: str = "label",
                      *,
                      do_correction: bool = True) -> Dataset:
    """Downsamples a given dataset to the specified negative to positive examples ratio. Only works with
    binary classification datasets with labels denoted as `0` and `1`.
    Args:
        dataset: a Dataset to downsample
        ratio: negative to positive examples ratio to maintain
        seed: a seed for shuffling
        label_column: the name of `label` column such as 'label' or 'labels'
        do_correction: correct resampled indices. If indices aren't corrected then examples with mismatched
        indices will not be accounted for be ReCoRD metrics. This is not necessarily undesirable because
        examples with such indices will have less negative examples (or even none), which makes them easier
        for the model, thus inflating the resulting metrics.
    Returns:
        Dataset: a downsampled dataset
    """

    def replace_indices(data: Dataset, index_map: Dict[str, str]) -> Dict[str, List[str]]:
        idx: List[str] = [index_map.get(el, el) for el in data["idx"]]
        return {"idx": idx}

    def get_correct_indices_map(data: Dataset) -> Dict[str, str]:
        """Generate a dictionary with replacements for indices that
        are no longer correct due to downsampling (i.e. the total number
        of elements denoted by the last part of an index has changed)
        Args:
            data: a downsampled Dataset
        Returns:
            Dict[str, str]: a dictionary containing replacement indices
        """
        actual_n_elements: Counter = Counter(data["idx"])
        corrected_index_map: Dict[str, str] = dict()
        for idx, n_elements in actual_n_elements.items():
            expected_n_elements: int = int(idx.split("-")[-1])
            if expected_n_elements != n_elements:
                new_idx: List[str] = idx.split("-")
                new_idx[-1]: str = str(n_elements)
                new_idx: str = "-".join(new_idx)
                corrected_index_map[idx] = new_idx
        return corrected_index_map

    def correct_indices(data: Dataset) -> Dataset:
        """Sets correct number of examples in downsampled indices
        Args:
            data: a downsampled dataset
        Returns:
            Dataset: the same dataset with correct indices
        """
        index_map: Dict[str, str] = get_correct_indices_map(data)
        return data.map(replace_indices, batched=True, fn_kwargs={"index_map": index_map})

    dataset_labels = dataset.unique(label_column)
    # `test` split shouldn't be downsampled
    if dataset_labels == [-1]:
        return dataset
    elif set(dataset_labels) == {0, 1}:
        # positive examples are denoted with `1`
        num_positive: int = sum(dataset[label_column])
        num_total: int = len(dataset)
        # the original number of negative examples is returned if `ratio` is not explicitly specified
        num_negative: int = floor(num_positive * ratio if ratio > 0 else num_total - num_positive)
        # first `num_positive` examples in a sorted dataset are labeled with `1`
        # while the rest are labeled with `0`
        sorted_dataset: Dataset = dataset.sort(label_column, reverse=True)
        # but we need to reshuffle the dataset before returning it
        shuffled_dataset: Dataset = sorted_dataset.select(range(num_positive + num_negative)).shuffle(seed=seed)
        if do_correction:
            shuffled_dataset = correct_indices(shuffled_dataset)
        return shuffled_dataset
    # the same logic is not applicable to cases with != 2 classes
    else:
        raise ValueError(f"Only binary classification labels are supported (i.e. [0, 1]), but {dataset_labels} were given")


def add_num_examples(dataset: Dataset) -> Dict[str, List[int]]:
    """Adds the total number of examples in a given dataset to
    each individual example. Must be applied to the whole dataset (i.e. `batched=True, batch_size=None`),
    otherwise the number will be incorrect.
    Args:
        dataset: a Dataset to add number of examples to
    Returns:
        Dict[str, List[int]]: total number of examples repeated for each example
    """
    num_examples = len(dataset[next(iter(dataset))])
    return {"num_examples": [num_examples] * num_examples}


def preprocess_multirc(examples: Dataset, *, clean_paragraphs: bool = True) -> Dict[str, List[str]]:
    """Compose strings in form of paragraphs and the folllowing questions.

    Args:
        examples: A given dataset.
        clean_paragraphs: Whether replace spaces and digits with a single space.

    Returns:
        Dict[str, List[str]]: Composed strings.

    """
    paragraphs: List[str] = examples["paragraph"]
    questions: List[str] = examples["question"]

    if clean_paragraphs:
        paragraphs = [re.sub(r"\s+", " ", re.sub(r"\(\d{1,2}\)", "", paragraph).strip()) for paragraph in paragraphs]

    contexts = [f"{paragraph} {question}" for paragraph, question in zip(paragraphs, questions)]

    return {"context": contexts}


def preprocess_wsc(dataset: Dataset) -> Dict[str, List[str]]:
    """Forms proper sentences from spans1 that are always entities and spans2 that describe these entities.

    Args:
        dataset: A given dataset.

    Returns:
        Dict[str, List[str]]: Answers that form proper sentences from capitalized spans1 and spans2.

    """
    spans1: List[str] = dataset["span1_text"]
    spans2: List[str] = dataset["span2_text"]
    answers = [f"{s2.capitalize()} {s1}" for s1, s2 in zip(spans1, spans2)]
    return {"answer": answers}


================================================
FILE: deeppavlov/dataset_readers/imdb_reader.py
================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from logging import getLogger
from typing import List, Dict, Any, Optional, Tuple
from pathlib import Path

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download_decompress, mark_done, is_done

log = getLogger(__name__)


@register('imdb_reader')
class ImdbReader(DatasetReader):
    """This class downloads and reads the IMDb sentiment classification dataset.

    https://ai.stanford.edu/~amaas/data/sentiment/

    Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts.
    (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association
    for Computational Linguistics (ACL 2011).
    """

    def read(self, data_path: str, url: Optional[str] = None,
             *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:
        """
        Args:
            data_path: A path to a folder with dataset files.
            url: A url to the archive with the dataset to download if the data folder is empty.
        """
        data_path = Path(data_path)

        if url is None:
            url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

        if not is_done(data_path):
            log.info('[downloading data from {} to {}]'.format(url, data_path))
            download_decompress(url, data_path)
            mark_done(data_path)

        alternative_data_path = data_path / "aclImdb"
        if alternative_data_path.exists():
            data_path = alternative_data_path

        data = {"train": [],
                "test": []}
        for data_type in data.keys():
            for label in ["neg", "pos"]:
                labelpath = data_path / data_type / label
                if not labelpath.exists():
                    raise RuntimeError(f"Cannot load data: {labelpath} does not exist")
                for filename in labelpath.glob("*.txt"):
                    with filename.open(encoding='utf-8') as f:
                        text = f.read()
                    data[data_type].append((text, [label]))

            if not data[data_type]:
                raise RuntimeError(f"Could not load the '{data_type}' dataset, "
                                   "probably data dirs are empty")

        return data


================================================
FILE: deeppavlov/dataset_readers/line_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, softwaredata
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('line_reader')
class LineReader(DatasetReader):
    """Read txt file by lines"""

    def read(self, data_path: str = None, *args, **kwargs) -> Dict:
        """Read lines from txt file

        Args:
            data_path: path to txt file

        Returns:
            A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys.
        """

        with open(data_path) as f:
            content = f.readlines()

        dataset = dict()
        dataset["train"] = [(line,) for line in content]
        dataset["valid"] = []
        dataset["test"] = []

        return dataset


================================================
FILE: deeppavlov/dataset_readers/morphotagging_dataset_reader.py
================================================
# Copyright 2018 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
from logging import getLogger
from pathlib import Path
from typing import Dict, List, Union, Tuple, Optional

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download_decompress, mark_done

WORD_COLUMN, POS_COLUMN, TAG_COLUMN = 1, 3, 5
HEAD_COLUMN, DEP_COLUMN = 6, 7

log = getLogger(__name__)


def get_language(filepath: str) -> str:
    """Extracts language from typical UD filename
    """
    return filepath.split("-")[0]


def read_infile(infile: Union[Path, str], *, from_words=False,
                word_column: int = WORD_COLUMN, pos_column: int = POS_COLUMN,
                tag_column: int = TAG_COLUMN, head_column: int = HEAD_COLUMN,
                dep_column: int = DEP_COLUMN, max_sents: int = -1,
                read_only_words: bool = False, read_syntax: bool = False) -> List[Tuple[List, Union[List, None]]]:
    """Reads input file in CONLL-U format

    Args:
        infile: a path to a file
        word_column: column containing words (default=1)
        pos_column: column containing part-of-speech labels (default=3)
        tag_column: column containing fine-grained tags (default=5)
        head_column: column containing syntactic head position (default=6)
        dep_column: column containing syntactic dependency label (default=7)
        max_sents: maximal number of sentences to read
        read_only_words: whether to read only words
        read_syntax: whether to return ``heads`` and ``deps`` alongside ``tags``. Ignored if read_only_words is ``True``

    Returns:
        a list of sentences. Each item contains a word sequence and an output sequence.
        The output sentence is ``None``, if ``read_only_words`` is ``True``,
        a single list of word tags if ``read_syntax`` is False,
        and a list of the form [``tags``, ``heads``, ``deps``] in case ``read_syntax`` is ``True``.

    """
    answer, curr_word_sent, curr_tag_sent = [], [], []
    curr_head_sent, curr_dep_sent = [], []
    # read_syntax = read_syntax and read_only_words
    if from_words:
        word_column, read_only_words = 0, True
    if infile is not sys.stdin:
        fin = open(infile, "r", encoding="utf8")
    else:
        fin = sys.stdin
    for line in fin:
        line = line.strip()
        if line.startswith("#"):
            continue
        if line == "":
            if len(curr_word_sent) > 0:
                if read_only_words:
                    curr_tag_sent = None
                elif read_syntax:
                    curr_tag_sent = [curr_tag_sent, curr_head_sent, curr_dep_sent]
                answer.append((curr_word_sent, curr_tag_sent))
            curr_tag_sent, curr_word_sent = [], []
            curr_head_sent, curr_dep_sent = [], []
            if len(answer) == max_sents:
                break
            continue
        splitted = line.split("\t")
        index = splitted[0]
        if not from_words and not index.isdigit():
            continue
        curr_word_sent.append(splitted[word_column])
        if not read_only_words:
            pos, tag = splitted[pos_column], splitted[tag_column]
            tag = pos if tag == "_" else "{},{}".format(pos, tag)
            curr_tag_sent.append(tag)
            if read_syntax:
                curr_head_sent.append(int(splitted[head_column]))
                curr_dep_sent.append(splitted[dep_column])
    if len(curr_word_sent) > 0:
        if read_only_words:
            curr_tag_sent = None
        elif read_syntax:
            curr_tag_sent = [curr_tag_sent, curr_head_sent, curr_dep_sent]
        answer.append((curr_word_sent, curr_tag_sent))
    if infile is not sys.stdin:
        fin.close()
    return answer


@register('morphotagger_dataset_reader')
class MorphotaggerDatasetReader(DatasetReader):
    """Class to read training datasets in UD format"""

    URL = 'http://files.deeppavlov.ai/datasets/UD2.0_source/'

    def read(self, data_path: Union[List, str],
             language: Optional[str] = None,
             data_types: Optional[List[str]] = None,
             **kwargs) -> Dict[str, List]:
        """Reads UD dataset from data_path.

        Args:
            data_path: can be either
                1. a directory containing files. The file for data_type 'mode'
                is then data_path / {language}-ud-{mode}.conllu
                2. a list of files, containing the same number of items as data_types
            language: a language to detect filename when it is not given
            data_types: which dataset parts among 'train', 'dev', 'test' are returned

        Returns:
            a dictionary containing dataset fragments (see ``read_infile``) for given data types
        """
        if data_types is None:
            data_types = ["train", "dev"]
        elif isinstance(data_types, str):
            data_types = list(data_types)
        for data_type in data_types:
            if data_type not in ["train", "dev", "test"]:
                raise ValueError("Unknown data_type: {}, only train, dev and test "
                                 "datatypes are allowed".format(data_type))
        if isinstance(data_path, str):
            data_path = Path(data_path)
        if isinstance(data_path, Path):
            if data_path.exists():
                is_file = data_path.is_file()
            else:
                is_file = (len(data_types) == 1)
            if is_file:
                # path to a single file
                data_path, reserve_data_path = [data_path], None
            else:
                # path to data directory
                if language is None:
                    raise ValueError("You must implicitly provide language "
                                     "when providing data directory as source")
                reserve_data_path = data_path
                data_path = [data_path / "{}-ud-{}.conllu".format(language, mode)
                             for mode in data_types]
                reserve_data_path = [
                    reserve_data_path / language / "{}-ud-{}.conllu".format(language, mode)
                    for mode in data_types]
        else:
            data_path = [Path(data_path) for data_path in data_path]
            reserve_data_path = None
        if len(data_path) != len(data_types):
            raise ValueError("The number of input files in data_path and data types "
                             "in data_types must be equal")
        has_missing_files = any(not filepath.exists() for filepath in data_path)
        if has_missing_files and reserve_data_path is not None:
            has_missing_files = any(not filepath.exists() for filepath in reserve_data_path)
            if not has_missing_files:
                data_path = reserve_data_path
        if has_missing_files:
            # Files are downloaded from the Web repository
            dir_path = data_path[0].parent
            language = language or get_language(data_path[0].parts[-1])
            url = self.URL + "{}.tar.gz".format(language)
            log.info('[downloading data from {} to {}]'.format(url, dir_path))
            dir_path.mkdir(exist_ok=True, parents=True)
            download_decompress(url, dir_path)
            mark_done(dir_path)
        data = {}
        for mode, filepath in zip(data_types, data_path):
            if mode == "dev":
                mode = "valid"
            data[mode] = read_infile(filepath, **kwargs)
        return data


================================================
FILE: deeppavlov/dataset_readers/multitask_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
from logging import getLogger
from typing import Dict

from deeppavlov.core.common.registry import get_model, register
from deeppavlov.core.data.dataset_reader import DatasetReader

log = getLogger(__name__)


@register('multitask_reader')
class MultiTaskReader(DatasetReader):
    """Class to read several datasets simultaneously."""

    def read(self, tasks: Dict[str, Dict[str, dict]], task_defaults: dict = None, **kwargs):
        """Creates dataset readers for tasks and returns what task dataset readers `read()` methods return.

        Args:
            tasks: dictionary which keys are task names and values are dictionaries with param name - value pairs for
                nested dataset readers initialization. If task has key-value pair ``'use_task_defaults': False``,
                task_defaults for this task dataset reader will be ignored.
            task_defaults: default task parameters.

        Returns:
            dictionary which keys are task names and values are what task readers `read()` methods returned.
        """
        data = dict()
        if task_defaults is None:
            task_defaults = dict()
        for task_name, task_params in tasks.items():
            if task_params.pop('use_task_defaults', True) is True:
                task_config = copy.deepcopy(task_defaults)
                task_config.update(task_params)
            else:
                task_config = task_params
            reader = get_model(task_config.pop('class_name'))()
            data[task_name] = reader.read(**task_config)
        return data


================================================
FILE: deeppavlov/dataset_readers/odqa_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import sqlite3
import unicodedata
from multiprocessing import Pool
from pathlib import Path
from typing import Union, List, Tuple, Generator, Any, Optional

from tqdm import tqdm

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download

logger = logging.getLogger(__name__)


@register('odqa_reader')
class ODQADataReader(DatasetReader):
    """Build a SQLite database from folder with txt files, json files or
    `Wiki Extractor <https://github.com/attardi/wikiextractor>`_ files.

    """

    def read(self, data_path: Union[Path, str], db_url: Optional[str] = None, *args,
             **kwargs) -> None:
        """Build a SQLite database from provided files, download SQLite database from a provided URL,
         or do nothing.

        Args:
            data_path: a directory/file with texts to create a database from
            db_url: path to a database url
            kwargs:
                save_path: a path where a database should be saved to, or path to a ready database
                dataset_format: initial data format; should be selected from ['txt', 'wiki', 'json']

        Returns:
            None

        """
        logger.info('Reading files...')
        try:
            save_path = expand_path(kwargs['save_path'])
        except KeyError:
            raise ConfigError(
                f'\"save_path\" attribute should be set for {self.__class__.__name__}\
                 in the JSON config.')
        if save_path.exists() and save_path.with_suffix(f'{save_path.suffix}.done').exists():
            return
        try:
            dataset_format = kwargs['dataset_format']
        except KeyError:
            raise ConfigError(
                f'\"dataset_format\" attribute should be set for {self.__class__.__name__}\
                 in the JSON config.')

        save_path.parent.mkdir(parents=True, exist_ok=True)

        if db_url:
            download_dir = save_path.parent
            logger.info(f'Downloading database from {db_url} to {download_dir}')
            download(download_dir, db_url, force_download=False)
            return

        self._build_db(save_path, dataset_format, expand_path(data_path))

    def iter_files(self, path: Union[Path, str]) -> Generator[Path, Any, Any]:
        """Iterate over folder with files or a single file and generate file paths.

        Args:
            path: path to a folder or a file

        Raises:
            RuntimeError if the provided `path` doesn't exist

        Yields:
            file paths one by one

        Returns:
            None

        """
        path = Path(path)
        if path.is_file():
            yield path
        elif path.is_dir():
            for item in path.iterdir():
                yield from self.iter_files(item)
        else:
            raise RuntimeError("Path doesn't exist: {}".format(path))

    def _build_db(self, save_path: Union[Path, str], dataset_format: str,
                  data_path: Union[Path, str],
                  num_workers: int = 8) -> None:
        """Build a SQLite database in parallel and save it to a pointed path.

        Args:
            save_path: a path where the ready database should be saved
            dataset_format: a data format, should be selected from ['txt', 'json', 'wiki']
            data_path: path to a folder/file from which to build a database
            num_workers: a number of workers for parallel database building

        Raises:
            sqlite3.OperationalError if `save_path` doesn't exist.
            RuntimeError if dataset_format is not in ['txt', 'json', 'wiki']

        Returns:
            None

        """
        done_path = save_path.with_suffix(f'{save_path.suffix}.done')

        if Path(save_path).exists():
            Path(save_path).unlink()
        if done_path.exists():
            done_path.unlink()

        logger.info('Building the database...')

        try:
            conn = sqlite3.connect(str(save_path))
        except sqlite3.OperationalError as e:
            e.args = e.args + ("Check that DB path exists.",)
            raise e
        c = conn.cursor()
        sql_table = "CREATE TABLE documents (id PRIMARY KEY, text);"
        c.execute(sql_table)

        files = [f for f in self.iter_files(data_path)]
        workers = Pool(num_workers)

        if dataset_format == 'txt':
            fn = self._get_file_contents
        elif dataset_format == 'json':
            fn = self._get_json_contents
        elif dataset_format == 'wiki':
            fn = self._get_wiki_contents
        else:
            raise RuntimeError('Unknown dataset format.')

        with tqdm(total=len(files)) as pbar:
            for data in tqdm(workers.imap_unordered(fn, files)):
                try:
                    c.executemany("INSERT INTO documents VALUES (?,?)", data)
                    pbar.update()
                except sqlite3.IntegrityError as e:
                    logger.warning(e)

        conn.commit()
        conn.close()
        done_path.touch()

    @staticmethod
    def _get_file_contents(fpath: Union[Path, str]) -> List[Tuple[str, str]]:
        """Extract file contents from '.txt' file.

        Args:
            fpath: path to a '.txt' file.

        Returns:
             a list with tuple of normalized file name and file contents

        """
        with open(fpath, encoding='utf-8') as fin:
            text = fin.read()
            normalized_text = unicodedata.normalize('NFD', text)
            return [(fpath.name, normalized_text)]

    @staticmethod
    def _get_json_contents(fpath: Union[Path, str]) -> List[Tuple[str, str]]:
        """Extract file contents from '.json' file. JSON files should be formatted as list with dicts
        which contain 'title' and 'doc' keywords.

        Args:
            fpath: path to a '.json' file.

        Returns:
            a list with tuples of normalized file name and file contents

        """
        docs = []
        with open(fpath, encoding='utf-8') as fin:
            for line in fin:
                data = json.loads(line)
                for doc in data:
                    if not doc:
                        continue
                    text = doc['text']
                    normalized_text = unicodedata.normalize('NFD', text)
                    docs.append((doc['title'], normalized_text))
        return docs

    @staticmethod
    def _get_wiki_contents(fpath: Union[Path, str]) -> List[Tuple[str, str]]:
        """Extract file contents from wiki extractor formatted files.

        Args:
            fpath: path to a '.txt' file in wiki extractor format

        Returns:
            a list with tuples of normalized file name and file contents

        """
        docs = []
        with open(fpath, encoding='utf-8') as fin:
            for line in fin:
                doc = json.loads(line)
                if not doc:
                    continue
                text = doc['text']
                normalized_text = unicodedata.normalize('NFD', text)
                docs.append((doc['title'], normalized_text))
        return docs


================================================
FILE: deeppavlov/dataset_readers/paraphraser_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Dict, List, Tuple

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('paraphraser_reader')
class ParaphraserReader(DatasetReader):
    """The class to read the paraphraser.ru dataset from files.

    Please, see https://paraphraser.ru.
    """

    def read(self,
             data_path: str,
             do_lower_case: bool = True,
             *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:
        """Read the paraphraser.ru dataset from files.

        Args:
            data_path: A path to a folder with dataset files.
            do_lower_case: Do you want to lowercase all texts
        """

        data_path = expand_path(data_path)
        train_fname = data_path / 'paraphrases.xml'
        test_fname = data_path / 'paraphrases_gold.xml'

        train_data = self._build_data(train_fname, do_lower_case)
        test_data = self._build_data(test_fname, do_lower_case)
        return {"train": train_data, "valid": [], "test": test_data}

    @staticmethod
    def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]:
        root = ET.fromstring(data_path.read_text(encoding='utf8'))
        data = {}
        for paraphrase in root.findall('corpus/paraphrase'):
            key = (paraphrase.find('value[@name="text_1"]').text,
                   paraphrase.find('value[@name="text_2"]').text)
            if do_lower_case:
                key = tuple([t.lower() for t in key])

            data[key] = 1 if int(paraphrase.find('value[@name="class"]').text) >= 0 else 0
        return list(data.items())


================================================
FILE: deeppavlov/dataset_readers/rel_ranking_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Dict, List, Tuple

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('rel_ranking_reader')
class ParaphraserReader(DatasetReader):
    """The class to read the paraphraser.ru dataset from files.
​
    Please, see https://paraphraser.ru.
    """

    def read(self,
             data_path: str,
             do_lower_case: bool = True,
             *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:
        """Read the paraphraser.ru dataset from files.
​
        Args:
            data_path: A path to a folder with dataset files.
            do_lower_case: Do you want to lowercase all texts
        """

        data_path = expand_path(data_path)
        train_fname = data_path / 'paraphrases.xml'
        test_fname = data_path / 'paraphrases_gold.xml'

        train_data = self._build_data(train_fname, do_lower_case)
        test_data = self._build_data(test_fname, do_lower_case)
        return {"train": train_data, "valid": [], "test": test_data}

    @staticmethod
    def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]:
        root = ET.fromstring(data_path.read_text(encoding='utf8'))
        data = []
        for paraphrase in root.findall('corpus/paraphrase'):
            key = (paraphrase.find('value[@name="text_1"]').text,
                   paraphrase.find('value[@name="text_2"]').text)
            if do_lower_case:
                key = tuple([t.lower() for t in key])

            pos_or_neg = int(paraphrase.find('value[@name="class"]').text)
            data.append((key, pos_or_neg))
        return data


================================================
FILE: deeppavlov/dataset_readers/rured_reader.py
================================================
import json
import os
import random
from typing import Dict, List, Tuple
from pathlib import Path
from logging import getLogger

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader

logger = getLogger(__name__)


@register('rured_reader')
class RuREDDatasetReader(DatasetReader):
    """ Class to read the datasets in RuRED format"""

    def read(self, data_path: str, rel2id: Dict = None) -> Dict[str, List[Tuple]]:
        """
        This class processes the RuRED relation extraction dataset
        (http://www.dialog-21.ru/media/5093/gordeevdiplusetal-031.pdf).
        Args:
            data_path: a path to a folder with dataset files.
            rel2id: a path to a file where information about relation to relation id corresponding is stored.
        Returns:
            RuRED output dictionary in the following format:
            DocRED output dictionary in the following format:
            {"data_type":
                List[
                    Tuple(
                        List[
                            List[all tokens of the document],
                            List[
                                List[Tuple(start pos of mention 1 of ent 1, end pos of mention 1 of ent 1), ...],
                                List[Tuple(start position of entity 2, end position of entity 2), ...],
                                List[str(NER tag of entity 1), str(NER tag of entity 2)]
                            ],
                        List(int(one-hot encoded relation label))
                    )
                ]
            }
        """

        data_path = Path(data_path).resolve()

        if not rel2id:
            self.rel2id = self.add_default_rel_dict()
        else:
            self.rel2id = rel2id
        self.stat = {}
        self.ner_stat = {}

        with open(os.path.join(data_path, "train.json"), encoding='utf-8') as file:
            train_data = json.load(file)

        with open(os.path.join(data_path, "dev.json"), encoding='utf-8') as file:
            dev_data = json.load(file)

        with open(os.path.join(data_path, "test.json"), encoding='utf-8') as file:
            test_data = json.load(file)

        train_data, self.stat["train"] = self.process_rured_file(train_data, num_neg_samples="twice")
        dev_data, self.stat["dev"] = self.process_rured_file(dev_data, num_neg_samples="equal")
        test_data, self.stat["test"] = self.process_rured_file(test_data, num_neg_samples="equal")

        data = {"train": train_data, "valid": dev_data, "test": test_data}

        return data

    def process_rured_file(self, data: List[Dict], num_neg_samples: str) -> Tuple[List, Dict]:
        """
        Processes a RuRED data and returns a DeepPavlov relevant output

        Args:
            data: List of data units
            num_neg_samples: how many negative samples will be included to positive ones
                Possible values:
                    - None: no negative samples will be generated
                        (relevant to the test set which has from neg samples only)
                    - equal: there will be one negative sample pro positive sample
                    - twice: there will be twice as many negative samples as positive ones
                    - all: take all negative samples from the dataset
        Returns:
            one list of processed documents
        """
        processed_samples = []
        neg_samples = []        # list of indices of negative samples
        pos_samples = 0         # counter of positive samples

        for sample in data:
            # record negative sample ids
            if sample["relation"] == "no_relation":
                neg_samples.append(len(processed_samples))
            else:
                pos_samples += 1

            if sample["subj_type"] in self.ner_stat:
                self.ner_stat[sample["subj_type"]] += 1
            else:
                self.ner_stat[sample["subj_type"]] = 1

            if sample["obj_type"] in self.ner_stat:
                self.ner_stat[sample["obj_type"]] += 1
            else:
                self.ner_stat[sample["obj_type"]] = 1

            processed_samples.append(
                (
                    [
                        sample["token"],
                        [[(sample["subj_start"], sample["subj_end"])], [(sample["obj_start"], sample["obj_end"])]],
                        [sample["subj_type"], sample["obj_type"]]
                    ],
                    self.label_to_one_hot(self.rel2id[sample["relation"]])
                )
            )

        # filter out some of negative sample if relevant
        if num_neg_samples == "equal":
            # include the same amount of negative samples as positive ones
            neg_to_eliminate = random.sample(neg_samples, (len(neg_samples) - pos_samples))
            processed_samples = [
                sample for sample_idx, sample in enumerate(processed_samples) if sample_idx not in neg_to_eliminate
            ]
        elif num_neg_samples == "twice":
            # include twice as much negative samples as positive ones
            neg_to_eliminate = random.sample(neg_samples, (len(neg_samples) - 2 * pos_samples))
            processed_samples = [
                sample for sample_idx, sample in enumerate(processed_samples) if sample_idx not in neg_to_eliminate
            ]
        elif num_neg_samples == "none":
            # eliminate all negative samples
            processed_samples = [
                sample for sample_idx, sample in enumerate(processed_samples) if sample_idx not in neg_samples
            ]
        else:
            raise ValueError("Unknown negative samples amount! Currently available are 'equal', 'twice' and 'none")

        # collect statistics
        stat = {}
        for sample in processed_samples:
            rel = [rel for rel, sample_log in enumerate(sample[1]) if sample_log == 1][0]
            if rel in stat:
                stat[rel] += 1
            else:
                stat[rel] = 1

        return processed_samples, stat

    def label_to_one_hot(self, label: int) -> List[int]:
        """ Turn labels to one hot encodings """
        relation = [0] * len(self.rel2id)
        relation[label] = 1
        return relation

    @staticmethod
    def add_default_rel_dict():
        """ Creates a default relation to relation if dictionary with RuRED relations """
        return dict(no_relation=0, MEMBER=1, WORKS_AS=2, WORKPLACE=3, OWNERSHIP=4, SUBORDINATE_OF=5, TAKES_PLACE_IN=6,
                    EVENT_TAKES_PART_IN=7, SELLS_TO=8, ALTERNATIVE_NAME=9, HEADQUARTERED_IN=10, PRODUCES=11,
                    ABBREVIATION=12, DATE_DEFUNCT_IN=13, SUBEVENT_OF=14, DATE_FOUNDED_IN=15, DATE_TAKES_PLACE_ON=16,
                    NUMBER_OF_EMPLOYEES_FIRED=17, ORIGINS_FROM=18, ACQUINTANCE_OF=19, PARENT_OF=20, ORGANIZES=21,
                    FOUNDED_BY=22, PLACE_RESIDES_IN=23, BORN_IN=24, AGE_IS=25, RELATIVE=26, NUMBER_OF_EMPLOYEES=27,
                    SIBLING=28, DATE_OF_BIRTH=29)


================================================
FILE: deeppavlov/dataset_readers/sq_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import pickle
from typing import List

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.common.file import load_pickle
from deeppavlov.core.common.file import read_json


@register('sq_reader')
class SQReader(DatasetReader):
    """Class to read training datasets"""

    def read(self, data_path: str, valid_size: int = None):
        if str(data_path).endswith(".pickle"):
            dataset = load_pickle(data_path)
        elif str(data_path).endswith(".json"):
            dataset = read_json(data_path)
        else:
            raise TypeError(f'Unsupported file type: {data_path}')
        if valid_size:
            dataset["valid"] = dataset["valid"][:valid_size]

        return dataset


@register('rubq_reader')
class RuBQReader(SQReader):
    """Class to read RuBQ datasets"""

    def read(self, data_path: str, version: str = "2.0", question_types: List[str] = ["all"],
                   not_include_question_types: List[str] = None, num_samples: int = -1):
        dataset = super().read(data_path)
        for data_type in ["valid", "test"]:
            samples = dataset[data_type]
            samples = [sample for sample in samples if float(sample["RuBQ_version"]) <= float(version) and
                       (any(tp in sample["tags"] for tp in question_types) or question_types == ["all"])]
            if not_include_question_types:
                samples = [sample for sample in samples if all([tp not in sample["tags"]
                           for tp in not_include_question_types])]
            samples = [self.preprocess(sample) for sample in samples]
            if num_samples > 0:
                samples = samples[:num_samples]
            dataset[data_type] = samples
        return dataset

    def preprocess(self, sample):
        question = sample.get("question_text", "")
        answers = sample.get("answers", [])
        answer_ids = [elem.get("value", "").split("/")[-1] for elem in answers]
        answer_labels = [elem.get("label", "").split("/")[-1] for elem in answers]
        query = sample.get("query", "")
        if query is None:
            query = ""
        else:
            query = query.replace("\n", " ").replace("  ", " ")
        return [question, [answer_ids, answer_labels, query]]


@register('lcquad_reader')
class LCQuADReader(SQReader):
    """Class to read LCQuAD dataset"""

    def read(self, data_path: str, question_types: List[str] = "all",
                   not_include_question_types: List[str] = None, num_samples: int = -1):
        dataset = super().read(data_path)
        for data_type in ["valid", "test"]:
            samples = dataset[data_type]
            samples = [sample for sample in samples if (any(tp == sample["subgraph"] for tp in question_types) \
                                                        or question_types == ["all"])]
            if not_include_question_types:
                samples = [sample for sample in samples
                           if sample["subgraph"] not in not_include_question_types]
            samples = [self.preprocess(sample) for sample in samples]
            if num_samples > 0:
                samples = samples[:num_samples]
            dataset[data_type] = samples
        return dataset

    def preprocess(self, sample):
        question = sample.get("question", "")
        answers = sample.get("answer", [])
        answer_labels = sample.get("answer_label", [])
        query = sample.get("sparql_wikidata", "")
        return [question, [answers, answer_labels, query]]


================================================
FILE: deeppavlov/dataset_readers/squad_dataset_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
from pathlib import Path
from typing import Dict, Any, Optional

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download_decompress


@register('squad_dataset_reader')
class SquadDatasetReader(DatasetReader):
    """
    Downloads dataset files and prepares train/valid split.

    SQuAD:
    Stanford Question Answering Dataset
    https://rajpurkar.github.io/SQuAD-explorer/
    
    SQuAD2.0:
    Stanford Question Answering Dataset, version 2.0
    https://rajpurkar.github.io/SQuAD-explorer/

    SberSQuAD:
    Dataset from SDSJ Task B
    https://www.sdsj.ru/ru/contest.html

    MultiSQuAD:
    SQuAD dataset with additional contexts retrieved (by tfidf) from original Wikipedia article.

    MultiSQuADRetr:
    SQuAD dataset with additional contexts retrieved by tfidf document ranker from full Wikipedia.

    """

    url_squad = 'http://files.deeppavlov.ai/datasets/squad-v1.1.tar.gz'
    url_sber_squad = 'http://files.deeppavlov.ai/datasets/sber_squad-v1.1.tar.gz'
    url_multi_squad = 'http://files.deeppavlov.ai/datasets/multiparagraph_squad.tar.gz'
    url_squad2 = 'http://files.deeppavlov.ai/datasets/squad-v2.0.tar.gz'

    def read(self, data_path: str, dataset: Optional[str] = 'SQuAD', url: Optional[str] = None, *args, **kwargs) \
            -> Dict[str, Dict[str, Any]]:
        """

        Args:
            data_path: path to save data
            dataset: default dataset names: ``'SQuAD'``, ``'SberSQuAD'`` or ``'MultiSQuAD'``
            url: link to archive with dataset, use url argument if non-default dataset is used

        Returns:
            dataset split on train/valid

        Raises:
            RuntimeError: if `dataset` is not one of these: ``'SQuAD'``, ``'SberSQuAD'``, ``'MultiSQuAD'``.
        """
        if url is not None:
            self.url = url
        elif dataset == 'SQuAD':
            self.url = self.url_squad
        elif dataset == 'SberSQuAD':
            self.url = self.url_sber_squad
        elif dataset == 'MultiSQuAD':
            self.url = self.url_multi_squad
        elif dataset == 'SQuAD2.0':
            self.url = self.url_squad2
        else:
            raise RuntimeError(f'Dataset {dataset} is unknown')

        data_path = Path(data_path)
        if dataset == "SQuAD2.0":
            required_files = [f'{dt}-v2.0.json' for dt in ['train', 'dev']]
        else:
            required_files = [f'{dt}-v1.1.json' for dt in ['train', 'dev']]
        data_path.mkdir(parents=True, exist_ok=True)

        if not all((data_path / f).exists() for f in required_files):
            download_decompress(self.url, data_path)

        dataset = {}
        for f in required_files:
            with data_path.joinpath(f).open('r', encoding='utf8') as fp:
                data = json.load(fp)
            if f in {'dev-v1.1.json', 'dev-v2.0.json'}:
                dataset['valid'] = data
            else:
                dataset['train'] = data

        return dataset


@register('multi_squad_dataset_reader')
class MultiSquadDatasetReader(DatasetReader):
    """
    Downloads dataset files and prepares train/valid split.

    MultiSQuADRetr:
    Multiparagraph SQuAD dataset with additional contexts retrieved by tfidf document ranker from full En Wikipedia.

    MultiSQuADRuRetr:
    Multiparagraph SberSQuAD dataset with additional contexts retrieved by tfidf document ranker from  Ru Wikipedia.

    """

    url_multi_squad_retr = 'http://files.deeppavlov.ai/datasets/multi_squad_retr_enwiki20161221.tar.gz'
    url_multi_squad_ru_retr = 'http://files.deeppavlov.ai/datasets/multi_squad_ru_retr.tar.gz'

    def read(self, data_path: str, dataset: Optional[str] = 'MultiSQuADRetr', url: Optional[str] = None, *args,
             **kwargs) -> Dict[str, Dict[str, Any]]:
        """

        Args:
            data_path: path to save data
            dataset: default dataset names: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'``
            url: link to archive with dataset, use url argument if non-default dataset is used

        Returns:
            dataset split on train/valid

        Raises:
            RuntimeError: if `dataset` is not one of these: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'``.
        """
        if url is not None:
            self.url = url
        elif dataset == 'MultiSQuADRetr':
            self.url = self.url_multi_squad_retr
        elif dataset == 'MultiSQuADRuRetr':
            self.url = self.url_multi_squad_ru_retr
        else:
            raise RuntimeError(f'Dataset {dataset} is unknown')

        data_path = Path(data_path)
        required_files = [f'{dt}.jsonl' for dt in ['train', 'dev']]
        if not data_path.exists():
            data_path.mkdir(parents=True)

        if not all((data_path / f).exists() for f in required_files):
            download_decompress(self.url, data_path)

        dataset = {}
        for f in required_files:
            if 'dev' in f:
                dataset['valid'] = data_path.joinpath(f)
            else:
                dataset['train'] = data_path.joinpath(f)

        return dataset


================================================
FILE: deeppavlov/dataset_readers/typos_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
from logging import getLogger
from pathlib import Path
from typing import Dict, List, Tuple

import requests
from lxml import html

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import is_done, download, mark_done

log = getLogger(__name__)


@register('typos_custom_reader')
class TyposCustom(DatasetReader):
    """Base class for reading spelling corrections dataset files

    """

    def __init__(self):
        pass

    @staticmethod
    def build(data_path: str) -> Path:
        """Base method that interprets ``data_path`` argument.

        Args:
            data_path: path to the tsv-file containing erroneous and corrected words

        Returns:
            the same path as a :class:`~pathlib.Path` object
        """
        return Path(data_path)

    @classmethod
    def read(cls, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]:
        """Read train data for spelling corrections algorithms

        Args:
            data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposCustom.build`

        Returns:
            train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator`
        """
        fname = cls.build(data_path)
        with fname.open(newline='', encoding='utf8') as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            next(reader)
            res = [(mistake, correct) for mistake, correct in reader]
        return {'train': res}


@register('typos_wikipedia_reader')
class TyposWikipedia(TyposCustom):
    """Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with
     English Wikipedia's list of common misspellings

    """

    @staticmethod
    def build(data_path: str) -> Path:
        """Download and parse common misspellings list from `Wikipedia <https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines>`_

        Args:
            data_path: target directory to download the data to

        Returns:
            path to the resulting tsv-file
        """
        data_path = Path(data_path) / 'typos_wiki'

        fname = data_path / 'misspelings.tsv'

        if not is_done(data_path):
            url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'

            page = requests.get(url)
            tree = html.fromstring(page.content)
            raw = tree.xpath('//pre/text()')[0].splitlines()
            data = []
            for pair in raw:
                typo, corrects = pair.strip().split('->')
                for correct in corrects.split(','):
                    data.append([typo.strip(), correct.strip()])

            fname.parent.mkdir(parents=True, exist_ok=True)
            with fname.open('w', newline='', encoding='utf8') as tsvfile:
                writer = csv.writer(tsvfile, delimiter='\t')
                for line in data:
                    writer.writerow(line)

            mark_done(data_path)

            log.info('Built')
        return fname


@register('typos_kartaslov_reader')
class TyposKartaslov(DatasetReader):
    """Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with
     a Russian misspellings dataset from `kartaslov <https://github.com/dkulagin/kartaslov>`_

    """

    def __init__(self):
        pass

    @staticmethod
    def build(data_path: str) -> Path:
        """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_

        Args:
            data_path: target directory to download the data to

        Returns:
            path to the resulting csv-file
        """
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            log.info('Built')
        return fname

    @staticmethod
    def read(data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]:
        """Read train data for spelling corrections algorithms

        Args:
            data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposKartaslov.build`

        Returns:
            train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator`
        """
        fname = TyposKartaslov.build(data_path)
        with open(str(fname), newline='', encoding='utf8') as csvfile:
            reader = csv.reader(csvfile, delimiter=';')
            next(reader)
            res = [(mistake, correct) for correct, mistake, weight in reader]
        return {'train': res}


================================================
FILE: deeppavlov/dataset_readers/ubuntu_v2_reader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
from pathlib import Path
from typing import List, Dict, Tuple, Union

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('ubuntu_v2_reader')
class UbuntuV2Reader(DatasetReader):
    """The class to read the Ubuntu V2 dataset from csv files.

    Please, see https://github.com/rkadlec/ubuntu-ranking-dataset-creator.
    """

    def read(self, data_path: str,
             positive_samples=False,
             *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
        """Read the Ubuntu V2 dataset from csv files.

        Args:
            data_path: A path to a folder with dataset csv files.
            positive_samples: if `True`, only positive context-response pairs will be taken for train
        """

        data_path = expand_path(data_path)
        dataset = {'train': None, 'valid': None, 'test': None}
        train_fname = Path(data_path) / 'train.csv'
        valid_fname = Path(data_path) / 'valid.csv'
        test_fname = Path(data_path) / 'test.csv'
        self.positive_samples = positive_samples
        self.sen2int_vocab = {}
        self.classes_vocab_train = {}
        self.classes_vocab_valid = {}
        self.classes_vocab_test = {}
        dataset["train"] = self.preprocess_data_train(train_fname)
        dataset["valid"] = self.preprocess_data_validation(valid_fname)
        dataset["test"] = self.preprocess_data_validation(test_fname)
        return dataset

    def preprocess_data_train(self, train_fname: Union[Path, str]) -> List[Tuple[List[str], int]]:
        contexts = []
        responses = []
        labels = []
        with open(train_fname, 'r') as f:
            reader = csv.reader(f)
            next(reader)
            for el in reader:
                contexts.append(el[0])
                responses.append(el[1])
                labels.append(int(el[2]))
            data = list(zip(contexts, responses))
            data = list(zip(data, labels))
            if self.positive_samples:
                data = [el[0] for el in data if el[1] == 1]
                data = list(zip(data, range(len(data))))
        return data

    def preprocess_data_validation(self, fname: Union[Path, str]) -> List[Tuple[List[str], int]]:
        contexts = []
        responses = []
        with open(fname, 'r') as f:
            reader = csv.reader(f)
            next(reader)
            for el in reader:
                contexts.append(el[0])
                responses.append(el[1:])
        data = [[el[0]] + el[1] for el in zip(contexts, responses)]
        data = [(el, 1) for el in data]
        return data


================================================
FILE: deeppavlov/deep.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from logging import getLogger

from deeppavlov.core.commands.infer import interact_model, predict_on_stream
from deeppavlov.core.commands.train import train_evaluate_model_from_config
from deeppavlov.core.common.cross_validation import calc_cv_score
from deeppavlov.core.common.file import find_config
from deeppavlov.download import deep_download
from deeppavlov.utils.pip_wrapper import install_from_config
from deeppavlov.utils.server import start_model_server
from deeppavlov.utils.socket import start_socket_server

log = getLogger(__name__)

parser = argparse.ArgumentParser()

parser.add_argument("mode", help="select a mode, train or interact", type=str,
                    choices={'train', 'evaluate', 'interact', 'predict', 'riseapi', 'risesocket', 'download', 'install',
                             'crossval'})
parser.add_argument("config_path", help="path to a pipeline json config", type=str)

parser.add_argument("-e", "--start-epoch-num", dest="start_epoch_num", default=None,
                    help="Start epoch number", type=int)
parser.add_argument("--recursive", action="store_true", help="Train nested configs")

parser.add_argument("-b", "--batch-size", dest="batch_size", default=None, help="inference batch size", type=int)
parser.add_argument("-f", "--input-file", dest="file_path", default=None, help="Path to the input file", type=str)
parser.add_argument("-d", "--download", action="store_true", help="download model components")
parser.add_argument("-i", "--install", action="store_true", help="install model requirements")

parser.add_argument("--folds", help="number of folds", type=int, default=5)

parser.add_argument("--https", action="store_true", default=None, help="run model in https mode")
parser.add_argument("--key", default=None, help="ssl key", type=str)
parser.add_argument("--cert", default=None, help="ssl certificate", type=str)

parser.add_argument("-p", "--port", default=None, help="api port", type=int)

parser.add_argument("--socket-type", default="TCP", type=str, choices={"TCP", "UNIX"})
parser.add_argument("--socket-file", default="/tmp/deeppavlov_socket.s", type=str)


def main():
    args = parser.parse_args()
    pipeline_config_path = find_config(args.config_path)

    if args.install or args.mode == 'install':
        install_from_config(pipeline_config_path)
    if args.download or args.mode == 'download':
        deep_download(pipeline_config_path)

    if args.mode == 'train':
        train_evaluate_model_from_config(pipeline_config_path,
                                         recursive=args.recursive,
                                         start_epoch_num=args.start_epoch_num)
    elif args.mode == 'evaluate':
        train_evaluate_model_from_config(pipeline_config_path, to_train=False, start_epoch_num=args.start_epoch_num)
    elif args.mode == 'interact':
        interact_model(pipeline_config_path)
    elif args.mode == 'riseapi':
        start_model_server(pipeline_config_path, args.https, args.key, args.cert, port=args.port)
    elif args.mode == 'risesocket':
        start_socket_server(pipeline_config_path, args.socket_type, port=args.port, socket_file=args.socket_file)
    elif args.mode == 'predict':
        predict_on_stream(pipeline_config_path, args.batch_size, args.file_path)
    elif args.mode == 'crossval':
        if args.folds < 2:
            log.error('Minimum number of Folds is 2')
        else:
            calc_cv_score(pipeline_config_path, n_folds=args.folds, is_loo=False)


if __name__ == "__main__":
    main()


================================================
FILE: deeppavlov/download.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import secrets
import shutil
import sys
from argparse import ArgumentParser, Namespace
from collections import defaultdict
from logging import getLogger
from pathlib import Path
from typing import Union, Optional, Dict, Iterable, Set, Tuple, List
from urllib.parse import urlparse
import requests
from filelock import FileLock

import deeppavlov
from deeppavlov.core.commands.utils import expand_path, parse_config
from deeppavlov.core.data.utils import download, download_decompress, get_all_elems_from_json, file_md5, \
    set_query_parameter, path_set_md5, get_download_token

log = getLogger(__name__)

parser = ArgumentParser()

parser.add_argument('--config', '-c', help="path to a pipeline json config", type=str,
                    default=None)
parser.add_argument('-all', action='store_true',
                    help="Download everything. Warning! There should be at least 10 GB space"
                         " available on disk.")


def get_config_downloads(config: Union[str, Path, dict]) -> Set[Tuple[str, Path]]:
    config = parse_config(config)

    downloads = set()
    if 'metadata' in config and 'download' in config['metadata']:
        for resource in config['metadata']['download']:
            if isinstance(resource, str):
                resource = {
                    'url': resource
                }

            url = resource['url']
            dest = expand_path(resource.get('subdir', ''))

            downloads.add((url, dest))

    config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')]

    downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)}

    return downloads


def get_configs_downloads(config: Optional[Union[str, Path, dict]] = None) -> Dict[str, Set[Path]]:
    all_downloads = defaultdict(set)
    if config:
        configs = [config]
    else:
        configs = list(Path(deeppavlov.__path__[0], 'configs').glob('**/*.json'))

    for config in configs:
        for url, dest in get_config_downloads(config):
            all_downloads[url].add(dest)

    return all_downloads


def check_md5(url: str, dest_paths: List[Path], headers: Optional[dict] = None) -> bool:
    url_md5 = path_set_md5(url)

    try:
        if url_md5.startswith('s3://'):
            import boto3

            s3 = boto3.resource('s3')
            bucket, key = url_md5[5:].split('/', maxsplit=1)
            obj = s3.Object(bucket, key)
            data = obj.get()['Body'].read().decode('utf8')
        else:
            r = requests.get(url_md5, headers=headers)
            if r.status_code != 200:
                return False
            data = r.text
    except Exception as e:
        log.debug(f'Could not download {url_md5} because of an exception {type(e)}: {e}')
        return False

    expected = {}
    for line in data.splitlines():
        _md5, fname = line.split(' ', maxsplit=1)
        if fname[0] != '*':
            if fname[0] == ' ':
                log.warning(f'Hash generated in text mode for {fname}, comparison could be incorrect')
            else:
                log.error(f'Unknown hash content format in {url + ".md5"}')
                return False
        expected[fname[1:]] = _md5

    done = None
    not_done = []
    for base_path in dest_paths:
        if all(file_md5(base_path / p) == _md5 for p, _md5 in expected.items()):
            done = base_path
        else:
            not_done.append(base_path)

    if done is None:
        return False

    for base_path in not_done:
        log.info(f'Copying data from {done} to {base_path}')
        for p in expected.keys():
            shutil.copy(done / p, base_path / p)
    return True


def download_resource(url: str, dest_paths: Iterable[Union[Path, str]], headers: Optional[dict] = None) -> None:
    dest_paths = [Path(dest) for dest in dest_paths]
    download_path = dest_paths[0].parent
    download_path.mkdir(parents=True, exist_ok=True)
    file_name = urlparse(url).path.split('/')[-1]
    lockfile = download_path / f'.{file_name}.lock'

    with FileLock(lockfile).acquire(poll_intervall=10):
        if check_md5(url, dest_paths, headers):
            log.info(f'Skipped {url} download because of matching hashes')
        elif any(ext in url for ext in ('.tar.gz', '.gz', '.zip')):
            download_decompress(url, download_path, dest_paths, headers=headers)
        else:
            dest_files = [dest_path / file_name for dest_path in dest_paths]
            download(dest_files, url, headers=headers)


def download_resources(args: Namespace) -> None:
    if not args.all and not args.config:
        log.error('You should provide either model config path or -all flag')
        sys.exit(1)
    elif args.all:
        downloads = get_configs_downloads()
    else:
        config_path = Path(args.config).resolve()
        downloads = get_configs_downloads(config=config_path)

    for url, dest_paths in downloads.items():
        download_resource(url, dest_paths)


def deep_download(config: Union[str, Path, dict]) -> None:
    downloads = get_configs_downloads(config)
    last_id = len(downloads) - 1
    session_id = secrets.token_urlsafe(32)

    for file_id, (url, dest_paths) in enumerate(downloads.items()):
        headers = {
            'dp-token': get_download_token(),
            'dp-session': session_id,
            'dp-file-id': str(last_id - file_id),
            'dp-version': deeppavlov.__version__
        }
        if not url.startswith('s3://') and not isinstance(config, dict):
            url = set_query_parameter(url, 'config', Path(config).stem)
        download_resource(url, dest_paths, headers)


def main(args: Optional[List[str]] = None) -> None:
    args = parser.parse_args(args)
    log.info("Downloading...")
    download_resources(args)
    log.info("\nDownload successful!")


if __name__ == "__main__":
    main()


================================================
FILE: deeppavlov/metrics/__init__.py
================================================


================================================
FILE: deeppavlov/metrics/accuracy.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import itertools
import re
from logging import getLogger
from typing import List

import numpy as np

from deeppavlov.core.common.metrics_registry import register_metric

log = getLogger(__name__)


@register_metric('accuracy')
def accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float:
    """
    Calculate accuracy in terms of absolute coincidence

    Args:
        y_true: array of true values
        y_predicted: array of predicted values

    Returns:
        fraction of absolutely coincidental samples
    """
    examples_len = len(y_true)
    # if y1 and y2 are both arrays, == can be erroneously interpreted as element-wise equality

    def _are_equal(y1, y2):
        answer = (y1 == y2)
        if isinstance(answer, np.ndarray):
            answer = answer.all()
        return answer

    equalities = [_are_equal(y1, y2) for y1, y2 in zip(y_true, y_predicted)]
    correct = sum(equalities)
    return correct / examples_len if examples_len else 0


@register_metric('kbqa_accuracy')
def kbqa_accuracy(questions_batch, pred_answer_labels_batch, pred_answer_ids_batch, pred_query_batch,
                  gold_answer_labels_batch, gold_answer_ids_batch, gold_query_batch) -> float:
    num_samples = len(pred_answer_ids_batch)
    correct = 0
    for question, pred_answer_label, pred_answer_ids, pred_query, gold_answer_labels, gold_answer_ids, gold_query in \
            zip(questions_batch, pred_answer_labels_batch, pred_answer_ids_batch, pred_query_batch,
                gold_answer_labels_batch, gold_answer_ids_batch, gold_query_batch):
        found_date = False
        if pred_answer_ids and gold_answer_ids and re.findall(r"[\d]{3,4}", pred_answer_ids[0]) and \
                re.findall(r"[\d]{3,4}", pred_answer_ids[0]) == re.findall(r"[\d]{3,4}", gold_answer_ids[0]):
            found_date = True
        found_label = False
        if len(gold_answer_labels) == 1 and len(pred_answer_label) > 1 and pred_answer_label == gold_answer_labels[0]:
            found_label = True
        no_answer = False
        if pred_answer_label == "Not Found" and not gold_answer_ids:
            no_answer = True
        if set(pred_answer_ids) == set(gold_answer_ids) or gold_query in pred_query or found_date or found_label \
                or no_answer:
            correct += 1
        log.debug(f"question: {question} -- gold_answer_ids: {gold_answer_ids} -- pred_answer_ids: {pred_answer_ids}")
    return correct / num_samples if num_samples else 0


@register_metric('multitask_accuracy')
def multitask_accuracy(*args) -> float:
    """
    Accuracy for multiple simultaneous tasks.

    Args:
        *args: a list of `2n` inputs. The first `n` inputs are the correct answers for `n` tasks,
            and the last `n` are the predicted ones.

    Returns:
        The percentage of inputs where the answers for all `n` tasks are correct.
    """
    n = len(args)
    y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:]
    answers = []
    for true, pred in zip(y_true_by_tasks, y_predicted_by_tasks):
        answers.append(accuracy(true, pred))
    final_answer = sum(answers)/len(answers)
    return final_answer


@register_metric('multitask_sequence_accuracy')
def multitask_sequence_accuracy(*args) -> float:
    """
    Accuracy for multiple simultaneous sequence labeling (tagging) tasks.
    For each sequence the model checks whether all its elements
    are labeled correctly for all the individual taggers.

    Args:
        *args: a list of `2n` inputs. The first `n` inputs are the correct answers for `n` tasks,
            and the last `n` are the predicted ones. For each task an

    Returns:
        The percentage of sequences where all the items has correct answers for all `n` tasks.

    """
    n = len(args)
    y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:]
    y_true_by_sents = list(zip(*y_true_by_tasks))
    y_predicted_by_sents = list(zip(*y_predicted_by_tasks))
    y_true = list(list(zip(*elem)) for elem in y_true_by_sents)
    y_predicted = list(list(zip(*elem)) for elem in y_predicted_by_sents)
    return accuracy(y_true, y_predicted)


@register_metric('multitask_token_accuracy')
def multitask_token_accuracy(*args) -> float:
    """
        Per-item accuracy for multiple simultaneous sequence labeling (tagging) tasks.

        Args:
            *args: a list of `2n` inputs. The first `n` inputs are the correct answers for `n` tasks
                and the last `n` are the predicted ones. For each task an

        Returns:
            The percentage of sequence elements for which the answers for all `n` tasks are correct.

        """
    n = len(args)
    y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:]
    y_true_by_sents = list(zip(*y_true_by_tasks))
    y_predicted_by_sents = list(zip(*y_predicted_by_tasks))
    y_true = list(list(zip(*elem)) for elem in y_true_by_sents)
    y_predicted = list(list(zip(*elem)) for elem in y_predicted_by_sents)
    return per_token_accuracy(y_true, y_predicted)


@register_metric('sets_accuracy')
def sets_accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float:
    """
    Calculate accuracy in terms of sets coincidence

    Args:
        y_true: true values
        y_predicted: predicted values

    Returns:
        portion of samples with absolutely coincidental sets of predicted values

    Alias:
        sets_accuracy
    """
    examples_len = len(y_true)
    correct = sum([set(y1) == set(y2) for y1, y2 in zip(y_true, y_predicted)])
    return correct / examples_len if examples_len else 0


@register_metric('slots_accuracy')
def slots_accuracy(y_true, y_predicted):
    y_true = [{tag.split('-')[-1] for tag in s if tag != 'O'} for s in y_true]
    y_predicted = [set(s.keys()) for s in y_predicted]
    return accuracy(y_true, y_predicted)


@register_metric('per_token_accuracy')
def per_token_accuracy(y_true, y_predicted):
    y_true = list(itertools.chain(*y_true))
    y_predicted = itertools.chain(*y_predicted)
    examples_len = len(y_true)
    correct = sum([y1 == y2 for y1, y2 in zip(y_true, y_predicted)])
    return correct / examples_len if examples_len else 0


# region go-bot metrics

@register_metric('per_item_dialog_accuracy')
def per_item_dialog_accuracy(y_true, y_predicted: List[List[str]]):
    # todo metric classes???
    y_true = [y['text'] for dialog in y_true for y in dialog]
    y_predicted = itertools.chain(*y_predicted)
    examples_len = len(y_true)
    correct = sum([y1.strip().lower() == y2.strip().lower() for y1, y2 in zip(y_true, y_predicted)])
    return correct / examples_len if examples_len else 0


@register_metric('acc')
def round_accuracy(y_true, y_predicted):
    """
    Rounds predictions and calculates accuracy in terms of absolute coincidence.

    Args:
        y_true: list of true values
        y_predicted: list of predicted values

    Returns:
        portion of absolutely coincidental samples
    """
    if isinstance(y_predicted[0], np.ndarray):
        predictions = [np.round(x) for x in y_predicted]
    else:
        predictions = [round(x) for x in y_predicted]
    examples_len = len(y_true)
    correct = sum([y1 == y2 for y1, y2 in zip(y_true, predictions)])
    return correct / examples_len if examples_len else 0


================================================
FILE: deeppavlov/metrics/bleu.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
from typing import List, Tuple, Any

from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction, brevity_penalty, closest_ref_length

from deeppavlov.core.common.metrics_registry import register_metric
from deeppavlov.metrics.google_bleu import compute_bleu

SMOOTH = SmoothingFunction()


@register_metric('bleu_advanced')
def bleu_advanced(y_true: List[Any], y_predicted: List[Any],
                  weights: Tuple = (1,), smoothing_function=SMOOTH.method1,
                  auto_reweigh=False, penalty=True) -> float:
    """Calculate BLEU score

    Parameters:
        y_true: list of reference tokens
        y_predicted: list of query tokens
        weights: n-gram weights
        smoothing_function: SmoothingFunction
        auto_reweigh: Option to re-normalize the weights uniformly
        penalty: either enable brevity penalty or not

    Return:
        BLEU score
    """

    bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh)

    hyp_len = len(y_predicted)
    hyp_lengths = hyp_len
    ref_lengths = closest_ref_length([y_true], hyp_len)

    bpenalty = brevity_penalty(ref_lengths, hyp_lengths)

    if penalty is True or bpenalty == 0:
        return bleu_measure

    return bleu_measure / bpenalty


@register_metric('bleu')
def bleu(y_true, y_predicted):
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y_p.lower().split() for y_p in y_predicted])


@register_metric('google_bleu')
def google_bleu(y_true, y_predicted):
    return compute_bleu(([y_t.lower().split()] for y_t in y_true),
                        (y_p.lower().split() for y_p in y_predicted))[0]


@register_metric('per_item_bleu')
def per_item_bleu(y_true, y_predicted):
    y_predicted = itertools.chain(*y_predicted)
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y_p.lower().split() for y_p in y_predicted])


@register_metric('per_item_dialog_bleu')
def per_item_dialog_bleu(y_true, y_predicted):
    y_true = (y['text'] for dialog in y_true for y in dialog)
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y.lower().split() for y_p in y_predicted for y in y_p])


================================================
FILE: deeppavlov/metrics/correlation.py
================================================
# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef

from deeppavlov.core.common.metrics_registry import register_metric


@register_metric('pearson_correlation')
def pearson_correlation(y_true, y_predicted) -> float:
    return pearsonr(y_predicted, y_true)[0]


@register_metric('spearman_correlation')
def spearman_correlation(y_true, y_predicted) -> float:
    return spearmanr(y_predicted, y_true)[0]


@register_metric('matthews_correlation')
def matthews_correlation(y_true, y_predicted) -> float:
    return matthews_corrcoef(y_true, y_predicted)


================================================
FILE: deeppavlov/metrics/elmo_metrics.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

import numpy as np

from deeppavlov.core.common.metrics_registry import register_metric


@register_metric('elmo_loss2ppl')
def elmo_loss2ppl(losses: List[np.ndarray]) -> float:
    """ Calculates perplexity by loss

    Args:
        losses: list of numpy arrays of model losses

    Returns:
        perplexity : float
    """
    avg_loss = np.mean(losses)
    return float(np.exp(avg_loss))


================================================
FILE: deeppavlov/metrics/fmeasure.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
from collections import OrderedDict
from itertools import chain
from logging import getLogger

import numpy as np
from sklearn.metrics import f1_score

from deeppavlov.core.common.metrics_registry import register_metric

log = getLogger(__name__)


@register_metric('ner_f1')
def ner_f1(y_true, y_predicted):
    """
    Calculates F1 measure for Named Entity Recognition task.

    Args:
        y_true: list of true values
        y_predicted: list of predicted values

    Returns:
        F1 score

    Alias:
        ner_f1
    """
    y_true = list(chain(*y_true))
    y_predicted = list(chain(*y_predicted))
    results = precision_recall_f1(y_true,
                                  y_predicted,
                                  print_results=True)
    f1 = results['__total__']['f1']
    return f1


@register_metric('ner_token_f1')
def ner_token_f1(y_true, y_predicted, print_results=False):
    """
    Calculates F1 measure for Named Entity Recognition task without taking into account BIO or BIOES markup.

    Args:
        y_true: list of true values
        y_predicted: list of predicted values
        print_results: if True, then F1 score for each entity type is printed

    Returns:
        F1 score

    Alias:
        ner_f1
    """
    y_true = list(chain(*y_true))
    y_pred = list(chain(*y_predicted))

    # Drop BIO or BIOES markup
    assert all(len(tag.split('-')) <= 2 for tag in y_true)

    y_true = [tag.split('-')[-1] for tag in y_true]
    y_pred = [tag.split('-')[-1] for tag in y_pred]
    tags = set(y_true) | set(y_pred)
    tags_dict = {tag: n for n, tag in enumerate(tags)}

    y_true_inds = np.array([tags_dict[tag] for tag in y_true])
    y_pred_inds = np.array([tags_dict[tag] for tag in y_pred])

    results = {}
    for tag, tag_ind in tags_dict.items():
        if tag == 'O':
            continue
        tp = np.sum((y_true_inds == tag_ind) & (y_pred_inds == tag_ind))
        fn = np.sum((y_true_inds == tag_ind) & (y_pred_inds != tag_ind))
        fp = np.sum((y_true_inds != tag_ind) & (y_pred_inds == tag_ind))
        n_pred = np.sum(y_pred_inds == tag_ind)
        n_true = np.sum(y_true_inds == tag_ind)
        if tp + fp > 0:
            precision = tp / (tp + fp) * 100
        else:
            precision = 0
        if tp + fn > 0:
            recall = tp / (tp + fn) * 100
        else:
            recall = 0
        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0
        results[tag] = {'precision': precision, 'recall': recall,
                        'f1': f1, 'n_true': n_true, 'n_pred': n_pred,
                        'tp': tp, 'fp': fp, 'fn': fn}

    results['__total__'], accuracy, total_true_entities, total_predicted_entities, total_correct = _global_stats_f1(
        results)
    n_tokens = len(y_true)
    if print_results:
        log.debug('TOKEN LEVEL F1')
        _print_conll_report(results, accuracy, total_true_entities, total_predicted_entities, n_tokens, total_correct)
    return results['__total__']['f1']


def _print_conll_report(results, accuracy, total_true_entities, total_predicted_entities, n_tokens, total_correct,
                        short_report=False, entity_of_interest=None):
    tags = list(results.keys())

    s = 'processed {len} tokens ' \
        'with {tot_true} phrases; ' \
        'found: {tot_pred} phrases;' \
        ' correct: {tot_cor}.\n\n'.format(len=n_tokens,
                                          tot_true=total_true_entities,
                                          tot_pred=total_predicted_entities,
                                          tot_cor=total_correct)

    s += 'precision:  {tot_prec:.2f}%; ' \
         'recall:  {tot_recall:.2f}%; ' \
         'FB1:  {tot_f1:.2f}\n\n'.format(acc=accuracy,
                                         tot_prec=results['__total__']['precision'],
                                         tot_recall=results['__total__']['recall'],
                                         tot_f1=results['__total__']['f1'])

    if not short_report:
        for tag in tags:
            if entity_of_interest is not None:
                if entity_of_interest in tag:
                    s += '\t' + tag + ': precision:  {tot_prec:.2f}%; ' \
                                      'recall:  {tot_recall:.2f}%; ' \
                                      'F1:  {tot_f1:.2f} ' \
                                      '{tot_predicted}\n\n'.format(tot_prec=results[tag]['precision'],
                                                                   tot_recall=results[tag]['recall'],
                                                                   tot_f1=results[tag]['f1'],
                                                                   tot_predicted=results[tag]['n_pred'])
            elif tag != '__total__':
                s += '\t' + tag + ': precision:  {tot_prec:.2f}%; ' \
                                  'recall:  {tot_recall:.2f}%; ' \
                                  'F1:  {tot_f1:.2f} ' \
                                  '{tot_predicted}\n\n'.format(tot_prec=results[tag]['precision'],
                                                               tot_recall=results[tag]['recall'],
                                                               tot_f1=results[tag]['f1'],
                                                               tot_predicted=results[tag]['n_pred'])
    elif entity_of_interest is not None:
        s += '\t' + entity_of_interest + ': precision:  {tot_prec:.2f}%; ' \
                                         'recall:  {tot_recall:.2f}%; ' \
                                         'F1:  {tot_f1:.2f} ' \
                                         '{tot_predicted}\n\n'.format(tot_prec=results[entity_of_interest]['precision'],
                                                                      tot_recall=results[entity_of_interest]['recall'],
                                                                      tot_f1=results[entity_of_interest]['f1'],
                                                                      tot_predicted=results[entity_of_interest][
                                                                          'n_pred'])
    log.debug(s)


def _global_stats_f1(results):
    total_true_entities = 0
    total_predicted_entities = 0
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    total_correct = 0
    for tag in results:
        if tag == '__total__':
            continue

        n_pred = results[tag]['n_pred']
        n_true = results[tag]['n_true']
        total_correct += results[tag]['tp']
        total_true_entities += n_true
        total_predicted_entities += n_pred
        total_precision += results[tag]['precision'] * n_pred
        total_recall += results[tag]['recall'] * n_true
        total_f1 += results[tag]['f1'] * n_true
    if total_true_entities > 0:
        accuracy = total_correct / total_true_entities * 100
        total_recall = total_recall / total_true_entities
    else:
        accuracy = 0
        total_recall = 0
    if total_predicted_entities > 0:
        total_precision = total_precision / total_predicted_entities
    else:
        total_precision = 0

    if total_precision + total_recall > 0:
        total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall)
    else:
        total_f1 = 0

    total_res = {'n_predicted_entities': total_predicted_entities,
                 'n_true_entities': total_true_entities,
                 'precision': total_precision,
                 'recall': total_recall,
                 'f1': total_f1}
    return total_res, accuracy, total_true_entities, total_predicted_entities, total_correct


@register_metric('f1')
def round_f1(y_true, y_predicted):
    """
    Calculates F1 (binary) measure.

    Args:
        y_true: list of true values
        y_predicted: list of predicted values

    Returns:
        F1 score

    Alias:
        f1
    """
    try:
        predictions = [np.round(x) for x in y_predicted]
    except TypeError:
        if set(y_true) | set(y_predicted) in ({"True"}, {"False"}, {"False", "True"}):
            y_true = [y == "True" for y in y_true]
            predictions = [y == "True" for y in y_predicted]
        else:
            raise RuntimeError(f"Unexpectible type for {y_true} and {predictions}")

    return f1_score(y_true, predictions)


@register_metric('f1_macro')
def round_f1_macro(y_true, y_predicted):
    """
    Calculates F1 macro measure.

    Args:
        y_true: list of true values
        y_predicted: list of predicted values

    Returns:
        F1 score

    Alias:
        f1_macro
    """
    try:
        predictions = [np.round(x) for x in y_predicted]
    except TypeError:
        predictions = y_predicted

    return f1_score(np.array(y_true), np.array(predictions), average="macro")


@register_metric('f1_weighted')
def round_f1_weighted(y_true, y_predicted):
    """
    Calculates F1 weighted measure.

    Args:
        y_true: list of true values
        y_predicted: list of predicted values

    Returns:
        F1 score

    Alias:
        f1_weighted
    """
    try:
        predictions = [np.round(x) for x in y_predicted]
    except TypeError:
        predictions = y_predicted

    return f1_score(np.array(y_true), np.array(predictions), average="weighted")


def chunk_finder(current_token, previous_token, tag):
    current_tag = current_token.split('-', 1)[-1]
    previous_tag = previous_token.split('-', 1)[-1]
    if previous_tag != tag:
        previous_tag = 'O'
    if current_tag != tag:
        current_tag = 'O'

    if current_tag != 'O' and (
            previous_tag == 'O' or
            previous_token in ['E-' + tag, 'L-' + tag, 'S-' + tag, 'U-' + tag] or
            current_token in ['B-' + tag, 'S-' + tag, 'U-' + tag]
    ):
        create_chunk = True
    else:
        create_chunk = False

    if previous_tag != 'O' and (
            current_tag == 'O' or
            previous_token in ['E-' + tag, 'L-' + tag, 'S-' + tag, 'U-' + tag] or
            current_token in ['B-' + tag, 'S-' + tag, 'U-' + tag]
    ):
        pop_out = True
    else:
        pop_out = False
    return create_chunk, pop_out


def precision_recall_f1(y_true, y_pred, print_results=True, short_report=False, entity_of_interest=None):
    # Find all tags
    tags = set()
    for tag in itertools.chain(y_true, y_pred):
        if tag != 'O':
            current_tag = tag[2:]
            tags.add(current_tag)
    tags = sorted(list(tags))

    results = OrderedDict()
    for tag in tags:
        results[tag] = OrderedDict()
    results['__total__'] = OrderedDict()
    n_tokens = len(y_true)
    total_correct = 0
    # Firstly we find all chunks in the ground truth and prediction
    # For each chunk we write starting and ending indices

    for tag in tags:
        count = 0
        true_chunk = []
        pred_chunk = []
        y_true = [str(y) for y in y_true]
        y_pred = [str(y) for y in y_pred]
        prev_tag_true = 'O'
        prev_tag_pred = 'O'
        while count < n_tokens:
            yt = y_true[count]
            yp = y_pred[count]

            create_chunk_true, pop_out_true = chunk_finder(yt, prev_tag_true, tag)
            if pop_out_true:
                true_chunk[-1] = (true_chunk[-1], count - 1)
            if create_chunk_true:
                true_chunk.append(count)

            create_chunk_pred, pop_out_pred = chunk_finder(yp, prev_tag_pred, tag)
            if pop_out_pred:
                pred_chunk[-1] = (pred_chunk[-1], count - 1)
            if create_chunk_pred:
                pred_chunk.append(count)
            prev_tag_true = yt
            prev_tag_pred = yp
            count += 1

        if len(true_chunk) > 0 and not isinstance(true_chunk[-1], tuple):
            true_chunk[-1] = (true_chunk[-1], count - 1)
        if len(pred_chunk) > 0 and not isinstance(pred_chunk[-1], tuple):
            pred_chunk[-1] = (pred_chunk[-1], count - 1)

        # Then we find all correctly classified intervals
        # True positive results
        tp = len(set(pred_chunk).intersection(set(true_chunk)))
        # And then just calculate errors of the first and second kind
        # False negative
        fn = len(true_chunk) - tp
        # False positive
        fp = len(pred_chunk) - tp
        if tp + fp > 0:
            precision = tp / (tp + fp) * 100
        else:
            precision = 0
        if tp + fn > 0:
            recall = tp / (tp + fn) * 100
        else:
            recall = 0
        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0
        results[tag]['precision'] = precision
        results[tag]['recall'] = recall
        results[tag]['f1'] = f1
        results[tag]['n_pred'] = len(pred_chunk)
        results[tag]['n_true'] = len(true_chunk)
        results[tag]['tp'] = tp
        results[tag]['fn'] = fn
        results[tag]['fp'] = fp

    results['__total__'], accuracy, total_true_entities, total_predicted_entities, accuracy = _global_stats_f1(results)
    results['__total__']['n_pred'] = total_predicted_entities
    results['__total__']['n_true'] = total_true_entities

    if print_results:
        s = 'processed {len} tokens ' \
            'with {tot_true} phrases; ' \
            'found: {tot_pred} phrases;' \
            ' correct: {tot_cor}.\n\n'.format(len=n_tokens,
                                              tot_true=total_true_entities,
                                              tot_pred=total_predicted_entities,
                                              tot_cor=total_correct)

        s += 'precision:  {tot_prec:.2f}%; ' \
             'recall:  {tot_recall:.2f}%; ' \
             'FB1:  {tot_f1:.2f}\n\n'.format(acc=accuracy,
                                             tot_prec=results['__total__']['precision'],
                                             tot_recall=results['__total__']['recall'],
                                             tot_f1=results['__total__']['f1'])

        if not short_report:
            for tag in tags:
                if entity_of_interest is not None:
                    if entity_of_interest in tag:
                        s += '\t' + tag + ': precision:  {tot_prec:.2f}%; ' \
                                          'recall:  {tot_recall:.2f}%; ' \
                                          'F1:  {tot_f1:.2f} ' \
                                          '{tot_predicted}\n\n'.format(tot_prec=results[tag]['precision'],
                                                                       tot_recall=results[tag]['recall'],
                                                                       tot_f1=results[tag]['f1'],
                                                                       tot_predicted=results[tag]['n_pred'])
                elif tag != '__total__':
                    s += '\t' + tag + ': precision:  {tot_prec:.2f}%; ' \
                                      'recall:  {tot_recall:.2f}%; ' \
                                      'F1:  {tot_f1:.2f} ' \
                                      '{tot_predicted}\n\n'.format(tot_prec=results[tag]['precision'],
                                                                   tot_recall=results[tag]['recall'],
                                                                   tot_f1=results[tag]['f1'],
                                                                   tot_predicted=results[tag]['n_pred'])
        elif entity_of_interest is not None:
            s += '\t' + entity_of_interest + ': precision:  {tot_prec:.2f}%; ' \
                                             'recall:  {tot_recall:.2f}%; ' \
                                             'F1:  {tot_f1:.2f} ' \
                                             '{tot_predicted}\n\n'.format(
                tot_prec=results[entity_of_interest]['precision'],
                tot_recall=results[entity_of_interest]['recall'],
                tot_f1=results[entity_of_interest]['f1'],
                tot_predicted=results[entity_of_interest]['n_pred'])
        log.debug(s)
    return results


@register_metric("average__ner_f1__f1_macro__f1")
def ner_f1__f1_macro__f1(ner_true, ner_pred, macro_true, macro_pred, f1_true, f1_pred):
    ner_f1_res = ner_f1(ner_true, ner_pred) / 100
    f1_macro_res = round_f1_macro(macro_true, macro_pred)
    f1_res = round_f1(f1_true, f1_pred)
    return (ner_f1_res + f1_macro_res + f1_res) / 3


@register_metric("average__roc_auc__roc_auc__ner_f1")
def roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_probas2, ner_true3, ner_pred3):
    from .roc_auc_score import roc_auc_score
    roc_auc1 = roc_auc_score(true_onehot1, pred_probas1)
    roc_auc2 = roc_auc_score(true_onehot2, pred_probas2)
    ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100
    return (roc_auc1 + roc_auc2 + ner_f1_3) / 3


================================================
FILE: deeppavlov/metrics/google_bleu.py
================================================
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Python implementation of BLEU and smooth-BLEU.

This module provides a Python implementation of BLEU and smooth-BLEU.
Smooth BLEU is computed following the method outlined in the paper:
Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
evaluation metrics for machine translation. COLING 2004.
"""

import collections
import math


def _get_ngrams(segment, max_order):
    """Extracts all n-grams upto a given maximum order from an input segment.

    Args:
      segment: text segment from which n-grams will be extracted.
      max_order: maximum length in tokens of the n-grams returned by this
          methods.

    Returns:
      The Counter containing all n-grams upto max_order in segment
      with a count of how many times each n-gram occurred.
    """
    ngram_counts = collections.Counter()
    for order in range(1, max_order + 1):
        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i + order])
            ngram_counts[ngram] += 1
    return ngram_counts


def compute_bleu(reference_corpus, translation_corpus, max_order=4,
                 smooth=False):
    """Computes BLEU score of translated segments against one or more references.

    Args:
      reference_corpus: list of lists of references for each translation. Each
          reference should be tokenized into a list of tokens.
      translation_corpus: list of translations to score. Each translation
          should be tokenized into a list of tokens.
      max_order: Maximum n-gram order to use when computing BLEU score.
      smooth: Whether or not to apply Lin et al. 2004 smoothing.

    Returns:
      3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
      precisions and brevity penalty.
    """
    matches_by_order = [0] * max_order
    possible_matches_by_order = [0] * max_order
    reference_length = 0
    translation_length = 0
    for (references, translation) in zip(reference_corpus,
                                         translation_corpus):
        reference_length += min(len(r) for r in references)
        translation_length += len(translation)

        merged_ref_ngram_counts = collections.Counter()
        for reference in references:
            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
        translation_ngram_counts = _get_ngrams(translation, max_order)
        overlap = translation_ngram_counts & merged_ref_ngram_counts
        for ngram in overlap:
            matches_by_order[len(ngram) - 1] += overlap[ngram]
        for order in range(1, max_order + 1):
            possible_matches = len(translation) - order + 1
            if possible_matches > 0:
                possible_matches_by_order[order - 1] += possible_matches

    precisions = [0] * max_order
    for i in range(0, max_order):
        if smooth:
            precisions[i] = ((matches_by_order[i] + 1.) /
                             (possible_matches_by_order[i] + 1.))
        else:
            if possible_matches_by_order[i] > 0:
                precisions[i] = (float(matches_by_order[i]) /
                                 possible_matches_by_order[i])
            else:
                precisions[i] = 0.0

    if min(precisions) > 0:
        p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
        geo_mean = math.exp(p_log_sum)
    else:
        geo_mean = 0

    ratio = float(translation_length) / reference_length

    if ratio > 1.0:
        bp = 1.
    else:
        bp = math.exp(1 - 1. / ratio)

    bleu = geo_mean * bp

    return (bleu, precisions, bp, ratio, translation_length, reference_length)


================================================
FILE: deeppavlov/metrics/log_loss.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from typing import List, Union

import numpy as np
from sklearn.metrics import log_loss

from deeppavlov.core.common.metrics_registry import register_metric


@register_metric('log_loss')
def sk_log_loss(y_true: Union[List[List[float]], List[List[int]], np.ndarray],
                y_predicted: Union[List[List[float]], List[List[int]], np.ndarray]) -> float:
    """
    Calculates log loss.

    Args:
        y_true: list or array of true values
        y_predicted: list or array of predicted values

    Returns:
        Log loss

    Alias:
        log_loss
    """
    return log_loss(y_true, y_predicted)


================================================
FILE: deeppavlov/metrics/mse.py
================================================
# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from sklearn.metrics import mean_squared_error
from typing import Union

from deeppavlov.core.common.metrics_registry import register_metric


@register_metric('mean_squared_error')
def mse(y_true: Union[np.array, list],
        y_predicted: Union[np.array, list],
        *args,
        **kwargs) -> float:
    """
    Calculates mean squared error.
    Args:
        y_true: list of true values
        y_predicted: list of predicted values
    Returns:
        float: Mean squared error
    """
    for value in [y_true, y_predicted]:
        assert (np.isfinite(value).all())
    return mean_squared_error(y_true, y_predicted, *args, **kwargs)


================================================
FILE: deeppavlov/metrics/recall_at_k.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from typing import List

import numpy as np

from deeppavlov.core.common.metrics_registry import register_metric


def recall_at_k(y_true: List[int], y_pred: List[List[np.ndarray]], k: int):
    """
    Calculates recall at k ranking metric.

    Args:
        y_true: Labels. Not used in the calculation of the metric.
        y_predicted: Predictions.
            Each prediction contains ranking score of all ranking candidates for the particular data sample.
            It is supposed that the ranking score for the true candidate goes first in the prediction.

    Returns:
        Recall at k
    """
    num_examples = float(len(y_pred))
    predictions = np.array(y_pred)
    predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k]
    num_correct = 0
    for el in predictions:
        if 0 in el:
            num_correct += 1
    return float(num_correct) / num_examples


@register_metric('r@1')
def r_at_1(y_true, y_pred):
    return recall_at_k(y_true, y_pred, k=1)


@register_metric('r@2')
def r_at_2(y_true, y_pred):
    return recall_at_k(y_true, y_pred, k=2)


@register_metric('r@5')
def r_at_5(labels, predictions):
    return recall_at_k(labels, predictions, k=5)


@register_metric('r@10')
def r_at_10(labels, predictions):
    return recall_at_k(labels, predictions, k=10)


================================================
FILE: deeppavlov/metrics/record_metrics.py
================================================
import re
import string
import collections
from typing import List

import numpy as np

from deeppavlov.models.preprocessors.torch_transformers_preprocessor import RecordNestedExample
from deeppavlov.core.common.metrics_registry import register_metric


@register_metric("record_f1_score")
def record_f1_score(record_examples: List[RecordNestedExample]):
    """Calculate F1 score for given nested ReCoRD examples

    Args:
        record_examples: processed ReCoRD examples

    Returns:
        float: F1 score
    """
    if not record_examples:
        return 0.
    f1_scores = []
    for example in record_examples:
        example_f1s = []
        for answer in example.answers:
            example_f1s.append(exact_match_score(example.prediction, answer))
        if example_f1s:
            f1_scores.append(max(example_f1s))
    return np.mean(f1_scores)


@register_metric("record_em_score")
def record_em_score(record_examples: List[RecordNestedExample]):
    """Calculate Exact Match score for given nested ReCoRD examples

    Args:
        record_examples: processed ReCoRD examples

    Returns:
        float: Exact Match score
    """
    if not record_examples:
        return 0.
    em_scores = []
    for example in record_examples:
        example_ems = []
        for answer in example.answers:
            example_ems.append(string_f1_score(example.prediction, answer))
        if example_ems:
            em_scores.append(max(example_ems))
    return np.mean(em_scores) if em_scores else -1


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace.
    From official ReCoRD eval script
    """

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def string_f1_score(prediction, ground_truth):
    """Compute normalized token level F1
    From official ReCoRD eval script
    """
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    """Compute normalized exact match
    From official ReCoRD eval script
    """
    return normalize_answer(prediction) == normalize_answer(ground_truth)


================================================
FILE: deeppavlov/metrics/roc_auc_score.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from typing import List, Union

import numpy as np
import sklearn.metrics

from deeppavlov.core.common.metrics_registry import register_metric


@register_metric('roc_auc')
def roc_auc_score(y_true: Union[List[List[float]], List[List[int]], np.ndarray],
                  y_pred: Union[List[List[float]], List[List[int]], np.ndarray]) -> float:
    """
    Compute Area Under the Curve (AUC) from prediction scores.

    Args:
        y_true: true binary labels
        y_pred: target scores, can either be probability estimates of the positive class

    Returns:
        Area Under the Curve (AUC) from prediction scores

    Alias:
        roc_auc
    """
    try:
        return sklearn.metrics.roc_auc_score(np.squeeze(np.array(y_true)),
                                             np.squeeze(np.array(y_pred)), average="macro")
    except ValueError:
        return 0.


================================================
FILE: deeppavlov/metrics/squad_metrics.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import string
from collections import Counter
from typing import List

from deeppavlov.core.common.metrics_registry import register_metric


@register_metric('squad_v2_em')
def squad_v2_exact_match(y_true: List[List[str]], y_predicted: List[str]) -> float:
    """ Calculates Exact Match score between y_true and y_predicted
        EM score uses the best matching y_true answer:
            if y_pred equal at least to one answer in y_true then EM = 1, else EM = 0

    The same as in SQuAD-v2.0

    Args:
        y_true: list of correct answers (correct answers are represented by list of strings)
        y_predicted: list of predicted answers

    Returns:
        exact match score : float
    """
    EM_total = sum(normalize_answer(prediction) in map(normalize_answer, ground_truth)
                   for ground_truth, prediction in zip(y_true, y_predicted))
    return 100 * EM_total / len(y_true) if len(y_true) > 0 else 0


@register_metric('squad_v1_em')
def squad_v1_exact_match(y_true: List[List[str]], y_predicted: List[str]) -> float:
    """ Calculates Exact Match score between y_true and y_predicted
        EM score uses the best matching y_true answer:
            if y_pred equal at least to one answer in y_true then EM = 1, else EM = 0
        Skips examples without an answer.
    Args:
        y_true: list of correct answers (correct answers are represented by list of strings)
        y_predicted: list of predicted answers
    Returns:
        exact match score : float
    """
    EM_total = 0
    count = 0
    for ground_truth, prediction in zip(y_true, y_predicted):
        if len(ground_truth[0]) == 0:
            # skip empty answers
            continue
        count += 1
        EMs = [int(normalize_answer(gt) == normalize_answer(prediction)) for gt in ground_truth]
        EM_total += max(EMs)
    return 100 * EM_total / count if count > 0 else 0


@register_metric('squad_v2_f1')
def squad_v2_f1(y_true: List[List[str]], y_predicted: List[str]) -> float:
    """ Calculates F-1 score between y_true and y_predicted
        F-1 score uses the best matching y_true answer

    The same as in SQuAD-v2.0

    Args:
        y_true: list of correct answers (correct answers are represented by list of strings)
        y_predicted: list of predicted answers

    Returns:
        F-1 score : float
    """
    f1_total = 0.0
    for ground_truth, prediction in zip(y_true, y_predicted):
        prediction_tokens = normalize_answer(prediction).split()
        f1s = []
        for gt in ground_truth:
            gt_tokens = normalize_answer(gt).split()
            if len(gt_tokens) == 0 or len(prediction_tokens) == 0:
                f1s.append(float(gt_tokens == prediction_tokens))
                continue
            common = Counter(prediction_tokens) & Counter(gt_tokens)
            num_same = sum(common.values())
            if num_same == 0:
                f1s.append(0.0)
                continue
            precision = 1.0 * num_same / len(prediction_tokens)
            recall = 1.0 * num_same / len(gt_tokens)
            f1 = (2 * precision * recall) / (precision + recall)
            f1s.append(f1)
        f1_total += max(f1s)
    return 100 * f1_total / len(y_true) if len(y_true) > 0 else 0


@register_metric('squad_v1_f1')
def squad_v1_f1(y_true: List[List[str]], y_predicted: List[str]) -> float:
    """ Calculates F-1 score between y_true and y_predicted
        F-1 score uses the best matching y_true answer

        Skips examples without an answer.
    Args:
        y_true: list of correct answers (correct answers are represented by list of strings)
        y_predicted: list of predicted answers
    Returns:
        F-1 score : float
    """
    f1_total = 0.0
    count = 0
    for ground_truth, prediction in zip(y_true, y_predicted):
        if len(ground_truth[0]) == 0:
            # skip empty answers
            continue
        count += 1
        prediction_tokens = normalize_answer(prediction).split()
        f1s = []
        for gt in ground_truth:
            gt_tokens = normalize_answer(gt).split()
            common = Counter(prediction_tokens) & Counter(gt_tokens)
            num_same = sum(common.values())
            if num_same == 0:
                f1s.append(0.0)
                continue
            precision = 1.0 * num_same / len(prediction_tokens)
            recall = 1.0 * num_same / len(gt_tokens)
            f1 = (2 * precision * recall) / (precision + recall)
            f1s.append(f1)
        f1_total += max(f1s)
    return 100 * f1_total / count if count > 0 else 0


def normalize_answer(s: str) -> str:
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


================================================
FILE: deeppavlov/models/__init__.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import nltk

from deeppavlov.core.common.prints import RedirectedPrints

if not os.environ.get('DP_SKIP_NLTK_DOWNLOAD'):
    with RedirectedPrints():
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('perluniprops', quiet=True)
        nltk.download('nonbreaking_prefixes', quiet=True)


================================================
FILE: deeppavlov/models/api_requester/__init__.py
================================================
from .api_requester import *


================================================
FILE: deeppavlov/models/api_requester/api_requester.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import asyncio
from typing import Any, List, Dict, AsyncIterable

import requests

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register('api_requester')
class ApiRequester(Component):
    """Component for forwarding parameters to APIs

    Args:
        url: url of the API.
        out: count of expected returned values or their names in a chainer.
        param_names: list of parameter names for API requests.
        debatchify: if ``True``, single instances will be sent to the API endpoint instead of batches.

    Attributes:
        url: url of the API.
        out_count: count of expected returned values.
        param_names: list of parameter names for API requests.
        debatchify: if True, single instances will be sent to the API endpoint instead of batches.
    """

    def __init__(self, url: str, out: [int, list], param_names: [list, tuple] = None, debatchify: bool = False,
                 *args, **kwargs):
        self.url = url
        if param_names is None:
            param_names = kwargs.get('in', ())
        self.param_names = param_names
        self.out_count = out if isinstance(out, int) else len(out)
        self.debatchify = debatchify

    def __call__(self, *args: List[Any], **kwargs: Dict[str, Any]):
        """

        Args:
            *args: list of parameters sent to the API endpoint. Parameter names are taken from self.param_names.
            **kwargs: named parameters to send to the API endpoint. If not empty, args are ignored

        Returns:
            result of the API request(s)
        """
        data = kwargs or dict(zip(self.param_names, args))

        if self.debatchify:
            batch_size = 0
            for v in data.values():
                batch_size = len(v)
                break

            assert batch_size > 0

            async def collect():
                return [j async for j in self.get_async_response(data, batch_size)]

            loop = asyncio.get_event_loop()
            response = loop.run_until_complete(collect())
            if self.out_count > 1:
                response = list(zip(*response))
        else:
            response = requests.post(self.url, json=data).json()

        return response

    async def get_async_response(self, data: dict, batch_size: int) -> AsyncIterable:
        """Helper function for sending requests asynchronously if the API endpoint does not support batching

        Args:
            data: data to be passed to the API endpoint
            batch_size: requests count

        Yields:
            requests results parsed as json
        """
        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                None,
                requests.post,
                self.url,
                None,
                {k: v[i] for k, v in data.items()}
            )
            for i in range(batch_size)
        ]
        for r in await asyncio.gather(*futures):
            yield r.json()


================================================
FILE: deeppavlov/models/api_requester/api_router.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import concurrent
from concurrent.futures import ProcessPoolExecutor
from logging import getLogger
from typing import List

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.models.api_requester import ApiRequester

logger = getLogger(__name__)


@register("api_router")
class ApiRouter(Component):
    """A helper class for running multiple API requesters on the same data in parallel

    Args:
        api_requesters: list of ApiRequester objects
        n_workers: The maximum number of subprocesses to run

    Attributes:
        api_requesters: list of ApiRequester objects
        n_workers: The maximum number of subprocesses to run
    """

    def __init__(self, api_requesters: List[ApiRequester], n_workers: int = 1, *args, **kwargs):
        self.api_requesters = api_requesters
        self.n_workers = n_workers

    def __call__(self, *args):
        """

        Args:
            *args: list of arguments to forward to the API requesters

        Returns:
            results of the requests
        """
        with ProcessPoolExecutor(self.n_workers) as executor:
            futures = [executor.submit(api_requester, *args) for api_requester
                       in
                       self.api_requesters]

            concurrent.futures.wait(futures)
            results = []
            for future, api_requester in zip(futures, self.api_requesters):
                result = future.result()
                if api_requester.out_count > 1:
                    results += result
                else:
                    results.append(result)

        return results


================================================
FILE: deeppavlov/models/classifiers/__init__.py
================================================


================================================
FILE: deeppavlov/models/classifiers/cos_sim_classifier.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, softwaredata
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from logging import getLogger
from typing import List, Tuple, Union

import numpy as np
from scipy.sparse import vstack, csr_matrix
from scipy.sparse.linalg import norm as sparse_norm

from deeppavlov.core.common.file import load_pickle
from deeppavlov.core.common.file import save_pickle
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.estimator import Estimator
from deeppavlov.core.models.serializable import Serializable

logger = getLogger(__name__)


@register("cos_sim_classifier")
class CosineSimilarityClassifier(Estimator, Serializable):
    """
    Classifier based on cosine similarity between vectorized sentences

    Parameters:
        save_path: path to save the model
        load_path: path to load the model
    """

    def __init__(self, top_n: int = 1, save_path: str = None, load_path: str = None, **kwargs) -> None:
        super().__init__(save_path=save_path, load_path=load_path, **kwargs)
        self.top_n = top_n

        self.x_train_features = self.y_train = None

        if kwargs['mode'] != 'train':
            self.load()

    def __call__(self, q_vects: Union[csr_matrix, List]) -> Tuple[List[str], List[int]]:
        """Found most similar answer for input vectorized question

        Parameters:
            q_vects: vectorized questions

        Returns:
            Tuple of Answer and Score
        """

        if isinstance(q_vects[0], csr_matrix):
            q_norm = sparse_norm(q_vects)
            if q_norm == 0.0:
                cos_similarities = np.zeros((q_vects.shape[0], self.x_train_features.shape[0]))
            else:
                norm = q_norm * sparse_norm(self.x_train_features, axis=1)
                cos_similarities = np.array(q_vects.dot(self.x_train_features.T).todense())
                cos_similarities = cos_similarities / norm
        elif isinstance(q_vects[0], np.ndarray):
            q_vects = np.array(q_vects)
            self.x_train_features = np.array(self.x_train_features)
            norm = np.linalg.norm(q_vects) * np.linalg.norm(self.x_train_features, axis=1)
            cos_similarities = q_vects.dot(self.x_train_features.T) / norm
        elif q_vects[0] is None:
            cos_similarities = np.zeros(len(self.x_train_features))
        else:
            raise NotImplementedError('Not implemented this type of vectors')

        # get cosine similarity for each class
        y_labels = np.unique(self.y_train)
        labels_scores = np.zeros((len(cos_similarities), len(y_labels)))
        for i, label in enumerate(y_labels):
            labels_scores[:, i] = np.max([cos_similarities[:, i]
                                          for i, value in enumerate(self.y_train) if value == label], axis=0)

        labels_scores_sum = labels_scores.sum(axis=1, keepdims=True)
        labels_scores = np.divide(labels_scores, labels_scores_sum,
                                  out=np.zeros_like(labels_scores), where=(labels_scores_sum != 0))

        answer_ids = np.argsort(labels_scores)[:, -self.top_n:]

        # generate top_n answers and scores
        answers = []
        scores = []
        for i in range(len(answer_ids)):
            answers.extend([y_labels[id] for id in answer_ids[i, ::-1]])
            scores.extend([np.round(labels_scores[i, id], 2) for id in answer_ids[i, ::-1]])

        return answers, scores

    def fit(self, x_train_vects: Tuple[Union[csr_matrix, List]], y_train: Tuple[str]) -> None:
        """Train classifier

        Parameters:
            x_train_vects: vectorized question for train dataset
            y_train: answers for train dataset

        Returns:
            None
        """
        if isinstance(x_train_vects, tuple):
            if len(x_train_vects) != 0:
                if isinstance(x_train_vects[0], csr_matrix):
                    self.x_train_features = vstack(list(x_train_vects))
                elif isinstance(x_train_vects[0], np.ndarray):
                    self.x_train_features = np.vstack(list(x_train_vects))
                else:
                    raise NotImplementedError('Not implemented this type of vectors')
            else:
                raise ValueError("Train vectors can't be empty")
        else:
            self.x_train_features = x_train_vects

        self.y_train = list(y_train)

    def save(self) -> None:
        """Save classifier parameters"""
        logger.info("Saving faq_model to {}".format(self.save_path))
        save_pickle((self.x_train_features, self.y_train), self.save_path)

    def load(self) -> None:
        """Load classifier parameters"""
        logger.debug("Loading faq_model from {}".format(self.load_path))
        self.x_train_features, self.y_train = load_pickle(self.load_path)


================================================
FILE: deeppavlov/models/classifiers/dnnc_proba2labels.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import List

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

log = getLogger(__name__)


@register('dnnc_proba2labels')
class Proba2Labels(Component):
    """
    Converts pairwise simmilarity scores into class label
    
    Args:
        confidence_threshold: used to determine whether example belongs to one 
                              of the classes in 'y_support' or not
        pooling: strategy for averaging similarity scores for each label
        is_binary: determines whether the similarity is a number or a probability vector
    """

    def __init__(self,
                 confidence_threshold: float = 0.0,
                 pooling: str = 'max',
                 is_binary: bool = True,
                 **kwargs) -> None:

        self.confidence_threshold = confidence_threshold
        self.pooling = pooling
        self.is_binary = is_binary

    def __call__(self,
                 simmilarity_scores: List[float],
                 x: List[str],
                 x_populated: List[str],
                 x_support: List[str],
                 y_support: List[str]
                ) -> List[str]:

        y_pred = []

        simmilarity_scores = np.array(simmilarity_scores)
        x_populated = np.array(x_populated)
        x_support = np.array(x_support)
        y_support = np.array(y_support)
        unique_labels = np.unique(y_support)

        # Transform probits vector into a simmilarity score
        if not self.is_binary:
            simmilarity_scores = simmilarity_scores[:, 1]

        for example in x:
            example_mask = np.where(np.logical_xor(x_populated == example, x_support == example))
            example_simmilarity_scores = simmilarity_scores[example_mask]
            example_y_support = y_support[example_mask]

            probability_by_label = []
            for label in unique_labels:
                label_mask = np.where(example_y_support == label)
                label_simmilarity_scores = example_simmilarity_scores[label_mask]
                if self.pooling == 'avg':
                    label_probability = np.mean(label_simmilarity_scores)
                elif self.pooling == 'max':
                    label_probability = np.max(label_simmilarity_scores)
                probability_by_label.append(label_probability)

            probability_by_label = np.array(probability_by_label)
            max_probability = max(probability_by_label)
            max_probability_label = unique_labels[np.argmax(probability_by_label)]
            prediction = "oos" if max_probability < self.confidence_threshold else max_probability_label

            y_pred.append(prediction)

        return y_pred


================================================
FILE: deeppavlov/models/classifiers/proba2labels.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger

import numpy as np

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

log = getLogger(__name__)


@register('proba2labels')
class Proba2Labels(Component):
    """
    Class implements probability to labels processing using the following ways: \
     choosing one or top_n indices with maximal probability or choosing any number of indices \
      which probabilities to belong with are higher than given confident threshold

    Args:
        max_proba: whether to choose label with maximal probability
        confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label)
        top_n: how many top labels with the highest probabilities to return

    Attributes:
        max_proba: whether to choose label with maximal probability
        confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label)
        top_n: how many top labels with the highest probabilities to return
    """

    def __init__(self,
                 max_proba: bool = None,
                 confidence_threshold: float = None,
                 top_n: int = None,
                 is_binary: bool = False,
                 **kwargs) -> None:
        """ Initialize class with given parameters"""

        self.max_proba = max_proba
        self.confidence_threshold = confidence_threshold
        self.top_n = top_n
        self.is_binary = is_binary

    def __call__(self,
                 *args,
                 **kwargs):
        """
        Process probabilities to labels
        Args:
            Every argument is a list of vectors with probability distribution
        Returns:
            list of labels (only label classification) or list of lists of labels (multi-label classification),
            or list of the following lists (in multitask setting) for every argument
        """
        answer = []
        log.debug(f'input {args}')
        for data in args:
            if all([k is None for k in data]):
                answer.append([])
            elif self.confidence_threshold:
                if self.is_binary:
                    answer.append([int(el > self.confidence_threshold) for el in data])
                else:
                    answer.append([list(np.where(np.array(d) > self.confidence_threshold)[0]) for d in data])
            elif self.max_proba:
                answer.append([np.argmax(d) for d in data])
            elif self.top_n:
                answer.append([np.argsort(d)[::-1][:self.top_n] for d in data])
            else:
                raise ConfigError("Proba2Labels requires one of three arguments: bool `max_proba` or "
                                  "float `confidence_threshold` for multi-label classification or"
                                  "integer `top_n` for choosing several labels with the highest probabilities")
        if len(args) == 1:  # only one argument
            answer = answer[0]
        log.debug(f'output {answer}')
        return answer


================================================
FILE: deeppavlov/models/classifiers/re_bert.py
================================================
import logging
from pathlib import Path
from typing import Tuple, Union, Any, List

import torch
from torch import Tensor
import torch.nn as nn
from opt_einsum import contract
from transformers import AutoConfig, BertModel, BertTokenizer

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.models.relation_extraction.losses import ATLoss

log = logging.getLogger(__name__)


class BertWithAdaThresholdLocContextPooling(nn.Module):

    def __init__(
            self,
            n_classes: int = 97,
            pretrained_bert: str = None,
            bert_tokenizer_config_file: str = None,
            bert_config_file: str = None,
            emb_size: int = 768,
            block_size: int = 8,       # 64
            num_ner_tags: int = 6,        # number of ner tags
            threshold: float = None,
            device: str = "gpu"
    ):
        super().__init__()
        self.n_classes = n_classes
        self.pretrained_bert = pretrained_bert
        self.bert_config_file = bert_config_file
        self.num_ner_tags = num_ner_tags
        self.emb_size = emb_size
        self.block_size = block_size
        self.threshold = threshold

        self.loss_fnt = ATLoss()
        self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu")

        # initialize parameters that would be filled later
        self.model, self.config, self.bert_config = None, None, None
        self.load()

        # initialize tokenizer to call resize_token_embeddings function for model with increased tokenizer size (due to
        # the additional <ENT> token) and get CLS and SEP token ids
        if Path(bert_tokenizer_config_file).is_file():
            vocab_file = str(expand_path(bert_tokenizer_config_file))
            self.tokenizer = BertTokenizer(vocab_file=vocab_file)
        else:
            tokenizer = BertTokenizer.from_pretrained(pretrained_bert)
        self.model.resize_token_embeddings(len(tokenizer) + 1)
        self.cls_token_id = tokenizer.cls_token_id
        self.sep_token_id = tokenizer.sep_token_id

        self.hidden_size = self.config.hidden_size
        self.head_extractor = nn.Linear(2 * self.hidden_size + self.num_ner_tags, self.emb_size)
        self.tail_extractor = nn.Linear(2 * self.hidden_size + self.num_ner_tags, self.emb_size)
        self.bilinear = nn.Linear(self.emb_size * self.block_size, self.n_classes)

    def forward(
            self,
            input_ids: Tensor,
            attention_mask: Tensor,
            entity_pos: List,
            ner_tags: List,
            labels: List = None
    ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]:

        if labels:
            curr_threshold = None       # for training: no set threshold but adaptive one
        else:
            curr_threshold = self.threshold     # for development and test: threshold set in config

        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = output[0]  # Tensor (batch_size x input_length x 768)
        attention = output[-1][-1]  # Tensor (batch_size x 12 x input_length x input_length)

        hs, rs, ts = self.get_hrt(sequence_output, attention, entity_pos)       # Tensors (batch_size x 768)

        # get ner tags of entities
        hs_ner_tags, ts_ner_tags = torch.Tensor([list(ele) for ele in list(zip(*ner_tags))]).to(self.device)
        hs_inp = torch.cat([hs, rs, hs_ner_tags], dim=1)
        ts_inp = torch.cat([ts, rs, ts_ner_tags], dim=1)

        hs = torch.tanh(self.head_extractor(hs_inp))
        ts = torch.tanh(self.tail_extractor(ts_inp))
        b1 = hs.view(-1, self.emb_size // self.block_size, self.block_size)
        b2 = ts.view(-1, self.emb_size // self.block_size, self.block_size)
        bl = (b1.unsqueeze(3) * b2.unsqueeze(2)).view(-1, self.emb_size * self.block_size)
        logits = self.bilinear(bl)

        output = (self.loss_fnt.get_label(logits, num_labels=self.n_classes, threshold=curr_threshold), logits)
        if labels is not None:
            labels_tensors = [torch.tensor(label) for label in labels]
            labels_tensors = torch.stack(labels_tensors).to(logits)
            loss = self.loss_fnt(logits.float(), labels_tensors.float())
            output = (loss.to(sequence_output),) + output
        return output

    def get_hrt(self, sequence_output: Tensor, attention: Tensor, entity_pos: List) -> Tuple[Tensor, Tensor, Tensor]:
        _, h, _, max_sequence_length = attention.size()
        hss, tss, rss = [], [], []
        for i in range(len(entity_pos)):            # for each training sample (= doc)
            entity_embs, entity_atts = [], []
            for e in entity_pos[i]:             # for each entity (= list of entity mentions)
                if len(e) == 0:
                    continue
                if len(e) > 1:
                    e_emb, e_att = [], []
                    for start, end in e:        # for start and end position of each mention
                        # skip the entity pair if the entity mention is truncated due to limited max seq length.
                        if start + 1 < max_sequence_length:
                            e_emb.append(sequence_output[i, start + 1])
                            e_att.append(attention[i, :, start + 1])
                    if len(e_emb) > 0:
                        e_emb = torch.logsumexp(torch.stack(e_emb, dim=0), dim=0)
                        e_att = torch.stack(e_att, dim=0).mean(0)
                    else:
                        e_emb = torch.zeros(self.hidden_size).to(sequence_output)
                        e_att = torch.zeros(h, max_sequence_length).to(attention)
                else:
                    start, end = e[0]
                    if start + 1 < max_sequence_length:
                        e_emb = sequence_output[i, start + 1]
                        e_att = attention[i, :, start + 1]
                    else:
                        e_emb = torch.zeros(self.hidden_size).to(sequence_output)
                        e_att = torch.zeros(h, max_sequence_length).to(attention)
                entity_embs.append(e_emb)           # get an embedding of an entity
                entity_atts.append(e_att)       # get attention of an entity

            entity_embs = torch.stack(entity_embs, dim=0)  # [n_e, d]           # entity embeddings for each document
            entity_atts = torch.stack(entity_atts, dim=0)  # [n_e, h, seq_len]

            hs = torch.index_select(entity_embs, 0, torch.tensor([0]).to(self.device))  # embeddings of the first entity
            ts = torch.index_select(entity_embs, 0, torch.tensor([1]).to(self.device)) # embeddings of the second entity

            h_att = torch.index_select(entity_atts, 0, torch.tensor([0]).to(self.device))
            t_att = torch.index_select(entity_atts, 0, torch.tensor([1]).to(self.device))
            ht_att = (h_att * t_att).mean(1)
            ht_att = ht_att / (ht_att.sum(1, keepdim=True) + 1e-5)
            rs = contract("ld,rl->rd", sequence_output[i], ht_att)  # ht_i.shape[0] x sequence_output.shape[2]
            hss.append(hs)
            tss.append(ts)
            rss.append(rs)

        hss = torch.cat(hss, dim=0)
        tss = torch.cat(tss, dim=0)
        rss = torch.cat(rss, dim=0)

        return hss, rss, tss

    def load(self) -> None:
        if self.pretrained_bert:
            log.debug(f"From pretrained {self.pretrained_bert}.")
            self.config = AutoConfig.from_pretrained(
                self.pretrained_bert, num_labels=self.n_classes, output_attentions=True, output_hidden_states=True
            )
            self.model = BertModel.from_pretrained(self.pretrained_bert, config=self.config)

        elif self.bert_config_file and Path(self.bert_config_file).is_file():
            self.config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file)))
            self.model = BertModel.from_config(config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.model.to(self.device)


================================================
FILE: deeppavlov/models/classifiers/torch_classification_model.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import List, Union, Optional

import numpy as np
import torch

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel
from .torch_nets import ShallowAndWideCnn

log = logging.getLogger(__name__)


@register('torch_text_classification_model')
class TorchTextClassificationModel(TorchModel):
    """Class implements torch model for classification of texts.
    Input can either be embedded tokenized texts OR indices of words in the vocabulary.
    Number of tokens is not fixed while the samples in batch should be padded to the same (e.g. longest) lengths.

    Args:
        n_classes: number of classes
        kernel_sizes_cnn: list of kernel sizes of convolutions
        filters_cnn: number of filters for convolutions
        dense_size: number of units for dense layer
        dropout_rate: dropout rate, after convolutions and between dense
        embedding_size: size of vector representation of words
        multilabel: is multi-label classification (if so, `sigmoid` activation will be used, otherwise, softmax)
        criterion: criterion name from `torch.nn`
        embedded_tokens: True, if input contains embedded tokenized texts;
                         False, if input containes indices of words in the vocabulary
        vocab_size: vocabulary size in case of `embedded_tokens=False`, and embedding is a layer in the Network
        return_probas: whether to return probabilities or index of classes (only for `multilabel=False`)

    Attributes:
        model: torch model itself
        epochs_done: number of epochs that were done
        criterion: torch criterion instance
    """

    def __init__(self, n_classes: int,
                 kernel_sizes_cnn: List[int],
                 filters_cnn: int,
                 dense_size: int,
                 dropout_rate: float = 0.0,
                 embedding_size: Optional[int] = None,
                 multilabel: bool = False,
                 criterion: str = "CrossEntropyLoss",
                 embedded_tokens: bool = True,
                 vocab_size: Optional[int] = None,
                 return_probas: bool = True,
                 **kwargs):

        if n_classes == 0:
            raise ConfigError("Please, provide vocabulary with considered classes or number of classes.")

        if multilabel and not return_probas:
            raise RuntimeError('Set return_probas to True for multilabel classification!')

        self.multilabel = multilabel
        self.return_probas = return_probas
        model = ShallowAndWideCnn(
            n_classes=n_classes, embedding_size=embedding_size,
            kernel_sizes_cnn=kernel_sizes_cnn, filters_cnn=filters_cnn,
            dense_size=dense_size, dropout_rate=dropout_rate,
            embedded_tokens=embedded_tokens,
            vocab_size=vocab_size
        )
        self.criterion = getattr(torch.nn, criterion)()
        super().__init__(model, **kwargs)

    def __call__(self, texts: List[np.ndarray], *args) -> Union[List[List[float]], List[int]]:
        """Infer on the given data.

        Args:
            texts: list of tokenized text samples
            labels: labels
            *args: additional arguments

        Returns:
            for each sentence:
                vector of probabilities to belong with each class
                or list of labels sentence belongs with
        """
        with torch.no_grad():
            features = np.array(texts)
            inputs = torch.from_numpy(features)
            inputs = inputs.to(self.device)
            outputs = self.model(inputs)
            if self.multilabel:
                outputs = torch.nn.functional.sigmoid(outputs)
            else:
                outputs = torch.nn.functional.softmax(outputs, dim=-1)

        outputs = outputs.cpu().detach().numpy()
        if self.return_probas:
            return outputs.tolist()
        else:
            return np.argmax(outputs, axis=-1).tolist()

    def train_on_batch(self, texts: List[List[np.ndarray]], labels: list) -> Union[float, List[float]]:
        """Train the model on the given batch.

        Args:
            texts: vectorized texts
            labels: list of labels

        Returns:
            metrics values on the given batch
        """
        features, labels = np.array(texts), np.array(labels)

        inputs, labels = torch.from_numpy(features), torch.from_numpy(labels)
        inputs, labels = inputs.to(self.device), labels.to(self.device)
        # zero the parameter gradients
        self.optimizer.zero_grad()

        # forward + backward + optimize
        outputs = self.model(inputs)
        labels = labels.view(-1).long()
        loss = self.criterion(outputs, labels)
        self._make_step(loss)
        return loss.item()


================================================
FILE: deeppavlov/models/classifiers/torch_nets.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Union, Optional

import torch
import torch.nn as nn


class ShallowAndWideCnn(nn.Module):
    def __init__(self, n_classes: int, embedding_size: int, kernel_sizes_cnn: List[int],
                 filters_cnn: Union[int, List[int]], dense_size: int, dropout_rate: float = 0.0,
                 embedded_tokens: bool = True, vocab_size: Optional[int] = None, **kwargs):
        super().__init__()
        self.embedded_tokens = embedded_tokens
        self.kernel_sizes_cnn = kernel_sizes_cnn

        if not embedded_tokens and vocab_size:
            self.embedding = nn.Embedding(vocab_size, embedding_size)
        if isinstance(filters_cnn, int):
            filters_cnn = len(kernel_sizes_cnn) * [filters_cnn]

        for i in range(len(kernel_sizes_cnn)):
            setattr(self, "conv_" + str(i), nn.Conv1d(embedding_size, filters_cnn[i], kernel_sizes_cnn[i],
                                                      padding=kernel_sizes_cnn[i]))
            setattr(self, "bn_" + str(i), nn.BatchNorm1d(filters_cnn[i]))
            setattr(self, "relu_" + str(i), nn.ReLU())
            setattr(self, "pool_" + str(i), nn.AdaptiveMaxPool1d(1))

        self.dropout = nn.Dropout(dropout_rate)
        self.dense = nn.Linear(sum(filters_cnn), dense_size)
        self.relu_dense = nn.ReLU()
        self.final_dense = nn.Linear(dense_size, n_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # number of tokens is variable
        if not self.embedded_tokens:
            # x of shape [batch_size, number of tokens]
            input = self.embedding(x)
            input = input.permute(0, 2, 1)
        else:
            # x of shape [batch_size, number of tokens, embedding_size]
            input = x.permute(0, 2, 1)

        # input of [batch size, embedding size, number of tokens]
        outputs = []
        for i in range(len(self.kernel_sizes_cnn)):
            # convolutional input should be of shape [batch_size, embedding_size, number of tokens]
            output = getattr(self, "conv_" + str(i))(input)
            output = getattr(self, "bn_" + str(i))(output)
            output = getattr(self, "relu_" + str(i))(output)
            output = getattr(self, "pool_" + str(i))(output)
            output = output.squeeze(-1)
            # output of shape [batch_size, out]
            outputs.append(output)

        output = torch.cat(outputs, dim=-1)
        output = self.dropout(output)
        output = self.dense(output)
        output = self.relu_dense(output)
        output = self.dropout(output)
        output = self.final_dense(output)
        return output


================================================
FILE: deeppavlov/models/classifiers/utils.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from logging import getLogger
from typing import List

import numpy as np

log = getLogger(__name__)


def labels2onehot(labels: [List[str], List[List[str]], np.ndarray], classes: [list, np.ndarray]) -> np.ndarray:
    """
    Convert labels to one-hot vectors for multi-class multi-label classification

    Args:
        labels: list of samples where each sample is a class or a list of classes which sample belongs with
        classes: array of classes' names

    Returns:
        2d array with one-hot representation of given samples
    """
    n_classes = len(classes)
    y = []
    for sample in labels:
        curr = np.zeros(n_classes)
        if isinstance(sample, list):
            for intent in sample:
                if intent not in classes:
                    log.warning('Unknown label {} detected. Assigning no class'.format(intent))
                else:
                    curr[np.where(np.array(classes) == intent)[0]] = 1
        else:
            curr[np.where(np.array(classes) == sample)[0]] = 1
        y.append(curr)
    y = np.asarray(y)
    return y


def proba2labels(proba: [list, np.ndarray], confidence_threshold: float, classes: [list, np.ndarray]) -> List[List]:
    """
    Convert vectors of probabilities to labels using confident threshold
    (if probability to belong with the class is bigger than confidence_threshold, sample belongs with the class;
    if no probabilities bigger than confident threshold, sample belongs with the class with the biggest probability)

    Args:
        proba: list of samples where each sample is a vector of probabilities to belong with given classes
        confidence_threshold (float): boundary of probability to belong with a class
        classes: array of classes' names

    Returns:
        list of lists of labels for each sample
    """
    y = []
    for sample in proba:
        to_add = np.where(sample > confidence_threshold)[0]
        if len(to_add) > 0:
            y.append(np.array(classes)[to_add].tolist())
        else:
            y.append(np.array([np.array(classes)[np.argmax(sample)]]).tolist())

    return y


def proba2onehot(proba: [list, np.ndarray], confidence_threshold: float, classes: [list, np.ndarray]) -> np.ndarray:
    """
    Convert vectors of probabilities to one-hot representations using confident threshold

    Args:
        proba: samples where each sample is a vector of probabilities to belong with given classes
        confidence_threshold: boundary of probability to belong with a class
        classes: array of classes' names

    Returns:
        2d array with one-hot representation of given samples
    """
    return labels2onehot(proba2labels(proba, confidence_threshold, classes), classes)


================================================
FILE: deeppavlov/models/doc_retrieval/__init__.py
================================================


================================================
FILE: deeppavlov/models/doc_retrieval/bpr.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple

import faiss
import numpy as np
import torch
from tqdm import trange
from transformers import AutoTokenizer, BertModel

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.serializable import Serializable


class FaissBinaryIndex:
    def __init__(self, index: faiss.Index):
        self.index = index

    def search(self, query_embs: np.ndarray, k: int, binary_k=1000, rerank=True) -> Tuple[np.ndarray, np.ndarray]:
        faiss.omp_set_num_threads(12)
        num_queries = query_embs.shape[0]
        bin_query_embs = np.packbits(np.where(query_embs > 0, 1, 0)).reshape(num_queries, -1)

        raw_index = self.index.index
        _, ids_arr = raw_index.search(bin_query_embs, binary_k)
        psg_embs = np.vstack([np.unpackbits(raw_index.reconstruct(int(id_))) for id_ in ids_arr.reshape(-1)])
        psg_embs = psg_embs.reshape(query_embs.shape[0], binary_k, query_embs.shape[1])
        psg_embs = psg_embs.astype(np.float32)

        psg_embs = psg_embs * 2 - 1
        scores_arr = np.einsum("ijk,ik->ij", psg_embs, query_embs)
        sorted_indices = np.argsort(-scores_arr, axis=1)

        ids_arr = ids_arr[np.arange(num_queries)[:, None], sorted_indices]
        ids_arr = np.array([self.index.id_map.at(int(id_)) for id_ in ids_arr.reshape(-1)], dtype=np.int)
        ids_arr = ids_arr.reshape(num_queries, -1)
        scores_arr = scores_arr[np.arange(num_queries)[:, None], sorted_indices]

        return scores_arr[:, :k], ids_arr[:, :k]


@register('bpr')
class BPR(Component, Serializable):
    def __init__(self, pretrained_model: str,
                 load_path: str,
                 bpr_index: str,
                 query_encoder_file: str,
                 max_query_length: int = 256,
                 top_n: int = 100,
                 device: str = "gpu",
                 *args, **kwargs
                 ):
        super().__init__(save_path=None, load_path=load_path)
        self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu")
        self.bpr_index = bpr_index
        self.top_n = top_n
        self.max_query_length = max_query_length
        self.query_encoder_file = query_encoder_file
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
        self.q_encoder = BertModel.from_pretrained(pretrained_model).to(self.device)
        self.load()
        self.index = FaissBinaryIndex(self.base_index)

    def load(self):
        checkpoint = torch.load(str(self.load_path / self.query_encoder_file), map_location=self.device)
        self.q_encoder.load_state_dict(checkpoint["model_state_dict"], strict=False)
        self.base_index = faiss.read_index_binary(str(self.load_path / self.bpr_index))

    def save(self) -> None:
        pass

    def encode_queries(self, queries, batch_size: int = 256) -> np.ndarray:
        embeddings = []
        with torch.no_grad():
            for start in trange(0, len(queries), batch_size):
                model_inputs = self.tokenizer.batch_encode_plus(
                    queries[start: start + batch_size],
                    return_tensors="pt",
                    max_length=self.max_query_length,
                    padding="max_length",
                )
                model_inputs = {k: v.to(self.device) for k, v in model_inputs.items()}
                sequence_output = self.q_encoder(**model_inputs)[0]
                emb = sequence_output[:, 0, :].contiguous().cpu().numpy()
                embeddings.append(emb)

        return np.vstack(embeddings)

    def __call__(self, queries):
        queries = [query.lower() for query in queries]
        query_embeddings = self.encode_queries(queries)
        scores_batch, ids_batch = self.index.search(query_embeddings, self.top_n)
        ids_batch = ids_batch.tolist()
        return ids_batch


================================================
FILE: deeppavlov/models/doc_retrieval/logit_ranker.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from operator import itemgetter
from typing import List, Union, Tuple, Optional

from deeppavlov.core.common.chainer import Chainer
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.estimator import Component
from deeppavlov.models.doc_retrieval.utils import find_answer_sentence

logger = getLogger(__name__)


@register("logit_ranker")
class LogitRanker(Component):
    """Select best answer using squad model logits. Make several batches for a single batch, send each batch
     to the squad model separately and get a single best answer for each batch.

     Args:
        squad_model: a loaded squad model
        batch_size: batch size to use with squad model
        sort_noans: whether to downgrade noans tokens in the most possible answers
        top_n: number of answers to return

     Attributes:
        squad_model: a loaded squad model
        batch_size: batch size to use with squad model
        top_n: number of answers to return

    """

    def __init__(self, squad_model: Union[Chainer, Component], batch_size: int = 50,
                 sort_noans: bool = False, top_n: int = 1, return_answer_sentence: bool = False, **kwargs):
        self.squad_model = squad_model
        self.batch_size = batch_size
        self.sort_noans = sort_noans
        self.top_n = top_n
        self.return_answer_sentence = return_answer_sentence

    def __call__(self, contexts_batch: List[List[str]], questions_batch: List[List[str]],
                 doc_ids_batch: Optional[List[List[str]]] = None) -> \
            Union[
                Tuple[List[str], List[float], List[int], List[str]],
                Tuple[List[List[str]], List[List[float]], List[List[int]], List[List[str]]],
                Tuple[List[str], List[float], List[int]],
                Tuple[List[List[str]], List[List[float]], List[List[int]]]
            ]:

        """
        Sort obtained results from squad reader by logits and get the answer with a maximum logit.

        Args:
            contexts_batch: a batch of contexts which should be treated as a single batch in the outer JSON config
            questions_batch: a batch of questions which should be treated as a single batch in the outer JSON config
            doc_ids_batch (optional): names of the documents from which the contexts_batch was derived
        Returns:
             a batch of best answers, their scores, places in contexts
             and doc_ids for this answers if doc_ids_batch were passed
        """
        if doc_ids_batch is None:
            logger.warning("you didn't pass tfidf_doc_ids as input in logit_ranker config so "
                           "batch_best_answers_doc_ids can't be compute")

        batch_best_answers = []
        batch_best_answers_score = []
        batch_best_answers_place = []
        batch_best_answers_doc_ids = []
        batch_best_answers_sentences = []
        for quest_ind, [contexts, questions] in enumerate(zip(contexts_batch, questions_batch)):
            results = []
            for i in range(0, len(contexts), self.batch_size):
                c_batch = contexts[i: i + self.batch_size]
                q_batch = questions[i: i + self.batch_size]
                batch_predict = list(zip(*self.squad_model(c_batch, q_batch), c_batch))
                results += batch_predict
            if self.sort_noans:
                results_sort = sorted(results, key=lambda x: (x[0] != '', x[2]), reverse=True)
            else:
                results_sort = sorted(results, key=itemgetter(2), reverse=True)
            best_answers = [x[0] for x in results_sort[:self.top_n]]
            best_answers_place = [x[1] for x in results_sort[:self.top_n]]
            best_answers_score = [x[2] for x in results_sort[:self.top_n]]
            best_answers_contexts = [x[3] for x in results_sort[:self.top_n]]
            batch_best_answers.append(best_answers)
            batch_best_answers_place.append(best_answers_place)
            batch_best_answers_score.append(best_answers_score)
            best_answers_sentences = []
            for answer, place, context in zip(best_answers, best_answers_place, best_answers_contexts):
                sentence = find_answer_sentence(place, context)
                best_answers_sentences.append(sentence)
            batch_best_answers_sentences.append(best_answers_sentences)

            if doc_ids_batch is not None:
                doc_ind = [results.index(x) for x in results_sort]
                batch_best_answers_doc_ids.append(
                    [doc_ids_batch[quest_ind][i] for i in doc_ind][:len(batch_best_answers[-1])])

        if self.top_n == 1:
            batch_best_answers = [x[0] for x in batch_best_answers]
            batch_best_answers_place = [x[0] for x in batch_best_answers_place]
            batch_best_answers_score = [x[0] for x in batch_best_answers_score]
            batch_best_answers_doc_ids = [x[0] for x in batch_best_answers_doc_ids]
            batch_best_answers_sentences = [x[0] for x in batch_best_answers_sentences]

        if doc_ids_batch is None:
            if self.return_answer_sentence:
                return batch_best_answers, batch_best_answers_score, batch_best_answers_place, \
                       batch_best_answers_sentences
            return batch_best_answers, batch_best_answers_score, batch_best_answers_place

        if self.return_answer_sentence:
            return batch_best_answers, batch_best_answers_score, batch_best_answers_place, batch_best_answers_doc_ids, \
                   batch_best_answers_sentences
        return batch_best_answers, batch_best_answers_score, batch_best_answers_place, batch_best_answers_doc_ids


================================================
FILE: deeppavlov/models/doc_retrieval/pop_ranker.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from operator import itemgetter
from typing import List, Any, Tuple

import numpy as np
import joblib

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.file import read_json
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.estimator import Component

logger = getLogger(__name__)


@register('pop_ranker')
class PopRanker(Component):
    """Rank documents according to their tfidf scores and popularities. It is not a standalone ranker,
    it should be used for re-ranking the results of TF-IDF Ranker.

    Based on a Logistic Regression trained on 3 features:

    * tfidf score of the article
    * popularity of the article obtained via Wikimedia REST API as a mean number of views for the period since 2017/11/05 to 2018/11/05
    * multiplication of the two features above

    Args:
        pop_dict_path: a path to json file with article title to article popularity map
        load_path: a path to saved logistic regression classifier
        top_n: a number of doc ids to return
        active: whether to return a number specified by :attr:`top_n` (``True``) or all ids
         (``False``)

    Attributes:
        pop_dict: a map of article titles to their popularity
        mean_pop: mean popularity of all popularities in :attr:`pop_dict`, use it when popularity is not found
        clf: a loaded logistic regression classifier
        top_n: a number of doc ids to return
        active: whether to return a number specified by :attr:`top_n` or all ids

    """

    def __init__(self, pop_dict_path: str, load_path: str, top_n: int = 3, active: bool = True,
                 **kwargs) -> None:
        pop_dict_path = expand_path(pop_dict_path)
        logger.debug(f"Reading popularity dictionary from {pop_dict_path}")
        self.pop_dict = read_json(pop_dict_path)
        self.mean_pop = np.mean(list(self.pop_dict.values()))
        load_path = expand_path(load_path)
        logger.debug(f"Loading popularity ranker from {load_path}")
        self.clf = joblib.load(load_path)
        self.top_n = top_n
        self.active = active

    def __call__(self, input_doc_ids: List[List[Any]], input_doc_scores: List[List[float]]) -> \
            Tuple[List[List], List[List]]:
        """Get tfidf scores and tfidf ids, re-rank them by applying logistic regression classifier,
        output pop ranker ids and pop ranker scores.

         Args:
            input_doc_ids: top input doc ids of tfidf ranker
            input_doc_scores: top input doc scores of tfidf ranker corresponding to doc ids

        Returns:
            top doc ids of pop ranker and their corresponding scores

        """
        batch_ids = []
        batch_scores = []
        for instance_ids, instance_scores in zip(input_doc_ids, input_doc_scores):
            instance_probas = []
            for idx, score in zip(instance_ids, instance_scores):
                pop = self.pop_dict.get(idx, self.mean_pop)
                features = [score, pop, score * pop]
                prob = self.clf.predict_proba([features])
                instance_probas.append(prob[0][1])

            sort = sorted(enumerate(instance_probas), key=itemgetter(1), reverse=True)
            sorted_probas = [item[1] for item in sort]
            sorted_ids = [instance_ids[item[0]] for item in sort]

            if self.active:
                sorted_ids = sorted_ids[:self.top_n]
                sorted_probas = sorted_probas[:self.top_n]

            batch_ids.append(sorted_ids)
            batch_scores.append(sorted_probas)

        return batch_ids, batch_scores


================================================
FILE: deeppavlov/models/doc_retrieval/tfidf_ranker.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import List, Any, Tuple

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.estimator import Component
from deeppavlov.models.vectorizers.hashing_tfidf_vectorizer import HashingTfIdfVectorizer

logger = getLogger(__name__)


@register("tfidf_ranker")
class TfidfRanker(Component):
    """Rank documents according to input strings.

    Args:
        vectorizer: a vectorizer class
        top_n: a number of doc ids to return
        active: whether to return a number specified by :attr:`top_n` (``True``) or all ids
         (``False``)

    Attributes:
        top_n: a number of doc ids to return
        vectorizer: an instance of vectorizer class
        active: whether to return a number specified by :attr:`top_n` or all ids
        index2doc: inverted :attr:`doc_index`
        iterator: a dataset iterator used for generating batches while fitting the vectorizer

    """

    def __init__(self, vectorizer: HashingTfIdfVectorizer, top_n=5, active: bool = True, **kwargs):

        self.top_n = top_n
        self.vectorizer = vectorizer
        self.active = active

    def __call__(self, questions: List[str]) -> Tuple[List[Any], List[float]]:
        """Rank documents and return top n document titles with scores.

        Args:
            questions: list of queries used in ranking

        Returns:
            a tuple of selected doc ids and their scores
        """

        batch_doc_ids, batch_docs_scores = [], []
        q_tfidfs = self.vectorizer(questions)

        for q_tfidf in q_tfidfs:
            scores = q_tfidf * self.vectorizer.tfidf_matrix
            scores = np.squeeze(
                scores.toarray() + 0.0001)  # add a small value to eliminate zero scores

            if self.active:
                thresh = self.top_n
            else:
                thresh = len(self.vectorizer.doc_index)

            if thresh >= len(scores):
                o = np.argpartition(-scores, len(scores) - 1)[0:thresh]
            else:
                o = np.argpartition(-scores, thresh)[0:thresh]
            o_sort = o[np.argsort(-scores[o])]

            doc_scores = scores[o_sort]
            doc_ids = [self.vectorizer.index2doc.get(i, int(i)) for i in o_sort]
            batch_doc_ids.append(doc_ids)
            batch_docs_scores.append(doc_scores)

        return batch_doc_ids, batch_docs_scores


================================================
FILE: deeppavlov/models/doc_retrieval/utils.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, List

import nltk

from deeppavlov.core.common.registry import register


@register('concat_lists')
def concat_lists(list_a: List[List[Any]], list_b: List[List[Any]]):
    list_u = []
    for element_a, element_b in zip(list_a, list_b):
        list_u.append(element_a + element_b)
    return list_u


def find_answer_sentence(answer_pos: int, context: str) -> str:
    answer_sentence = ""
    context_sentences = nltk.sent_tokenize(context)
    start = 0
    context_sentences_offsets = []
    for sentence in context_sentences:
        end = start + len(sentence)
        context_sentences_offsets.append((start, end))
        start = end + 1

    for sentence, (start_offset, end_offset) in zip(context_sentences, context_sentences_offsets):
        if start_offset < answer_pos < end_offset:
            answer_sentence = sentence
            break

    return answer_sentence


================================================
FILE: deeppavlov/models/embedders/__init__.py
================================================


================================================
FILE: deeppavlov/models/embedders/abstract_embedder.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABCMeta, abstractmethod
from logging import getLogger
from pathlib import Path
from typing import List, Union, Iterator

import numpy as np

from deeppavlov.core.data.utils import zero_pad
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.serializable import Serializable

log = getLogger(__name__)


class Embedder(Component, Serializable, metaclass=ABCMeta):
    """
    Class implements fastText embedding model

    Args:
        load_path: path where to load pre-trained embedding model from
        pad_zero: whether to pad samples or not

    Attributes:
        model: model instance
        tok2emb: dictionary with already embedded tokens
        dim: dimension of embeddings
        pad_zero: whether to pad sequence of tokens with zeros or not
        mean: whether to return one mean embedding vector per sample
        load_path: path with pre-trained fastText binary model
    """

    def __init__(self, load_path: Union[str, Path], pad_zero: bool = False, mean: bool = False, **kwargs) -> None:
        """
        Initialize embedder with given parameters
        """
        super().__init__(save_path=None, load_path=load_path)
        self.tok2emb = {}
        self.pad_zero = pad_zero
        self.mean = mean
        self.dim = None
        self.model = None
        self.load()

    def save(self) -> None:
        """
        Class does not save loaded model again as it is not trained during usage
        """
        raise NotImplementedError

    def __call__(self, batch: List[List[str]], mean: bool = None) -> List[Union[list, np.ndarray]]:
        """
        Embed sentences from batch

        Args:
            batch: list of tokenized text samples
            mean: whether to return mean embedding of tokens per sample

        Returns:
            embedded batch
        """
        batch = [self._encode(sample, mean) for sample in batch]
        if self.pad_zero:
            batch = zero_pad(batch)
        return batch

    @abstractmethod
    def __iter__(self) -> Iterator[str]:
        """
        Iterate over all words from the model vocabulary

        Returns:
            iterator
        """

    @abstractmethod
    def _get_word_vector(self, w: str) -> np.ndarray:
        """
        Embed a word using ``self.model``

        Args:
            w: a word

        Returns:
            embedding vector
        """

    def _encode(self, tokens: List[str], mean: bool) -> Union[List[np.ndarray], np.ndarray]:
        """
        Embed one text sample

        Args:
            tokens: tokenized text sample
            mean: whether to return mean embedding of tokens per sample

        Returns:
            list of embedded tokens or array of mean values
        """
        embedded_tokens = []
        for t in tokens:
            try:
                emb = self.tok2emb[t]
            except KeyError:
                try:
                    emb = self._get_word_vector(t)
                except KeyError:
                    emb = np.zeros(self.dim, dtype=np.float32)
                self.tok2emb[t] = emb
            embedded_tokens.append(emb)

        if mean is None:
            mean = self.mean

        if mean:
            filtered = [et for et in embedded_tokens if np.any(et)]
            if filtered:
                return np.mean(filtered, axis=0)
            return np.zeros(self.dim, dtype=np.float32)

        return embedded_tokens


================================================
FILE: deeppavlov/models/embedders/fasttext_embedder.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import Iterator

import fasttext

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.models.embedders.abstract_embedder import Embedder

log = getLogger(__name__)


@register('fasttext')
class FasttextEmbedder(Embedder):
    """
    Class implements fastText embedding model

    Args:
        load_path: path where to load pre-trained embedding model from
        pad_zero: whether to pad samples or not

    Attributes:
        model: fastText model instance
        tok2emb: dictionary with already embedded tokens
        dim: dimension of embeddings
        pad_zero: whether to pad sequence of tokens with zeros or not
        load_path: path with pre-trained fastText binary model
    """

    def _get_word_vector(self, w: str) -> np.ndarray:
        return self.model.get_word_vector(w)

    def load(self) -> None:
        """
        Load fastText binary model from self.load_path
        """
        log.debug(f"[loading fastText embeddings from `{self.load_path}`]")
        self.model = fasttext.load_model(str(self.load_path))
        self.dim = self.model.get_dimension()

    def __iter__(self) -> Iterator[str]:
        """
        Iterate over all words from fastText model vocabulary

        Returns:
            iterator
        """
        yield from self.model.get_words()


================================================
FILE: deeppavlov/models/embedders/tfidf_weighted_embedder.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import List, Union, Optional, Tuple

import numpy as np

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import zero_pad
from deeppavlov.core.models.component import Component

log = getLogger(__name__)


@register('tfidf_weighted')
class TfidfWeightedEmbedder(Component):
    """
    The class implements the functionality of embedding the sentence \
        as a weighted average by special coefficients of tokens embeddings. \
        Coefficients can be taken from the given TFIDF-vectorizer in ``vectorizer`` or \
        calculated as TFIDF from counter vocabulary given in ``counter_vocab_path``.
        Also one can give ``tags_vocab_path`` to the vocabulary with weights of tags. \
        In this case, batch with tags should be given as a second input in ``__call__`` method.

    Args:
        embedder: embedder instance
        tokenizer: tokenizer instance, should be able to detokenize sentence
        pad_zero: whether to pad samples or not
        mean: whether to return mean token embedding
        tags_vocab_path: optional path to vocabulary with tags weights
        vectorizer: vectorizer instance should be trained with ``analyzer="word"``
        counter_vocab_path: path to counter vocabulary
        idf_base_count: minimal idf value (less time occured are not counted)
        log_base: logarithm base for TFIDF-coefficient calculation froom counter vocabulary
        min_idf_weight: minimal idf weight

    Attributes:
        embedder: embedder instance
        tokenizer: tokenizer instance, should be able to detokenize sentence
        dim: dimension of embeddings
        pad_zero: whether to pad samples or not
        mean: whether to return mean token embedding
        tags_vocab: vocabulary with weigths for tags
        vectorizer: vectorizer instance
        counter_vocab_path: path to counter vocabulary
        counter_vocab: counter vocabulary
        idf_base_count: minimal idf value (less time occured are not counted)
        log_base: logarithm base for TFIDF-coefficient calculation froom counter vocabulary
        min_idf_weight: minimal idf weight

    Examples:
        >>> from deeppavlov.models.embedders.tfidf_weighted_embedder import TfidfWeightedEmbedder
        >>> from deeppavlov.models.embedders.fasttext_embedder import FasttextEmbedder
        >>> fasttext_embedder = FasttextEmbedder('/data/embeddings/wiki.ru.bin')
        >>> fastTextTfidf = TfidfWeightedEmbedder(embedder=fasttext_embedder,
                counter_vocab_path='/data/vocabs/counts_wiki_lenta.txt')
        >>> fastTextTfidf([['большой', 'и', 'розовый', 'бегемот']])
        [array([ 1.99135890e-01, -7.14746421e-02,  8.01428872e-02, -5.32840924e-02,
                 5.05212297e-02,  2.76053832e-01, -2.53270134e-01, -9.34443950e-02,
                 ...
                 1.18385439e-02,  1.05643446e-01, -1.21904516e-03,  7.70555378e-02])]
    """

    def __init__(self,
                 embedder: Component,
                 tokenizer: Component = None,
                 pad_zero: bool = False,
                 mean: bool = False,
                 tags_vocab_path: str = None,
                 vectorizer: Component = None,
                 counter_vocab_path: str = None,
                 idf_base_count: int = 100,
                 log_base: int = 10,
                 min_idf_weight=0.0, **kwargs) -> None:
        self.embedder = embedder
        self.dim = self.embedder.dim
        self.mean = mean
        self.pad_zero = pad_zero
        self.tokenizer = tokenizer or self.space_detokenizer
        self.vectorizer = vectorizer

        if vectorizer and counter_vocab_path:
            raise ConfigError("TfidfWeightedEmbedder got vectorizer and counter_vocab_path simultaneously."
                              " Remove one of them, please")
        elif vectorizer:
            self.vectorizer = vectorizer
            self.vocabulary = np.array(self.vectorizer.model.get_feature_names())
        elif counter_vocab_path:
            self.counter_vocab_path = expand_path(counter_vocab_path)
            self.counter_vocab, self.min_count = self.load_counter_vocab(self.counter_vocab_path)
            self.idf_base_count = idf_base_count
            self.log_base = log_base
            self.min_idf_weight = min_idf_weight
        else:
            raise ConfigError("TfidfWeightedEmbedder did not get vectorizer or counter_vocab_path."
                              " Set one of them, please")

        if tags_vocab_path:
            self.tags_vocab = self.load_tags_vocab(expand_path(tags_vocab_path))
        else:
            self.tags_vocab = None

    @staticmethod
    def load_tags_vocab(load_path: str) -> dict:
        """
        Load tag vocabulary from the given path, each key of the vocabulary is a tag, \
            and the corresponding value of the item is a coefficient of words with such tags to be multiplied for.

        Args:
            load_path: path to the vocabulary to be load from

        Returns:
            vocabulary
        """
        tags_vocab = dict()
        with open(load_path, 'r') as f:
            lines = f.readlines()
            f.close()

        for line in lines:
            key, val = line[:-1].split(' ')  # "\t"
            tags_vocab[key] = val

        return tags_vocab

    @staticmethod
    def load_counter_vocab(load_path: str) -> Tuple[dict, int]:
        """
        Load counter vocabulary from the given path

        Args:
            load_path: path to the vocabulary to be load from

        Returns:
            vocabulary
        """
        counter_vocab = dict()
        with open(load_path, 'r') as f:
            lines = f.readlines()
            f.close()

        min_val = np.inf
        for line in lines:
            key, val = line[:-1].split('\t')
            val = int(val)
            counter_vocab[key] = val
            if val < min_val:
                min_val = val

        return counter_vocab, min_val

    @staticmethod
    def space_detokenizer(batch: List[List[str]]) -> List[str]:
        """
        Detokenizer by default. Linking tokens by space symbol

        Args:
            batch: batch of tokenized texts

        Returns:
            batch of detokenized texts
        """
        return [" ".join(tokens) for tokens in batch]

    def __call__(self, batch: List[List[str]], tags_batch: Optional[List[List[str]]] = None, mean: bool = None,
                 *args, **kwargs) -> List[Union[list, np.ndarray]]:
        """
        Infer on the given data

        Args:
            batch: tokenized text samples
            tags_batch: optional batch of corresponding tags
            mean: whether to return mean token embedding (does not depend on self.mean)
            *args: additional arguments
            **kwargs: additional arguments

        Returns:

        """

        if self.tags_vocab:
            if tags_batch is None:
                raise ConfigError("TfidfWeightedEmbedder got 'tags_vocab_path' but __call__ did not get tags_batch.")
            batch = [self._tags_encode(sample, tags_sample, mean=mean) for sample, tags_sample in
                     zip(batch, tags_batch)]
        else:
            if tags_batch:
                raise ConfigError("TfidfWeightedEmbedder got tags batch, but 'tags_vocab_path' is empty.")
            batch = [self._encode(sample, mean=mean) for sample in batch]

        if self.pad_zero:
            batch = zero_pad(batch)

        return batch

    def _encode(self, tokens: List[str], mean: bool) -> Union[List[np.ndarray], np.ndarray]:
        """
        Embed one text sample

        Args:
            tokens: tokenized text sample
            mean: whether to return mean token embedding (does not depend on self.mean)

        Returns:
            list of embedded tokens or array of mean values
        """
        if self.vectorizer:
            detokenized_sample = self.tokenizer([tokens])[0]  # str
            vectorized_sample = self.vectorizer([detokenized_sample])  # (voc_size,)

            weights = np.array([vectorized_sample[0, np.where(self.vocabulary == token)[0][0]]
                                if len(np.where(self.vocabulary == token)[0]) else 0.
                                for token in tokens])
        else:
            weights = np.array([self.get_weight(max(self.counter_vocab.get(token, 0), self.idf_base_count))
                                for token in tokens])

        if sum(weights) == 0:
            weights = np.ones(len(tokens))

        embedded_tokens = np.array(self.embedder([tokens]))[0, :, :]

        if mean is None:
            mean = self.mean

        if mean:
            embedded_tokens = np.average(embedded_tokens, weights=weights, axis=0)
        else:
            embedded_tokens = np.array([weights[i] * embedded_tokens[i] for i in range(len(tokens))])

        return embedded_tokens

    def get_weight(self, count: int) -> float:
        """
        Calculate the weight corresponding to the given count

        Args:
            count: the number of occurences of particular token

        Returns:
            weight
        """
        log_count = np.log(count) / np.log(self.log_base)
        log_base_count = np.log(self.idf_base_count) / np.log(self.log_base)
        weight = max(1.0 / (1.0 + log_count - log_base_count), self.min_idf_weight)
        return weight

    def _tags_encode(self, tokens: List[str], tags: List[str], mean: bool) -> Union[List[np.ndarray], np.ndarray]:
        """
        Embed one text sample

        Args:
            tokens: tokenized text sample
            tags: tokenized tags sample
            mean: whether to return mean token embedding (does not depend on self.mean)

        Returns:
            list of embedded tokens or array of mean values
        """

        embedded_tokens = np.array(self.embedder([tokens]))[0, :, :]

        tags_weights = np.array([self.tags_vocab.get(tag, 1.0) for tag in tags])

        detokenized_sample = self.tokenizer([tokens])[0]  # str
        vectorized_sample = self.vectorizer([detokenized_sample])  # (voc_size,)

        if self.vectorizer:
            weights = np.array([vectorized_sample[0, np.where(self.vocabulary == token)[0][0]]
                                if len(np.where(self.vocabulary == token)[0]) else 0.
                                for token in tokens])
        else:
            weights = np.array([self.get_weight(max(self.counter_vocab.get(token, 0), self.idf_base_count))
                                for token in tokens])

        weights = np.multiply(weights, tags_weights)
        if sum(weights) == 0:
            weights = np.ones(len(tokens))

        if mean is None:
            mean = self.mean

        if mean:
            embedded_tokens = np.average(embedded_tokens, weights=weights, axis=0)
        else:
            embedded_tokens = np.array([weights[i] * embedded_tokens[i] for i in range(len(tokens))])

        return embedded_tokens


================================================
FILE: deeppavlov/models/embedders/transformers_embedder.py
================================================
# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from typing import Union, Tuple, Collection

import torch
import transformers

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.serializable import Serializable


@register('transformers_bert_embedder')
class TransformersBertEmbedder(Serializable):
    """Transformers-based BERT model for embeddings tokens, subtokens and sentences

    Args:
        load_path: path to a pretrained BERT pytorch checkpoint
        bert_config_file: path to a BERT configuration file
        truncate: whether to remove zero-paddings from returned data

    """
    model: transformers.BertModel
    dim: int

    def __init__(self, load_path: Union[str, Path], bert_config_path: Union[str, Path] = None,
                 truncate: bool = False, **kwargs):
        super().__init__(save_path=None, load_path=load_path, **kwargs)
        if bert_config_path is not None:
            bert_config_path = expand_path(bert_config_path)
        self.config = bert_config_path
        self.truncate = truncate
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.load()

    def save(self, *args, **kwargs):
        raise NotImplementedError

    def load(self):
        self.model = transformers.BertModel.from_pretrained(self.load_path, config=self.config).eval().to(self.device)
        self.dim = self.model.config.hidden_size

    def __call__(self,
                 subtoken_ids_batch: Collection[Collection[int]],
                 startofwords_batch: Collection[Collection[int]],
                 attention_batch: Collection[Collection[int]]) -> Tuple[Collection[Collection[Collection[float]]],
                                                                        Collection[Collection[Collection[float]]],
                                                                        Collection[Collection[float]],
                                                                        Collection[Collection[float]],
                                                                        Collection[Collection[float]]]:
        """Predict embeddings values for a given batch

        Args:
            subtoken_ids_batch: padded indexes for every subtoken
            startofwords_batch: a mask matrix with ``1`` for every first subtoken init in a token and ``0``
                for every other subtoken
            attention_batch: a mask matrix with ``1`` for every significant subtoken and ``0`` for paddings
        """
        ids_tensor = torch.tensor(subtoken_ids_batch, device=self.device, dtype=torch.long)
        startofwords_tensor = torch.tensor(startofwords_batch, device=self.device).bool()
        attention_tensor = torch.tensor(attention_batch, device=self.device)
        with torch.no_grad():
            output = self.model(ids_tensor, attention_tensor)
            last_hidden = output.last_hidden_state
            pooler_output = output.pooler_output
            attention_tensor = attention_tensor.unsqueeze(-1)
            max_emb = torch.max(last_hidden - 1e9 * (1 - attention_tensor), dim=1)[0]
            subword_emb = last_hidden * attention_tensor
            mean_emb = torch.sum(subword_emb, dim=1) / torch.sum(attention_tensor, dim=1)

            tokens_lengths = startofwords_tensor.sum(dim=1)
            word_emb = torch.zeros((subword_emb.shape[0], tokens_lengths.max(), subword_emb.shape[2]),
                                   device=self.device, dtype=subword_emb.dtype)
            target_indexes = (torch.arange(word_emb.shape[1], device=self.device).expand(word_emb.shape[:-1]) <
                              tokens_lengths.unsqueeze(-1))
            word_emb[target_indexes] = subword_emb[startofwords_tensor]

        subword_emb = subword_emb.cpu().numpy()
        word_emb = word_emb.cpu().numpy()
        pooler_output = pooler_output.cpu().numpy()
        max_emb = max_emb.cpu().numpy()
        mean_emb = mean_emb.cpu().numpy()
        if self.truncate:
            subword_emb = [item[:mask.sum()] for item, mask in zip(subword_emb, attention_batch)]
            word_emb = [item[:mask.sum()] for item, mask in zip(word_emb, startofwords_batch)]
        return word_emb, subword_emb, max_emb, mean_emb, pooler_output


================================================
FILE: deeppavlov/models/entity_extraction/__init__.py
================================================


================================================
FILE: deeppavlov/models/entity_extraction/entity_detection_parser.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from collections import defaultdict
from logging import getLogger
from string import punctuation
from typing import List, Tuple, Union, Any

import numpy as np
from nltk.corpus import stopwords

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

log = getLogger(__name__)
punctuation = punctuation.replace('+', '')


@register('question_sign_checker')
class QuestionSignChecker:
    def __init__(self, delete_brackets: bool = False, **kwargs):
        self.delete_brackets = delete_brackets
        self.replace_tokens = [(" '", ' "'), ("' ", '" '), (" ?", "?"), ("  ", " ")]

    def __call__(self, questions: List[str]) -> List[str]:
        """Adds question sign if it is absent or replaces dots in the end with question sign."""
        questions_clean = []
        for question in questions:
            question = question if question.endswith('?') else f'{question.rstrip(".")}?'
            if self.delete_brackets:
                brackets_text = re.findall(r"(\(.*?\))", question)
                for elem in brackets_text:
                    question = question.replace(elem, " ")
            for old_tok, new_tok in self.replace_tokens:
                question = question.replace(old_tok, new_tok)
            questions_clean.append(question)
        return questions_clean


@register('entity_type_split')
def entity_type_split(entities_batch: List[List[str]], tags_batch: List[List[str]]) -> Tuple[
    List[List[str]], List[List[str]], List[List[str]]]:
    f_entities_batch, f_types_batch, f_tags_batch = [], [], []
    for entities_list, tags_list in zip(entities_batch, tags_batch):
        f_entities_list, f_types_list, f_tags_list = [], [], []
        for entity, tag in zip(entities_list, tags_list):
            if tag != "T":
                f_entities_list.append(entity)
                f_tags_list.append(tag.lower())
            else:
                f_types_list.append(entity)
        f_entities_batch.append(f_entities_list)
        f_tags_batch.append(f_tags_list)
        f_types_batch.append(f_types_list)
    return f_entities_batch, f_tags_batch, f_types_batch


@register('entity_detection_parser')
class EntityDetectionParser(Component):
    """This class parses probabilities of tokens to be a token from the entity substring."""

    def __init__(self, o_tag: str, tags_file: str, entity_tags: List[str] = None, ignore_points: bool = False,
                 thres_proba: float = 0.8, make_tags_from_probas: bool = False, lang: str = "en",
                 ignored_tags: List[str] = None, **kwargs):
        """
        Args:
            o_tag: tag for tokens which are neither entities nor types
            tags_file: filename with NER tags
            entity_tags: tags for entities
            ignore_points: whether to consider points as separate symbols
            thres_proba: if the probability of the tag is less than thres_proba, we assign the tag as 'O'
            make_tags_from_probas: whether to define token tags from confidences from sequence tagging model
            lang: language of texts
            ignored_tags: not used tags of entities
        """
        self.entity_tags = entity_tags
        self.o_tag = o_tag
        self.ignore_points = ignore_points
        self.thres_proba = thres_proba
        self.tag_ind_dict = {}
        with open(str(expand_path(tags_file))) as fl:
            tags = [line.split('\t')[0] for line in fl.readlines()]
            self.tags = tags
            if self.entity_tags is None:
                self.entity_tags = list(
                    {tag.split('-')[1] for tag in tags if len(tag.split('-')) > 1}.difference({self.o_tag}))

            self.entity_prob_ind = {entity_tag: [i for i, tag in enumerate(tags) if entity_tag in tag]
                                    for entity_tag in self.entity_tags}
            self.tags_ind = {tag: i for i, tag in enumerate(tags)}
            self.et_prob_ind = [i for tag, ind in self.entity_prob_ind.items() for i in ind]
            for entity_tag, tag_ind in self.entity_prob_ind.items():
                for ind in tag_ind:
                    self.tag_ind_dict[ind] = entity_tag
            self.tag_ind_dict[0] = self.o_tag
        self.make_tags_from_probas = make_tags_from_probas
        if lang == "en":
            self.stopwords = set(stopwords.words("english"))
        elif lang == "ru":
            self.stopwords = set(stopwords.words("russian"))
        else:
            raise ValueError(f'Unsupported lang value: "{lang}". Only "en" and "ru" are allowed.')
        self.ignored_tags = ignored_tags or []

    def __call__(self, question_tokens_batch: List[List[str]], tokens_info_batch: List[List[List[float]]],
                 tokens_probas_batch: np.ndarray) -> \
            Tuple[List[dict], List[dict], List[dict]]:
        """
        Args:
            question_tokens_batch: tokenized questions
            tokens_info_batch: list of tags of question tokens
            tokens_probas_probas: list of probabilities of question tokens
        Returns:
            Batch of dicts where keys are tags and values are substrings corresponding to tags
            Batch of substrings which correspond to entity types
            Batch of lists of token indices in the text which correspond to entities
        """
        entities_batch = []
        positions_batch = []
        probas_batch = []
        for tokens, tags, probas in \
                zip(question_tokens_batch, tokens_info_batch, tokens_probas_batch):
            if self.make_tags_from_probas:
                tags, _ = self.tags_from_probas(tokens, probas)
            tags = self.correct_quotes(tokens, tags, probas)
            tags = self.correct_tags(tokens, tags)
            entities, positions, entities_probas = self.entities_from_tags(tokens, tags, probas)
            entities_batch.append(entities)
            positions_batch.append(positions)
            probas_batch.append(entities_probas)
        return entities_batch, positions_batch, probas_batch

    def tags_from_probas(self, tokens: List[str], probas: np.array) -> Tuple[List[Union[str, List[str]]], List[Any]]:
        """
        This method makes a list of tags from a list of probas for tags
        Args:
            tokens: text tokens list
            probas: probabilities for tokens to belong to particular tags
        Returns:
            list of tags for tokens
            list of probabilities of these tags
        """
        tags = []
        tag_probas = []
        for token, proba in zip(tokens, probas):
            if proba[0] < self.thres_proba:
                tag_num = np.argmax(proba[1:]) + 1
            else:
                tag_num = 0
            tags.append(self.tags[tag_num])
            tag_probas.append(proba[tag_num])

        return tags, tag_probas

    def correct_tags(self, tokens: List[str], tags: List[str]) -> List[str]:
        for i in range(len(tags) - 2):
            if len(tags[i]) > 1 and tags[i].startswith("B-"):
                tag = tags[i].split("-")[1]
                if tags[i + 2] == f"I-{tag}" and tags[i + 1] != f"I-{tag}":
                    tags[i + 1] = f"I-{tag}"
            if tokens[i + 1] in '«' and tags[i] != "O":
                tags[i] = "O"
                tags[i + 1] = "O"
            if len(tags[i]) > 1 and tags[i].split("-")[1] == "EVENT":
                found_n = -1
                for j in range(i + 1, i + 3):
                    if re.findall(r"[\d]{3,4}", tokens[j]):
                        found_n = j
                        break
                if found_n > 0:
                    for j in range(i + 1, found_n + 1):
                        tags[j] = "I-EVENT"
            if i < len(tokens) - 3 and len(tokens[i]) == 1 and tokens[i + 1] == "." and len(tokens[i + 2]) == 1 \
                    and tokens[i + 3] == "." and tags[i + 2].startswith("B-"):
                tag = tags[i + 2].split("-")[1]
                tags[i] = f"B-{tag}"
                tags[i + 1] = f"I-{tag}"
                tags[i + 2] = f"I-{tag}"
        return tags

    def correct_quotes(self, tokens: List[str], tags: List[str], probas: np.array) -> List[str]:
        quotes = {"«": "»", '"': '"'}
        for i in range(len(tokens)):
            if tokens[i] in {"«", '"'}:
                quote_start = tokens[i]
                end_pos = 0
                for j in range(i + 1, len(tokens)):
                    if tokens[j] == quotes[quote_start]:
                        end_pos = j
                        break
                if end_pos and end_pos != i + 1:
                    probas_sum = np.sum(probas[i + 1:end_pos], axis=0)
                    tags_probas = {}
                    for tag in self.entity_prob_ind:
                        for ind in self.entity_prob_ind[tag]:
                            if tag not in tags_probas:
                                tags_probas[tag] = probas_sum[ind]
                            else:
                                tags_probas[tag] += probas_sum[ind]
                    tags_probas = list(tags_probas.items())
                    tags_probas = sorted(tags_probas, key=lambda x: x[1], reverse=True)
                    found_tag = ""
                    for tag, _ in tags_probas:
                        if tag != "PERSON":
                            found_tag = tag
                            break
                    if found_tag:
                        tags[i + 1] = f"B-{found_tag}"
                        for j in range(i + 2, end_pos):
                            tags[j] = f"I-{found_tag}"
        return tags

    def add_entity(self, entity: str, c_tag: str) -> None:
        replace_tokens = [(' - ', '-'), ("'s", ''), (' .', '.'), ('{', ''), ('}', ''),
                          ('  ', ' '), ('"', "'"), ('(', ''), (')', ''), (' +', '+')]
        if entity and (entity[-1] in punctuation or entity[-1] == "»"):
            entity = entity[:-1]
            self.ent_pos_dict[c_tag] = self.ent_pos_dict[c_tag][:-1]
        if entity and (entity[0] in punctuation or entity[0] == "«"):
            entity = entity[1:]
            self.ent_pos_dict[c_tag] = self.ent_pos_dict[c_tag][1:]
        entity = ' '.join(entity)
        for old, new in replace_tokens:
            entity = entity.replace(old, new)
        if entity and entity.lower() not in self.stopwords:
            cur_probas = self.ent_probas_dict[c_tag]
            self.ents_pos_probas_dict[c_tag].append((entity, self.ent_pos_dict[c_tag],
                                                     round(sum(cur_probas) / len(cur_probas), 4)))
        self.ent_dict[c_tag] = []
        self.ent_pos_dict[c_tag] = []
        self.ent_probas_dict[c_tag] = []

    def entities_from_tags(self, tokens: List[str], tags: List[str],
                                 tag_probas: List[List[float]]) -> Tuple[dict, dict, dict]:
        """
        This method makes lists of substrings corresponding to entities and entity types
        and a list of indices of tokens which correspond to entities
        Args:
            tokens: list of tokens of the text
            tags: list of tags for tokens
            tag_probas: list of probabilities of tags
        Returns:
            list of entity substrings (or a dict of tags (keys) and entity substrings (values))
            list of substrings for entity types
            list of indices of tokens which correspond to entities (or a dict of tags (keys)
                and list of indices of entity tokens)
        """
        self.ent_dict = defaultdict(list)
        self.ent_pos_dict = defaultdict(list)
        self.ent_probas_dict = defaultdict(list)
        self.ents_pos_probas_dict = defaultdict(list)
        cnt = 0
        for n, (tok, tag, probas) in enumerate(zip(tokens, tags, tag_probas)):
            if tag.split('-')[-1] in self.entity_tags:
                f_tag = tag.split("-")[-1]
                if tag.startswith("B-") and any(self.ent_dict.values()):
                    for c_tag, entity in self.ent_dict.items():
                        self.add_entity(entity, c_tag)
                self.ent_dict[f_tag].append(tok)
                self.ent_pos_dict[f_tag].append(cnt)
                self.ent_probas_dict[f_tag].append(probas[self.tags_ind[tag]])

            elif any(self.ent_dict.values()):
                for tag, entity in self.ent_dict.items():
                    c_tag = tag.split("-")[-1]
                    self.add_entity(entity, c_tag)
            cnt += 1
        if any(self.ent_dict.values()):
            for tag, entity in self.ent_dict.items():
                c_tag = tag.split("-")[-1]
                self.add_entity(entity, c_tag)

        self.ents_pos_probas_dict = {tag: elements for tag, elements in self.ents_pos_probas_dict.items()
                                     if tag not in self.ignored_tags}

        for tag in self.ents_pos_probas_dict:
            ents_pos_proba = self.ents_pos_probas_dict[tag]

        entities_dict = {tag: [ent[0] for ent in ents] for tag, ents in self.ents_pos_probas_dict.items()}
        entities_positions_dict = {tag: [ent[1] for ent in ents] for tag, ents in self.ents_pos_probas_dict.items()}
        entities_probas_dict = {tag: [ent[2] for ent in ents] for tag, ents in self.ents_pos_probas_dict.items()}
        log.debug(f"entities_dict {entities_dict}")

        return entities_dict, entities_positions_dict, entities_probas_dict


================================================
FILE: deeppavlov/models/entity_extraction/entity_linking.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import sqlite3
from logging import getLogger
from typing import List, Dict, Tuple, Any, Union
from collections import defaultdict

import nltk
import spacy
from hdt import HDTDocument
from nltk.corpus import stopwords
from rapidfuzz import fuzz

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.serializable import Serializable
from deeppavlov.models.entity_extraction.find_word import WordSearcher

log = getLogger(__name__)
nltk.download("stopwords")


@register("entity_linker")
class EntityLinker(Component, Serializable):
    """
    Class for linking of entity substrings in the document to entities in Wikidata
    """

    def __init__(
            self,
            load_path: str,
            entity_ranker=None,
            entities_database_filename: str = None,
            words_dict_filename: str = None,
            ngrams_matrix_filename: str = None,
            num_entities_for_bert_ranking: int = 50,
            num_entities_for_conn_ranking: int = 5,
            num_entities_to_return: int = 10,
            max_text_len: int = 300,
            max_paragraph_len: int = 150,
            lang: str = "ru",
            use_descriptions: bool = True,
            alias_coef: float = 1.1,
            use_tags: bool = False,
            lemmatize: bool = False,
            full_paragraph: bool = False,
            use_connections: bool = False,
            kb_filename: str = None,
            prefixes: Dict[str, Any] = None,
            **kwargs,
    ) -> None:
        """

        Args:
            load_path: path to folder with inverted index files
            entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert
            entities_database_filename: filename with database with entities index
            words_dict_filename: filename with words and corresponding tags
            ngrams_matrix_filename: filename with char tfidf matrix
            num_entities_for_bert_ranking: number of candidate entities for BERT ranking using description and context
            num_entities_for_conn_ranking: number of candidate entities for ranking using connections in the knowledge
                graph
            num_entities_to_return: number of candidate entities for the substring which are returned
            max_text_len: maximal length of entity context
            max_paragraph_len: maximal length of context paragraphs
            lang: russian or english
            use_description: whether to perform entity ranking by context and description
            alias_coef: coefficient which is multiplied by the substring matching confidence if the substring is the
                title of the entity
            use_tags: whether to filter candidate entities by tags
            lemmatize: whether to lemmatize tokens
            full_paragraph: whether to use full paragraph for entity context
            use_connections: whether to rank entities by connections in the knowledge graph
            kb_filename: filename with the knowledge base in HDT format
            prefixes: entity and title prefixes
            **kwargs:
        """
        super().__init__(save_path=None, load_path=load_path)
        self.lemmatize = lemmatize
        self.num_entities_for_bert_ranking = num_entities_for_bert_ranking
        self.num_entities_for_conn_ranking = num_entities_for_conn_ranking
        self.entity_ranker = entity_ranker
        self.entities_database_filename = entities_database_filename
        self.num_entities_to_return = num_entities_to_return
        self.max_text_len = max_text_len
        self.max_paragraph_len = max_paragraph_len
        self.lang = f"@{lang}"
        if self.lang == "@en":
            self.stopwords = set(stopwords.words("english"))
            self.nlp = spacy.load("en_core_web_sm")
        elif self.lang == "@ru":
            self.stopwords = set(stopwords.words("russian"))
            self.nlp = spacy.load("ru_core_news_sm")
        self.alias_coef = alias_coef
        self.use_descriptions = use_descriptions
        self.use_connections = use_connections
        self.use_tags = use_tags
        self.full_paragraph = full_paragraph
        self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.related_tags = {
            "loc": ["gpe", "country", "city", "us_state", "river"],
            "gpe": ["loc", "country", "city", "us_state"],
            "work_of_art": ["product", "law"],
            "product": ["work_of_art"],
            "law": ["work_of_art"],
            "org": ["fac", "business"],
            "business": ["org"]
        }
        self.word_searcher = None
        if words_dict_filename:
            self.word_searcher = WordSearcher(words_dict_filename, ngrams_matrix_filename, self.lang)
        self.kb_filename = kb_filename
        self.prefixes = prefixes
        self.load()

    def load(self) -> None:
        self.conn = sqlite3.connect(str(self.load_path / self.entities_database_filename))
        self.cur = self.conn.cursor()
        self.kb = None
        if self.kb_filename:
            self.kb = HDTDocument(str(expand_path(self.kb_filename)))

    def save(self) -> None:
        pass

    def __call__(
            self,
            substr_batch: List[List[str]],
            tags_batch: List[List[str]] = None,
            probas_batch: List[List[float]] = None,
            sentences_batch: List[List[str]] = None,
            offsets_batch: List[List[List[int]]] = None,
            sentences_offsets_batch: List[List[Tuple[int, int]]] = None,
            entities_to_link_batch: List[List[int]] = None
    ):
        if (not sentences_offsets_batch or sentences_offsets_batch[0] is None) and sentences_batch is not None:
            sentences_offsets_batch = []
            for sentences_list in sentences_batch:
                sentences_offsets_list = []
                start = 0
                for sentence in sentences_list:
                    end = start + len(sentence)
                    sentences_offsets_list.append([start, end])
                    start = end + 1
                sentences_offsets_batch.append(sentences_offsets_list)

        if sentences_batch is None:
            sentences_batch = [[] for _ in substr_batch]
            sentences_offsets_batch = [[] for _ in substr_batch]

        if not entities_to_link_batch or entities_to_link_batch[0] is None:
            entities_to_link_batch = [[1 for _ in substr_list] for substr_list in substr_batch]

        log.debug(f"substr: {substr_batch} --- sentences_batch: {sentences_batch} --- offsets: {offsets_batch}")
        if (not offsets_batch or offsets_batch[0] is None) and sentences_batch:
            offsets_batch = []
            for substr_list, sentences_list in zip(substr_batch, sentences_batch):
                text = " ".join(sentences_list).lower()
                log.debug(f"text {text}")
                offsets_list = []
                for substr in substr_list:
                    st_offset = text.find(substr.lower())
                    end_offset = st_offset + len(substr)
                    offsets_list.append([st_offset, end_offset])
                offsets_batch.append(offsets_list)
        ids_batch, conf_batch, pages_batch, labels_batch = [], [], [], []
        for substr_list, offsets_list, tags_list, probas_list, sentences_list, sentences_offsets_list, \
            entities_to_link in zip(substr_batch, offsets_batch, tags_batch, probas_batch, sentences_batch,
                                    sentences_offsets_batch, entities_to_link_batch):
            ids_list, conf_list, pages_list, labels_list = \
                self.link_entities(substr_list, offsets_list, tags_list, probas_list, sentences_list,
                                   sentences_offsets_list, entities_to_link)
            log.debug(f"ids_list {ids_list} conf_list {conf_list}")
            if self.num_entities_to_return == 1:
                pages_list = [pages[0] for pages in pages_list]
            else:
                pages_list = [pages[: len(ids)] for pages, ids in zip(pages_list, ids_list)]
            ids_batch.append(ids_list)
            conf_batch.append(conf_list)
            pages_batch.append(pages_list)
            labels_batch.append(labels_list)
        return ids_batch, conf_batch, pages_batch, labels_batch

    def link_entities(
            self,
            substr_list: List[str],
            offsets_list: List[List[int]],
            tags_list: List[str],
            probas_list: List[float],
            sentences_list: List[str],
            sentences_offsets_list: List[List[int]],
            entities_to_link: List[int]
    ) -> Tuple[List[Any], List[Any], List[List[Union[str, Any]]], List[List[Union[str, Any]]]]:
        log.debug(f"substr_list {substr_list} tags_list {tags_list} probas {probas_list} offsets_list {offsets_list}")
        ids_list, conf_list, pages_list, label_list, descr_list = [], [], [], [], []
        if substr_list:
            entities_scores_list = []
            cand_ent_scores_list = []
            for substr, tags, proba in zip(substr_list, tags_list, probas_list):
                for old_symb, new_symb in [("'s", ""), ("@", ""), ("  ", " "), (".", ""), (",", ""), ("-", " "),
                                           ("'", " "), ("!", ""), (":", ""), ("&", ""), ("/", " "), ('"', ""),
                                           ("  ", " ")]:
                    substr = substr.replace(old_symb, new_symb)
                substr = substr.strip()
                cand_ent_init = defaultdict(set)
                if len(substr) > 1:
                    if isinstance(tags, str):
                        tags = [tags]
                    tags = [tag.lower() for tag in tags]
                    if tags and not isinstance(tags[0], (list, tuple)):
                        tags = [(tag, 1.0) for tag in tags]
                    if tags and tags[0][0] == "e":
                        use_tags_flag = False
                    else:
                        use_tags_flag = True
                    cand_ent_init = self.find_exact_match(substr, tags, use_tags=use_tags_flag)
                    new_substr = re.sub(r"\b([a-z]{1}) ([a-z]{1})\b", r"\1\2", substr)
                    if substr != new_substr:
                        new_cand_ent_init = self.find_exact_match(new_substr, tags, use_tags=use_tags_flag)
                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)

                    init_substr_split = substr.lower().split(" ")
                    if tags[0][0] in {"person", "work_of_art"}:
                        substr_split = [word for word in substr.lower().split(" ") if len(word) > 0]
                    else:
                        substr_split = [word for word in substr.lower().split(" ")
                                        if word not in self.stopwords and len(word) > 0]

                    substr_split_lemm = [self.nlp(tok)[0].lemma_ for tok in substr_split]
                    substr_lemm = " ".join(substr_split_lemm)
                    if substr_split != substr_split_lemm \
                            or (tags[0][0] == "work_of_art"
                                and len(substr_split) != len(init_substr_split)):
                        new_cand_ent_init = self.find_fuzzy_match(substr_split, tags, use_tags=use_tags_flag)
                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)
                    if substr_split != substr_split_lemm:
                        new_cand_ent_init = self.find_exact_match(substr_lemm, tags, use_tags=use_tags_flag)
                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)
                        new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, tags, use_tags=use_tags_flag)
                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)

                    all_low_conf = self.define_all_low_conf(cand_ent_init, 1.0)
                    clean_tags, corr_tags, corr_clean_tags = self.correct_tags(tags)
                    log.debug(f"substr: {substr} --- lemm: {substr_split_lemm} --- tags: {tags} --- corr_tags: "
                              f"{corr_tags} --- all_low_conf: {all_low_conf} --- cand_ent_init: {len(cand_ent_init)}")

                    if (not cand_ent_init or all_low_conf) and corr_tags:
                        corr_cand_ent_init = self.find_exact_match(substr, corr_tags, use_tags=use_tags_flag)
                        cand_ent_init = self.unite_dicts(cand_ent_init, corr_cand_ent_init)
                        if substr_split != substr_split_lemm:
                            new_cand_ent_init = self.find_exact_match(substr_lemm, corr_tags, use_tags=use_tags_flag)
                            cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)
                            new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, corr_tags,
                                                                      use_tags=use_tags_flag)
                            cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)

                    if not cand_ent_init and len(substr_split) == 1 and self.word_searcher:
                        corr_words = self.word_searcher(substr_split[0], set(clean_tags + corr_clean_tags))
                        if corr_words:
                            cand_ent_init = self.find_exact_match(corr_words[0], tags + corr_tags,
                                                                  use_tags=use_tags_flag)

                    if not cand_ent_init and len(substr_split) > 1:
                        cand_ent_init = self.find_fuzzy_match(substr_split, tags)

                    all_low_conf = self.define_all_low_conf(cand_ent_init, 0.85)
                    if (not cand_ent_init or all_low_conf) and tags[0][0] != "t":
                        use_tags_flag = False
                        new_cand_ent_init = self.find_exact_match(substr, tags, use_tags=use_tags_flag)
                        cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)
                        if substr_split != substr_split_lemm and (tags[0][0] == "e" or not cand_ent_init):
                            new_cand_ent_init = self.find_fuzzy_match(substr_split, tags, use_tags=use_tags_flag)
                            cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)
                            new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, tags, use_tags=use_tags_flag)
                            cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init)

                cand_ent_scores = []
                for entity in cand_ent_init:
                    entities_scores = list(cand_ent_init[entity])
                    entities_scores = sorted(entities_scores, key=lambda x: (x[0], x[2], x[1]), reverse=True)
                    cand_ent_scores.append(([entity] + list(entities_scores[0])))

                cand_ent_scores = sorted(cand_ent_scores, key=lambda x: (x[1], x[3], x[2]), reverse=True)
                cand_ent_scores = cand_ent_scores[: self.num_entities_for_bert_ranking]
                cand_ent_scores_list.append(cand_ent_scores)
                entity_ids = [elem[0] for elem in cand_ent_scores]
                scores = [elem[1:4] for elem in cand_ent_scores]
                conf_list.append(scores)
                entities_scores_list.append(
                    {entity_id: entity_scores for entity_id, entity_scores in zip(entity_ids, scores)}
                )
                ids_list.append(entity_ids)
                pages = [elem[4] for elem in cand_ent_scores]
                entity_labels = [elem[5] for elem in cand_ent_scores]
                pages_list.append({entity_id: page for entity_id, page in zip(entity_ids, pages)})
                label_list.append(
                    {entity_id: entity_label for entity_id, entity_label in zip(entity_ids, entity_labels)})
                descr_list.append([elem[6] for elem in cand_ent_scores])

            scores_dict = {}
            if self.use_connections and self.kb:
                scores_dict = self.rank_by_connections(ids_list)

            substr_lens = [len(entity_substr.split()) for entity_substr in substr_list]
            ids_list, conf_list = self.rank_by_description(substr_list, tags_list, offsets_list, ids_list,
                                                           descr_list, entities_scores_list, sentences_list,
                                                           sentences_offsets_list, substr_lens, scores_dict)
        label_list = [[label_dict.get(entity_id, "") for entity_id in entity_ids]
                      for entity_ids, label_dict in zip(ids_list, label_list)]
        pages_list = [[pages_dict.get(entity_id, "") for entity_id in entity_ids]
                      for entity_ids, pages_dict in zip(ids_list, pages_list)]

        f_ids_list, f_conf_list, f_pages_list, f_label_list = [], [], [], []
        for ids, confs, pages, labels, add_flag in \
                zip(ids_list, conf_list, pages_list, label_list, entities_to_link):
            if add_flag:
                f_ids_list.append(ids)
                f_conf_list.append(confs)
                f_pages_list.append(pages)
                f_label_list.append(labels)
        return f_ids_list, f_conf_list, f_pages_list, f_label_list

    def define_all_low_conf(self, cand_ent_init, thres):
        all_low_conf = True
        for entity_id in cand_ent_init:
            entity_info_set = cand_ent_init[entity_id]
            for entity_info in entity_info_set:
                if entity_info[0] >= thres:
                    all_low_conf = False
                    break
            if not all_low_conf:
                break
        return all_low_conf

    def correct_tags(self, tags):
        clean_tags = [tag for tag, conf in tags]
        corr_tags, corr_clean_tags = [], []
        for tag, conf in tags:
            if tag in self.related_tags:
                corr_tag_list = self.related_tags[tag]
                for corr_tag in corr_tag_list:
                    if corr_tag not in clean_tags and corr_tag not in corr_clean_tags:
                        corr_tags.append([corr_tag, conf])
                        corr_clean_tags.append(corr_tag)
        return clean_tags, corr_tags, corr_clean_tags

    def unite_dicts(self, cand_ent_init, new_cand_ent_init):
        for entity_id in new_cand_ent_init:
            if entity_id in cand_ent_init:
                for entity_info in new_cand_ent_init[entity_id]:
                    cand_ent_init[entity_id].add(entity_info)
            else:
                cand_ent_init[entity_id] = new_cand_ent_init[entity_id]
        return cand_ent_init

    def process_cand_ent(self, cand_ent_init, entities_and_ids, substr_split, tag, tag_conf, use_tags):
        for title, entity_id, rels, ent_tag, page, label, descr in entities_and_ids:
            if (ent_tag == tag and use_tags) or not use_tags:
                substr_score = self.calc_substr_score(title, substr_split, tag, ent_tag, label)
                cand_ent_init[entity_id].add((substr_score, rels, tag_conf, page, label, descr))
        return cand_ent_init

    def sanitize_substr(self, entity_substr, tag):
        if tag == "person":
            entity_substr_split = entity_substr.split()
            if len(entity_substr_split) > 1 and len(entity_substr_split[-1]) > 1 and len(entity_substr_split[-2]) == 1:
                entity_substr = entity_substr_split[-1]
        return entity_substr

    def find_exact_match(self, entity_substr, tags, use_tags=True):
        entity_substr = entity_substr.lower()
        entity_substr_split = entity_substr.split()
        cand_ent_init = defaultdict(set)
        for tag, tag_conf in tags:
            entity_substr = self.sanitize_substr(entity_substr, tag)
            query = "SELECT * FROM inverted_index WHERE title MATCH ?;"
            entities_and_ids = []
            try:
                res = self.cur.execute(query, (entity_substr,))
                entities_and_ids = res.fetchall()
            except:
                log.info(f"error in query execute {query}")
            if entities_and_ids:
                cand_ent_init = self.process_cand_ent(
                    cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf, use_tags)
        return cand_ent_init

    def find_fuzzy_match(self, entity_substr_split, tags, use_tags=True):
        cand_ent_init = defaultdict(set)
        for tag, tag_conf in tags:
            if len(entity_substr_split) > 3:
                entity_substr_split = [" ".join(entity_substr_split[i:i + 2])
                                       for i in range(len(entity_substr_split) - 1)]
            for word in entity_substr_split:
                if len(word) > 1 and word not in self.stopwords:
                    query = "SELECT * FROM inverted_index WHERE title MATCH ?;"
                    part_entities_and_ids = []
                    try:
                        res = self.cur.execute(query, (word,))
                        part_entities_and_ids = res.fetchall()
                    except:
                        log.info(f"error in query execute {query}")
                    if part_entities_and_ids:
                        cand_ent_init = self.process_cand_ent(
                            cand_ent_init, part_entities_and_ids, entity_substr_split, tag, tag_conf, use_tags)
        return cand_ent_init

    def match_tokens(self, entity_substr_split, label_tokens):
        cnt = 0.0
        if not (len(entity_substr_split) > 1 and len(label_tokens) > 1
                and set(entity_substr_split) != set(label_tokens) and label_tokens[0] != label_tokens[-1]
                and ((entity_substr_split[0] == label_tokens[-1]) or (entity_substr_split[-1] == label_tokens[0]))):
            for ent_tok in entity_substr_split:
                found = False
                for label_tok in label_tokens:
                    if label_tok == ent_tok:
                        found = True
                        break
                if found:
                    cnt += 1.0
                else:
                    for label_tok in label_tokens:
                        if label_tok[:2] == ent_tok[:2]:
                            fuzz_score = fuzz.ratio(label_tok, ent_tok)
                            c_long_toks = len(label_tok) >= 8 and label_tok[:6] == ent_tok[:6] and fuzz_score > 70.0
                            c_shrt_toks = len(label_tokens) > 2 and len(label_tok) > 3 and label_tok[:4] == ent_tok[:4]
                            if (fuzz_score >= 75.0 or c_long_toks or c_shrt_toks) and not found:
                                cnt += fuzz_score * 0.01
                                break
        substr_score = round(cnt / max(len(label_tokens), len(entity_substr_split)), 3)
        if len(label_tokens) == 2 and len(entity_substr_split) == 1:
            if entity_substr_split[0] == label_tokens[1]:
                substr_score = 0.5
            elif entity_substr_split[0] == label_tokens[0]:
                substr_score = 0.3
        return substr_score

    def correct_substr_score(self, entity_substr_split, label_tokens, substr_score):
        if sum([len(tok) == 1 for tok in entity_substr_split]) == 2 and len(label_tokens) >= 2 \
                and any([(len(tok) == 2 and re.findall(r"[a-z]{2}", tok)) for tok in label_tokens]):
            new_label_tokens = []
            for tok in label_tokens:
                if len(tok) == 2 and re.findall(r"[a-z]{2}", tok):
                    new_label_tokens.append(tok[0])
                    new_label_tokens.append(tok[1])
                else:
                    new_label_tokens.append(tok)
            label_tokens = new_label_tokens
        if any([re.findall(r"[\d]{4}", tok) for tok in entity_substr_split]) \
                and any([re.findall(r"[\d]{4}–[\d]{2}", tok) for tok in label_tokens]):
            new_label_tokens = []
            for tok in label_tokens:
                if re.findall(r"[\d]{4}–[\d]{2}", tok):
                    new_label_tokens.append(tok[:4])
                    new_label_tokens.append(tok[5:])
                else:
                    new_label_tokens.append(tok)
            label_tokens = new_label_tokens
        new_substr_score = self.match_tokens(entity_substr_split, label_tokens)
        substr_score = max(substr_score, new_substr_score)
        return substr_score

    def calc_substr_score(self, entity_title, entity_substr_split, tag, ent_tag, entity_label):
        if self.lang == "@ru":
            entity_title = entity_title.replace("ё", "е")
        label_tokens = entity_title.split()
        substr_score = self.match_tokens(entity_substr_split, label_tokens)
        substr_score = self.correct_substr_score(entity_substr_split, label_tokens, substr_score)
        if re.findall(r" \(.*\)", entity_label):
            entity_label_split = entity_label.replace("(", "").replace(")", "").lower().split()
            lbl_substr_score = self.match_tokens(entity_substr_split, entity_label_split)
            substr_score = max(substr_score, lbl_substr_score)
        if tag == ent_tag and tag.lower() == "person" and len(entity_substr_split) > 1 \
                and len(entity_substr_split[-1]) > 1 and len(entity_substr_split[-2]) == 1 \
                and len(label_tokens) == len(entity_substr_split):
            cnt = 0.0
            for j in range(len(label_tokens) - 1):
                if label_tokens[j][0] == entity_substr_split[j][0]:
                    cnt += 1.0
            if label_tokens[-1] == entity_substr_split[-1]:
                cnt += 1.0
            new_substr_score = cnt / len(label_tokens)
            substr_score = max(substr_score, new_substr_score)

        if entity_title.lower() == entity_label.lower() and substr_score == 1.0:
            substr_score = substr_score * self.alias_coef
        return substr_score

    def rank_by_description(
            self,
            entity_substr_list: List[str],
            tags_list: List[str],
            entity_offsets_list: List[List[int]],
            cand_ent_list: List[List[str]],
            cand_ent_descr_list: List[List[str]],
            entities_scores_list: List[Dict[str, Tuple[int, float]]],
            sentences_list: List[str],
            sentences_offsets_list: List[Tuple[int, int]],
            substr_lens: List[int],
            scores_dict: Dict[str, int] = None
    ) -> Tuple[List[Union[Union[float, List[Any], List[Union[float, Any]]], Any]], List[
        Union[Union[tuple, List[tuple], List[Any], List[Tuple[Union[float, Any], ...]]], Any]]]:
        entity_ids_list = []
        conf_list = []
        contexts = []
        for entity_offset in entity_offsets_list:
            context, sentence = "", ""
            if len(entity_offset) == 2:
                entity_start_offset, entity_end_offset = entity_offset
                rel_start_offset = 0
                rel_end_offset = 0
                found_sentence_num = 0
                for num, (sent, (sent_start_offset, sent_end_offset)) in enumerate(
                        zip(sentences_list, sentences_offsets_list)
                ):
                    if entity_start_offset >= sent_start_offset and entity_end_offset <= sent_end_offset:
                        sentence = sent
                        found_sentence_num = num
                        rel_start_offset = entity_start_offset - sent_start_offset
                        rel_end_offset = entity_end_offset - sent_start_offset
                        break
            if sentence:
                start_of_sentence = 0
                end_of_sentence = len(sentence)
                if len(sentence) > self.max_text_len:
                    start_of_sentence = max(rel_start_offset - self.max_text_len // 2, 0)
                    end_of_sentence = min(rel_end_offset + self.max_text_len // 2, len(sentence))
                text_before = sentence[start_of_sentence:rel_start_offset]
                text_after = sentence[rel_end_offset:end_of_sentence]
                context = text_before + "[ENT]" + text_after
                if self.full_paragraph:
                    cur_sent_len = len(re.findall(self.re_tokenizer, context))
                    first_sentence_num = found_sentence_num
                    last_sentence_num = found_sentence_num
                    context = [context]
                    while True:
                        added = False
                        if last_sentence_num < len(sentences_list) - 1:
                            sentence_tokens = re.findall(self.re_tokenizer, sentences_list[last_sentence_num + 1])
                            last_sentence_len = len(sentence_tokens)
                            if cur_sent_len + last_sentence_len < self.max_paragraph_len:
                                context.append(sentences_list[last_sentence_num + 1])
                                cur_sent_len += last_sentence_len
                                last_sentence_num += 1
                                added = True
                        if first_sentence_num > 0:
                            sentence_tokens = re.findall(self.re_tokenizer, sentences_list[first_sentence_num - 1])
                            first_sentence_len = len(sentence_tokens)
                            if cur_sent_len + first_sentence_len < self.max_paragraph_len:
                                context = [sentences_list[first_sentence_num - 1]] + context
                                cur_sent_len += first_sentence_len
                                first_sentence_num -= 1
                                added = True
                        if not added:
                            break
                    context = " ".join(context)

            log.debug(f"rank, context: {context}")
            contexts.append(context)

        if self.use_descriptions:
            scores_list = self.entity_ranker(contexts, cand_ent_list, cand_ent_descr_list)
        else:
            scores_list = [[(entity_id, 1.0) for entity_id in cand_ent] for cand_ent in cand_ent_list]

        for entity_substr, tag, context, candidate_entities, substr_len, entities_scores, scores in zip(
                entity_substr_list, tags_list, contexts, cand_ent_list, substr_lens, entities_scores_list, scores_list
        ):
            entities_with_scores = []
            max_conn_score = 0
            if scores_dict and scores:
                max_conn_score = max([scores_dict.get(entity, 0) for entity, _ in scores])
            for entity, score in scores:
                substr_score = round(entities_scores.get(entity, (0.0, 0))[0], 2)
                num_rels = entities_scores.get(entity, (0.0, 0))[1]
                if len(context.split()) < 4:
                    score = 0.95
                elif scores_dict and 0 < max_conn_score == scores_dict.get(entity, 0):
                    score = 1.0
                    num_rels = 200
                entities_with_scores.append((entity, substr_score, num_rels, float(score)))

            if tag == "t":
                entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True)
            else:
                entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[3], x[2]), reverse=True)
            log.debug(f"{entity_substr} --- tag: {tag} --- entities_with_scores: {entities_with_scores}")

            if not entities_with_scores:
                top_entities = []
                top_conf = []
            elif entities_with_scores and substr_len == 1 and entities_with_scores[0][1] < 1.0:
                top_entities = []
                top_conf = []
            elif entities_with_scores and (
                    entities_with_scores[0][1] < 0.3
                    or (entities_with_scores[0][3] < 0.13 and entities_with_scores[0][2] < 20)
                    or (entities_with_scores[0][3] < 0.3 and entities_with_scores[0][2] < 4)
                    or entities_with_scores[0][1] < 0.6
            ):
                top_entities = []
                top_conf = []
            else:
                top_entities = [score[0] for score in entities_with_scores]
                top_conf = [score[1:] for score in entities_with_scores]

            high_conf_entities = []
            high_conf_nums = []
            for elem_num, (entity, conf) in enumerate(zip(top_entities, top_conf)):
                if len(conf) == 3 and conf[0] >= 1.0 and conf[1] > 50 and conf[2] > 0.3:
                    new_conf = list(conf)
                    if new_conf[1] > 55:
                        new_conf[2] = 1.0
                    new_conf = tuple(new_conf)
                    high_conf_entities.append((entity,) + new_conf)
                    high_conf_nums.append(elem_num)

            high_conf_entities = sorted(high_conf_entities, key=lambda x: (x[1], x[3], x[2]), reverse=True)
            log.debug(f"high_conf_entities: {high_conf_entities}")
            for n, elem_num in enumerate(high_conf_nums):
                if 0 <= elem_num - n < len(top_entities):
                    del top_entities[elem_num - n]
                    del top_conf[elem_num - n]

            top_entities = [elem[0] for elem in high_conf_entities] + top_entities
            top_conf = [elem[1:] for elem in high_conf_entities] + top_conf

            if not top_entities:
                entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True)
                top_entities = [score[0] for score in entities_with_scores]
                top_conf = [score[1:] for score in entities_with_scores]

            if self.num_entities_to_return == 1 and top_entities:
                entity_ids_list.append(top_entities[0])
                conf_list.append([round(cnf, 2) for cnf in top_conf[0]])
            elif self.num_entities_to_return == "max":
                if top_conf:
                    max_conf = top_conf[0][0]
                    max_rank_conf = top_conf[0][2]
                    entity_ids, confs = [], []
                    for entity_id, conf in zip(top_entities, top_conf):
                        if (conf[0] >= max_conf * 0.9 and max_rank_conf <= 1.0) \
                                or (max_rank_conf == 1.0 and conf[2] == 1.0):
                            entity_ids.append(entity_id)
                            confs.append([round(cnf, 2) for cnf in conf])
                    entity_ids_list.append(entity_ids)
                    conf_list.append(confs)
                else:
                    entity_ids_list.append([])
                    conf_list.append([])
            else:
                entity_ids_list.append(top_entities[: self.num_entities_to_return])
                conf_list.append([[round(cnf, 2) for cnf in conf] for conf in top_conf[: self.num_entities_to_return]])
            log.debug(f"{entity_substr} --- top entities {entity_ids_list[-1]} --- top_conf {conf_list[-1]}")
        return entity_ids_list, conf_list

    def sort_out_low_conf(self, entity_substr, top_entities, top_conf):
        if len(entity_substr.split()) > 1 and top_conf:
            f_top_entities, f_top_conf = [], []
            for top_conf_thres, conf_thres in [(1.0, 0.9), (0.9, 0.8)]:
                if top_conf[0][0] >= top_conf_thres:
                    for ent, conf in zip(top_entities, top_conf):
                        if conf[0] > conf_thres:
                            f_top_entities.append(ent)
                            f_top_conf.append(conf)
            return f_top_entities, f_top_conf
        return top_entities, top_conf

    def rank_by_connections(self, ids_list):
        objects_sets_dict, scores_dict, conn_dict = {}, {}, {}
        for ids in ids_list:
            for entity_id in ids:
                scores_dict[entity_id] = 0
                conn_dict[entity_id] = set()
        for ids in ids_list:
            for entity_id in ids[:self.num_entities_for_conn_ranking]:
                objects = set()
                for prefix in self.prefixes["entity"]:
                    tr, _ = self.kb.search_triples(f"{prefix}/{entity_id}", "", "")
                    for subj, rel, obj in tr:
                        if rel.split("/")[-1] not in {"P31", "P279"}:
                            if any([obj.startswith(pr) for pr in self.prefixes["entity"]]):
                                objects.add(obj.split("/")[-1])
                            if rel.startswith(self.prefixes["rels"]["no_type"]):
                                tr2, _ = self.kb.search_triples(obj, "", "")
                                for _, rel2, obj2 in tr2:
                                    if rel2.startswith(self.prefixes["rels"]["statement"]) \
                                            or rel2.startswith(self.prefixes["rels"]["qualifier"]):
                                        if any([obj2.startswith(pr) for pr in self.prefixes["entity"]]):
                                            objects.add(obj2.split("/")[-1])
                objects_sets_dict[entity_id] = objects
                for obj in objects:
                    if obj not in objects_sets_dict:
                        objects_sets_dict[obj] = set()
                    objects_sets_dict[obj].add(entity_id)

        for i in range(len(ids_list)):
            for j in range(len(ids_list)):
                if i != j:
                    for entity_id1 in ids_list[i][:self.num_entities_for_conn_ranking]:
                        for entity_id2 in ids_list[j][:self.num_entities_for_conn_ranking]:
                            if entity_id1 in objects_sets_dict[entity_id2]:
                                conn_dict[entity_id1].add(entity_id2)
                                conn_dict[entity_id2].add(entity_id1)
        for entity_id in conn_dict:
            scores_dict[entity_id] = len(conn_dict[entity_id])
        return scores_dict


================================================
FILE: deeppavlov/models/entity_extraction/find_word.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
import pickle
from collections import Counter

import numpy as np
import scipy as sp

from deeppavlov.core.commands.utils import expand_path

Sparse = sp.sparse.csr_matrix


class WordSearcher:
    def __init__(self, words_dict_filename: str, ngrams_matrix_filename: str, lang: str = "@en", thresh: int = 1000):
        self.words_dict_filename = words_dict_filename
        self.ngrams_matrix_filename = ngrams_matrix_filename
        if lang == "@en":
            self.letters = "abcdefghijklmnopqrstuvwxyz"
        elif lang == "@ru":
            self.letters = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
        else:
            raise ValueError(f'Unexpected lang value: "{lang}"')
        self.thresh = thresh
        self.load()
        self.make_ngrams_dicts()

    def load(self):
        with open(str(expand_path(self.words_dict_filename)), "rb") as fl:
            self.words_dict = pickle.load(fl)
        words_list = list(self.words_dict.keys())
        self.words_list = sorted(words_list)

        loader = np.load(str(expand_path(self.ngrams_matrix_filename)), allow_pickle=True)
        self.count_matrix = Sparse((loader["data"], loader["indices"], loader["indptr"]), shape=loader["shape"])

    def make_ngrams_dicts(self):
        self.bigrams_dict, self.trigrams_dict = {}, {}
        bigram_combs = list(itertools.product(self.letters, self.letters))
        bigram_combs = ["".join(comb) for comb in bigram_combs]
        trigram_combs = list(itertools.product(self.letters, self.letters, self.letters))
        trigram_combs = ["".join(comb) for comb in trigram_combs]
        for cnt, bigram in enumerate(bigram_combs):
            self.bigrams_dict[bigram] = cnt
        for cnt, trigram in enumerate(trigram_combs):
            self.trigrams_dict[trigram] = cnt + len(bigram_combs)

    def __call__(self, query, tags):
        ngrams_list = []
        for i in range(len(query) - 1):
            ngram = query[i : i + 2].lower()
            if ngram in self.bigrams_dict:
                ngram_id = self.bigrams_dict[ngram]
                ngrams_list.append(ngram_id)
        for i in range(len(query) - 2):
            ngram = query[i : i + 3].lower()
            if ngram in self.trigrams_dict:
                ngram_id = self.trigrams_dict[ngram]
                ngrams_list.append(ngram_id)
        ngrams_with_cnts = Counter(ngrams_list).most_common()
        ngram_ids = [elem[0] for elem in ngrams_with_cnts]
        ngram_cnts = [1 for _ in ngrams_with_cnts]

        indptr = np.array([0, len(ngram_cnts)])
        query_matrix = Sparse(
            (ngram_cnts, ngram_ids, indptr), shape=(1, len(self.bigrams_dict) + len(self.trigrams_dict))
        )

        scores = query_matrix * self.count_matrix
        scores = np.squeeze(scores.toarray())

        if self.thresh >= len(scores):
            o = np.argpartition(-scores, len(scores) - 1)[0:self.thresh]
        else:
            o = np.argpartition(-scores, self.thresh)[0:self.thresh]
        o_sort = o[np.argsort(-scores[o])]
        o_sort = o_sort.tolist()

        found_words = [self.words_list[n] for n in o_sort]
        found_words = [
            word
            for word in found_words
            if (
                word.startswith(query[0])
                and abs(len(word) - len(query)) < 3
                and self.words_dict[word].intersection(tags)
            )
        ]
        return found_words


================================================
FILE: deeppavlov/models/entity_extraction/ner_chunker.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from logging import getLogger
from string import punctuation
from typing import List, Tuple, Union, Any

from nltk import sent_tokenize
from transformers import AutoTokenizer

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.common.chainer import Chainer
from deeppavlov.models.entity_extraction.entity_detection_parser import EntityDetectionParser

log = getLogger(__name__)


@register('ner_chunker')
class NerChunker(Component):
    """
        Class to split documents into chunks of max_seq_len symbols so that the length will not exceed
        maximal sequence length to feed into BERT
    """

    def __init__(self, vocab_file: str, max_seq_len: int = 400, lowercase: bool = False, batch_size: int = 2, **kwargs):
        """
        Args:
            vocab_file: vocab file of pretrained transformer model
            max_seq_len: maximal length of chunks into which the document is split
            lowercase: whether to lowercase text
            batch_size: how many chunks are in batch
        """
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.tokenizer = AutoTokenizer.from_pretrained(vocab_file,
                                                       do_lower_case=True)
        self.punct_ext = punctuation + " " + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        self.russian_letters = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
        self.lowercase = lowercase

    def __call__(self, docs_batch: List[str]) -> Tuple[List[List[str]], List[List[int]], List[List[Union[
        List[Union[Tuple[int, int], Tuple[Union[int, Any], Union[int, Any]]]], List[
            Tuple[Union[int, Any], Union[int, Any]]], List[Tuple[int, int]]]]], List[List[Union[List[Any], List[str]]]],
                                                                 List[List[str]]]:
        """
        This method splits each document in the batch into chunks wuth the maximal length of max_seq_len
 
        Args:
            docs_batch: batch of documents
        Returns:
            batch of lists of document chunks for each document
            batch of lists of numbers of documents which correspond to chunks
        """
        text_batch_list, nums_batch_list, sentences_offsets_batch_list, sentences_batch_list = [], [], [], []
        text_batch, nums_batch, sentences_offsets_batch, sentences_batch = [], [], [], []
        for n, doc in enumerate(docs_batch):
            if self.lowercase:
                doc = doc.lower()
            start = 0
            text = ""
            sentences_list = []
            sentences_offsets_list = []
            cur_len = 0
            doc_pieces = doc.split("\n")
            doc_pieces = [self.sanitize(doc_piece) for doc_piece in doc_pieces]
            doc_pieces = [doc_piece for doc_piece in doc_pieces if len(doc_piece) > 1]
            if doc_pieces:
                sentences = []
                for doc_piece in doc_pieces:
                    sentences += sent_tokenize(doc_piece)
                for sentence in sentences:
                    sentence_tokens = re.findall(self.re_tokenizer, sentence)
                    sentence_len = sum([len(self.tokenizer.encode_plus(token, add_special_tokens=False)["input_ids"])
                                        for token in sentence_tokens])
                    if cur_len + sentence_len < self.max_seq_len:
                        text += f"{sentence} "
                        cur_len += sentence_len
                        end = start + len(sentence)
                        sentences_offsets_list.append((start, end))
                        sentences_list.append(sentence)
                        start = end + 1
                    else:
                        text = text.strip()
                        if text:
                            text_batch.append(text)
                            sentences_offsets_batch.append(sentences_offsets_list)
                            sentences_batch.append(sentences_list)
                            nums_batch.append(n)

                        if sentence_len < self.max_seq_len:
                            text = f"{sentence} "
                            cur_len = sentence_len
                            start = 0
                            end = start + len(sentence)
                            sentences_offsets_list = [(start, end)]
                            sentences_list = [sentence]
                            start = end + 1
                        else:
                            text = ""
                            sentence_chunks = sentence.split(" ")
                            for chunk in sentence_chunks:
                                chunk_tokens = re.findall(self.re_tokenizer, chunk)
                                chunk_len = sum([len(self.tokenizer.encode_plus(token,
                                                                                add_special_tokens=False)["input_ids"])
                                                 for token in chunk_tokens])
                                if cur_len + chunk_len < self.max_seq_len:
                                    text += f"{chunk} "
                                    cur_len += chunk_len + 1
                                    end = start + len(chunk)
                                    sentences_offsets_list.append((start, end))
                                    sentences_list.append(chunk)
                                    start = end + 1
                                else:
                                    text = text.strip()
                                    if text:
                                        text_batch.append(text)
                                        sentences_offsets_batch.append(sentences_offsets_list)
                                        sentences_batch.append(sentences_list)
                                        nums_batch.append(n)

                                    text = f"{chunk} "
                                    cur_len = chunk_len
                                    start = 0
                                    end = start + len(chunk)
                                    sentences_offsets_list = [(start, end)]
                                    sentences_list = [chunk]
                                    start = end + 1

                text = text.strip().strip(",")
                if text:
                    text_batch.append(text)
                    nums_batch.append(n)
                    sentences_offsets_batch.append(sentences_offsets_list)
                    sentences_batch.append(sentences_list)
            else:
                text_batch.append("а")
                nums_batch.append(n)
                sentences_offsets_batch.append([(0, len(doc))])
                sentences_batch.append([doc])

        num_batches = len(text_batch) // self.batch_size + int(len(text_batch) % self.batch_size > 0)
        for jj in range(num_batches):
            text_batch_list.append(text_batch[jj * self.batch_size:(jj + 1) * self.batch_size])
            nums_batch_list.append(nums_batch[jj * self.batch_size:(jj + 1) * self.batch_size])
            sentences_offsets_batch_list.append(
                sentences_offsets_batch[jj * self.batch_size:(jj + 1) * self.batch_size])
            sentences_batch_list.append(sentences_batch[jj * self.batch_size:(jj + 1) * self.batch_size])

        return text_batch_list, nums_batch_list, sentences_offsets_batch_list, sentences_batch_list

    def sanitize(self, text):
        text_len = len(text)

        if text_len > 0 and text[text_len - 1] not in {'.', '!', '?'}:
            i = text_len - 1
            while text[i] in self.punct_ext and i > 0:
                i -= 1
                if (text[i] in {'.', '!', '?'} and text[i - 1].lower() in self.russian_letters) or \
                        (i > 1 and text[i] in {'.', '!', '?'} and text[i - 1] in '"' and text[
                            i - 2].lower() in self.russian_letters):
                    break

            text = text[:i + 1]
        text = re.sub(r'\s+', ' ', text)
        return text


@register('ner_chunk_model')
class NerChunkModel(Component):
    """
        Class for linking of entity substrings in the document to entities in Wikidata
    """

    def __init__(self, ner: Chainer,
                 ner_parser: EntityDetectionParser,
                 ner2: Chainer = None,
                 ner_parser2: EntityDetectionParser = None,
                 **kwargs) -> None:
        """
        Args:
            ner: config for entity detection
            ner_parser: component deeppavlov.models.entity_extraction.entity_detection_parser
            ner2: config of additional entity detection model (ensemble of ner and ner2 models gives better
                entity detection quality than single ner model)
            ner_parser2: component deeppavlov.models.entity_extraction.entity_detection_parser
            **kwargs:
        """
        self.ner = ner
        self.ner_parser = ner_parser
        self.ner2 = ner2
        self.ner_parser2 = ner_parser2

    def __call__(self, text_batch_list: List[List[str]],
                 nums_batch_list: List[List[int]],
                 sentences_offsets_batch_list: List[List[List[Tuple[int, int]]]],
                 sentences_batch_list: List[List[List[str]]]
                 ):
        """
        Args:
            text_batch_list: list of document chunks
            nums_batch_list: nums of documents
            sentences_offsets_batch_list: indices of start and end symbols of sentences in text
            sentences_batch_list: list of sentences from texts
        Returns:
            doc_entity_substr_batch: entity substrings
            doc_entity_offsets_batch: indices of start and end symbols of entities in text
            doc_tags_batch: entity tags (PER, LOC, ORG)
            doc_sentences_offsets_batch: indices of start and end symbols of sentences in text
            doc_sentences_batch: list of sentences from texts
        """
        entity_substr_batch_list, entity_offsets_batch_list, entity_positions_batch_list, tags_batch_list, \
        entity_probas_batch_list, text_len_batch_list, text_tokens_len_batch_list = [], [], [], [], [], [], []
        for text_batch, sentences_offsets_batch, sentences_batch in \
                zip(text_batch_list, sentences_offsets_batch_list, sentences_batch_list):
            text_batch = [text.replace("\xad", " ") for text in text_batch]

            ner_tokens_batch, ner_tokens_offsets_batch, ner_probas_batch, probas_batch = self.ner(text_batch)
            entity_substr_batch, entity_positions_batch, entity_probas_batch = \
                self.ner_parser(ner_tokens_batch, ner_probas_batch, probas_batch)
            if self.ner2:
                ner_tokens_batch2, ner_tokens_offsets_batch2, ner_probas_batch2, probas_batch2 = self.ner2(text_batch)
                entity_substr_batch2, entity_positions_batch2, entity_probas_batch2 = \
                    self.ner_parser2(ner_tokens_batch2, ner_probas_batch2, probas_batch2)
                entity_substr_batch, entity_positions_batch, entity_probas_batch = \
                    self.merge_annotations(entity_substr_batch, entity_positions_batch, entity_probas_batch,
                                           entity_substr_batch2, entity_positions_batch2, entity_probas_batch2)

            entity_pos_tags_probas_batch = [[(entity_substr.lower(), entity_substr_positions, tag, entity_proba)
                                             for tag, entity_substr_list in entity_substr_dict.items()
                                             for entity_substr, entity_substr_positions, entity_proba in
                                             zip(entity_substr_list, entity_positions_dict[tag],
                                                 entity_probas_dict[tag])]
                                            for entity_substr_dict, entity_positions_dict, entity_probas_dict in
                                            zip(entity_substr_batch, entity_positions_batch, entity_probas_batch)]

            entity_substr_batch, entity_offsets_batch, entity_positions_batch, tags_batch, \
            probas_batch = [], [], [], [], []
            for entity_pos_tags_probas, ner_tokens_offsets_list in \
                    zip(entity_pos_tags_probas_batch, ner_tokens_offsets_batch):
                if entity_pos_tags_probas:
                    entity_offsets_list = []
                    entity_substr_list, entity_positions_list, tags_list, probas_list = zip(*entity_pos_tags_probas)
                    for entity_positions in entity_positions_list:
                        start_offset = ner_tokens_offsets_list[entity_positions[0]][0]
                        end_offset = ner_tokens_offsets_list[entity_positions[-1]][1]
                        entity_offsets_list.append((start_offset, end_offset))
                else:
                    entity_substr_list, entity_offsets_list, entity_positions_list = [], [], []
                    tags_list, probas_list = [], []
                entity_substr_batch.append(list(entity_substr_list))
                entity_offsets_batch.append(list(entity_offsets_list))
                entity_positions_batch.append(list(entity_positions_list))
                tags_batch.append(list(tags_list))
                probas_batch.append(list(probas_list))

            entity_substr_batch_list.append(entity_substr_batch)
            tags_batch_list.append(tags_batch)
            entity_offsets_batch_list.append(entity_offsets_batch)
            entity_positions_batch_list.append(entity_positions_batch)
            entity_probas_batch_list.append(probas_batch)
            text_len_batch_list.append([len(text) for text in text_batch])
            text_tokens_len_batch_list.append([len(ner_tokens) for ner_tokens in ner_tokens_batch])

        doc_entity_substr_batch, doc_tags_batch, doc_entity_offsets_batch, doc_probas_batch = [], [], [], []
        doc_entity_positions_batch, doc_sentences_offsets_batch, doc_sentences_batch = [], [], []
        doc_entity_substr, doc_tags, doc_probas, doc_entity_offsets, doc_entity_positions = [], [], [], [], []
        doc_sentences_offsets, doc_sentences = [], []
        cur_doc_num = 0
        text_len_sum = 0
        text_tokens_len_sum = 0
        for entity_substr_batch, tags_batch, probas_batch, entity_offsets_batch, entity_positions_batch, \
            sentences_offsets_batch, sentences_batch, text_len_batch, text_tokens_len_batch, nums_batch in \
                zip(entity_substr_batch_list, tags_batch_list, entity_probas_batch_list, entity_offsets_batch_list,
                    entity_positions_batch_list, sentences_offsets_batch_list, sentences_batch_list,
                    text_len_batch_list, text_tokens_len_batch_list, nums_batch_list):
            for entity_substr_list, tag_list, probas_list, entity_offsets_list, entity_positions_list, \
                sentences_offsets_list, sentences_list, text_len, text_tokens_len, doc_num in \
                    zip(entity_substr_batch, tags_batch, probas_batch, entity_offsets_batch, entity_positions_batch,
                        sentences_offsets_batch, sentences_batch, text_len_batch, text_tokens_len_batch, nums_batch):
                if doc_num == cur_doc_num:
                    doc_entity_substr += entity_substr_list
                    doc_tags += tag_list
                    doc_probas += probas_list
                    doc_entity_offsets += [(start_offset + text_len_sum, end_offset + text_len_sum)
                                           for start_offset, end_offset in entity_offsets_list]
                    doc_sentences_offsets += [(start_offset + text_len_sum, end_offset + text_len_sum)
                                              for start_offset, end_offset in sentences_offsets_list]
                    doc_entity_positions += [[pos + text_tokens_len_sum for pos in positions]
                                             for positions in entity_positions_list]
                    doc_sentences += sentences_list
                    text_len_sum += text_len + 1
                    text_tokens_len_sum += text_tokens_len
                else:
                    doc_entity_substr_batch.append(doc_entity_substr)
                    doc_tags_batch.append(doc_tags)
                    doc_probas_batch.append(doc_probas)
                    doc_entity_offsets_batch.append(doc_entity_offsets)
                    doc_entity_positions_batch.append(doc_entity_positions)
                    doc_sentences_offsets_batch.append(doc_sentences_offsets)
                    doc_sentences_batch.append(doc_sentences)
                    doc_entity_substr = entity_substr_list
                    doc_tags = tag_list
                    doc_probas = probas_list
                    doc_entity_offsets = entity_offsets_list
                    doc_sentences_offsets = sentences_offsets_list
                    doc_sentences = sentences_list
                    cur_doc_num = doc_num
                    text_len_sum = text_len + 1
                    text_tokens_len_sum = text_tokens_len

        doc_entity_substr_batch.append(doc_entity_substr)
        doc_tags_batch.append(doc_tags)
        doc_probas_batch.append(doc_probas)
        doc_entity_offsets_batch.append(doc_entity_offsets)
        doc_entity_positions_batch.append(doc_entity_positions)
        doc_sentences_offsets_batch.append(doc_sentences_offsets)
        doc_sentences_batch.append(doc_sentences)

        return doc_entity_substr_batch, doc_entity_offsets_batch, doc_entity_positions_batch, doc_tags_batch, \
               doc_sentences_offsets_batch, doc_sentences_batch, doc_probas_batch

    def merge_annotations(self, substr_batch, pos_batch, probas_batch, substr_batch2, pos_batch2, probas_batch2):
        log.debug(f"ner_chunker, substr2: {substr_batch2} --- pos2: {pos_batch2} --- probas2: {probas_batch2} --- "
                  f"substr: {substr_batch} --- pos: {pos_batch} --- probas: {probas_batch}")
        for i in range(len(substr_batch)):
            for key2 in substr_batch2[i]:
                substr_list2 = substr_batch2[i][key2]
                pos_list2 = pos_batch2[i][key2]
                probas_list2 = probas_batch2[i][key2]
                for substr2, pos2, probas2 in zip(substr_list2, pos_list2, probas_list2):
                    found = False
                    for key in substr_batch[i]:
                        pos_list = pos_batch[i][key]
                        for pos in pos_list:
                            if pos[0] <= pos2[0] <= pos[-1] or pos[0] <= pos2[-1] <= pos[-1]:
                                found = True
                    if not found:
                        if key2 not in substr_batch[i]:
                            substr_batch[i][key2] = []
                            pos_batch[i][key2] = []
                            probas_batch[i][key2] = []
                        substr_batch[i][key2].append(substr2)
                        pos_batch[i][key2].append(pos2)
                        probas_batch[i][key2].append(probas2)
        for i in range(len(substr_batch)):
            for key2 in substr_batch2[i]:
                substr_list2 = substr_batch2[i][key2]
                pos_list2 = pos_batch2[i][key2]
                probas_list2 = probas_batch2[i][key2]
                for substr2, pos2, probas2 in zip(substr_list2, pos_list2, probas_list2):
                    for key in substr_batch[i]:
                        inds = []
                        substr_list = substr_batch[i][key]
                        pos_list = pos_batch[i][key]
                        probas_list = probas_batch[i][key]
                        for n, (substr, pos, probas) in enumerate(zip(substr_list, pos_list, probas_list)):
                            if (pos[0] == pos2[0] and pos[-1] < pos2[-1]) or (pos[0] > pos2[0] and pos[-1] == pos2[-1]):
                                inds.append(n)
                            elif key == "EVENT" and ((pos[0] >= pos2[0] and pos[-1] <= pos2[-1])
                                                     or (len(substr.split()) == 1 and pos2[0] <= pos[0])):
                                inds.append(n)

                        if (len(inds) > 1 or (len(inds) == 1 and key in {"WORK_OF_ART", "EVENT"})) \
                                and not (key == "PERSON" and " и " in substr2):
                            inds = sorted(inds, reverse=True)
                            for ind in inds:
                                del substr_batch[i][key][ind]
                                del pos_batch[i][key][ind]
                                del probas_batch[i][key][ind]
                            substr_batch[i][key].append(substr2)
                            pos_batch[i][key].append(pos2)
                            probas_batch[i][key].append(probas2)
        return substr_batch, pos_batch, probas_batch


================================================
FILE: deeppavlov/models/kbqa/__init__.py
================================================


================================================
FILE: deeppavlov/models/kbqa/query_generator.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import itertools
import re
from collections import defaultdict
from logging import getLogger
from typing import Tuple, List, Optional, Union, Dict, Any, Set

import nltk
import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.models.kbqa.query_generator_base import QueryGeneratorBase
from deeppavlov.models.kbqa.rel_ranking_infer import RelRankerInfer
from deeppavlov.models.kbqa.utils import extract_year, extract_number, make_combs, fill_query, find_query_features, \
    make_sparql_query, merge_sparql_query
from deeppavlov.models.kbqa.wiki_parser import WikiParser

log = getLogger(__name__)


@register('query_generator')
class QueryGenerator(QueryGeneratorBase):
    """
        Class for query generation using Wikidata hdt file
    """

    def __init__(self, wiki_parser: WikiParser,
                 rel_ranker: RelRankerInfer,
                 entities_to_leave: int = 5,
                 types_to_leave: int = 2,
                 rels_to_leave: int = 7,
                 max_comb_num: int = 10000,
                 gold_query_info: Dict[str, str] = None,
                 map_query_str_to_kb: List[Tuple[str, str]] = None,
                 return_answers: bool = True, *args, **kwargs) -> None:
        """

        Args:
            wiki_parser: component deeppavlov.models.kbqa.wiki_parser
            rel_ranker: component deeppavlov.models.kbqa.rel_ranking_infer
            entities_to_leave: how many entities to leave after entity linking
            types_to_leave: how many types to leave after entity linking
            rels_to_leave: how many relations to leave after relation ranking
            max_comb_num: the maximum number of combinations of candidate entities and relations
            gold_query_info: dict of variable names used for formatting output sparql queries
            map_query_str_to_kb: mapping of knowledge base prefixes to full https
            return_answers: whether to return answers or candidate relations and answers for further ranking
            **kwargs:
        """
        self.wiki_parser = wiki_parser
        self.rel_ranker = rel_ranker
        self.entities_to_leave = entities_to_leave
        self.types_to_leave = types_to_leave
        self.rels_to_leave = rels_to_leave
        self.max_comb_num = max_comb_num
        self.gold_query_info = gold_query_info
        self.map_query_str_to_kb = map_query_str_to_kb
        self.return_answers = return_answers
        self.replace_tokens = [("wdt:p", "wdt:P"), ("pq:p", "pq:P")]
        super().__init__(wiki_parser=self.wiki_parser, rel_ranker=self.rel_ranker,
                         entities_to_leave=self.entities_to_leave, rels_to_leave=self.rels_to_leave,
                         *args, **kwargs)

    def __call__(self, question_batch: List[str],
                 question_san_batch: List[str],
                 template_type_batch: Union[List[List[str]], List[str]],
                 entities_from_ner_batch: List[List[str]],
                 types_from_ner_batch: List[List[str]],
                 entity_tags_batch: List[List[str]],
                 probas_batch: List[List[float]],
                 answer_types_batch: List[Set[str]] = None,
                 entities_to_link_batch: List[List[int]] = None) -> Tuple[List[Any], List[Any]]:

        candidate_outputs_batch, template_answers_batch = [], []
        if not answer_types_batch or answer_types_batch[0] is None:
            answer_types_batch = [[] for _ in question_batch]
        if not entities_to_link_batch or entities_to_link_batch[0] is None:
            entities_to_link_batch = [[1 for _ in substr_list] for substr_list in entities_from_ner_batch]
        log.debug(f"kbqa inputs {question_batch} {question_san_batch} template_type_batch: {template_type_batch} --- "
                  f"entities_from_ner: {entities_from_ner_batch} --- types_from_ner: {types_from_ner_batch} --- "
                  f"entity_tags_batch: {entity_tags_batch} --- answer_types_batch: "
                  f"{[list(elem)[:3] for elem in answer_types_batch]}")
        for question, question_sanitized, template_type, entities_from_ner, types_from_ner, entity_tags_list, \
            probas, entities_to_link, answer_types in zip(question_batch, question_san_batch, template_type_batch,
                                                          entities_from_ner_batch, types_from_ner_batch,
                                                          entity_tags_batch, probas_batch, entities_to_link_batch,
                                                          answer_types_batch):
            if template_type == "-1":
                template_type = "7"
            candidate_outputs, template_answer = \
                self.find_candidate_answers(question, question_sanitized, template_type, entities_from_ner,
                                            types_from_ner, entity_tags_list, probas, entities_to_link, answer_types)
            candidate_outputs_batch.append(candidate_outputs)
            template_answers_batch.append(template_answer)

        if self.return_answers:
            answers = self.rel_ranker(question_batch, template_type_batch, candidate_outputs_batch,
                                      entities_from_ner_batch, template_answers_batch)
            log.debug(f"(__call__)answers: {answers}")
            if not answers:
                answers = ["Not Found" for _ in question_batch]
            return answers
        else:
            return candidate_outputs_batch, template_answers_batch

    def parse_queries_info(self, question, queries_info, entity_ids, type_ids, rels_from_template):
        parsed_queries_info = []
        question_tokens = nltk.word_tokenize(question)
        rels_scores_dict = {}
        for query_info in queries_info:
            query = query_info["query_template"].lower()
            for old_tok, new_tok in self.replace_tokens:
                query = query.replace(old_tok, new_tok)
            log.debug(f"\n_______________________________\nquery: {query}\n_______________________________\n")
            entities_and_types_select = query_info["entities_and_types_select"]
            rels_for_search = query_info["rank_rels"]
            rel_types = query_info["rel_types"]
            n_hops = query_info["n_hops"]
            unk_rels = query_info.get("unk_rels", [])
            query_seq_num = query_info["query_sequence"]
            return_if_found = query_info["return_if_found"]
            log.debug(f"(query_parser)query: {query}, rels_for_search {rels_for_search}, rel_types {rel_types} "
                      f"n_hops {n_hops}, {query_seq_num}, {return_if_found}")
            query_triplets = re.findall("{[ ]?(.*?)[ ]?}", query)[0].split(' . ')
            log.debug(f"(query_parser)query_triplets: {query_triplets}")
            query_triplets_split = [triplet.split(' ')[:3] for triplet in query_triplets]
            property_types = {}
            for rel_type, query_triplet in zip(rel_types, query_triplets_split):
                if query_triplet[1].startswith("?") and rel_type == "qualifier":
                    property_types[query_triplet[1]] = rel_type
            query_sequence_dict = {num + 1: triplet for num, triplet in enumerate(query_triplets_split)}
            query_sequence = []
            for i in query_seq_num:
                query_sequence.append(query_sequence_dict[i])
            triplet_info_list = [("forw" if triplet[2].startswith('?') else "backw", search_source, rel_type, n_hop)
                                 for search_source, triplet, rel_type, n_hop in \
                                 zip(rels_for_search, query_sequence, rel_types, n_hops)
                                 if search_source != "do_not_rank"]
            log.debug(f"(query_parser)query_sequence_dict: {query_sequence_dict} --- rel_directions: "
                      f"{triplet_info_list} --- query_sequence: {query_sequence}")
            entity_ids = [entity[:self.entities_to_leave] for entity in entity_ids]
            rels, entities_rel_conn = [], set()
            if rels_from_template is not None:
                rels = [[(rel, 1.0) for rel in rel_list] for rel_list in rels_from_template]
            elif not rels:
                for triplet_info in triplet_info_list:
                    ex_rels, cur_rels_scores_dict, entity_rel_conn = self.find_top_rels(question, entity_ids,
                                                                                        triplet_info)
                    rels.append(ex_rels)
                    rels_scores_dict = {**rels_scores_dict, **cur_rels_scores_dict}
                    entities_rel_conn = entities_rel_conn.union(entity_rel_conn)
            log.debug(f"(query_parser)rels: {rels}")
            rels_from_query = [triplet[1] for triplet in query_triplets_split if triplet[1].startswith('?')]
            qualifier_rels = [triplet[1] for triplet in query_triplets_split if triplet[1].startswith("pq:P")]

            answer_ent, order_info, filter_from_query = find_query_features(query, qualifier_rels, question)
            log.debug(f"(query_parser) filter_from_query: {filter_from_query} --- order_info: {order_info}")

            year = extract_year(question_tokens, question)
            number = extract_number(question_tokens, question)
            log.debug(f"year {year}, number {number}")
            if year:
                filter_info = [(elem[0], elem[1].replace("n", year)) for elem in filter_from_query]
            elif number:
                filter_info = [(elem[0], elem[1].replace("n", number)) for elem in filter_from_query]
            else:
                filter_info = [elem for elem in filter_from_query if elem[1] != "n"]
            for unk_prop, prop_type in property_types.items():
                filter_info.append((unk_prop, prop_type))
            log.debug(f"(query_parser)filter_from_query: {filter_from_query}")
            rel_combs = make_combs(rels, permut=False)

            entity_positions, type_positions = [elem.split('_') for elem in entities_and_types_select.split(' ')]
            log.debug(f"entity_positions {entity_positions}, type_positions {type_positions}")
            selected_entity_ids, selected_type_ids = [], []
            if len(entity_ids) > 1 and len(entity_positions) == 1:
                selected_entity_ids = []
                for j in range(max([len(elem) for elem in entity_ids])):
                    for elem in entity_ids:
                        if j < len(elem):
                            selected_entity_ids.append(elem[j])
                selected_entity_ids = [selected_entity_ids]
            elif entity_ids:
                selected_entity_ids = [entity_ids[int(pos) - 1] for pos in entity_positions if int(pos) > 0]
            if type_ids:
                selected_type_ids = [type_ids[int(pos) - 1][:self.types_to_leave]
                                     for pos in type_positions if int(pos) > 0]
            entity_combs = make_combs(selected_entity_ids, permut=True)
            type_combs = make_combs(selected_type_ids, permut=False)
            log.debug(f"(query_parser)entity_combs: {entity_combs[:3]}, type_combs: {type_combs[:3]},"
                      f" rel_combs: {rel_combs[:3]}")

            all_combs_list = list(itertools.product(entity_combs, type_combs, rel_combs))
            all_combs_list = sorted(all_combs_list, key=lambda x: (sum([elem[-1] for elem in x]), x[0][-1]))
            parsed_queries_info.append({"query_triplets": query_triplets,
                                        "query_sequence": query_sequence,
                                        "rels_from_query": rels_from_query,
                                        "answer_ent": answer_ent,
                                        "filter_info": filter_info,
                                        "order_info": order_info,
                                        "rel_types": rel_types,
                                        "unk_rels": unk_rels,
                                        "return_if_found": return_if_found,
                                        "selected_entity_ids": selected_entity_ids,
                                        "selected_type_ids": selected_type_ids,
                                        "rels": rels,
                                        "entities_rel_conn": entities_rel_conn,
                                        "entity_combs": entity_combs,
                                        "type_combs": type_combs,
                                        "rel_combs": rel_combs,
                                        "all_combs_list": all_combs_list})
        return parsed_queries_info, rels_scores_dict

    def check_valid_query(self, entities_rel_conn, query_hdt_seq):
        entity_rel_valid = True
        if entities_rel_conn:
            for query_hdt_elem in query_hdt_seq:
                entity, rel = "", ""
                if len(query_hdt_elem) == 3 and any([query_hdt_elem[i].startswith("?") for i in [0, 2]]):
                    if "statement" in self.kb_prefixes and query_hdt_elem[1].startswith(self.kb_prefixes["statement"]):
                        continue
                    else:
                        if not query_hdt_elem[0].startswith("?"):
                            entity = query_hdt_elem[0].split("/")[-1]
                        elif not query_hdt_elem[2].startswith("?"):
                            entity = query_hdt_elem[2].split("/")[-1]
                        if not query_hdt_elem[1].startswith("?"):
                            rel = query_hdt_elem[1].split("/")[-1]
                        if entity and rel and rel not in self.kb_prefixes["type_rels"] \
                                and (entity, rel) not in entities_rel_conn:
                            entity_rel_valid = False
        return entity_rel_valid

    def query_parser(self, question: str,
                     queries_info: Dict[str, str],
                     entity_ids: List[List[str]],
                     type_ids: List[List[str]],
                     answer_types: Set[str],
                     rels_from_template: Optional[List[Tuple[str]]] = None) -> Union[List[Dict[str, Any]], list]:
        parsed_queries_info, rels_scores_dict = self.parse_queries_info(question, queries_info, entity_ids, type_ids,
                                                                        rels_from_template)
        queries_list, parser_info_list, entity_conf_list = [], [], []
        new_combs_list, query_info_list = [], []
        combs_num_list = [len(parsed_query_info["all_combs_list"]) for parsed_query_info in parsed_queries_info]
        if combs_num_list:
            max_comb_nums = max(combs_num_list)
        else:
            max_comb_nums = 0
        for comb_num in range(max_comb_nums):
            for parsed_query_info in parsed_queries_info:
                if comb_num < min(len(parsed_query_info["all_combs_list"]), self.max_comb_num):
                    query_triplets = parsed_query_info["query_triplets"]
                    query_sequence = parsed_query_info["query_sequence"]
                    rels_from_query = parsed_query_info["rels_from_query"]
                    answer_ent = parsed_query_info["answer_ent"]
                    filter_info = parsed_query_info["filter_info"]
                    order_info = parsed_query_info["order_info"]
                    rel_types = parsed_query_info["rel_types"]
                    unk_rels = parsed_query_info["unk_rels"]
                    return_if_found = parsed_query_info["return_if_found"]
                    entities_rel_conn = parsed_query_info["entities_rel_conn"]
                    combs = parsed_query_info["all_combs_list"][comb_num]
                    if combs[0][-1] == 0:
                        entity_conf_list.append(1.0)
                    else:
                        entity_conf_list.append(0.9)
                    query_hdt_seq = [fill_query(query_hdt_elem, combs[0], combs[1], combs[2],
                                                self.map_query_str_to_kb)
                                     for query_hdt_elem in query_sequence]
                    if comb_num == 0:
                        log.debug(f"\n______________________\nfilled query: {query_hdt_seq}\n______________________\n")

                    entity_rel_valid = self.check_valid_query(entities_rel_conn, query_hdt_seq)
                    if entity_rel_valid:
                        new_combs_list.append(combs)
                        queries_list.append((answer_ent, rels_from_query, query_hdt_seq, filter_info, order_info,
                                             answer_types, rel_types, return_if_found))
                        query_info_list.append((query_triplets, query_hdt_seq, answer_ent, filter_info, order_info))
                        parser_info_list.append("query_execute")
                    if comb_num < 3 and unk_rels:
                        unk_query_sequence = copy.deepcopy(query_sequence)
                        unk_rels_from_query = copy.deepcopy(rels_from_query)
                        for unk_rel, rel_var in zip(unk_rels, ["?p", "?p2"]):
                            unk_query_sequence[int(unk_rel) - 1][1] = rel_var
                            combs[-1][int(unk_rel) - 1] = (rel_var, 1.0)
                            if rel_var not in rels_from_query:
                                unk_rels_from_query.append(rel_var)
                        query_hdt_seq = [
                            fill_query(query_hdt_elem, combs[0], combs[1], combs[2], self.map_query_str_to_kb)
                            for query_hdt_elem in unk_query_sequence]
                        new_combs_list.append(combs)
                        queries_list.append((answer_ent, unk_rels_from_query, query_hdt_seq, filter_info, order_info,
                                             answer_types, rel_types, return_if_found))
                        query_info_list.append((query_triplets, query_hdt_seq, answer_ent, filter_info, order_info))
                        parser_info_list.append("query_execute")

        outputs_list = self.wiki_parser(parser_info_list, queries_list)
        outputs = self.parse_outputs(outputs_list, new_combs_list, query_info_list, entity_conf_list, rels_scores_dict)
        return outputs

    def parse_outputs(self, outputs_list, combs_list, query_info_list, entity_conf_list, rels_scores_dict):
        outputs = []
        if isinstance(outputs_list, list) and outputs_list:
            outputs_len = len(outputs_list)
            combs_list = combs_list[:outputs_len]
            entity_conf_list = entity_conf_list[:outputs_len]
            for combs, query_info, entity_conf, (answers_list, found_rels_list, found_combs_list) in \
                    zip(combs_list, query_info_list, entity_conf_list, outputs_list):
                for answers, found_rels, found_comb in zip(answers_list, found_rels_list, found_combs_list):
                    found_rels = [found_rel.split("/")[-1] for found_rel in found_rels]
                    new_combs = list(copy.deepcopy(combs))
                    found_unk_rel = False
                    for j, rel_var in enumerate(["?p", "?p2"]):
                        if isinstance(new_combs[2][j], tuple) and new_combs[2][j][0] == rel_var:
                            if found_rels:
                                new_combs[2][j] = (found_rels[j], rels_scores_dict.get(found_rels[j], 1.0))
                            else:
                                new_combs[2][j] = (new_combs[2][j][0], 0.0)
                            found_unk_rel = True
                    if found_rels and not found_unk_rel:
                        new_combs[2] = new_combs[2][:-1] + [(found_rels[0], 1.0), new_combs[2][-1]]
                    confidence = np.prod([score for rel, score in new_combs[2][:-1]])
                    if answers:
                        outputs.append([new_combs[0], new_combs[1]] + [rel for rel, score in new_combs[2][:-1]] +
                                       answers + [(confidence, entity_conf), found_comb, query_info, new_combs[2]])
            outputs_dict = defaultdict(list)
            types_dict = defaultdict(list)
            for output in outputs:
                key = (tuple(output[0]), tuple([rel.split("/")[-1] for rel in output[2:-5]]))
                if key not in outputs_dict or output[-5:] not in outputs_dict[key]:
                    outputs_dict[key].append(output[-5:])
                    types_dict[key].append(tuple(output[1]))
            outputs = []
            for (entity_comb, rel_comb), output in outputs_dict.items():
                type_comb = types_dict[(entity_comb, rel_comb)]
                output_conf = [elem[1] for elem in output]
                output_conf = sorted(output_conf, key=lambda x: x[0] * x[1], reverse=True)
                found_combs = [elem[2] for elem in output]
                queries = [elem[3] for elem in output]
                rel_combs = [elem[4] for elem in output]
                cur_rel_comb = rel_combs[0]
                cur_rel_comb = [rel for rel, score in cur_rel_comb[:-1]]
                sparql_query = make_sparql_query(queries[0], entity_comb, rel_combs[0], type_comb[0],
                                                 self.gold_query_info)
                parser_info_list = ["fill_triplets"]
                parser_query_list = [(queries[0][1], queries[0][2], found_combs[0])]
                filled_triplets = self.wiki_parser(parser_info_list, parser_query_list)
                outputs.append({"entities": entity_comb, "types": type_comb, "relations": list(cur_rel_comb),
                                "answers": tuple([ans for ans, *_ in output]), "output_conf": output_conf[0],
                                "sparql_query": sparql_query, "triplets": filled_triplets[0]})
        return outputs


@register('query_formatter')
class QueryFormatter(Component):
    def __init__(self, query_info: Dict[str, str], replace_prefixes: Dict[str, str] = None, **kwargs):
        self.query_info = query_info
        self.replace_prefixes = replace_prefixes

    def __call__(self, queries_batch):
        parsed_queries_batch = []
        for query in queries_batch:
            query_split = re.findall("{[ ]?(.*?)[ ]?}", query)
            init_query_triplets, query_triplets = [], []
            if query_split:
                init_query_triplets = query_split[0].split('. ')
            for triplet in init_query_triplets:
                triplet = " ".join([elem.strip("<>") for elem in triplet.strip().split()])
                if self.replace_prefixes:
                    for old_prefix, new_prefix in self.replace_prefixes.items():
                        triplet = triplet.replace(old_prefix, new_prefix)
                query_triplets.append(triplet)
            answer_ent, order_info, filter_from_query = find_query_features(query, order_from_query=True)
            query_info = (query_triplets, answer_ent, filter_from_query, order_info)
            query = merge_sparql_query(query_info, self.query_info)
            parsed_queries_batch.append(query)
        return parsed_queries_batch


================================================
FILE: deeppavlov/models/kbqa/query_generator_base.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
import json
from logging import getLogger
from typing import Tuple, List, Dict, Optional, Union, Any, Set

from bs4 import BeautifulSoup
from whapi import search, get_html

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.file import read_json
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.serializable import Serializable
from deeppavlov.models.entity_extraction.entity_linking import EntityLinker
from deeppavlov.models.kbqa.rel_ranking_infer import RelRankerInfer
from deeppavlov.models.kbqa.template_matcher import TemplateMatcher
from deeppavlov.models.kbqa.utils import preprocess_template_queries

log = getLogger(__name__)


class QueryGeneratorBase(Component, Serializable):
    """
        This class takes as input entity substrings, defines the template of the query and
        fills the slots of the template with candidate entities and relations.
    """

    def __init__(self, template_matcher: TemplateMatcher,
                 rel_ranker: RelRankerInfer,
                 load_path: str,
                 sparql_queries_filename: str,
                 entity_linker: EntityLinker,
                 rels_in_ranking_queries_fname: str = None,
                 wiki_parser=None,
                 entities_to_leave: int = 5,
                 rels_to_leave: int = 7,
                 syntax_structure_known: bool = False,
                 use_wp_api_requester: bool = False,
                 use_el_api_requester: bool = False,
                 use_alt_templates: bool = True,
                 delete_rel_prefix: bool = True,
                 kb_prefixes: Dict[str, str] = None, *args, **kwargs) -> None:
        """

        Args:
            template_matcher: component deeppavlov.models.kbqa.template_matcher
            rel_ranker: component deeppavlov.models.kbqa.rel_ranking_infer
            load_path: path to folder with wikidata files
            sparql_queries_filename: file with sparql query templates
            entity_linker: component deeppavlov.models.entity_extraction.entity_linking for linking of entities
            rels_in_ranking_queries_fname: file with list of rels in queries for questions with ranking
            wiki_parser: component deeppavlov.models.kbqa.wiki_parser
            entities_to_leave: how many entities to leave after entity linking
            rels_to_leave: how many relations to leave after relation ranking
            syntax_structure_known: if syntax tree parser was used to define query template type
            use_wp_api_requester: whether deeppavlov.models.api_requester.api_requester component will be used for
                Wiki Parser
            use_el_api_requester: whether deeppavlov.models.api_requester.api_requester component will be used for
                Entity Linking
            use_alt_templates: whether to use alternative templates if no answer was found for default query template
            delete_rel_prefix: whether to delete prefix in relations
            kb_prefixes: prefixes for entities, relations and types in the knowledge base
        """
        super().__init__(save_path=None, load_path=load_path)
        self.template_matcher = template_matcher
        self.entity_linker = entity_linker
        self.wiki_parser = wiki_parser
        self.rel_ranker = rel_ranker
        self.rels_in_ranking_queries_fname = rels_in_ranking_queries_fname
        self.rels_in_ranking_queries = {}
        self.entities_to_leave = entities_to_leave
        self.rels_to_leave = rels_to_leave
        self.syntax_structure_known = syntax_structure_known
        self.use_wp_api_requester = use_wp_api_requester
        self.use_el_api_requester = use_el_api_requester
        self.use_alt_templates = use_alt_templates
        self.sparql_queries_filename = sparql_queries_filename
        self.delete_rel_prefix = delete_rel_prefix
        self.kb_prefixes = kb_prefixes

        self.load()

    def load(self) -> None:
        if self.rels_in_ranking_queries_fname is not None:
            self.rels_in_ranking_queries = read_json(self.load_path / self.rels_in_ranking_queries_fname)

        template_queries = read_json(str(expand_path(self.sparql_queries_filename)))
        self.template_queries = preprocess_template_queries(template_queries, self.kb_prefixes)

    def save(self) -> None:
        pass

    def find_candidate_answers(self, question: str,
                               question_sanitized: str,
                               template_types: Union[List[str], str],
                               entities_from_ner: List[str],
                               types_from_ner: List[str],
                               entity_tags: List[str],
                               probas: List[float],
                               entities_to_link: List[int],
                               answer_types: Set[str]) -> Tuple[Union[List[Dict[str, Any]], list], str]:
        candidate_outputs = []
        self.template_nums = [template_types]

        replace_tokens = [(' - ', '-'), (' .', ''), ('{', ''), ('}', ''), ('  ', ' '), ('"', "'"), ('(', ''),
                          (')', ''), ('–', '-')]
        for old, new in replace_tokens:
            question = question.replace(old, new)

        entities_from_template, types_from_template, rels_from_template, rel_dirs_from_template, query_type_template, \
        entity_types, template_answer, template_answer_types, template_found = self.template_matcher(
            question_sanitized, entities_from_ner)
        if query_type_template:
            self.template_nums = [query_type_template]

        log.debug(
            f"question: {question} entities_from_template {entities_from_template} template_type {self.template_nums} "
            f"types from template {types_from_template} rels_from_template {rels_from_template} entities_from_ner "
            f"{entities_from_ner} types_from_ner {types_from_ner} answer_types {list(answer_types)[:3]}")

        if entities_from_template or types_from_template:
            if rels_from_template[0][0] == "PHOW":
                how_to_content = self.find_answer_wikihow(entities_from_template[0])
                candidate_outputs = [["PHOW", how_to_content, 1.0]]
            else:
                entity_ids = self.get_entity_ids(entities_from_template, entity_tags, probas, question,
                                                 entities_to_link)
                type_ids = self.get_entity_ids(types_from_template, ["t" for _ in types_from_template],
                                               [1.0 for _ in types_from_template], question)
                log.debug(f"entities_from_template: {entities_from_template} --- entity_types: {entity_types} --- "
                          f"types_from_template: {types_from_template} --- rels_from_template: {rels_from_template} "
                          f"--- answer_types: {template_answer_types} --- entity_ids: {entity_ids}")
                candidate_outputs = self.sparql_template_parser(question_sanitized, entity_ids, type_ids,
                                                                template_answer_types, rels_from_template,
                                                                rel_dirs_from_template)
        if not candidate_outputs and (entities_from_ner or types_from_ner):
            log.debug(f"(__call__)entities_from_ner: {entities_from_ner}")
            entity_ids = self.get_entity_ids(entities_from_ner, entity_tags, probas, question)
            type_ids = self.get_entity_ids(types_from_ner, ["t" for _ in types_from_ner],
                                           [1.0 for _ in types_from_ner], question)
            log.debug(f"(__call__)entity_ids: {entity_ids} type_ids {type_ids}")
            self.template_nums = template_types
            log.debug(f"(__call__)self.template_nums: {self.template_nums}")
            if not self.syntax_structure_known:
                entity_ids = entity_ids[:3]
            candidate_outputs = self.sparql_template_parser(question_sanitized, entity_ids, type_ids, answer_types)
        return candidate_outputs, template_answer

    def get_entity_ids(self, entities: List[str], tags: List[str], probas: List[float], question: str,
                       entities_to_link: List[int] = None) -> List[List[str]]:
        entity_ids, el_output = [], []
        try:
            el_output = self.entity_linker([entities], [tags], [probas], [[question]], [None], [None],
                                           [entities_to_link])
        except json.decoder.JSONDecodeError:
            log.warning("not received output from entity linking")
        if el_output:
            if self.use_el_api_requester:
                el_output = el_output[0]
            if el_output:
                if isinstance(el_output[0], dict):
                    entity_ids = [entity_info.get("entity_ids", []) for entity_info in el_output]
                if isinstance(el_output[0], list):
                    entity_ids, *_ = el_output
            if not self.use_el_api_requester and entity_ids:
                entity_ids = entity_ids[0]

        return entity_ids

    def sparql_template_parser(self, question: str,
                               entity_ids: List[List[str]],
                               type_ids: List[List[str]],
                               answer_types: Set[str],
                               rels_from_template: Optional[List[Tuple[str]]] = None,
                               rel_dirs_from_template: Optional[List[str]] = None) -> Union[List[Dict[str, Any]], list]:
        candidate_outputs = []
        if isinstance(self.template_nums, str):
            self.template_nums = [self.template_nums]
        template_log_list = [str([elem["query_template"], elem["template_num"]])
                             for elem in self.template_queries.values() if elem["template_num"] in self.template_nums]
        log.debug(f"(find_candidate_answers)self.template_nums: {' --- '.join(template_log_list)}")
        init_templates = []
        for template_num in self.template_nums:
            for num, template in self.template_queries.items():
                if (num == template_num and self.syntax_structure_known) or \
                        (template["template_num"] == template_num and not self.syntax_structure_known):
                    init_templates.append(template)
        templates = [template for template in init_templates if
                     (not self.syntax_structure_known and [len(entity_ids), len(type_ids)] == template[
                         "entities_and_types_num"])
                     or self.syntax_structure_known]
        if not templates:
            templates = [template for template in init_templates if
                         (not self.syntax_structure_known and [len(entity_ids), 0] == template[
                             "entities_and_types_num"])
                         or self.syntax_structure_known]
        if not templates:
            return candidate_outputs
        if rels_from_template is not None:
            query_template = {}
            for template in templates:
                if template["rel_dirs"] == rel_dirs_from_template:
                    query_template = template
            if query_template:
                candidate_outputs = self.query_parser(question, [query_template], entity_ids, type_ids, answer_types,
                                                      rels_from_template)
        else:
            candidate_outputs = []
            for priority in range(1, 3):
                pr_templates = [template for template in templates if template["priority"] == priority]
                candidate_outputs = self.query_parser(question, pr_templates, entity_ids, type_ids, answer_types,
                                                      rels_from_template)
                if candidate_outputs:
                    return candidate_outputs

            if not candidate_outputs:
                alt_template_nums = templates[0].get("alternative_templates", [])
                log.debug(f"Using alternative templates {alt_template_nums}")
                alt_templates = [self.template_queries[num] for num in alt_template_nums]
                candidate_outputs = self.query_parser(question, alt_templates, entity_ids, type_ids, answer_types,
                                                      rels_from_template)
                if candidate_outputs:
                    return candidate_outputs

        log.debug("candidate_rels_and_answers:\n" + '\n'.join([str(output) for output in candidate_outputs[:5]]))
        return candidate_outputs

    def find_top_rels(self, question: str, entity_ids: List[List[str]], triplet_info: Tuple) -> \
            Tuple[List[Tuple[str, float]], Dict[str, float], Set[Tuple[str, str]]]:
        ex_rels, entity_rel_conn = [], set()
        direction, source, rel_type, n_hop = triplet_info
        if source == "wiki":
            queries_list = list({(entity, direction, rel_type) for entity_id in entity_ids
                                 for entity in entity_id[:self.entities_to_leave]})
            entity_ids_list = [elem[0] for elem in queries_list]
            parser_info_list = ["find_rels" for i in range(len(queries_list))]
            ex_rels = self.wiki_parser(parser_info_list, queries_list)
            for ex_rels_elem, entity_id in zip(ex_rels, entity_ids_list):
                for rel in ex_rels_elem:
                    entity_rel_conn.add((entity_id, rel.split("/")[-1]))
            if self.use_wp_api_requester and ex_rels:
                ex_rels = [rel[0] for rel in ex_rels]
            ex_rels = list(set(itertools.chain.from_iterable(ex_rels)))
            if n_hop in {"1-of-2-hop", "2-hop"}:
                queries_list = list({(entity, "backw", rel_type) for entity_id in entity_ids
                                     for entity in entity_id[:self.entities_to_leave]})
                entity_ids_list = [elem[0] for elem in queries_list]
                parser_info_list = ["find_rels" for i in range(len(queries_list))]
                ex_rels_backw = self.wiki_parser(parser_info_list, queries_list)
                for ex_rels_elem, entity_id in zip(ex_rels_backw, entity_ids_list):
                    for rel in ex_rels_elem:
                        entity_rel_conn.add((entity_id, rel.split("/")[-1]))
                ex_rels_backw = list(set(itertools.chain.from_iterable(ex_rels_backw)))
                ex_rels += ex_rels_backw
            if self.delete_rel_prefix:
                ex_rels = [rel.split('/')[-1] for rel in ex_rels]
        elif source in {"rank_list_1", "rel_list_1"}:
            ex_rels = self.rels_in_ranking_queries.get("one_rel_in_query", [])
        elif source in {"rank_list_2", "rel_list_2"}:
            ex_rels = self.rels_in_ranking_queries.get("two_rels_in_query", [])

        ex_rels = [rel for rel in ex_rels if not any([rel.endswith(t_rel) for t_rel in self.kb_prefixes["type_rels"]])]
        rels_with_scores = self.rel_ranker.rank_rels(question, ex_rels)
        if n_hop == "2-hop" and rels_with_scores and entity_ids and entity_ids[0]:
            rels_1hop = [rel for rel, score in rels_with_scores]
            queries_list = [(entity_ids[0], rels_1hop[:5])]
            parser_info_list = ["find_rels_2hop"]
            ex_rels_2hop = self.wiki_parser(parser_info_list, queries_list)
            if self.delete_rel_prefix:
                ex_rels_2hop = [rel.split('/')[-1] for rel in ex_rels_2hop]
            rels_with_scores = self.rel_ranker.rank_rels(question, ex_rels_2hop)

        rels_with_scores = list(set(rels_with_scores))
        rels_with_scores = sorted(rels_with_scores, key=lambda x: x[1], reverse=True)
        rels_scores_dict = {rel: score for rel, score in rels_with_scores}

        return rels_with_scores[:self.rels_to_leave], rels_scores_dict, entity_rel_conn

    def find_answer_wikihow(self, howto_sentence: str) -> str:
        tags = []
        search_results = search(howto_sentence, 5)
        if search_results:
            article_id = search_results[0]["article_id"]
            html = get_html(article_id)
            page = BeautifulSoup(html, 'lxml')
            tags = list(page.find_all(['p']))
        if tags:
            howto_content = f"{tags[0].text.strip()}@en"
        else:
            howto_content = "Not Found"
        return howto_content

    def query_parser(self, question, query_templates, entity_ids, type_ids, answer_types, rels_from_template):
        raise NotImplementedError


================================================
FILE: deeppavlov/models/kbqa/rel_ranking_infer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import Tuple, List, Any, Optional

from scipy.special import softmax

from deeppavlov.core.common.chainer import Chainer
from deeppavlov.core.common.file import load_pickle, read_json
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.serializable import Serializable
from deeppavlov.models.kbqa.sentence_answer import sentence_answer
from deeppavlov.models.kbqa.wiki_parser import WikiParser

log = getLogger(__name__)


@register('rel_ranking_infer')
class RelRankerInfer(Component, Serializable):
    """Class for ranking of paths in subgraph"""

    def __init__(self, load_path: str,
                 rel_q2name_filename: str,
                 return_elements: List[str] = None,
                 ranker: Chainer = None,
                 wiki_parser: Optional[WikiParser] = None,
                 batch_size: int = 32,
                 softmax: bool = False,
                 use_api_requester: bool = False,
                 rank: bool = True,
                 nll_rel_ranking: bool = False,
                 nll_path_ranking: bool = False,
                 top_possible_answers: int = -1,
                 top_n: int = 1,
                 pos_class_num: int = 1,
                 rel_thres: float = 0.0,
                 type_rels: List[str] = None, **kwargs):
        """

        Args:
            load_path: path to folder with wikidata files
            rel_q2name_filename: name of file which maps relation id to name
            return_elements: what elements return in output
            ranker: component deeppavlov.models.ranking.rel_ranker
            wiki_parser: component deeppavlov.models.wiki_parser
            batch_size: infering batch size
            softmax: whether to process relation scores with softmax function
            use_api_requester: whether wiki parser will be used as external api
            rank: whether to rank relations or simple copy input
            nll_rel_ranking: whether use components trained with nll loss for relation ranking
            nll_path_ranking: whether use components trained with nll loss for relation path ranking
            top_possible_answers: number of answers returned for a question in each list of candidate answers
            top_n: number of lists of candidate answers returned for a question
            pos_class_num: index of positive class in the output of relation ranking model
            rel_thres: threshold of relation confidence
            type_rels: list of relations in the knowledge base which connect an entity and its type 
            **kwargs:
        """
        super().__init__(save_path=None, load_path=load_path)
        self.rel_q2name_filename = rel_q2name_filename
        self.ranker = ranker
        self.wiki_parser = wiki_parser
        self.batch_size = batch_size
        self.softmax = softmax
        self.return_elements = return_elements or list()
        self.use_api_requester = use_api_requester
        self.rank = rank
        self.nll_rel_ranking = nll_rel_ranking
        self.nll_path_ranking = nll_path_ranking
        self.top_possible_answers = top_possible_answers
        self.top_n = top_n
        self.pos_class_num = pos_class_num
        self.rel_thres = rel_thres
        self.type_rels = type_rels or set()
        self.load()

    def load(self) -> None:
        if self.rel_q2name_filename.endswith("pickle"):
            self.rel_q2name = load_pickle(self.load_path / self.rel_q2name_filename)
        elif self.rel_q2name_filename.endswith("json"):
            self.rel_q2name = read_json(self.load_path / self.rel_q2name_filename)

    def save(self) -> None:
        pass

    def __call__(self, questions_batch: List[str],
                 template_type_batch: List[str],
                 raw_answers_batch: List[List[Tuple[str]]],
                 entity_substr_batch: List[List[str]],
                 template_answers_batch: List[str]) -> List[str]:
        answers_batch, outp_confidences_batch, answer_ids_batch = [], [], []
        entities_and_rels_batch, queries_batch, triplets_batch = [], [], []
        for question, template_type, raw_answers, entities, template_answer in \
                zip(questions_batch, template_type_batch, raw_answers_batch, entity_substr_batch,
                    template_answers_batch):
            answers_with_scores = []
            l_questions, l_rels, l_rels_labels, l_cur_answers, l_entities, l_types, l_sparql_queries, l_triplets, \
            l_confs = self.preprocess_ranking_input(question, raw_answers)

            n_batches = len(l_questions) // self.batch_size + int(len(l_questions) % self.batch_size > 0)
            for i in range(n_batches):
                if self.rank:
                    if self.nll_path_ranking:
                        probas = self.ranker([l_questions[0]],
                                             [l_rels_labels[self.batch_size * i:self.batch_size * (i + 1)]])
                        probas = probas[0]
                    else:
                        probas = self.ranker(l_questions[self.batch_size * i:self.batch_size * (i + 1)],
                                             l_rels_labels[self.batch_size * i:self.batch_size * (i + 1)])
                        probas = [proba[0] for proba in probas]
                else:
                    probas = [rel_conf for rel_conf, entity_conf in
                              l_confs[self.batch_size * i:self.batch_size * (i + 1)]]
                for j in range(self.batch_size * i, self.batch_size * (i + 1)):
                    if j < len(l_cur_answers) and (probas[j - self.batch_size * i] > self.rel_thres or
                                                   (len(l_rels[j]) > 1 and not set(l_rels[j]).intersection(
                                                       self.type_rels))):
                        answers_with_scores.append((l_cur_answers[j], l_sparql_queries[j], l_triplets[j],
                                                    l_entities[j], l_types[j], l_rels_labels[j], l_rels[j],
                                                    round(probas[j - self.batch_size * i], 3),
                                                    round(l_confs[j][0], 3), l_confs[j][1]))
            answers_with_scores = sorted(answers_with_scores, key=lambda x: x[-1] * x[-3], reverse=True)
            if template_type == "simple_boolean" and not answers_with_scores:
                answers_with_scores = [(["No"], "", [], [], [], [], [], 1.0, 1.0, 1.0)]
            res_answers_list, res_answer_ids_list, res_confidences_list, res_entities_and_rels_list = [], [], [], []
            res_queries_list, res_triplets_list = [], []
            for n, ans_sc_elem in enumerate(answers_with_scores):
                init_answer_ids, query, triplets, q_entities, q_types, _, q_rels, p_conf, r_conf, e_conf = ans_sc_elem
                answer_ids = []
                for answer_id in init_answer_ids:
                    answer_id = str(answer_id).replace("@en", "").strip('"')
                    if answer_id not in answer_ids:
                        answer_ids.append(answer_id)

                if self.top_possible_answers > 0:
                    answer_ids = answer_ids[:self.top_possible_answers]
                answer_ids_input = [(answer_id, question) for answer_id in answer_ids]
                answer_ids = [str(answer_id).split("/")[-1] for answer_id in answer_ids]
                parser_info_list = ["find_label" for _ in answer_ids_input]
                init_answer_labels = self.wiki_parser(parser_info_list, answer_ids_input)
                if n < 7:
                    log.debug(f"answers: {init_answer_ids[:3]} --- query {query} --- entities {q_entities} --- "
                              f"types {q_types[:3]} --- q_rels {q_rels} --- {ans_sc_elem[5:]} --- "
                              f"answer_labels {init_answer_labels[:3]}")
                answer_labels = []
                for label in init_answer_labels:
                    if label not in answer_labels:
                        answer_labels.append(label)
                answer_labels = [label for label in answer_labels if (label and label != "Not Found")][:5]
                answer_labels = [str(label) for label in answer_labels]
                if len(answer_labels) > 2:
                    answer = f"{', '.join(answer_labels[:-1])} and {answer_labels[-1]}"
                else:
                    answer = ', '.join(answer_labels)

                if "sentence_answer" in self.return_elements:
                    try:
                        answer = sentence_answer(question, answer, entities, template_answer)
                    except ValueError as e:
                        log.warning(f"Error in sentence answer, {e}")

                res_answers_list.append(answer)
                res_answer_ids_list.append(answer_ids)
                if "several_confidences" in self.return_elements:
                    res_confidences_list.append((p_conf, r_conf, e_conf))
                else:
                    res_confidences_list.append(p_conf)
                res_entities_and_rels_list.append([q_entities[:-1], q_rels])
                res_queries_list.append(query)
                res_triplets_list.append(triplets)

            if self.top_n == 1:
                if answers_with_scores:
                    answers_batch.append(res_answers_list[0])
                    outp_confidences_batch.append(res_confidences_list[0])
                    answer_ids_batch.append(res_answer_ids_list[0])
                    entities_and_rels_batch.append(res_entities_and_rels_list[0])
                    queries_batch.append(res_queries_list[0])
                    triplets_batch.append(res_triplets_list[0])
                else:
                    answers_batch.append("Not Found")
                    outp_confidences_batch.append(0.0)
                    answer_ids_batch.append([])
                    entities_and_rels_batch.append([])
                    queries_batch.append([])
                    triplets_batch.append([])
            else:
                answers_batch.append(res_answers_list[:self.top_n])
                outp_confidences_batch.append(res_confidences_list[:self.top_n])
                answer_ids_batch.append(res_answer_ids_list[:self.top_n])
                entities_and_rels_batch.append(res_entities_and_rels_list[:self.top_n])
                queries_batch.append(res_queries_list[:self.top_n])
                triplets_batch.append(res_triplets_list[:self.top_n])

        answer_tuple = (answers_batch,)
        if "confidences" in self.return_elements:
            answer_tuple += (outp_confidences_batch,)
        if "answer_ids" in self.return_elements:
            answer_tuple += (answer_ids_batch,)
        if "entities_and_rels" in self.return_elements:
            answer_tuple += (entities_and_rels_batch,)
        if "queries" in self.return_elements:
            answer_tuple += (queries_batch,)
        if "triplets" in self.return_elements:
            answer_tuple += (triplets_batch,)

        return answer_tuple

    def preprocess_ranking_input(self, question, answers):
        l_questions, l_rels, l_rels_labels, l_cur_answers = [], [], [], []
        l_entities, l_types, l_sparql_queries, l_triplets, l_confs = [], [], [], [], []
        for ans_and_rels in answers:
            answer, sparql_query, confidence = "", "", []
            entities, types, rels, rels_labels, triplets = [], [], [], [], []
            if ans_and_rels:
                rels = [rel.split('/')[-1] for rel in ans_and_rels["relations"]]
                answer = ans_and_rels["answers"]
                entities = ans_and_rels["entities"]
                types = ans_and_rels["types"]
                sparql_query = ans_and_rels["sparql_query"]
                triplets = ans_and_rels["triplets"]
                confidence = ans_and_rels["output_conf"]
                rels_labels = []
                for rel in rels:
                    if rel in self.rel_q2name:
                        label = self.rel_q2name[rel]
                        if isinstance(label, list):
                            label = label[0]
                        rels_labels.append(label.lower())
            if rels_labels:
                l_questions.append(question)
                l_rels.append(rels)
                l_rels_labels.append(rels_labels)
                l_cur_answers.append(answer)
                l_entities.append(entities)
                l_types.append(types)
                l_sparql_queries.append(sparql_query)
                l_triplets.append(triplets)
                l_confs.append(confidence)
        return l_questions, l_rels, l_rels_labels, l_cur_answers, l_entities, l_types, l_sparql_queries, l_triplets, \
               l_confs

    def rank_rels(self, question: str, candidate_rels: List[str]) -> List[Tuple[str, Any]]:
        rels_with_scores = []
        if question is not None:
            questions, rels_labels, rels = [], [], []
            for candidate_rel in candidate_rels:
                if candidate_rel in self.rel_q2name:
                    cur_rels_labels = self.rel_q2name[candidate_rel]
                    if isinstance(cur_rels_labels, str):
                        cur_rels_labels = [cur_rels_labels]
                    for cur_rel in cur_rels_labels:
                        questions.append(question)
                        rels.append(candidate_rel)
                        rels_labels.append(cur_rel)
            if questions:
                n_batches = len(rels) // self.batch_size + int(len(rels) % self.batch_size > 0)
                for i in range(n_batches):
                    if self.nll_rel_ranking:
                        probas = self.ranker([questions[0]],
                                             [rels_labels[i * self.batch_size:(i + 1) * self.batch_size]])
                        probas = probas[0]
                    else:
                        probas = self.ranker(questions[i * self.batch_size:(i + 1) * self.batch_size],
                                             rels_labels[i * self.batch_size:(i + 1) * self.batch_size])
                        probas = [proba[self.pos_class_num] for proba in probas]
                    for j, rel in enumerate(rels[i * self.batch_size:(i + 1) * self.batch_size]):
                        rels_with_scores.append((rel, probas[j]))
            if self.softmax:
                scores = [score for rel, score in rels_with_scores]
                softmax_scores = softmax(scores)
                rels_with_scores = [(rel, softmax_score) for (rel, score), softmax_score in
                                    zip(rels_with_scores, softmax_scores)]
            rels_with_scores_dict = {}
            for rel, score in rels_with_scores:
                if rel not in rels_with_scores_dict:
                    rels_with_scores_dict[rel] = []
                rels_with_scores_dict[rel].append(score)
            rels_with_scores = [(rel, max(scores)) for rel, scores in rels_with_scores_dict.items()]
            rels_with_scores = sorted(rels_with_scores, key=lambda x: x[1], reverse=True)
        return rels_with_scores


================================================
FILE: deeppavlov/models/kbqa/ru_adj_to_noun.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from collections import defaultdict
from logging import getLogger
from typing import List

import numpy as np
import spacy
from scipy.sparse import csr_matrix

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register

log = getLogger(__name__)


@register('ru_adj_to_noun')
class RuAdjToNoun:
    """
        Class for converting an adjective in Russian to the corresponding noun, for example:
        "московский" -> "Москва", "африканский" -> "Африка"
    """

    def __init__(self, freq_dict_filename: str, candidate_nouns: int = 10, freq_thres: float = 4.5,
                 score_thres: float = 2.8, **kwargs):
        """

        Args:
            freq_dict_filename: file with the dictionary of Russian words with the corresponding frequencies
            candidate_nouns: how many candidate nouns to leave after search
            **kwargs:
        """
        self.candidate_nouns = candidate_nouns
        self.freq_thres = freq_thres
        self.score_thres = score_thres
        alphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя-"
        self.alphabet_length = len(alphabet)
        self.max_word_length = 24
        self.letter_nums = {letter: num for num, letter in enumerate(alphabet)}
        with open(str(expand_path(freq_dict_filename)), 'r') as fl:
            lines = fl.readlines()
        pos_freq_dict = defaultdict(list)
        for line in lines:
            line_split = line.strip('\n').split('\t')
            if re.match("[\d]+\.[\d]+", line_split[2]):
                pos_freq_dict[line_split[1]].append((line_split[0], float(line_split[2])))
        self.nouns_with_freq = pos_freq_dict["s.PROP"]
        self.adj_set = set([word for word, freq in pos_freq_dict["a"]])
        self.nouns = [noun[0] for noun in self.nouns_with_freq]
        self.matrix = self.make_sparse_matrix(self.nouns).transpose()
        self.nlp = spacy.load("ru_core_news_sm")

    def search(self, word: str):
        word = self.nlp(word)[0].lemma_
        if word in self.adj_set:
            q_matrix = self.make_sparse_matrix([word])
            scores = q_matrix * self.matrix
            scores = np.squeeze(scores.toarray())
            indices = np.argsort(-scores)[:self.candidate_nouns]
            scores = list(scores[indices])
            candidates = [self.nouns_with_freq[indices[i]] + (scores[i],) for i in range(len(indices))]
            candidates = [cand for cand in candidates if cand[0][:3].lower() == word[:3].lower()]
            candidates = sorted(candidates, key=lambda x: (x[2], x[1]), reverse=True)
            log.debug(f"AdjToNoun, found nouns: {candidates}")
            if candidates and candidates[0][1] > self.freq_thres and candidates[0][2] > self.score_thres:
                return candidates[0][0]
        return ""

    def make_sparse_matrix(self, words: List[str]):
        indptr = []
        indices = []
        data = []

        total_length = 0

        for n, word in enumerate(words):
            indptr.append(total_length)
            for cnt, letter in enumerate(word.lower()):
                col = self.alphabet_length * cnt + self.letter_nums[letter]
                indices.append(col)
                init_value = 1.0 - cnt * 0.05
                if init_value < 0:
                    init_value = 0
                data.append(init_value)
            total_length += len(word)

        indptr.append(total_length)

        data = np.array(data)
        indptr = np.array(indptr)
        indices = np.array(indices)

        matrix = csr_matrix((data, indices, indptr), shape=(len(words), self.max_word_length * self.alphabet_length))

        return matrix


================================================
FILE: deeppavlov/models/kbqa/sentence_answer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import re
from logging import getLogger

import pkg_resources
import spacy

log = getLogger(__name__)

# en_core_web_sm is installed and used by test_inferring_pretrained_model in the same interpreter session during tests.
# Spacy checks en_core_web_sm package presence with pkg_resources, but pkg_resources is initialized with interpreter,
# sot it doesn't see en_core_web_sm installed after interpreter initialization, so we use importlib.reload below.

if 'en-core-web-sm' not in pkg_resources.working_set.by_key.keys():
    importlib.reload(pkg_resources)

# TODO: move nlp to sentence_answer, sentence_answer to rel_ranking_infer and revise en_core_web_sm requirement,
# TODO: make proper downloading with spacy.cli.download
nlp = spacy.load('en_core_web_sm')

pronouns = ["who", "what", "when", "where", "how"]


def find_tokens(tokens, node, not_inc_node):
    if node != not_inc_node:
        tokens.append(node.text)
        for elem in node.children:
            tokens = find_tokens(tokens, elem, not_inc_node)
    return tokens


def find_inflect_dict(sent_nodes):
    inflect_dict = {}
    for node in sent_nodes:
        if node.dep_ == "aux" and node.tag_ == "VBD" and (node.head.tag_ == "VBP" or node.head.tag_ == "VB"):
            new_verb = node.head._.inflect("VBD")
            inflect_dict[node.head.text] = new_verb
            inflect_dict[node.text] = ""
        if node.dep_ == "aux" and node.tag_ == "VBZ" and node.head.tag_ == "VB":
            new_verb = node.head._.inflect("VBZ")
            inflect_dict[node.head.text] = new_verb
            inflect_dict[node.text] = ""
    return inflect_dict


def find_wh_node(sent_nodes):
    wh_node = ""
    main_head = ""
    wh_node_head = ""
    for node in sent_nodes:
        if node.text.lower() in pronouns:
            wh_node = node
            break

    if wh_node:
        wh_node_head = wh_node.head
        if wh_node_head.dep_ == "ccomp":
            main_head = wh_node_head.head

    return wh_node, wh_node_head, main_head


def find_tokens_to_replace(wh_node_head, main_head, question_tokens, question):
    redundant_tokens_to_replace = []
    question_tokens_to_replace = []

    if main_head:
        redundant_tokens_to_replace = find_tokens([], main_head, wh_node_head)
    what_tokens_fnd = re.findall("what (.*) (is|was|does|did) (.*)", question, re.IGNORECASE)
    if what_tokens_fnd:
        what_tokens = what_tokens_fnd[0][0].split()
        if len(what_tokens) <= 2:
            redundant_tokens_to_replace += what_tokens

    wh_node_head_desc = [node for node in wh_node_head.children if node.text != "?"]
    wh_node_head_dep = [node.dep_ for node in wh_node_head.children if
                        (node.text != "?" and node.dep_ not in ["aux", "prep"] and node.text.lower() not in pronouns)]
    for node in wh_node_head_desc:
        if node.dep_ == "nsubj" and len(wh_node_head_dep) > 1 or node.text.lower() in pronouns or node.dep_ == "aux":
            question_tokens_to_replace.append(node.text)
            for elem in node.subtree:
                question_tokens_to_replace.append(elem.text)

    question_tokens_to_replace = list(set(question_tokens_to_replace))

    redundant_replace_substr = []
    for token in question_tokens:
        if token in redundant_tokens_to_replace:
            redundant_replace_substr.append(token)
        else:
            if redundant_replace_substr:
                break

    redundant_replace_substr = ' '.join(redundant_replace_substr)

    question_replace_substr = []

    for token in question_tokens:
        if token in question_tokens_to_replace:
            question_replace_substr.append(token)
        else:
            if question_replace_substr:
                break

    question_replace_substr = ' '.join(question_replace_substr)

    return redundant_replace_substr, question_replace_substr


def sentence_answer(question, entity_title, entities=None, template_answer=None):
    log.debug(f"question {question} entity_title {entity_title} entities {entities} template_answer {template_answer}")
    sent_nodes = nlp(question)
    reverse = False
    if sent_nodes[-2].tag_ == "IN":
        reverse = True
    question_tokens = [elem.text for elem in sent_nodes]
    log.debug(f"spacy tags: {[(elem.text, elem.tag_, elem.dep_, elem.head.text) for elem in sent_nodes]}")

    inflect_dict = find_inflect_dict(sent_nodes)
    wh_node, wh_node_head, main_head = find_wh_node(sent_nodes)
    redundant_replace_substr, question_replace_substr = find_tokens_to_replace(wh_node_head, main_head,
                                                                               question_tokens, question)
    log.debug(f"redundant_replace_substr {redundant_replace_substr} question_replace_substr {question_replace_substr}")
    if redundant_replace_substr:
        answer = question.replace(redundant_replace_substr, '')
    else:
        answer = question

    if answer.endswith('?'):
        answer = answer.replace('?', '').strip()

    if question_replace_substr:
        if template_answer and entities:
            answer = template_answer.replace("[ent]", entities[0]).replace("[ans]", entity_title)
        elif wh_node.text.lower() in ["what", "who", "how"]:
            fnd_date = re.findall(f"what (day|year) (.*)\?", question, re.IGNORECASE)
            fnd_wh = re.findall("what (is|was) the name of (.*) (which|that) (.*)\?", question, re.IGNORECASE)
            fnd_name = re.findall("what (is|was) the name (.*)\?", question, re.IGNORECASE)
            if fnd_date:
                fnd_date_aux = re.findall(f"what (day|year) (is|was) ({entities[0]}) (.*)\?", question, re.IGNORECASE)
                if fnd_date_aux:
                    answer = f"{entities[0]} {fnd_date_aux[0][1]} {fnd_date_aux[0][3]} on {entity_title}"
                else:
                    answer = f"{fnd_date[0][1]} on {entity_title}"
            elif fnd_wh:
                answer = f"{entity_title} {fnd_wh[0][3]}"
            elif fnd_name:
                aux_verb, sent_cut = fnd_name[0]
                if sent_cut.startswith("of "):
                    sent_cut = sent_cut[3:]
                answer = f"{entity_title} {aux_verb} {sent_cut}"
            else:
                if reverse:
                    answer = answer.replace(question_replace_substr, '')
                    answer = f"{answer} {entity_title}"
                else:
                    answer = answer.replace(question_replace_substr, entity_title)
        elif wh_node.text.lower() in ["when", "where"] and entities:
            sent_cut = re.findall(f"(when|where) (was|is) {entities[0]} (.*)\?", question, re.IGNORECASE)
            if sent_cut:
                if sent_cut[0][0].lower() == "when":
                    answer = f"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} on {entity_title}"
                else:
                    answer = f"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} in {entity_title}"
            else:
                answer = answer.replace(question_replace_substr, '')
                answer = f"{answer} in {entity_title}"

    for old_tok, new_tok in inflect_dict.items():
        answer = answer.replace(old_tok, new_tok)
    answer = re.sub("\s+", " ", answer).strip()

    answer = answer + '.'

    return answer


================================================
FILE: deeppavlov/models/kbqa/template_matcher.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
import json
import multiprocessing as mp
import re
from logging import getLogger
from typing import Any, Tuple, List, Union

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.serializable import Serializable

log = getLogger(__name__)


class RegexpMatcher:
    def __init__(self, question):
        self.question = question

    def __call__(self, template):
        res = re.findall(template["template_regexp"], self.question)
        found_template = []
        if res:
            found_template.append((res[0], template))
        return found_template


@register('template_matcher')
class TemplateMatcher(Serializable):
    """
        This class matches the question with one of the templates
        to extract entity substrings and define which relations
        corresponds to the question
    """

    def __init__(self, load_path: str, templates_filename: str,
                 num_processors: int = None, **kwargs) -> None:
        """

        Args:
            load_path: path to folder with file with templates
            templates_filename: file with templates
            **kwargs:
        """
        super().__init__(save_path=None, load_path=load_path)
        self.templates_filename = templates_filename
        self.num_processors = mp.cpu_count() if num_processors == None else num_processors
        self.pool = mp.Pool(self.num_processors)
        self.load()

    def load(self) -> None:
        log.debug(f"(load)self.load_path / self.templates_filename: {self.load_path / self.templates_filename}")
        with open(self.load_path / self.templates_filename) as fl:
            self.templates = json.load(fl)

    def save(self) -> None:
        raise NotImplementedError

    def __call__(self, question: str, entities_from_ner: List[str]) -> \
            Tuple[Union[List[str], list], list, Union[list, Any], Union[list, Any], Union[str, Any], Union[list, Any],
                  Union[str, Any], Union[list, Any], Union[str, Any]]:
        question = question.lower()
        question = self.sanitize(question)
        question_length = len(question)
        entities, types, relations, relation_dirs = [], [], [], []
        query_type = ""
        template_found = ""
        entity_types = []
        template_answer = ""
        answer_types = []
        results = self.pool.map(RegexpMatcher(question), self.templates)
        results = functools.reduce(lambda x, y: x + y, results)
        replace_tokens = [("the uk", "united kingdom"), ("the us", "united states")]
        if results:
            min_length = 100
            for result in results:
                found_ent, template = result
                positions_entity_tokens = template["positions_entity_tokens"]
                positions_type_tokens = template["positions_type_tokens"]
                positions_unuseful_tokens = template["positions_unuseful_tokens"]
                template_len = template["template_len"]
                template_found = template["template"]
                entities_cand = [found_ent[pos].replace('?', '') for pos in positions_entity_tokens]
                types_cand = [found_ent[pos].replace('?', '').split(',')[0] for pos in positions_type_tokens]
                unuseful_tokens = [found_ent[pos].replace('?', '') for pos in positions_unuseful_tokens]
                entity_lengths = [len(entity) for entity in entities_cand]
                entity_num_tokens = all([len(entity.split(' ')) < 6 for entity in entities_cand])
                type_lengths = [len(entity_type) for entity_type in types_cand]
                unuseful_tokens_len = sum([len(unuseful_tok) for unuseful_tok in unuseful_tokens])
                log.debug(f"found template: {template}, {found_ent}")
                match, entities_cand = self.match_template_and_ner(entities_cand, entities_from_ner, template_found)
                if match and (0 not in entity_lengths or 0 not in type_lengths and entity_num_tokens):
                    cur_len = sum(entity_lengths) + sum(type_lengths)
                    log.debug(f"lengths: entity+type {cur_len}, question {question_length}, "
                              f"template {template_len}, unuseful tokens {unuseful_tokens_len}")
                    if cur_len < min_length and unuseful_tokens_len + template_len + cur_len == question_length:
                        entities = entities_cand
                        for old_token, new_token in replace_tokens:
                            entities = [entity.replace(old_token, new_token) for entity in entities]
                        types = types_cand
                        relations = template["relations"]
                        relation_dirs = template["rel_dirs"]
                        query_type = template["template_type"]
                        entity_types = template.get("entity_types", [])
                        template_answer = template.get("template_answer", "")
                        answer_types = template.get("answer_types", [])
                        min_length = cur_len

        return entities, types, relations, relation_dirs, query_type, entity_types, template_answer, answer_types, \
            template_found

    def sanitize(self, question: str) -> str:
        question = re.sub(r"^(a |the )", '', question)
        date_interval = re.findall("([\d]{4}-[\d]{4})", question)
        if date_interval:
            question = question.replace(date_interval[0], '')
        question = question.replace('  ', ' ')
        return question

    def match_template_and_ner(self, entities_cand: List[str], entities_from_ner: List[str], template: str):
        entities_from_ner = [entity.lower() for entity in entities_from_ner]
        entities_from_ner = [re.sub(r"^(a |the )", '', entity) for entity in entities_from_ner]
        entities_cand = [re.sub(r"^(a |the )", '', entity) for entity in entities_cand]
        entities_cand = [entity.strip() for entity in entities_cand]
        log.debug(f"entities_cand {entities_cand} entities_from_ner {entities_from_ner}")
        match = set(entities_cand) == set(entities_from_ner) or not entities_from_ner or template == "how to xxx?"
        return match, entities_cand


================================================
FILE: deeppavlov/models/kbqa/tree_to_sparql.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import re
from collections import defaultdict
from io import StringIO
from logging import getLogger
from typing import Any, List, Tuple, Dict, Union

import spacy
from navec import Navec
from razdel import tokenize
from slovnet import Syntax
from udapi.block.read.conllu import Conllu
from udapi.core.node import Node

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.file import read_json
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.serializable import Serializable
from deeppavlov.models.kbqa.ru_adj_to_noun import RuAdjToNoun
from deeppavlov.models.kbqa.utils import preprocess_template_queries

log = getLogger(__name__)


@register('slovnet_syntax_parser')
class SlovnetSyntaxParser(Component, Serializable):
    """Class for syntax parsing using Slovnet library"""

    def __init__(self, load_path: str, navec_filename: str, syntax_parser_filename: str, tree_patterns_filename: str,
                 **kwargs):
        super().__init__(save_path=None, load_path=load_path)
        self.navec_filename = expand_path(navec_filename)
        self.syntax_parser_filename = expand_path(syntax_parser_filename)
        self.tree_patterns = read_json(expand_path(tree_patterns_filename))
        self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.pronouns = {"q_pronouns": {"какой", "какая", "какое", "каком", "каким", "какую", "кто", "что", "как",
                                        "когда", "где", "чем", "сколько"},
                         "how_many": {"сколько"}}
        self.first_tokens = {"первый", "первая", "первое"}
        self.nlp = spacy.load("ru_core_news_sm")
        self.load()

    def load(self) -> None:
        navec = Navec.load(self.navec_filename)
        self.syntax = Syntax.load(self.syntax_parser_filename)
        self.syntax.navec(navec)

    def save(self) -> None:
        pass

    def preprocess_sentences(self, sentences, entity_offsets_batch):
        sentences_tokens_batch, replace_dict_batch = [], []
        for sentence, entity_offsets in zip(sentences, entity_offsets_batch):
            if sentence.islower():
                for start, end in entity_offsets:
                    entity_old = sentence[start:end]
                    if entity_old:
                        entity_new = f"{entity_old[0].upper()}{entity_old[1:]}"
                        sentence = sentence.replace(entity_old, entity_new)
                sentence = f"{sentence[0].upper()}{sentence[1:]}"
            names3 = re.findall(r"([\w]{1}\.)([ ]?)([\w]{1}\.)([ ])([\w]{3,})", sentence)
            replace_dict = {}
            for name in names3:
                names_str = "".join(name)
                replace_dict[name[-1]] = (names_str, "name")
                sentence = sentence.replace(names_str, name[-1])
            names2 = re.findall(r"([\w]{1}\.)([ ])([\w]{3,})", sentence)
            for name in names2:
                names_str = "".join(name)
                replace_dict[name[-1]] = (names_str, "name")
                sentence = sentence.replace(names_str, name[-1])
            works_of_art = re.findall(r'(["«])(.*?)(["»])', sentence)
            for symb_start, work_of_art, symb_end in works_of_art:
                work_of_art_tokens = re.findall(self.re_tokenizer, work_of_art)
                if len(work_of_art.split()) > 1:
                    short_substr = ""
                    for tok in work_of_art_tokens:
                        if self.nlp(tok)[0].pos_ == "NOUN":
                            short_substr = tok
                            break
                    if not short_substr:
                        short_substr = work_of_art_tokens[0]
                    replace_dict[short_substr] = (work_of_art, "name")
                    sentence = sentence.replace(work_of_art, short_substr)
            while True:
                tokens = sentence.split()
                found_substr = False
                for i in range(len(tokens) - 2):
                    found = True
                    for j in range(i, i + 3):
                        if len(tokens[j]) < 2 or tokens[j][0] in '("' or tokens[j][-1] in '"),.?':
                            found = False
                    if found and i > 0:
                        token_tags = [self.nlp(tokens[j])[0].pos_ for j in range(i, i + 3)]
                        lemm_tokens = {self.nlp(tok)[0].lemma_ for tok in tokens[i:i + 3]}
                        if token_tags == ["DET", "DET", "NOUN"] and not lemm_tokens & self.first_tokens:
                            long_substr = " ".join(tokens[i:i + 3])
                            replace_dict[tokens[i + 2]] = (long_substr, "adj")
                            sentence = sentence.replace(long_substr, tokens[i + 2])
                            found_substr = True
                    if found_substr:
                        break
                if not found_substr:
                    break
            sentence_tokens = [tok.text for tok in tokenize(sentence)]
            sentences_tokens_batch.append(sentence_tokens)
            log.debug(f"replace_dict: {replace_dict} --- sentence: {sentence_tokens}")
            replace_dict_batch.append(replace_dict)
        return sentences_tokens_batch, replace_dict_batch

    def get_markup(self, proc_syntax_batch, replace_dict_batch):
        markup_batch = []
        for proc_syntax, replace_dict in zip(proc_syntax_batch, replace_dict_batch):
            markup_list = []
            for elem in proc_syntax.tokens:
                markup_list.append({"id": elem.id, "text": elem.text, "head_id": int(elem.head_id), "rel": elem.rel})
            ids, words, head_ids, rels = self.get_elements(markup_list)
            head_ids, markup_list = self.correct_cycle(ids, head_ids, rels, markup_list)
            for substr in replace_dict:
                substr_full, substr_type = replace_dict[substr]
                found_n = -1
                for n, markup_elem in enumerate(markup_list):
                    if markup_elem["text"] == substr:
                        found_n = n
                if found_n > -1:
                    before_markup_list = copy.deepcopy(markup_list[:found_n])
                    after_markup_list = copy.deepcopy(markup_list[found_n + 1:])
                    substr_tokens = [tok.text for tok in tokenize(substr_full)]
                    new_markup_list = []
                    if substr_type == "name":
                        for j in range(len(substr_tokens)):
                            new_markup_elem = {"id": str(found_n + j + 1), "text": substr_tokens[j]}
                            if j == 0:
                                new_markup_elem["rel"] = markup_list[found_n]["rel"]
                                if int(markup_list[found_n]["head_id"]) < found_n + 1:
                                    new_markup_elem["head_id"] = markup_list[found_n]["head_id"]
                                else:
                                    new_markup_elem["head_id"] = str(int(markup_list[found_n]["head_id"]) + len(
                                        substr_tokens) - 1)
                            else:
                                new_markup_elem["rel"] = "flat:name"
                                new_markup_elem["head_id"] = str(found_n + 1)
                            new_markup_list.append(new_markup_elem)
                    elif substr_type == "adj":
                        for j in range(len(substr_tokens)):
                            new_elem = {"id": str(found_n + j + 1), "text": substr_tokens[j]}
                            if j == len(substr_tokens) - 1:
                                new_elem["rel"] = markup_list[found_n]["rel"]
                                if markup_list[found_n]["head_id"] < found_n + 1:
                                    new_elem["head_id"] = markup_list[found_n]["head_id"]
                                else:
                                    new_elem["head_id"] = markup_list[found_n]["head_id"] + len(substr_tokens) - 1
                            else:
                                new_elem["rel"] = "amod"
                                new_elem["head_id"] = str(found_n + len(substr_tokens))
                            new_markup_list.append(new_elem)

                    for j in range(len(before_markup_list)):
                        if int(before_markup_list[j]["head_id"]) > found_n + 1:
                            before_markup_list[j]["head_id"] = int(before_markup_list[j]["head_id"]) + \
                                                               len(substr_tokens) - 1
                        if before_markup_list[j]["head_id"] == found_n + 1 and substr_type == "adj":
                            before_markup_list[j]["head_id"] = found_n + len(substr_tokens)
                    for j in range(len(after_markup_list)):
                        after_markup_list[j]["id"] = str(int(after_markup_list[j]["id"]) + len(substr_tokens) - 1)
                        if int(after_markup_list[j]["head_id"]) > found_n + 1:
                            after_markup_list[j]["head_id"] = int(after_markup_list[j]["head_id"]) + \
                                                              len(substr_tokens) - 1
                        if after_markup_list[j]["head_id"] == found_n + 1 and substr_type == "adj":
                            after_markup_list[j]["head_id"] = found_n + len(substr_tokens)

                    markup_list = before_markup_list + new_markup_list + after_markup_list
            for j in range(len(markup_list)):
                markup_list[j]["head_id"] = str(markup_list[j]["head_id"])
            markup_batch.append(markup_list)
        return markup_batch

    def find_cycle(self, ids, head_ids):
        for i in range(len(ids)):
            for j in range(len(ids)):
                if i < j and head_ids[j] == str(i + 1) and head_ids[i] == str(j + 1):
                    return i + 1
        return -1

    def correct_markup(self, words, head_ids, rels, root_n):
        if len(words) > 3:
            pos = [self.nlp(words[i])[0].pos_ for i in range(len(words))]
            for tree_pattern in self.tree_patterns:
                first_word = tree_pattern.get("first_word", "")
                (r_start, r_end), rel_info = tree_pattern.get("rels", [[0, 0], ""])
                (p_start, p_end), pos_info = tree_pattern.get("pos", [[0, 0], ""])
                if (not first_word or words[0].lower() in self.pronouns[first_word]) \
                        and (not rel_info or rels[r_start:r_end] == rel_info) \
                        and (not pos_info or pos[p_start:p_end] == pos_info):
                    for ind, deprel in tree_pattern.get("rel_ids", {}).items():
                        rels[int(ind)] = deprel
                    for ind, head_id in tree_pattern.get("head_ids", {}).items():
                        head_ids[int(ind)] = head_id
                    root_n = tree_pattern["root_n"]
                    break
            if words[0].lower() in {"какой", "какая", "какое"} and rels[:3] == ["det", "obj", "root"] \
                    and pos[1:3] == ["NOUN", "VERB"] and "nsubj" not in rels:
                rels[1] = "nsubj"
        return head_ids, rels, root_n

    def find_root(self, rels):
        root_n = -1
        for n in range(len(rels)):
            if rels[n] == "root":
                root_n = n + 1
                break
        return root_n

    def get_elements(self, markup_elem):
        ids, words, head_ids, rels = [], [], [], []
        for elem in markup_elem:
            ids.append(elem["id"])
            words.append(elem["text"])
            head_ids.append(elem["head_id"])
            rels.append(elem["rel"])
        return ids, words, head_ids, rels

    def correct_cycle(self, ids, head_ids, rels, markup_elem):
        cycle_num = -1
        for n, (elem_id, head_id) in enumerate(zip(ids, head_ids)):
            if str(head_id) == str(elem_id):
                cycle_num = n
        root_n = self.find_root(rels)
        if cycle_num > 0 and root_n > -1:
            head_ids[cycle_num] = root_n
        markup_elem[cycle_num]["head_id"] = root_n
        return head_ids, markup_elem

    def process_markup(self, markup_batch):
        processed_markup_batch = []
        for markup_elem in markup_batch:
            processed_markup = []
            ids, words, head_ids, rels = self.get_elements(markup_elem)
            if "root" not in {rel.lower() for rel in rels}:
                found_root = False
                for n, (elem_id, head_id) in enumerate(zip(ids, head_ids)):
                    if elem_id == head_id:
                        rels[n] = "root"
                        head_ids[n] = 0
                        found_root = True
                if not found_root:
                    for n in range(len(ids)):
                        if rels[n] == "nsubj":
                            rels[n] = "root"
                            head_ids[n] = 0
                            found_root = True
                if not found_root:
                    for n in range(len(ids)):
                        if self.nlp(words[n])[0].pos_ == "VERB":
                            rels[n] = "root"
                            head_ids[n] = 0

            root_n = self.find_root(rels)
            head_ids, rels, root_n = self.correct_markup(words, head_ids, rels, root_n)
            if words[-1] == "?" and -1 < root_n != head_ids[-1]:
                head_ids[-1] = root_n

            head_ids, markup_elem = self.correct_cycle(ids, head_ids, rels, markup_elem)
            i = self.find_cycle(ids, head_ids)
            if i == 1 and root_n > -1:
                head_ids[i - 1] = root_n
            for elem_id, word, head_id, rel in zip(ids, words, head_ids, rels):
                processed_markup.append(f"{elem_id}\t{word}\t_\t_\t_\t_\t{head_id}\t{rel}\t_\t_")
            processed_markup_batch.append("\n".join(processed_markup))
        return processed_markup_batch

    def __call__(self, sentences, entity_offsets_batch):
        sentences_tokens_batch, substr_dict_batch = self.preprocess_sentences(sentences, entity_offsets_batch)
        proc_syntax_batch = list(self.syntax.map(sentences_tokens_batch))
        markup_batch = self.get_markup(proc_syntax_batch, substr_dict_batch)
        processed_markup_batch = self.process_markup(markup_batch)
        return processed_markup_batch


@register('tree_to_sparql')
class TreeToSparql(Component):
    """
        Class for building of sparql query template using syntax parser
    """

    def __init__(self, sparql_queries_filename: str, syntax_parser: Component, kb_prefixes: Dict[str, str],
                 adj_to_noun: RuAdjToNoun = None, **kwargs):
        """

        Args:
            sparql_queries_filename: file with sparql query templates
            syntax_parser: component for syntactic parsing of the input question
            kb_prefixes: prefixes for entities, relations and types in the knowledge base
            adj_to_noun: component deeppavlov.models.kbqa.tree_to_sparql:RuAdjToNoun
            **kwargs:
        """
        self.q_pronouns = {"какой", "какая", "какое", "каком", "каким", "какую", "кто", "что", "как", "когда",
                           "где", "чем", "сколько"}
        self.how_many = "сколько"
        self.change_root_tokens = {"каким был", "какой была"}
        self.first_tokens = {"первый", "первая", "первое"}
        self.last_tokens = {"последний"}
        self.begin_tokens = {"начинать", "начать"}
        self.end_tokens = {"завершить", "завершать", "закончить"}
        self.ranking_tokens = {"самый"}
        self.date_tokens = {"год", "месяц"}
        self.nlp = spacy.load("ru_core_news_sm")
        self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.sparql_queries_filename = expand_path(sparql_queries_filename)
        template_queries = read_json(self.sparql_queries_filename)
        self.template_queries = preprocess_template_queries(template_queries, kb_prefixes)
        self.syntax_parser = syntax_parser
        self.adj_to_noun = adj_to_noun

    def __call__(self, questions_batch: List[str], substr_batch: List[List[str]], tags_batch: List[List[str]],
                 offsets_batch: List[List[List[int]]], positions_batch: List[List[List[int]]],
                 probas_batch: List[List[float]]) -> Tuple[
        List[Union[str, Any]], List[Union[List[str], List[Union[str, Any]]]], List[Union[List[str], Any]], List[
            Union[List[Union[str, Any]], Any]], List[Union[List[Union[float, Any]], Any]], List[List[int]], List[
            Union[List[str], List[Any]]]]:
        substr_batch, tags_batch, offsets_batch, positions_batch, probas_batch = \
            self.sort_substr(substr_batch, tags_batch, offsets_batch, positions_batch, probas_batch)
        log.debug(f"substr: {substr_batch} tags: {tags_batch} positions: {positions_batch}")
        query_nums_batch, s_substr_batch, s_tags_batch, s_probas_batch, types_batch = [], [], [], [], []
        entities_to_link_batch = []
        clean_questions_batch = []
        count = False
        for question, substr_list, tags_list, offsets_list, probas_list, positions in \
                zip(questions_batch, substr_batch, tags_batch, offsets_batch, probas_batch, positions_batch):
            entities_dict, probas_dict = {}, {}
            for substr, tag, proba in zip(substr_list, tags_list, probas_list):
                entities_dict[substr.lower()] = tag
                probas_dict[substr.lower()] = proba
            for i in range(len(substr_list)):
                substr = substr_list[i]
                if len(substr) > 2 and ("-" in substr or f"{substr}-" in question) and " - " not in substr:
                    if "-" in substr:
                        length = len(re.findall(self.re_tokenizer, substr))
                    else:
                        length = 3
                    substr_tokens = list(tokenize(substr))
                    positions[i] = [positions[i][j] for j in range(len(substr_tokens))]
                    if i < len(substr_list) - 1:
                        for j in range(i + 1, len(substr_list)):
                            pos_inds = positions[j]
                            pos_inds = [ind - length + 1 for ind in pos_inds]
                            positions[j] = pos_inds

            root, tree, tree_desc, unknown_node, unknown_branch = self.syntax_parse(question, offsets_list)
            query_nums = ["7"]
            s_substr_list = substr_list
            s_tags_list = tags_list
            s_probas_list = probas_list
            types_list = []
            if unknown_node:
                log.debug(f"syntax tree info 1, unknown node: {unknown_node.form}, unkn branch: {unknown_branch.form}")
                log.debug(f"wh_leaf: {self.wh_leaf}")
                clause_node, clause_branch = self.find_clause_node(root, unknown_branch)
                log.debug(f"clause node: {clause_node}")
                tok_and_ord = {node.ord: node for node in tree.descendants}
                appos_token_nums = sorted(self.find_appos_tokens(root, tok_and_ord, []))
                appos_tokens = [elem.form for elem in tree_desc if elem.ord in appos_token_nums]
                clause_token_nums = sorted(self.find_clause_tokens(root, tok_and_ord, clause_node))
                clause_tokens = [elem.form for elem in tree_desc if elem.ord in clause_token_nums]
                log.debug(f"appos tokens: {appos_tokens}")
                log.debug(f"clause_tokens: {clause_tokens}")
                question, ranking_tokens = self.sanitize_question(tree, root, appos_token_nums, clause_token_nums)
                if appos_token_nums or clause_token_nums:
                    root, tree, tree_desc, unknown_node, unknown_branch = self.syntax_parse(question, offsets_list)
                    log.debug(f"syntax tree info 2, unknown node: {unknown_node}, unkn branch: {unknown_branch}")

                if unknown_node:
                    modifiers, clause_modifiers = self.find_modifiers_of_unknown(unknown_node)
                    log.debug(f"modifiers: {modifiers} --- clause modifiers: {[nd.form for nd in clause_modifiers]}")
                    if f"{tree_desc[0].form.lower()} {tree_desc[1].form.lower()}" in self.change_root_tokens:
                        new_root = root.children[0]
                    else:
                        new_root = root
                    root_desc = defaultdict(list)
                    for node in new_root.children:
                        if node.deprel not in ["punct", "advmod", "cop", "mark"]:
                            if node == unknown_branch:
                                root_desc[node.deprel].append(node)
                            else:
                                if self.find_entities(node, positions) or \
                                        (self.find_year_or_number(node) and node.deprel in ["obl", "nummod"]):
                                    root_desc[node.deprel].append(node)

                    if root.form.lower() == self.how_many or ("nsubj" in root_desc.keys() and
                                                              self.how_many in [nd.form.lower() for nd in
                                                                                root_desc["nsubj"]]):
                        count = True
                    log.debug(f"root_desc {root_desc.keys()}")
                    self.root_entity = False
                    if root.ord - 1 in positions:
                        self.root_entity = True

                    temporal_order = self.find_first_last(new_root)
                    new_root_nf = self.nlp(new_root.form)[0].lemma_
                    if new_root_nf in self.begin_tokens or new_root_nf in self.end_tokens:
                        temporal_order = new_root_nf
                    query_nums, s_substr_list, types_list = self.build_query(new_root, unknown_branch, root_desc,
                                                                             unknown_node, modifiers, clause_modifiers,
                                                                             clause_node, positions, entities_dict,
                                                                             count, temporal_order, ranking_tokens)
                    s_tags_list, s_probas_list = [], []
                    for substr in s_substr_list:
                        substr = substr.replace(" - ", "-")
                        s_tags_list.append(entities_dict.get(substr.lower(), "E"))
                        s_probas_list.append(probas_dict.get(substr.lower(), 1.0))
            clean_questions_batch.append(question)
            if query_nums and s_substr_list:
                entities_to_link = [1 for _ in s_substr_list]
                s_substr_list_lower = [s.lower() for s in s_substr_list]
                for substr, tag, proba in zip(substr_list, tags_list, probas_list):
                    if substr.lower() not in s_substr_list_lower:
                        s_substr_list.append(substr)
                        s_tags_list.append(tag)
                        s_probas_list.append(proba)
                        entities_to_link.append(0)
                s_substr_batch.append(s_substr_list)
                s_tags_batch.append(s_tags_list)
                s_probas_batch.append(s_probas_list)
                entities_to_link_batch.append(entities_to_link)
            else:
                mod_len = 0
                gr_len = 1
                if all([tags_list[i] == tags_list[0] for i in range(len(tags_list))]):
                    gr_len = len(substr_list)
                elif len(substr_list) > 1:
                    mod_len = 1
                for num, template in self.template_queries.items():
                    syntax_info = [gr_len, 0, mod_len, 0, False, False, False]
                    if syntax_info == list(template["syntax_structure"].values()):
                        query_nums.append(num)
                entities_to_link = [1 for _ in s_substr_list]
                s_substr_batch.append(substr_list)
                s_tags_batch.append(tags_list)
                s_probas_batch.append(probas_list)
                entities_to_link_batch.append(entities_to_link)
            query_nums_batch.append(query_nums)
            types_batch.append(types_list)
        log.debug(f"clean_questions: {clean_questions_batch} --- substr: {s_substr_batch} --- tags: {s_tags_batch} "
                  f"--- entities_to_link {entities_to_link_batch} --- types: {types_batch}")
        return clean_questions_batch, query_nums_batch, s_substr_batch, s_tags_batch, s_probas_batch, \
               entities_to_link_batch, types_batch

    def sort_substr(self, substr_batch: List[List[str]], tags_batch: List[List[str]],
                    offsets_batch: List[List[List[int]]], positions_batch: List[List[List[int]]],
                    probas_batch: List[List[float]]) -> Tuple[
        List[List[str]], List[List[str]], List[List[List[int]]], List[List[List[int]]], List[List[float]]]:
        s_substr_batch, s_tags_batch, s_offsets_batch, s_positions_batch, s_probas_batch = [], [], [], [], []
        for substr_list, tags_list, offsets_list, positions_list, probas_list \
                in zip(substr_batch, tags_batch, offsets_batch, positions_batch, probas_batch):
            substr_info = [(substr, tag, offsets, positions, proba) for substr, tag, offsets, positions, proba
                           in zip(substr_list, tags_list, offsets_list, positions_list, probas_list)]
            substr_info = sorted(substr_info, key=lambda x: x[3][0])
            s_substr_batch.append([elem[0] for elem in substr_info])
            s_tags_batch.append([elem[1] for elem in substr_info])
            s_offsets_batch.append([elem[2] for elem in substr_info])
            s_positions_batch.append([elem[3] for elem in substr_info])
            s_probas_batch.append([elem[4] for elem in substr_info])
        return s_substr_batch, s_tags_batch, s_offsets_batch, s_positions_batch, s_probas_batch

    def syntax_parse(self, question: str, entity_offsets_list: List[List[int]]) -> Tuple[
        Union[str, Any], Union[str, Any], Union[str, Any], str, str]:
        syntax_tree = self.syntax_parser([question], [entity_offsets_list])[0]
        log.debug(f"syntax tree: \n{syntax_tree}")
        root, tree, tree_desc, unknown_node, unknown_branch = "", "", "", "", ""
        try:
            tree = Conllu(filehandle=StringIO(syntax_tree)).read_tree()
            root = self.find_root(tree)
            tree_desc = tree.descendants
        except ValueError as e:
            log.warning(f"error in parsing syntax tree, {e}")
        if root:
            unknown_node, unknown_branch = self.find_branch_with_unknown(root)
            log.debug(f"syntax tree info, root: {root.form} unk_node: {unknown_node} unk_branch: {unknown_branch}")
        return root, tree, tree_desc, unknown_node, unknown_branch

    def sanitize_question(self, tree: Node, root: Node, appos_token_nums: List[int], clause_token_nums: List[int]) -> \
            Tuple[str, list]:
        ranking_tokens = self.find_ranking_tokens(root, appos_token_nums, clause_token_nums)
        question_tokens = []
        for node in tree.descendants:
            if node.ord not in appos_token_nums + clause_token_nums:
                if ranking_tokens and (node.ord in ranking_tokens or node.form.lower() in self.q_pronouns):
                    question_tokens.append(self.nlp(node.form)[0].lemma_)
                else:
                    question_tokens.append(node.form)
        question = " ".join(question_tokens)
        log.debug(f"sanitized question: {question}")
        return question, ranking_tokens

    def find_root(self, tree: Node) -> Node:
        for node in tree.descendants:
            if node.deprel == "root" and node.children:
                return node

    def find_branch_with_unknown(self, root: Node) -> Tuple[str, str]:
        self.wh_leaf = False
        self.one_chain = False
        if root.form.lower() in self.q_pronouns:
            if "nsubj" in [node.deprel for node in root.children] or root.form.lower() in self.how_many:
                self.one_chain = True
            else:
                for node in root.children:
                    if node.deprel == "nsubj":
                        return node, node
        if not self.one_chain:
            for node in root.children:
                if node.form.lower() in self.q_pronouns:
                    if node.children:
                        for child in node.children:
                            if child.deprel in ["nmod", "obl"]:
                                return child, node
                    else:
                        self.wh_leaf = True
                else:
                    for child in node.descendants:
                        if child.form.lower() in self.q_pronouns:
                            return child.parent, node
        if self.wh_leaf or self.one_chain:
            for node in root.children:
                if node.deprel in ["nsubj", "obl", "obj", "nmod", "xcomp"] and node.form.lower() not in self.q_pronouns:
                    return node, node

        return "", ""

    def find_modifiers_of_unknown(self, node: Node) -> Tuple[List[Union[str, Any]], list]:
        modifiers = []
        clause_modifiers = []
        for mod in node.children:
            if mod.deprel in ["amod", "nmod"] or (mod.deprel == "appos" and mod.children):
                noun_mod = ""
                if self.adj_to_noun:
                    noun_mod = self.adj_to_noun.search(mod.form)
                if noun_mod:
                    modifiers.append(noun_mod)
                else:
                    modifiers.append(mod)
            if mod.deprel == "acl":
                clause_modifiers.append(mod)
        return modifiers, clause_modifiers

    def find_clause_node(self, root: Node, unknown_branch: Node) -> Tuple[str, str]:
        for node in root.children:
            if node.deprel == "obl" and node != unknown_branch:
                for elem in node.children:
                    if elem.deprel == "acl":
                        return elem, node
        return "", ""

    def find_entities(self, node: Node, positions: List[List[int]]) -> List[str]:
        node_desc = [(node.form, node.ord, node.parent)] + \
                    [(elem.form, elem.ord, elem.parent) for elem in node.descendants]
        node_desc = sorted(node_desc, key=lambda x: x[1])
        entities_list, heads_list = [], []
        for pos_elem in positions:
            entity, parents = [], []
            for ind in pos_elem:
                for node_elem in node_desc:
                    if ind + 1 == node_elem[1]:
                        entity.append(node_elem[0])
                        parents.append(node_elem[2])
                        break
            if len(entity) == len(pos_elem):
                entity = " ".join(entity).replace(" .", ".")
                entities_list.append(entity)
                heads_list.append(parents[0])
        log.debug(f"node: {node.form} --- found_entities: {entities_list} --- node_desc: {node_desc} --- "
                  f"positions: {positions}")
        return entities_list

    def find_year_or_number(self, node: Node) -> bool:
        found = False
        for elem in node.descendants:
            if elem.deprel == "nummod" or re.findall(r"[\d]{4}", elem.form):
                return True
        return found

    def find_year_constraint(self, node: Node) -> list:
        node_desc = [(node.form, node.ord)] + [(elem.form, elem.ord) for elem in node.descendants]
        node_desc = sorted(node_desc, key=lambda x: x[1])
        desc_text = " ".join([elem[0] for elem in node_desc])
        for symb in ".,:;)":
            desc_text = desc_text.replace(f" {symb}", symb)
        for pattern in [r"в ([\d]{3,4}) году", r"с ([\d]{3,4}) по ([\d]{3,4})"]:
            fnd = re.findall(pattern, desc_text)
            if fnd:
                return fnd
        return []

    def find_appos_tokens(self, node: Node, tok_and_ord: List[Tuple[Node, int]],
                          appos_token_nums: List[int]) -> List[int]:
        for elem in node.children:
            e_desc = elem.descendants
            if elem.deprel == "appos" and elem.ord > 1 and tok_and_ord[elem.ord - 1].deprel == "punct" \
                    and not all([nd.deprel in {"appos", "flat:name"} for nd in e_desc]) \
                    and not ({"«", '"', '``', '('} & {nd.form for nd in e_desc}):
                appos_token_nums.append(elem.ord)
                for desc in elem.descendants:
                    appos_token_nums.append(desc.ord)
            else:
                appos_token_nums = self.find_appos_tokens(elem, tok_and_ord, appos_token_nums)
        return appos_token_nums

    def find_clause_tokens(self, node: Node, tok_and_ord: Dict[int, Node], clause_node: Node) -> List[int]:
        clause_token_nums = []
        for elem in node.children:
            if elem != clause_node and elem.deprel == "acl":
                clause_token_nums.append(elem.ord)
                for desc in elem.descendants:
                    clause_token_nums.append(desc.ord)
            else:
                clause_token_nums = self.find_appos_tokens(elem, tok_and_ord, clause_token_nums)
        return clause_token_nums

    def find_first_last(self, node: Node) -> str:
        first_or_last = ""
        nodes = [node]
        while nodes:
            for node in nodes:
                node_desc = defaultdict(set)
                for elem in node.children:
                    normal_form = self.nlp(elem.form.lower())[0].lemma_
                    node_desc[elem.deprel].add(normal_form)
                log.debug(f"find_first_last {node_desc}")
                if "amod" in node_desc.keys() and "nmod" in node_desc.keys() and \
                        node_desc["amod"].intersection(self.first_tokens | self.last_tokens):
                    first_or_last = ' '.join(node_desc["amod"].intersection(self.first_tokens | self.last_tokens))
                    return first_or_last
            nodes = [elem for node in nodes for elem in node.children]
        return first_or_last

    def find_ranking_tokens(self, node: Node, appos_token_nums: List[int], clause_token_nums: List[int]) -> list:
        ranking_tokens = []
        for elem in node.descendants:
            if self.nlp(elem.form)[0].lemma_ in self.ranking_tokens \
                    and elem.ord not in appos_token_nums + clause_token_nums:
                ranking_tokens.append(elem.ord)
                ranking_tokens.append(elem.parent.ord)
                return ranking_tokens
        return ranking_tokens

    @staticmethod
    def choose_grounded_entity(grounded_entities: List[str], entities_dict: Dict[str, str]):
        tags = [entities_dict.get(entity.lower(), "") for entity in grounded_entities]
        if len(grounded_entities) > 1:
            if not all([tags[i] == tags[0] for i in range(1, len(tags))]):
                for f_tag in ["WORK_OF_ART", "FAC", "PERSON", "GPE"]:
                    for entity, tag in zip(grounded_entities, tags):
                        if tag == f_tag:
                            return [entity]
            elif not all([entity[0].islower() for entity in grounded_entities]):
                for entity in grounded_entities:
                    if entity[0].isupper():
                        return [entity]
        return grounded_entities

    def build_query(self, root: Node, unknown_branch: Node, root_desc: Dict[str, List[Node]], unknown_node: Node,
                    unknown_modifiers: List[Node], clause_modifiers: List[Node], clause_node: Node,
                    positions: List[List[int]], entities_dict: Dict[str, str], count: bool = False,
                    temporal_order: str = "", ranking_tokens: List[str] = None) -> Tuple[
        List[str], List[str], List[str]]:
        query_nums = []
        grounded_entities_list, types_list, modifiers_list, qualifier_entities_list = [], [], [], []
        found_year_or_number = False
        order = False
        root_desc_deprels = []
        for key in root_desc.keys():
            for i in range(len(root_desc[key])):
                if key in {"nsubj", "obj", "obl", "iobj", "acl", "nmod", "xcomp", "cop"}:
                    root_desc_deprels.append(key)
        root_desc_deprels = sorted(root_desc_deprels)
        log.debug(f"build_query: root_desc.keys, {root_desc_deprels}, positions {positions}, wh_leaf {self.wh_leaf}, "
                  f"one_chain {self.one_chain}, temporal order {temporal_order}, ranking tokens {ranking_tokens}")
        if root_desc_deprels in [["nsubj", "obl"],
                                 ["nsubj", "obj"],
                                 ["nsubj", "xcomp"],
                                 ["obj", "xcomp"],
                                 ["nmod", "nsubj"],
                                 ["obj", "obl"],
                                 ["iobj", "nsubj"],
                                 ["acl", "nsubj"],
                                 ["cop", "nsubj", "obl"],
                                 ["obj"],
                                 ["obl"],
                                 ["nmod"],
                                 ["xcomp"],
                                 ["nsubj"]]:
            if self.wh_leaf or self.one_chain:
                if root_desc_deprels == ["nsubj", "obl"]:
                    grounded_entities_list = self.find_entities(root_desc["nsubj"][0], positions)
                    if not grounded_entities_list:
                        grounded_entities_list = self.find_entities(root_desc["obl"][0], positions)
                else:
                    for nodes in root_desc.values():
                        if nodes[0].form not in self.q_pronouns:
                            grounded_entities_list = self.find_entities(nodes[0], positions)
                            if grounded_entities_list:
                                break
            else:
                if self.root_entity:
                    grounded_entities_list = [root.form]
                for nodes in root_desc.values():
                    if nodes[0] != unknown_branch:
                        grounded_entities_list = self.find_entities(nodes[0], positions)
                        if grounded_entities_list:
                            type_entity = unknown_node.form
                            types_list.append(type_entity)
                            break

                if unknown_modifiers:
                    for n, modifier in enumerate(unknown_modifiers):
                        if isinstance(modifier, str):
                            modifiers_list.append(modifier)
                        else:
                            modifier_entities = self.find_entities(modifier, positions)
                            if modifier_entities:
                                modifiers_list += modifier_entities
                if clause_modifiers:
                    found_year_or_number = self.find_year_or_number(clause_modifiers[0])
                    if found_year_or_number:
                        query_nums.append("0")
                    qualifier_entities_list = self.find_entities(clause_modifiers[0], positions)

        if root_desc_deprels == ["nsubj", "obl", "obl"]:
            grounded_entities_list = self.find_entities(root_desc["nsubj"][0], positions)
            for node in root_desc["obl"]:
                if node == unknown_branch:
                    types_list.append(node.form)
                else:
                    grounded_entities_list += self.find_entities(node, positions)

        if root_desc_deprels == ["nsubj", "obj", "obj"]:
            obj_desc = root_desc["obj"]
            qualifier_entities_list = self.find_entities(obj_desc[0], positions)
            grounded_entities_list = self.find_entities(obj_desc[1], positions)

        year_constraint = self.find_year_constraint(root)
        if root_desc_deprels == ["nmod", "nsubj"] and year_constraint:
            if len(year_constraint[0]) == 2:
                query_nums.append("24")
            elif len(year_constraint[0]) == 1:
                query_nums.append("0")

        if root_desc_deprels == ["obj", "xcomp"]:
            grounded_entities_list = self.find_entities(root_desc["xcomp"][0], positions)

        if (self.wh_leaf and root_desc_deprels in [["nsubj", "obj", "obl"], ["obj", "obl"]]) \
                or (root_desc_deprels in [["nsubj", "obj", "obl"], ["obl", "xcomp"]]
                    and self.find_year_or_number(root_desc["obl"][0])):
            found_year_or_number = self.find_year_or_number(root_desc["obl"][0])
            nsubj_ent_list, obj_ent_list = [], []
            if "nsubj" in root_desc_deprels:
                nsubj_ent_list = self.find_entities(root_desc["nsubj"][0], positions)
            if "obj" in root_desc:
                obj_ent_list = self.find_entities(root_desc["obj"][0], positions)
            obl_ent_list = self.find_entities(root_desc["obl"][0], positions)
            log.debug(f"nsubj_ent: {nsubj_ent_list} --- obj_ent: {obj_ent_list} obl_ent: {obl_ent_list}")
            if self.wh_leaf:
                grounded_entities_list = obl_ent_list
                qualifier_entities_list = obj_ent_list
            elif not found_year_or_number and nsubj_ent_list and obl_ent_list:
                grounded_entities_list = nsubj_ent_list
                modifiers_list = obl_ent_list
            else:
                grounded_entities_list = obj_ent_list
            if found_year_or_number:
                query_nums.append("0")
            if not grounded_entities_list:
                grounded_entities_list = self.find_entities(root, positions)
                grounded_entities_list = self.choose_grounded_entity(grounded_entities_list, entities_dict)

        if clause_node:
            for node in clause_node.children:
                if node.deprel == "obj":
                    grounded_entities_list = self.find_entities(node, positions)
                if self.find_year_or_number(node):
                    query_nums.append("0")

            if not self.wh_leaf:
                type_entity = unknown_node.form
                types_list.append(type_entity)

        if root_desc_deprels == ["nmod", "nmod"]:
            grounded_entities_list = self.find_entities(root_desc["nmod"][0], positions)
            modifiers_list = self.find_entities(root_desc["nmod"][1], positions)

        if root_desc_deprels == ["nmod", "nsubj", "nummod"]:
            if not self.wh_leaf:
                grounded_entities_list = self.find_entities(root_desc["nmod"][0], positions)
                found_year_or_number = self.find_year_or_number(root_desc["nummod"][0])

        if temporal_order and not query_nums:
            for deprel in root_desc:
                for node in root_desc[deprel]:
                    entities = self.find_entities(node, positions)
                    if entities:
                        grounded_entities_list = entities
                        break
                if grounded_entities_list:
                    break
            if temporal_order in self.first_tokens | self.begin_tokens:
                query_nums += ["22"]
            if temporal_order in self.last_tokens | self.end_tokens:
                query_nums += ["23"]
        log.debug(f"query_nums: {query_nums} --- year_constraint: {year_constraint}")

        if count:
            grounded_entities_list = self.find_entities(root, positions)

        grounded_entities_list = self.choose_grounded_entity(grounded_entities_list, entities_dict)
        entities_list = grounded_entities_list + qualifier_entities_list + modifiers_list
        types_list = [tp for tp in types_list
                      if not (len(tp.split()) == 1 and self.nlp(tp)[0].lemma_ in self.date_tokens)]

        gr_len = len(grounded_entities_list)
        types_len = len(types_list)
        mod_len = len(modifiers_list)
        qua_len = len(qualifier_entities_list)
        if qua_len or count:
            types_len = 0

        if not temporal_order and not query_nums:
            for num, template in self.template_queries.items():
                syntax_info = [gr_len, types_len, mod_len, qua_len, found_year_or_number, count, order]
                if syntax_info == list(template["syntax_structure"].values()):
                    query_nums.append(num)
                if mod_len:
                    syntax_info[1] = 0
                    if syntax_info == list(template["syntax_structure"].values()):
                        query_nums.append(num)

        log.debug(f"tree_to_sparql, grounded entities: {grounded_entities_list} --- types: {types_list} --- "
                  f"modifier entities: {modifiers_list} --- qualifier entities: {qualifier_entities_list} --- "
                  f"year_or_number {found_year_or_number} --- count: {count} --- order: {order} --- "
                  f"query nums: {query_nums}")

        return query_nums, entities_list, types_list


================================================
FILE: deeppavlov/models/kbqa/type_define.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle
from typing import List

import spacy
from nltk.corpus import stopwords

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register


@register('answer_types_extractor')
class AnswerTypesExtractor:
    """Class which defines answer types for the question"""

    def __init__(self, lang: str, types_filename: str, types_sets_filename: str,
                 num_types_to_return: int = 15, **kwargs):
        """

        Args:
            lang: Russian or English
            types_filename: filename with dictionary where keys are type ids and values are type labels
            types_sets_filename: filename with dictionary where keys are NER tags and values are Wikidata types
                corresponding to tags
            num_types_to_return: how many answer types to return for each question
            **kwargs:
        """
        self.lang = lang
        self.types_filename = str(expand_path(types_filename))
        self.types_sets_filename = str(expand_path(types_sets_filename))
        self.num_types_to_return = num_types_to_return
        if self.lang == "@en":
            self.stopwords = set(stopwords.words("english"))
            self.nlp = spacy.load("en_core_web_sm")
            self.pronouns = ["what"]
        elif self.lang == "@ru":
            self.stopwords = set(stopwords.words("russian"))
            self.nlp = spacy.load("ru_core_news_sm")
            self.pronouns = ["какой", "каком"]
        with open(self.types_filename, 'rb') as fl:
            self.types_dict = pickle.load(fl)
        with open(self.types_sets_filename, 'rb') as fl:
            self.types_sets = pickle.load(fl)

    def __call__(self, questions_batch: List[str], entity_substr_batch: List[List[str]],
                 tags_batch: List[List[str]], types_substr_batch: List[List[str]] = None):
        if types_substr_batch is None:
            types_substr_batch = []
            for question, entity_substr_list in zip(questions_batch, entity_substr_batch):
                types_substr = []
                type_noun = ""
                doc = self.nlp(question)
                token_pos_dict = {}
                for n, token in enumerate(doc):
                    token_pos_dict[token.text] = n
                for token in doc:
                    if token.text.lower() in self.pronouns and token.head.dep_ in ["attr", "nsubj"]:
                        type_noun = token.head.text
                        if not any([type_noun in entity_substr.lower() for entity_substr in entity_substr_list]):
                            types_substr.append(type_noun)
                        break
                if type_noun:
                    for token in doc:
                        if token.head.text == type_noun and token.dep_ in ["amod", "compound"]:
                            type_adj = token.text
                            if not any([type_adj.lower() in entity_substr.lower() for entity_substr in
                                        entity_substr_list]):
                                types_substr.append(type_adj)
                            break
                        elif token.head.text == type_noun and token.dep_ == "prep":
                            if len(list(token.children)) == 1 \
                                    and not any([list(token.children)[0].text in entity_substr.lower()
                                                 for entity_substr in entity_substr_list]):
                                types_substr += [token.text, list(token.children)[0].text]
                elif any([word in question for word in self.pronouns]):
                    for token in doc:
                        if token.dep_ == "nsubj" and not any([token.text in entity_substr.lower()
                                                              for entity_substr in entity_substr_list]):
                            types_substr.append(token.text)
                types_substr = [(token, token_pos_dict[token]) for token in types_substr]
                types_substr = sorted(types_substr, key=lambda x: x[1])
                types_substr = " ".join([elem[0] for elem in types_substr])
                types_substr_batch.append(types_substr)
        types_sets_batch = [set() for _ in questions_batch]
        for n, (question, types_sets) in enumerate(zip(questions_batch, types_sets_batch)):
            question = question.lower()
            if not types_sets:
                if self.lang == "@ru":
                    if question.startswith("кто"):
                        types_sets_batch[n] = self.types_sets["PER"]
                    elif question.startswith("где"):
                        types_sets_batch[n] = self.types_sets["LOC"]
                    elif any([question.startswith(elem) for elem in ["когда", "в каком году", "в каком месяце"]]):
                        types_sets_batch[n] = {"date"}
                    elif len(question.split()) > 1 and (any([question.startswith(elem) for elem in ["кем ", "как"]]) \
                                                        or question.split()[1].startswith("как")):
                        types_sets_batch[n] = {"not_date"}
                elif self.lang == "@en":
                    if question.startswith("who"):
                        types_sets_batch[n] = self.types_sets["PER"]
                    elif question.startswith("where"):
                        types_sets_batch[n] = self.types_sets["LOC"]
                    elif any([question.startswith(elem) for elem in ["when", "what year", "what month"]]):
                        types_sets_batch[n] = {"date"}

        new_entity_substr_batch, new_entity_offsets_batch, new_tags_batch = [], [], []
        for question, entity_substr_list, tags_list in zip(questions_batch, entity_substr_batch, tags_batch):
            new_entity_substr, new_tags = [], []
            if not entity_substr_list:
                doc = self.nlp(question)
                for token in doc:
                    if token.dep_ == "nsubj":
                        new_entity_substr.append(token.text)
                        new_tags.append("MISC")
                        break
                new_entity_substr_batch.append(new_entity_substr)
                new_tags_batch.append(new_tags)
            else:
                new_entity_substr_batch.append(entity_substr_list)
                new_tags_batch.append(tags_list)

        return types_sets_batch, new_entity_substr_batch, new_tags_batch


================================================
FILE: deeppavlov/models/kbqa/utils.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
import re
from collections import namedtuple
from typing import List, Tuple, Dict, Any


def find_query_features(query, qualifier_rels=None, question=None, order_from_query=None):
    query = query.lower().replace("select distinct", "select")
    answer_ent = re.findall(r"select [\(]?([\S]+) ", query)
    order_info_nt = namedtuple("order_info", ["variable", "sorting_order"])
    order_variable = re.findall("order by (asc|desc)\((.*)\)", query)
    if order_variable:
        if (qualifier_rels and len(qualifier_rels[0][4:]) > 1) or order_from_query:
            answers_sorting_order = order_variable[0][0]
        else:
            answers_sorting_order = order_of_answers_sorting(question)
        order_info = order_info_nt(order_variable[0][1], answers_sorting_order)
    else:
        order_info = order_info_nt(None, None)
    filter_from_query = re.findall("contains\((\?\w), (.+?)\)", query)
    return answer_ent, order_info, filter_from_query


def extract_year(question_tokens: List[str], question: str) -> str:
    question_patterns = [r'.*\d{1,2}/\d{1,2}/(\d{4}).*', r'.*\d{1,2}-\d{1,2}-(\d{4}).*', r'.*(\d{4})-\d{1,2}-\d{1,2}.*']
    from_to_patterns = [r"from ([\d]{3,4}) to [\d]{3,4}", r"с ([\d]{3,4}) по [\d]{3,4}"]
    token_patterns = [r'(\d{4})', r'^(\d{4})-.*', r'.*-(\d{4})$']
    year = ""
    for pattern in question_patterns:
        fnd = re.search(pattern, question)
        if fnd is not None:
            year = fnd.group(1)
            break
    else:
        for pattern in from_to_patterns:
            fnd = re.findall(pattern, question)
            if fnd:
                return fnd[0]
        for token in question_tokens:
            for pattern in token_patterns:
                fnd = re.search(pattern, token)
                if fnd is not None:
                    return fnd.group(1)
    return year


def extract_number(question_tokens: List[str], question: str) -> str:
    number = ""
    fnd = re.search(r'.*(\d\.\d+e\+\d+)\D*', question)
    if fnd is not None:
        number = fnd.group(1)
    else:
        for tok in question_tokens:
            if tok[0].isdigit():
                number = tok
                break

    number = number.replace('1st', '1').replace('2nd', '2').replace('3rd', '3')
    number = number.strip(".0")

    return number


def order_of_answers_sorting(question: str) -> str:
    question_lower = question.lower()
    max_words = ["maximum", "highest", "max ", "greatest", "most", "longest", "biggest", "deepest", "завершил",
                 "закончил", "завершает"]
    for word in max_words:
        if word in question_lower:
            return "desc"
    return "asc"


def make_combs(entity_ids: List[List[str]], permut: bool) -> List[List[str]]:
    entity_ids = [[(entity, n) for n, entity in enumerate(entities_list)] for entities_list in entity_ids]
    entity_ids = list(itertools.product(*entity_ids))
    entity_ids = [comb for comb in entity_ids if not
    (all([comb[i][0][0].split("/")[-1] == comb[0][0][0].split("/")[-1] for i in range(len(comb))])
     and not all([comb[i][0][0] == comb[0][0][0] for i in range(len(comb))]))]
    entity_ids_permut = []
    if permut:
        for comb in entity_ids:
            entity_ids_permut += itertools.permutations(comb)
    else:
        entity_ids_permut = entity_ids
    entity_ids = sorted(entity_ids_permut, key=lambda x: sum([elem[1] for elem in x]))
    ent_combs = [[elem[0] for elem in comb] + [sum([elem[1] for elem in comb])] for comb in entity_ids]
    return ent_combs


def fill_slots(query: str, entity_comb: List[str], type_comb: List[str], rel_comb: List[Tuple[str, float]],
               delete_rel_prefix: bool = False) -> str:
    for n, entity in enumerate(entity_comb[:-1]):
        query = query.replace(f"e{n + 1}", entity)
    for n, entity_type in enumerate(type_comb[:-1]):  # type_entity
        query = query.replace(f"t{n + 1}", entity_type)
    for n, (rel, score) in enumerate(rel_comb[:-1]):
        if not rel.startswith("?"):
            if delete_rel_prefix:
                rel = rel.split("/")[-1]
            query = query.replace(f"r{n + 1}", rel)
    return query


def correct_variables(query_triplets: List[str], answer_ent: List[str], query_info: Dict[str, str]):
    for i in range(len(query_triplets)):
        for ent_var in answer_ent:
            triplet_elements = query_triplets[i].split()
            for j in range(len(triplet_elements)):
                if triplet_elements[j] not in ent_var and triplet_elements[j].startswith("?"):
                    triplet_elements[j] = query_info["mid_var"]
                    break
                if triplet_elements[j].startswith("?") \
                        and triplet_elements[j] not in [query_info["mid_var"], query_info["unk_var"]]:
                    triplet_elements[j] = query_info["unk_var"]
                    break
            query_triplets[i] = " ".join(triplet_elements)
            query_triplets[i] = query_triplets[i].replace(ent_var, query_info["unk_var"])
    return query_triplets


def query_from_triplets(query_triplets: List[str], answer_ent: List[str], query_info: Dict[str, str]) -> str:
    filled_query = " . ".join(query_triplets)
    if answer_ent and answer_ent[0].lower().startswith("count"):
        filled_query = f"SELECT COUNT({query_info['unk_var']}) " + \
                       f"WHERE {{ {filled_query}. }}"
    else:
        filled_query = f"SELECT {query_info['unk_var']} WHERE {{ {filled_query}. }}"
    filled_query = filled_query.replace(" ..", ".")
    return filled_query


def fill_query(query: List[str], entity_comb: List[str], type_comb: List[str], rel_comb: List[Tuple[str, float]],
               map_query_str_to_kb) -> List[str]:
    ''' example of query: ["wd:E1", "p:R1", "?s"]
                   entity_comb: ["Q159"]
                   type_comb: []
                   rel_comb: ["P17"]
        map_query_str_to_kb = [("P0", "http://wd"),
                               ("P00", "http://wl"),
                               ("wd:", "http://we/"),
                               ("wdt:", "http://wpd/"),
                               (" p:", " http://wp/"),
                               ("ps:", "http://wps/"),
                               ("pq:", "http://wpq/")]
    '''
    query = " ".join(query)

    for query_str, wikidata_str in map_query_str_to_kb:
        query = query.replace(query_str, wikidata_str)
    query = fill_slots(query, entity_comb, type_comb, rel_comb)
    query = query.replace("http://wpd/P0", "http://wd")
    query = query.replace("http://wpd/P00", "http://wl")
    query = query.split(' ')
    return query


def make_sparql_query(query_info: Tuple[List[str], List[str], List[str], Dict[str, Any], Dict[str, Any]],
                      entities: List[str], rels: List[Tuple[str, float]], types: List[str],
                      query_info_dict: Dict[str, str]) -> List[str]:
    query_triplets, filled_triplets, answer_ent, filter_info, order_info = query_info
    query_triplets = [fill_slots(elem, entities, types, rels, delete_rel_prefix=True) for elem in query_triplets]
    query_triplets = correct_variables(query_triplets, answer_ent, query_info_dict)
    filled_queries = []
    if any(["qualifier" in filter_info_element for filter_info_element in filter_info]):
        filled_queries.append(query_from_triplets(query_triplets, answer_ent, query_info_dict))
    else:
        for triplets_p in list(itertools.permutations(query_triplets)):
            filled_queries.append(query_from_triplets(triplets_p, answer_ent, query_info_dict))
    return filled_queries


def merge_sparql_query(query_info: Tuple[List[str], List[str], Dict[str, Any], Dict[str, Any]],
                       query_info_dict: Dict[str, str]) -> str:
    query_triplets, answer_ent, filter_info, order_info = query_info
    query = query_from_triplets(query_triplets, answer_ent, query_info_dict)
    return query


def preprocess_template_queries(template_queries: Dict[str, Any], kb_prefixes: Dict[str, str]) -> Dict[str, Any]:
    for template_num in template_queries:
        template = template_queries[template_num]
        query = template["query_template"]
        q_triplets = re.findall("{[ ]?(.*?)[ ]?}", query)[0].split(' . ')
        q_triplets = [triplet.split(' ')[:3] for triplet in q_triplets]
        if not "rel_types" in template:
            template["rel_types"] = ["direct" for _ in q_triplets]
        rel_types = template["rel_types"]
        rel_dirs, n_hops, entities, types, gr_ent, mod_ent, q_ent = [], [], set(), set(), set(), set(), set()

        for n, (triplet, rel_type) in enumerate(zip(q_triplets, rel_types)):
            if not triplet[1].startswith(kb_prefixes["type_rel"]):
                if triplet[2].startswith("?"):
                    rel_dirs.append("forw")
                else:
                    rel_dirs.append("backw")
            for ind in [0, 2]:
                if triplet[ind].startswith(kb_prefixes["entity"]):
                    entities.add(triplet[ind])
                elif triplet[ind].startswith(kb_prefixes["type"]):
                    types.add(triplet[ind])
            if rel_type in {"qualifier", "statement"}:
                if triplet[2].startswith(kb_prefixes["entity"]):
                    q_ent.add(triplet[2])
            else:
                if triplet[0].startswith(kb_prefixes["entity"]):
                    gr_ent.add(triplet[0])
                elif triplet[2].startswith(kb_prefixes["entity"]):
                    mod_ent.add(triplet[2])
            if triplet[1].startswith(kb_prefixes["rel"]) and triplet[0].startswith("?") and triplet[2].startswith("?"):
                n_hops.append("2-hop")
            elif n == 0 and len(q_triplets) == 2 and q_triplets[1][1].startswith(kb_prefixes["rel"]) \
                    and q_triplets[1][0].startswith("?") and q_triplets[1][2].startswith("?"):
                n_hops.append("1-of-2-hop")
            else:
                n_hops.append("1-hop")
        syntax_structure = {"gr_ent": len(gr_ent), "types": len(types), "mod_ent": len(mod_ent),
                            "q_ent": len(q_ent), "year_or_number": False, "count": False, "order": False}
        if "filter" in query.lower():
            syntax_structure["year_or_number"] = True
        if "order" in query.lower():
            syntax_structure["order"] = True
        if "count" in query.lower():
            syntax_structure["count"] = True
        if not "query_sequence" in template:
            template["query_sequence"] = list(range(1, len(q_triplets) + 1))
        template["rel_dirs"] = rel_dirs
        template["n_hops"] = n_hops
        template["entities_and_types_num"] = [len(entities), len(types)]
        if entities:
            entities_str = '_'.join([str(num) for num in list(range(1, len(entities) + 1))])
        else:
            entities_str = "0"
        if types:
            types_str = '_'.join([str(num) for num in list(range(1, len(types) + 1))])
        else:
            types_str = "0"
        template["entities_and_types_select"] = f"{entities_str} {types_str}"
        template["syntax_structure"] = syntax_structure
        if "return_if_found" not in template:
            template["return_if_found"] = False
        if "priority" not in template:
            template["priority"] = 1
        template_queries[template_num] = template
    return template_queries


================================================
FILE: deeppavlov/models/kbqa/wiki_parser.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import re
from collections import namedtuple
from logging import getLogger
from typing import List, Tuple, Dict, Any, Union

from hdt import HDTDocument

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.file import load_pickle, read_json
from deeppavlov.core.common.registry import register

log = getLogger(__name__)


@register('wiki_parser')
class WikiParser:
    """This class extract relations, objects or triplets from Wikidata HDT file."""

    def __init__(self, wiki_filename: str,
                 file_format: str = "hdt",
                 prefixes: Dict[str, Union[str, Dict[str, str]]] = None,
                 rel_q2name_filename: str = None,
                 max_comb_num: int = 1e6,
                 lang: str = "@en", **kwargs) -> None:
        """

        Args:
            wiki_filename: file with Wikidata
            file_format: format of Wikidata file
            lang: Russian or English language
            **kwargs:
        """

        if prefixes is None:
            prefixes = {
                "entity": "http://we",
                "label": "http://wl",
                "alias": "http://wal",
                "description": "http://wd",
                "rels": {
                    "direct": "http://wpd",
                    "no_type": "http://wp",
                    "statement": "http://wps",
                    "qualifier": "http://wpq",
                    "type": "http://wpd/P31"
                },
                "statement": "http://ws"
            }
        self.prefixes = prefixes
        self.file_format = file_format
        self.wiki_filename = str(expand_path(wiki_filename))
        if self.file_format == "hdt":
            self.document = HDTDocument(self.wiki_filename)
        elif self.file_format == "pickle":
            self.document = load_pickle(self.wiki_filename)
            self.parsed_document = {}
        else:
            raise ValueError("Unsupported file format")
        self.used_rels = set()
        self.rel_q2name = dict()
        if rel_q2name_filename:
            if rel_q2name_filename.endswith("json"):
                self.rel_q2name = read_json(str(expand_path(rel_q2name_filename)))
            elif rel_q2name_filename.endswith("pickle"):
                self.rel_q2name = load_pickle(str(expand_path(rel_q2name_filename)))
            else:
                raise ValueError(f"Unsupported file format: {rel_q2name_filename}")

        self.max_comb_num = max_comb_num
        self.lang = lang
        self.replace_tokens = [('"', ''), (self.lang, " "), ('$', ' '), ('  ', ' ')]

    def __call__(self, parser_info_list: List[str], queries_list: List[Any]) -> List[Any]:
        wiki_parser_output = self.execute_queries_list(parser_info_list, queries_list)
        return wiki_parser_output

    def execute_queries_list(self, parser_info_list: List[str], queries_list: List[Any]):
        wiki_parser_output = []
        query_answer_types = []
        for parser_info, query in zip(parser_info_list, queries_list):
            if parser_info == "query_execute":
                answers, found_rels, found_combs = [], [], []
                try:
                    what_return, rels_from_query, query_seq, filter_info, order_info, answer_types, rel_types, \
                    return_if_found = query
                    if answer_types:
                        query_answer_types = answer_types
                    answers, found_rels, found_combs = \
                        self.execute(what_return, rels_from_query, query_seq, filter_info, order_info,
                                     query_answer_types, rel_types)
                except ValueError:
                    log.warning("Wrong arguments are passed to wiki_parser")
                wiki_parser_output.append([answers, found_rels, found_combs])
            elif parser_info == "find_rels":
                rels = []
                try:
                    rels = self.find_rels(*query)
                except:
                    log.warning("Wrong arguments are passed to wiki_parser")
                wiki_parser_output.append(rels)
            elif parser_info == "find_rels_2hop":
                rels = []
                try:
                    rels = self.find_rels_2hop(*query)
                except ValueError:
                    log.warning("Wrong arguments are passed to wiki_parser")
                wiki_parser_output += rels
            elif parser_info == "find_object":
                objects = []
                try:
                    objects = self.find_object(*query)
                except:
                    log.warning("Wrong arguments are passed to wiki_parser")
                wiki_parser_output.append(objects)
            elif parser_info == "check_triplet":
                check_res = False
                try:
                    check_res = self.check_triplet(*query)
                except:
                    log.warning("Wrong arguments are passed to wiki_parser")
                wiki_parser_output.append(check_res)
            elif parser_info == "find_label":
                label = ""
                try:
                    label = self.find_label(*query)
                except:
                    log.warning("Wrong arguments are passed to wiki_parser")
                wiki_parser_output.append(label)
            elif parser_info == "find_types":
                types = []
                try:
                    types = self.find_types(query)
                except:
                    log.warning("Wrong arguments are passed to wiki_parser")
                wiki_parser_output.append(types)
            elif parser_info == "fill_triplets":
                filled_triplets = []
                try:
                    filled_triplets = self.fill_triplets(*query)
                except ValueError:
                    log.warning("Wrong arguments are passed to wiki_parser")
                wiki_parser_output.append(filled_triplets)
            elif parser_info == "find_triplets":
                if self.file_format == "hdt":
                    triplets = []
                    try:
                        triplets_forw, c = self.document.search_triples(f"{self.prefixes['entity']}/{query}", "", "")
                        triplets.extend([triplet for triplet in triplets_forw
                                         if not triplet[2].startswith(self.prefixes["statement"])])
                        triplets_backw, c = self.document.search_triples("", "", f"{self.prefixes['entity']}/{query}")
                        triplets.extend([triplet for triplet in triplets_backw
                                         if not triplet[0].startswith(self.prefixes["statement"])])
                    except:
                        log.warning("Wrong arguments are passed to wiki_parser")
                    wiki_parser_output.append(list(triplets))
                else:
                    triplets = {}
                    try:
                        triplets = self.document.get(query, {})
                    except:
                        log.warning("Wrong arguments are passed to wiki_parser")
                    uncompressed_triplets = {}
                    if triplets:
                        if "forw" in triplets:
                            uncompressed_triplets["forw"] = self.uncompress(triplets["forw"])
                        if "backw" in triplets:
                            uncompressed_triplets["backw"] = self.uncompress(triplets["backw"])
                    wiki_parser_output.append(uncompressed_triplets)
            elif parser_info == "find_triplets_for_rel":
                found_triplets = []
                try:
                    found_triplets, c = \
                        self.document.search_triples("", f"{self.prefixes['rels']['direct']}/{query}", "")
                except:
                    log.warning("Wrong arguments are passed to wiki_parser")
                wiki_parser_output.append(list(found_triplets))
            elif parser_info == "parse_triplets" and self.file_format == "pickle":
                for entity in query:
                    self.parse_triplets(entity)
                wiki_parser_output.append("ok")
            else:
                raise ValueError("Unsupported query type")

        return wiki_parser_output

    def execute(self, what_return: List[str],
                rels_from_query: List[str],
                query_seq: List[List[str]],
                filter_info: List[Tuple[str]] = None,
                order_info: namedtuple = None,
                answer_types: List[str] = None,
                rel_types: List[str] = None):
        """
            Let us consider an example of the question "What is the deepest lake in Russia?"
            with the corresponding SPARQL query            
            "SELECT ?ent WHERE { ?ent wdt:P31 wd:T1 . ?ent wdt:R1 ?obj . ?ent wdt:R2 wd:E1 } ORDER BY ASC(?obj) LIMIT 5"

            arguments:
                what_return: ["?obj"]
                query_seq: [["?ent", "http://www.wikidata.org/prop/direct/P17", "http://www.wikidata.org/entity/Q159"]
                            ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"],
                            ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"]]
                filter_info: []
                order_info: order_info(variable='?obj', sorting_order='asc')
        """
        extended_combs = []
        answers, found_rels, found_combs = [], [], []

        for n, (query, rel_type) in enumerate(zip(query_seq, rel_types)):
            unknown_elem_positions = [(pos, elem) for pos, elem in enumerate(query) if elem.startswith('?')]
            """
                n = 0, query = ["?ent", "http://www.wikidata.org/prop/direct/P17",
                                "http://www.wikidata.org/entity/Q159"]
                       unknown_elem_positions = ["?ent"]
                n = 1, query = ["?ent", "http://www.wikidata.org/prop/direct/P31",
                                "http://www.wikidata.org/entity/Q23397"]
                       unknown_elem_positions = [(0, "?ent")]
                n = 2, query = ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"]
                       unknown_elem_positions = [(0, "?ent"), (2, "?obj")]
            """
            if n == 0:
                combs, triplets = self.search(query, unknown_elem_positions, rel_type)
                # combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...]
            else:
                if combs:
                    known_elements = []
                    extended_combs = []
                    if query[0].startswith("?"):
                        for elem in query:
                            if elem in combs[0].keys():
                                known_elements.append(elem)
                        for comb in combs:
                            """
                                n = 1
                                query = ["?ent", "http://www.wikidata.org/prop/direct/P31",
                                                                            "http://www.wikidata.org/entity/Q23397"]
                                comb = {"?ent": "http://www.wikidata.org/entity/Q5513"}
                                known_elements = ["?ent"], known_values = ["http://www.wikidata.org/entity/Q5513"]
                                filled_query = ["http://www.wikidata.org/entity/Q5513", 
                                                "http://www.wikidata.org/prop/direct/P31", 
                                                "http://www.wikidata.org/entity/Q23397"]
                                new_combs = [["http://www.wikidata.org/entity/Q5513", 
                                              "http://www.wikidata.org/prop/direct/P31", 
                                              "http://www.wikidata.org/entity/Q23397"], ...]
                                extended_combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...]
                            """
                            if comb:
                                known_values = [comb[known_elem] for known_elem in known_elements]
                                for known_elem, known_value in zip(known_elements, known_values):
                                    filled_query = [elem.replace(known_elem, known_value) for elem in query]
                                    new_combs, triplets = self.search(filled_query, unknown_elem_positions, rel_type)
                                    for new_comb in new_combs:
                                        extended_combs.append(self.merge_combs(comb, new_comb))
                    else:
                        new_combs, triplets = self.search(query, unknown_elem_positions, rel_type)
                        for comb in combs:
                            for new_comb in new_combs:
                                extended_combs.append(self.merge_combs(comb, new_comb))
                combs = extended_combs

        is_boolean = self.define_is_boolean(query_seq)
        if combs or is_boolean:
            if filter_info:
                for filter_elem, filter_value in filter_info:
                    if filter_value == "qualifier":
                        filter_value = "wpq/"
                    combs = [comb for comb in combs if filter_value in comb[filter_elem]]

            if order_info and not isinstance(order_info, list) and order_info.variable is not None:
                reverse = True if order_info.sorting_order == "desc" else False
                sort_elem = order_info.variable
                if combs and "?p" in combs[0]:
                    rel_combs = {}
                    for comb in combs:
                        if comb["?p"] not in rel_combs:
                            rel_combs[comb["?p"]] = []
                        rel_combs[comb["?p"]].append(comb)
                    rel_combs_list = rel_combs.values()
                else:
                    rel_combs_list = [combs]
                new_rel_combs_list = []
                for rel_combs in rel_combs_list:
                    new_rel_combs = []
                    for rel_comb in rel_combs:
                        value_str = rel_comb[sort_elem].split('^^')[0].strip('"+')
                        fnd_date = re.findall(r"[\d]{3,4}-[\d]{1,2}-[\d]{1,2}", value_str)
                        fnd_num = re.findall(r"([\d]+)\.([\d]+)", value_str)
                        if fnd_date:
                            rel_comb[sort_elem] = fnd_date[0]
                        elif fnd_num or value_str.isdigit():
                            rel_comb[sort_elem] = float(value_str)
                        new_rel_combs.append(rel_comb)
                    new_rel_combs = [(elem, n) for n, elem in enumerate(new_rel_combs)]
                    new_rel_combs = sorted(new_rel_combs, key=lambda x: (x[0][sort_elem], x[1]), reverse=reverse)
                    new_rel_combs = [elem[0] for elem in new_rel_combs]
                    new_rel_combs_list.append(new_rel_combs)
                combs = [new_rel_combs[0] for new_rel_combs in new_rel_combs_list]

            if what_return and what_return[-1].startswith("count"):
                answers = [[len(combs)]]
            else:
                answers = [[elem[key] for key in what_return if key in elem] for elem in combs]

            if answer_types:
                if list(answer_types) == ["date"]:
                    answers = [[entity for entity in answer
                                if re.findall(r"[\d]{3,4}-[\d]{1,2}-[\d]{1,2}", entity)] for answer in answers]
                elif list(answer_types) == ["not_date"]:
                    answers = [[entity for entity in answer
                                if not re.findall(r"[\d]{3,4}-[\d]{1,2}-[\d]{1,2}", entity)] for answer in answers]
                else:
                    answer_types = set(answer_types)
                    answers = [[entity for entity in answer
                                if answer_types.intersection(self.find_types(entity))] for answer in answers]
            if is_boolean:
                answers = [["Yes" if len(triplets) > 0 else "No"]]
            found_rels = [[elem[key] for key in rels_from_query if key in elem] for elem in combs]
            ans_rels_combs = [(answer, rel, comb) for answer, rel, comb in zip(answers, found_rels, combs)
                              if any([entity for entity in answer])]
            answers = [elem[0] for elem in ans_rels_combs]
            found_rels = [elem[1] for elem in ans_rels_combs]
            found_combs = [elem[2] for elem in ans_rels_combs]

        return answers, found_rels, found_combs

    @staticmethod
    def define_is_boolean(query_hdt_seq):
        return len(query_hdt_seq) == 1 and all([not query_hdt_seq[0][i].startswith("?") for i in [0, 2]])

    @staticmethod
    def merge_combs(comb1, comb2):
        new_comb = {}
        for key in comb1:
            if (key in comb2 and comb1[key] == comb2[key]) or key not in comb2:
                new_comb[key] = comb1[key]
        for key in comb2:
            if (key in comb1 and comb2[key] == comb1[key]) or key not in comb1:
                new_comb[key] = comb2[key]
        return new_comb

    def search(self, query: List[str], unknown_elem_positions: List[Tuple[int, str]], rel_type):
        query = list(map(lambda elem: "" if elem.startswith('?') else elem, query))
        subj, rel, obj = query
        if self.file_format == "hdt":
            combs = []
            triplets, cnt = self.document.search_triples(subj, rel, obj)
            if cnt < self.max_comb_num:
                triplets = list(triplets)
                if rel == self.prefixes["description"] or rel == self.prefixes["label"]:
                    triplets = [triplet for triplet in triplets if triplet[2].endswith(self.lang)]
                    combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets]
                else:
                    if isinstance(self.prefixes["rels"][rel_type], str):
                        combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets
                                 if (triplet[1].startswith(self.prefixes["rels"][rel_type])
                                     or triplet[1].startswith(self.prefixes["rels"]["type"]))]
                    else:
                        combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets
                                 if (any(triplet[1].startswith(tp) for tp in self.prefixes["rels"][rel_type])
                                     or triplet[1].startswith(self.prefixes["rels"]["type"]))]
            else:
                log.debug("max comb num exceeds")
        else:
            triplets = []
            if subj:
                subj, triplets = self.find_triplets(subj, "forw")
                triplets = [[subj, triplet[0], obj] for triplet in triplets for obj in triplet[1:]]
            if obj:
                obj, triplets = self.find_triplets(obj, "backw")
                triplets = [[subj, triplet[0], obj] for triplet in triplets for subj in triplet[1:]]
            if rel:
                if rel == self.prefixes["description"]:
                    triplets = [triplet for triplet in triplets if triplet[1] == "descr_en"]
                else:
                    rel = rel.split('/')[-1]
                    triplets = [triplet for triplet in triplets if triplet[1] == rel]
            combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets]

        return combs, triplets

    def find_label(self, entity: str, question: str = "") -> str:
        entity = str(entity).replace('"', '')
        if self.file_format == "hdt":
            if entity.startswith("Q") or entity.startswith("P"):
                # example: "Q5513"
                entity = f"{self.prefixes['entity']}/{entity}"
                # "http://www.wikidata.org/entity/Q5513"

            if entity.startswith(self.prefixes["entity"]):
                labels, c = self.document.search_triples(entity, self.prefixes["label"], "")
                # labels = [["http://www.wikidata.org/entity/Q5513", "http://www.w3.org/2000/01/rdf-schema#label",
                #                                                    '"Lake Baikal"@en'], ...]
                for label in labels:
                    if label[2].endswith(self.lang):
                        found_label = label[2].strip(self.lang)
                        for old_tok, new_tok in self.replace_tokens:
                            found_label = found_label.replace(old_tok, new_tok)
                        found_label = found_label.strip()
                        return found_label

            elif entity.endswith(self.lang):
                # entity: '"Lake Baikal"@en'
                entity = entity[:-3].replace('$', ' ').replace('  ', ' ')
                return entity

            elif "^^" in entity:
                """
                    examples:
                        '"1799-06-06T00:00:00Z"^^<http://www.w3.org/2001/XMLSchema#dateTime>' (date)
                        '"+1642"^^<http://www.w3.org/2001/XMLSchema#decimal>' (number)
                """
                entity = entity.split("^^")[0]
                for token in ["T00:00:00Z", "+"]:
                    entity = entity.replace(token, '')
                entity = self.format_date(entity, question).replace('$', '')
                return entity

            elif re.findall(r"[\d]{3,4}-[\d]{2}-[\d]{2}", entity):
                entity = self.format_date(entity, question).replace('$', '')
                return entity

            elif entity in ["Yes", "No"]:
                return entity

            elif entity.isdigit():
                entity = entity.replace('.', ',')
                return entity

        if self.file_format == "pickle":
            if entity:
                if entity.startswith("Q") or entity.startswith("P"):
                    triplets = self.document.get(entity, {}).get("forw", [])
                    triplets = self.uncompress(triplets)
                    for triplet in triplets:
                        if triplet[0] == "name_en":
                            return triplet[1]
                else:
                    entity = self.format_date(entity, question)
                    return entity

        return "Not Found"

    def format_date(self, entity, question):
        dates_dict = {"January": "января", "February": "февраля", "March": "марта", "April": "апреля", "May": "мая",
                      "June": "июня", "July": "июля", "August": "августа", "September": "сентября",
                      "October": "октября",
                      "November": "ноября", "December": "декабря"}
        date_info = re.findall("([\d]{3,4})-([\d]{1,2})-([\d]{1,2})", entity)
        if date_info:
            year, month, day = date_info[0]
            if "how old" in question.lower() or "сколько лет" in question.lower():
                entity = datetime.datetime.now().year - int(year)
            elif "в каком году" in question.lower():
                entity = year
            elif "в каком месяце" in question.lower():
                entity = month
            elif day not in {"00", "0"}:
                date = datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d")
                entity = date.strftime("%d %B %Y")
            else:
                entity = year
            if self.lang == "@ru":
                for mnth, mnth_replace in dates_dict.items():
                    entity = entity.replace(mnth, mnth_replace)
            return str(entity)
        entity = entity.lstrip('+-')
        return entity

    def find_alias(self, entity: str) -> List[str]:
        aliases = []
        if entity.startswith(self.prefixes["entity"]):
            labels, cardinality = self.document.search_triples(entity, self.prefixes["alias"], "")
            aliases = [label[2].strip(self.lang).strip('"') for label in labels if label[2].endswith(self.lang)]
        return aliases

    def find_rels(self, entity: str, direction: str, rel_type: str = "no_type") -> List[str]:
        rels = []
        if self.file_format == "hdt":
            if not rel_type:
                rel_type = "direct"
            if direction == "forw":
                query = [f"{self.prefixes['entity']}/{entity}", "", ""]
            else:
                query = ["", "", f"{self.prefixes['entity']}/{entity}"]
            triplets, c = self.document.search_triples(*query)
            triplets = list(triplets)
            if isinstance(self.prefixes['rels'][rel_type], str):
                start_str = f"{self.prefixes['rels'][rel_type]}/P"
                rels = {triplet[1] for triplet in triplets if triplet[1].startswith(start_str)}
            else:
                rels = {triplet[1] for triplet in triplets
                        if any([triplet[1].startswith(tp) for tp in self.prefixes['rels'][rel_type]])}
            rels = list(rels)
            if self.used_rels:
                rels = [rel for rel in rels if rel.split("/")[-1] in self.used_rels]
        return rels

    def find_rels_2hop(self, entity_ids, rels_1hop):
        rels = []
        for entity_id in entity_ids:
            for rel_1hop in rels_1hop:
                triplets, cnt = self.document.search_triples(f"{self.prefixes['entity']}/{entity_id}", rel_1hop, "")
                triplets = [triplet for triplet in triplets if triplet[2].startswith(self.prefixes['entity'])]
                objects_1hop = [triplet[2].split("/")[-1] for triplet in triplets]
                triplets, cnt = self.document.search_triples("", rel_1hop, f"{self.prefixes['entity']}/{entity_id}")
                triplets = [triplet for triplet in triplets if triplet[0].startswith(self.prefixes['entity'])]
                objects_1hop += [triplet[0].split("/")[-1] for triplet in triplets]
                for object_1hop in objects_1hop[:5]:
                    tr_2hop, cnt = self.document.search_triples(f"{self.prefixes['entity']}/{object_1hop}", "", "")
                    rels_2hop = [elem[1] for elem in tr_2hop if elem[1] != rel_1hop]
                    if self.used_rels:
                        rels_2hop = [elem for elem in rels_2hop if elem.split("/")[-1] in self.used_rels]
                    rels += rels_2hop
                    tr_2hop, cnt = self.document.search_triples("", "", f"{self.prefixes['entity']}/{object_1hop}")
                    rels_2hop = [elem[1] for elem in tr_2hop if elem[1] != rel_1hop]
                    if self.used_rels:
                        rels_2hop = [elem for elem in rels_2hop if elem.split("/")[-1] in self.used_rels]
                    rels += rels_2hop
        rels = list(set(rels))
        return rels

    def find_object(self, entity: str, rel: str, direction: str) -> List[str]:
        objects = []
        if not direction:
            direction = "forw"
        if self.file_format == "hdt":
            entity = f"{self.prefixes['entity']}/{entity.split('/')[-1]}"
            rel = f"{self.prefixes['rels']['direct']}/{rel}"
            if direction == "forw":
                triplets, cnt = self.document.search_triples(entity, rel, "")
                if cnt < self.max_comb_num:
                    objects.extend([triplet[2].split('/')[-1] for triplet in triplets])
            else:
                triplets, cnt = self.document.search_triples("", rel, entity)
                objects.extend([triplet[0].split('/')[-1] for triplet in triplets])
        else:
            entity = entity.split('/')[-1]
            rel = rel.split('/')[-1]
            triplets = self.document.get(entity, {}).get(direction, [])
            triplets = self.uncompress(triplets)
            for found_rel, *objects in triplets:
                if rel == found_rel:
                    objects.extend(objects)
        return objects

    def check_triplet(self, subj: str, rel: str, obj: str) -> bool:
        if self.file_format == "hdt":
            subj = f"{self.prefixes['entity']}/{subj}"
            rel = f"{self.prefixes['rels']['direct']}/{rel}"
            obj = f"{self.prefixes['entity']}/{obj}"
            triplets, cnt = self.document.search_triples(subj, rel, obj)
            if cnt > 0:
                return True
            else:
                return False
        else:
            subj = subj.split('/')[-1]
            rel = rel.split('/')[-1]
            obj = obj.split('/')[-1]
            triplets = self.document.get(subj, {}).get("forw", [])
            triplets = self.uncompress(triplets)
            for found_rel, *objects in triplets:
                if found_rel == rel:
                    for found_obj in objects:
                        if found_obj == obj:
                            return True
            return False

    def find_types(self, entity: str):
        types = []
        if self.file_format == "hdt":
            if not entity.startswith("http"):
                entity = f"{self.prefixes['entity']}/{entity}"
            tr, c = self.document.search_triples(entity, f"{self.prefixes['rels']['direct']}/P31", "")
            types = [triplet[2].split('/')[-1] for triplet in tr]
            for rel in ["P106", "P21"]:
                tr, c = self.document.search_triples(entity, f"{self.prefixes['rels']['direct']}/{rel}", "")
                types += [triplet[2].split('/')[-1] for triplet in tr]

        if self.file_format == "pickle":
            entity = entity.split('/')[-1]
            triplets = self.document.get(entity, {}).get("forw", [])
            triplets = self.uncompress(triplets)
            for triplet in triplets:
                if triplet[0] == "P31":
                    types = triplet[1:]
        types = set(types)
        return types

    def find_subclasses(self, entity: str):
        types = []
        if self.file_format == "hdt":
            if not entity.startswith("http"):
                entity = f"{self.prefixes['entity']}/{entity}"
            tr, c = self.document.search_triples(entity, f"{self.prefixes['rels']['direct']}/P279", "")
            types = [triplet[2].split('/')[-1] for triplet in tr]
        if self.file_format == "pickle":
            entity = entity.split('/')[-1]
            triplets = self.document.get(entity, {}).get("forw", [])
            triplets = self.uncompress(triplets)
            for triplet in triplets:
                if triplet[0] == "P279":
                    types = triplet[1:]
        types = set(types)
        return types

    def uncompress(self, triplets: Union[str, List[List[str]]]) -> List[List[str]]:
        if isinstance(triplets, str):
            triplets = triplets.split('\t')
            triplets = [triplet.strip().split("  ") for triplet in triplets]
        return triplets

    def parse_triplets(self, entity):
        triplets = self.document.get(entity, {})
        for direction in ["forw", "backw"]:
            if direction in triplets:
                dir_triplets = triplets[direction]
                dir_triplets = self.uncompress(dir_triplets)
                if entity in self.parsed_document:
                    self.parsed_document[entity][direction] = dir_triplets
                else:
                    self.parsed_document[entity] = {direction: dir_triplets}

    def find_triplets(self, subj: str, direction: str) -> Tuple[str, List[List[str]]]:
        subj = subj.split('/')[-1]
        if subj in self.parsed_document:
            triplets = self.parsed_document.get(subj, {}).get(direction, [])
        else:
            triplets = self.document.get(subj, {}).get(direction, [])
            triplets = self.uncompress(triplets)
        return subj, triplets

    def fill_triplets(self, init_triplets, what_to_return, comb):
        filled_triplets = []
        for n, (subj, rel, obj) in enumerate(init_triplets):
            if "statement" in self.prefixes and subj.startswith("?") \
                    and comb.get(subj, "").startswith(self.prefixes["statement"]) and not rel.startswith("?") \
                    and (obj == what_to_return[0] or re.findall(r"[\d]{3,4}", comb.get(what_to_return[0], ""))):
                continue
            else:
                if "statement" in self.prefixes and subj.startswith("?") \
                        and str(comb.get(subj, "")).startswith(self.prefixes["statement"]):
                    if not comb.get(what_to_return[0], "").startswith("http") \
                            and re.findall(r"[\d]{3,4}", comb.get(what_to_return[0], "")):
                        subj = init_triplets[1][2]
                    else:
                        subj = what_to_return[0]
                if "statement" in self.prefixes and obj.startswith("?") \
                        and str(comb.get(obj, "")).startswith(self.prefixes["statement"]):
                    if not str(comb.get(what_to_return[0], "")).startswith("http") \
                            and re.findall(r"[\d]{3,4}", str(comb.get(what_to_return[0], ""))):
                        obj = init_triplets[1][2]
                    else:
                        obj = what_to_return[0]
                subj, obj = str(subj), str(obj)
                if subj.startswith("?"):
                    subj = comb.get(subj, "")
                if obj.startswith("?"):
                    obj = comb.get(obj, "")
                if rel.startswith("?"):
                    rel = comb.get(rel, "")
                subj_label = self.find_label(subj)
                obj_label = self.find_label(obj)
                if rel in self.rel_q2name:
                    rel_label = self.rel_q2name[rel]
                elif rel.split("/")[-1] in self.rel_q2name:
                    rel_label = self.rel_q2name[rel.split("/")[-1]]
                else:
                    rel_label = self.find_label(rel)
                if isinstance(rel_label, list) and rel_label:
                    rel_label = rel_label[0]
                filled_triplets.append([subj_label, rel_label, obj_label])
        return filled_triplets


================================================
FILE: deeppavlov/models/morpho_syntax_parser/__init__.py
================================================


================================================
FILE: deeppavlov/models/morpho_syntax_parser/dependency_decoding.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

import numpy as np
from ufal.chu_liu_edmonds import chu_liu_edmonds

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register('chu_liu_edmonds_transformer')
class ChuLiuEdmonds(Component):
    """
    A wrapper for Chu-Liu-Edmonds algorithm for maximum spanning tree
    """

    def __init__(self, min_edge_prob=1e-6, **kwargs):
        self.min_edge_prob = min_edge_prob

    def __call__(self, probs: List[np.ndarray]) -> List[List[int]]:
        """Applies Chu-Liu-Edmonds algorithm to the matrix of head probabilities.
        probs: a 3D-array of probabilities of shape B*L*(L+1)
        """
        answer = []
        for elem in probs:
            m, n = elem.shape
            if n == m + 1:
                elem = np.log10(np.maximum(self.min_edge_prob, elem)) - np.log10(self.min_edge_prob)
                elem = np.concatenate([np.zeros_like(elem[:1, :]), elem], axis=0)
                # it makes impossible to create multiple edges 0->i
                elem[1:, 0] += np.log10(self.min_edge_prob) * len(elem)
                heads, _ = chu_liu_edmonds(elem.astype("float64"))
                answer.append(heads[1:])
            else:
                raise ValueError("First and second axis lengths m, n of probs should satisfy the condition n == m + 1")
        return answer


================================================
FILE: deeppavlov/models/morpho_syntax_parser/joint.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Union, List

from deeppavlov.core.common.chainer import Chainer
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

UD_COLUMN_FEAT_MAPPING = {"id": 0, "word": 1, "lemma": 2, "upos": 3, "feats": 5, "head": 6, "deprel": 7}


@register("joint_tagger_parser")
class JointTaggerParser(Component):
    """
    A class to perform joint morphological and syntactic parsing.
    It is just a wrapper that calls the models for tagging and parsing
    and comprises their results in a single output.
    Args:
        tagger: the morphological tagger model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance)
        parser_path: the syntactic parser model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance)
        output_format: the output format, it may be either `ud` (alias: `conllu`) or `json`.
    Attributes:
        tagger: a morphological tagger model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance)
        parser: a syntactic parser model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance)
    """

    def __init__(self, tagger: Chainer, parser: Chainer,
                 output_format: str = "ud", *args, **kwargs):
        if output_format not in ["ud", "conllu", "json", "dict"]:
            UserWarning("JointTaggerParser output_format can be only `ud`, `conllu` or `json`. " \
                        "Unknown format: {}, setting the output_format to `ud`.".format(output_format))
            output_format = "ud"
        self.output_format = output_format
        self.tagger = tagger
        self.parser = parser

    def __call__(self, data: Union[List[str], List[List[str]]]) \
            -> Union[List[List[dict]], List[str], List[List[str]]]:
        tagger_output = self.tagger(data)
        parser_output = self.parser(data)
        answer = []
        for i, (tagger_sent, parser_sent) in enumerate(zip(tagger_output, parser_output)):
            curr_sent_answer = []
            for j, curr_word_tagger_output in enumerate(tagger_sent):
                curr_word_tagger_output = curr_word_tagger_output.split("\t")
                curr_word_parser_output = parser_sent[j].split("\t")
                curr_word_answer = curr_word_tagger_output[:]
                # setting parser output
                curr_word_answer[6:8] = curr_word_parser_output[6:8]
                if self.output_format in ["json", "dict"]:
                    curr_word_answer = {key: curr_word_answer[index]
                                        for key, index in UD_COLUMN_FEAT_MAPPING.items()}
                    curr_word_answer = str(curr_word_answer)
                curr_word_answer = "\t".join(curr_word_answer)
                curr_sent_answer.append(curr_word_answer)
            curr_sent_answer = "\n".join(str(x) for x in curr_sent_answer)
            answer.append(curr_sent_answer)
        return answer


================================================
FILE: deeppavlov/models/morpho_syntax_parser/spacy_lemmatizer.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

import spacy

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register('spacy_lemmatizer')
class SpacyLemmatizer(Component):
    def __init__(self, model: str, **kwargs):
        self.nlp = spacy.load(model)

    def __call__(self, words_batch: List[List[str]]):
        return [[self.nlp(word)[0].lemma_ for word in words_list] for words_list in words_batch]


================================================
FILE: deeppavlov/models/morpho_syntax_parser/syntax_parsing.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple, Union

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


def make_pos_and_tag(tag: str, sep: str = ",",
                     return_mode: Optional[str] = None) -> Tuple[str, Union[str, list, dict, tuple]]:
    """
    Args:
        tag: the part-of-speech tag
        sep: the separator between part-of-speech tag and grammatical features
        return_mode: the type of return value, can be None, list, dict or sorted_items
    Returns:
        the part-of-speech label and grammatical features in required format
    """
    if tag.endswith(" _"):
        tag = tag[:-2]
    if sep in tag:
        pos, tag = tag.split(sep, maxsplit=1)
    else:
        pos, tag = tag, ("_" if return_mode is None else "")
    if return_mode in ["dict", "list", "sorted_items"]:
        tag = tag.split("|") if tag != "" else []
        if return_mode in ["dict", "sorted_items"]:
            tag = dict(tuple(elem.split("=")) for elem in tag)
            if return_mode == "sorted_items":
                tag = tuple(sorted(tag.items()))
    return pos, tag


class OutputPrettifier(Component):
    """Base class for formatting the output of dependency parser and morphotagger"""

    def __init__(self, return_string: bool = True, begin: str = "", end: str = "\n", sep: str = "\n",
                 **kwargs) -> None:
        self.return_string = return_string
        self.begin = begin
        self.end = end
        self.sep = sep

    def prettify(self, tokens: List[str], heads: List[int], deps: List[str]) -> Union[List[str], str]:
        raise NotImplementedError

    def __call__(self, X: List[List[str]], Y: List[List[int]], Z: List[List[str]]) -> List[Union[List[str], str]]:
        """Calls the :meth:`~prettify` function for each input sentence.
        Args:
            X: a list of input sentences
            Y: a list of lists of head positions for sentence words
            Z: a list of lists of dependency labels for sentence words
        Returns:
            a list of prettified UD outputs
        """
        return [self.prettify(x, y, z) for x, y, z in zip(X, Y, Z)]


@register('dependency_output_prettifier')
class DependencyOutputPrettifier(OutputPrettifier):
    """Class which prettifies dependency parser output
    to 10-column (Universal Dependencies) format.
    Args:
        begin: a string to append in the beginning
        end: a string to append in the end
        sep: separator between word analyses
    """

    def __init__(self, return_string: bool = True, begin: str = "", end: str = "\n", sep: str = "\n",
                 **kwargs) -> None:
        super().__init__(return_string, begin, end, sep, **kwargs)
        self.format_string = "{}\t{}\t_\t_\t_\t_\t{}\t{}\t_\t_"

    def prettify(self, tokens: List[str], heads: List[int], deps: List[str]) -> Union[List[str], str]:
        """Prettifies output of dependency parser.
        Args:
            tokens: tokenized source sentence
            heads: list of head positions, the output of the parser
            deps: list of head positions, the output of the parser
        Returns:
            the prettified output of the parser
        """
        answer = []
        for i, (word, head, dep) in enumerate(zip(tokens, heads, deps)):
            answer.append(self.format_string.format(i + 1, word, head, dep))
        if self.return_string:
            answer = self.begin + self.sep.join(answer) + self.end
        return answer


@register('lemmatized_output_prettifier')
class LemmatizedOutputPrettifier(OutputPrettifier):
    """Class which prettifies morphological tagger output to 4-column
    or 10-column (Universal Dependencies) format.
    Args:
        format_mode: output format,
            in `basic` mode output data contains 4 columns (id, word, pos, features),
            in `conllu` or `ud` mode it contains 10 columns:
            id, word, lemma, pos, xpos, feats, head, deprel, deps, misc
            (see http://universaldependencies.org/format.html for details)
            Only id, word, lemma, tag and pos columns are predicted in current version,
            other columns are filled by `_` value.
        begin: a string to append in the beginning
        end: a string to append in the end
        sep: separator between word analyses
    """

    def __init__(self, return_string: bool = True, begin: str = "", end: str = "\n", sep: str = "\n",
                 **kwargs) -> None:
        super().__init__(return_string, begin, end, sep, **kwargs)
        self.format_string = "{}\t{}\t{}\t{}\t_\t{}\t_\t_\t_\t_"

    def prettify(self, tokens: List[str], tags: List[str], lemmas: List[str]) -> Union[List[str], str]:
        """Prettifies output of morphological tagger.
        Args:
            tokens: tokenized source sentence
            tags: list of tags, the output of a tagger
            lemmas: list of lemmas, the output of a lemmatizer
        Returns:
            the prettified output of the tagger.
        Examples:
            >>> sent = "John really likes pizza .".split()
            >>> tags = ["PROPN,Number=Sing", "ADV",
            >>>         "VERB,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
            >>>         "NOUN,Number=Sing", "PUNCT"]
            >>> lemmas = "John really like pizza .".split()
            >>> prettifier = LemmatizedOutputPrettifier()
            >>> self.prettify(sent, tags, lemmas)
                1	John	John	PROPN	_	Number=Sing	_	_	_	_
                2	really	really	ADV	_	_	_	_	_	_
                3	likes	like	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	_	_	_	_
                4	pizza	pizza	NOUN	_	Number=Sing	_	_	_	_
                5	.	.	PUNCT	_	_	_	_	_	_
        """
        answer = []
        for i, (word, tag, lemma) in enumerate(zip(tokens, tags, lemmas)):
            pos, tag = make_pos_and_tag(tag, sep=",")
            answer.append(self.format_string.format(i + 1, word, lemma, pos, tag))
        if self.return_string:
            answer = self.begin + self.sep.join(answer) + self.end
        return answer


================================================
FILE: deeppavlov/models/preprocessors/__init__.py
================================================


================================================
FILE: deeppavlov/models/preprocessors/dirty_comments_preprocessor.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import string
from typing import List

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register('dirty_comments_preprocessor')
class DirtyCommentsPreprocessor(Component):
    """
    Class implements preprocessing of english texts with low level of literacy such as comments
    """

    def __init__(self, remove_punctuation: bool = True, *args, **kwargs):
        self.remove_punctuation = remove_punctuation

    def __call__(self, batch: List[str], **kwargs) -> List[str]:
        """
        Preprocess given batch

        Args:
            batch: list of text samples
            **kwargs: additional arguments

        Returns:
            list of preprocessed text samples
        """
        f = [x.lower() for x in batch]
        f = [re.sub("<\S*>", " ", x) for x in f]
        f = [re.sub('\s+', ' ', x) for x in f]

        f = [x.replace("won't", "will not") for x in f]
        f = [x.replace("can't", "cannot") for x in f]
        f = [x.replace("i'm", "i am") for x in f]
        f = [x.replace(" im ", " i am ") for x in f]
        f = [x.replace("'re", " are") for x in f]
        f = [x.replace("ain't", "is not") for x in f]
        f = [x.replace("'ll", " will") for x in f]
        f = [x.replace("n't", " not") for x in f]
        f = [x.replace("'ve", " have") for x in f]
        f = [x.replace("'s", " is") for x in f]
        f = [x.replace("'d", " would") for x in f]

        f = [re.sub("ies( |$)", "y ", x) for x in f]
        f = [re.sub("s( |$)", " ", x) for x in f]
        f = [re.sub("ing( |$)", " ", x) for x in f]

        f = [x.replace(" u ", " you ") for x in f]
        f = [x.replace(" em ", " them ") for x in f]
        f = [x.replace(" da ", " the ") for x in f]
        f = [x.replace(" yo ", " you ") for x in f]
        f = [x.replace(" ur ", " your ") for x in f]
        f = [x.replace(" u r ", " you are ") for x in f]
        f = [x.replace(" urs ", " yours ") for x in f]
        f = [x.replace("y'all", "you all") for x in f]

        f = [x.replace(" r u ", " are you ") for x in f]
        f = [x.replace(" r you", " are you") for x in f]
        f = [x.replace(" are u ", " are you ") for x in f]

        f = [x.replace("\\n", " ") for x in f]
        f = [x.replace("\\t", " ") for x in f]
        f = [x.replace("\\xa0", " ") for x in f]
        f = [x.replace("\\xc2", " ") for x in f]
        f = [re.sub("[0-9]+", " 0 ", x) for x in f]

        f = [re.sub(r'([' + string.printable + r'])\1{3,}', r'\1\1', x).strip() for x in f]

        if self.remove_punctuation:
            f = [re.sub(r'([' + string.punctuation + '])', ' ', x) for x in f]

        f = [re.sub(' +', ' ', x) for x in f]
        return f


================================================
FILE: deeppavlov/models/preprocessors/dnnc_preprocessor.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import List, Tuple

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

log = getLogger(__name__)


@register('dnnc_pair_generator')
class PairGenerator(Component):
    """
    Generates all possible ordered pairs from 'texts_batch' and 'support_dataset'
    
    Args:
        bidirectional: adds pairs in reverse order
    """

    def __init__(self, bidirectional: bool = False, **kwargs) -> None:
        self.bidirectional = bidirectional

    def __call__(self,
                 texts: List[str],
                 dataset: List[List[str]],
                ) -> Tuple[List[str], List[str], List[str], List[str]]:
        hypotesis_batch = []
        premise_batch = []
        hypotesis_labels_batch = []
        for [premise, [hypotesis, hypotesis_labels]] in zip(texts * len(dataset),
                                                            np.repeat(dataset, len(texts), axis=0)):
            premise_batch.append(premise)
            hypotesis_batch.append(hypotesis)
            hypotesis_labels_batch.append(hypotesis_labels)

            if self.bidirectional:
                premise_batch.append(hypotesis)
                hypotesis_batch.append(premise)
                hypotesis_labels_batch.append(hypotesis_labels)
        return texts, hypotesis_batch, premise_batch, hypotesis_labels_batch


================================================
FILE: deeppavlov/models/preprocessors/mask.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register('mask')
class Mask(Component):
    """Takes a batch of tokens and returns the masks of corresponding length"""
    def __init__(self, *args, **kwargs):
        pass

    @staticmethod
    def __call__(tokens_batch, **kwargs):
        batch_size = len(tokens_batch)
        max_len = max(len(utt) for utt in tokens_batch)
        mask = np.zeros([batch_size, max_len], dtype=np.float32)
        for n, utterance in enumerate(tokens_batch):
            mask[n, :len(utterance)] = 1

        return mask


================================================
FILE: deeppavlov/models/preprocessors/multitask_preprocessor.py
================================================
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Iterable
from logging import getLogger

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.models.preprocessors.torch_transformers_preprocessor import *

log = getLogger(__name__)


@register('multitask_pipeline_preprocessor')
class MultiTaskPipelinePreprocessor(Component):
    """
    Extracts out the task_id from the first index of each example for each task.
    Then splits the input and performs tokenization
    Params:
    
    vocab_file(str): vocabulary file for tokenization
    do_lower_case(bool): if True, tokenization is lower-cased. Default: True
    preprocessor(str): name of DeepPavlov class that is used for tokenization. 
    Default: TorchTransformersPreprocessor
    preprocessors(List[str]): list of names of DeepPavlov classes that are used for tokenization.
    Overrides preprocessor . The length of list must be equal to the number of tasks
    max_seq_length(int): Maximum sequence length for tokenizer. Default: 512
    strict(bool): if True, we always try to split data assuming predefined modes as in multitask_example.json  
    If False, we go without splitting if we are not sure how to split the data. Default: False
    print_first_example(bool): if True, we print the first input example after initialization. Default: False 
    """

    def __init__(self,
                 vocab_file,
                 do_lower_case: bool = True,
                 preprocessor: str = 'TorchTransformersPreprocessor',
                 preprocessors: List[str] = None,
                 max_seq_length: int = 512,
                 strict=False,
                 print_first_example=False,
                 *args, **kwargs):
        self.strict = strict
        self.printed = False
        self.print_first_example = print_first_example
        self.prefix = ''
        if preprocessors is None:
            log.info(
                f'Assuming the same preprocessor name for all : {preprocessor}')
            self.preprocessor = eval(preprocessor)(vocab_file, do_lower_case,
                                                   max_seq_length, *args, **kwargs)
            self.preprocessors = None
        else:
            for i in range(len(preprocessors)):
                preprocessors[i] = eval(preprocessors[i])
            self.n_task = len(preprocessors)
            self.preprocessors = [preprocessors[i](vocab_file=vocab_file, do_lower_case=do_lower_case,
                                                   max_seq_length=max_seq_length,
                                                   *args, **kwargs) for i in range(len(preprocessors))]

    def split(self, features):
        if all([isinstance(k, str) for k in features]) or all([k is None for k in features]):
            # single sentence classification
            log.debug('Assuming single sentence classification')
            texts_a, texts_b = features, None
        elif all([isinstance(k, tuple) and len(k) == 2 for k in features]):
            log.debug(
                'Assuming sentence pair classification or classification for multichoice')
            texts_a, texts_b = [], []
            for feature in features:
                text_a, text_b = feature
                texts_a.append(text_a)
                texts_b.append(text_b)
        elif all([isinstance(k, list) for k in features]):
            log.debug('Assuming ner classification')
            texts_a, texts_b = list(features), None
        else:
            if self.strict:
                raise Exception(f'Unsupported task data {features}')
            else:
                log.warning('Data not split.Going without splitting')
                texts_a, texts_b = features, None
        return texts_a, texts_b

    def __call__(self, *args):
        """
        Returns batches of values from ``inp``. Every batch contains values that have same key from
        ``keys_to_extract`` attribute. The order of elements of ``keys_to_extract`` is preserved.

        Args:
            inp: A sequence of dictionaries with identical keys

        Returns:
            A list of lists of values of dictionaries from ``inp``
        """
        self.n_task = len(args)
        if self.preprocessors is None:
            # Defining preprocessor list while we call the function, as only he
            self.preprocessors = [self.preprocessor
                                  for _ in range(self.n_task)]
        answer = []
        for i in range(len(args)):
            if all([j is None for j in args[i]]):
                log.debug('All nones received')
                answer.append([])
            else:
                texts_a, texts_b = self.split(args[i])
                #log.debug(f'Preprocessor {self.preprocessors[i]}')
                if all([j is None for j in texts_a]):
                    log.debug('All nones')
                    answer.append([])
                else:
                    if 'choice' in str(self.preprocessors[i]):
                        if isinstance(texts_a[0], str) and isinstance(texts_b[0],list):
                            for j in range(len(texts_b)):
                                texts_a[j] = [texts_a[j] for _ in range(len(texts_b[j]))]
                        if self.prefix:
                            for j in range(len(texts_a)):
                                 texts_a[j] = [' '.join([self.prefix, text]) for text in texts_a[j]]
                    else:
                        if self.prefix:
                            texts_a = [' '.join([self.prefix, text]) for text in texts_a]
                    answer.append(self.preprocessors[i](texts_a, texts_b))
                    if not self.printed and self.print_first_example:
                        print((texts_a, texts_b))
                        print(answer[-1])
                        self.printed = True
        if answer == [[]]:
            raise Exception('Empty answer')
        return answer


================================================
FILE: deeppavlov/models/preprocessors/ner_preprocessor.py
================================================
import errno
import os
from logging import getLogger
from typing import List

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.estimator import Estimator

log = getLogger(__name__)


@register("ner_vocab")
class NerVocab(Estimator):
    """ Implementation of the NER vocabulary

    Params:
        word_file_path: the path to the pre-trained word embedding model
        save_path: the folder path to save dictionary files
        load_path: the folder path from which the dictionary files are loaded
        char_level: the flag arg indicating the character vocabulary
    """

    def __init__(self,
                 word_file_path=None,
                 save_path=None,
                 load_path=None,
                 char_level=False,
                 **kwargs):

        super().__init__(save_path=save_path, load_path=load_path, **kwargs)

        self.word_file_path = word_file_path
        self.char_level = char_level

        if word_file_path is not None:
            self.load_from_file(word_file_path)
            if self.save_path is not None:
                self.save_to_file(self.save_path)
        elif self.load_path is not None:
            self.load_from_file(self.load_path)

    def load_from_file(self, filename):
        if filename is None or not os.path.exists(filename):
            return

        self._t2i, self._i2t = {}, {}
        for i, line in enumerate(open(file=filename, mode="r", encoding="utf-8").readlines()):
            word = line.strip()
            self._t2i[word] = i
            self._i2t[i] = word

    def save_to_file(self, filename):
        if filename is None:
            return

        dir_name = os.path.dirname(filename)
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        with open(file=filename, mode="w", encoding="utf-8") as fo:
            for word in self._t2i.keys():
                fo.write("{}\n".format(word))

    def fit(self, sents: [List[List[str]]], *args):
        if self.word_file_path is not None:
            return

        if self.char_level:
            items = set([char for sent in sents for word in sent for char in word])
        else:
            items = set([word for sent in sents for word in sent])
        items = ["<UNK>", "<PAD>"] + list(items)

        self._t2i = {k: v for v, k in enumerate(items)}
        self._i2t = {k: v for k, v in enumerate(items)}

        self.save_to_file(self.save_path)

    def pad_batch(self, tokens: List[List[int]]):
        """ Create padded batch of words, tags, chunk pos, even batch of characters

        Params:
            tokens: list of raw words, pos, chunk, or tags.

        Returns:
            the padded batch
        """

        batch_size = len(tokens)

        if not self.char_level:
            max_len = max([len(seq) for seq in tokens])
            padded_batch = np.full((batch_size, max_len), self._t2i["<PAD>"])
            for i, seq in enumerate(tokens):
                padded_batch[i, :len(seq)] = seq
        else:
            max_len_seq = max([len(seq) for seq in tokens])
            if max_len_seq == 0:
                max_len_sub_seq = 0
            else:
                max_len_sub_seq = max([len(sub_seq) for seq in tokens for sub_seq in seq])
            padded_batch = np.full((batch_size, max_len_seq, max_len_sub_seq), self._t2i["<PAD>"])
            for i, seq in enumerate(tokens):
                for j, sub_seq in enumerate(seq):
                    padded_batch[i, j, :len(sub_seq)] = sub_seq
        return padded_batch

    def __call__(self, sents, **kwargs):
        if not self.char_level:
            sents_ind = [[self._t2i[word] if word in self._t2i else 0 for word in sent] for sent in sents]
        else:
            sents_ind = [[[self._t2i[char] if char in self._t2i else 0 for char in word] for word in sent] for sent in
                         sents]
        padded_sents = self.pad_batch(sents_ind)

        return padded_sents

    def load(self, *args, **kwargs):
        log.debug("[loading vocabulary from {}]".format(self.load_path))
        if self.load_path is not None:
            self.load_from_file(self.load_path)

    def save(self, *args, **kwargs):
        log.info("[saving vocabulary to {}]".format(self.save_path))
        if not os.path.exists(os.path.dirname(self.save_path)):
            try:
                os.makedirs(os.path.dirname(self.save_path))
            except OSError as exc:
                if exc.errno != errno.EEXIST:
                    raise
        self.save_to_file(self.save_path)

    @property
    def len(self):
        return len(self._t2i)

    @property
    def t2i(self):
        return self._t2i

    @property
    def i2t(self):
        return self._i2t


================================================
FILE: deeppavlov/models/preprocessors/odqa_preprocessors.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from itertools import chain
from logging import getLogger
from typing import List, Callable, Union, Tuple, Optional

from nltk import sent_tokenize

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

logger = getLogger(__name__)


@register('document_chunker')
class DocumentChunker(Component):
    """Make chunks from a document or a list of documents. Don't tear up sentences if needed.

    Args:
        sentencize_fn: a function for sentence segmentation
        keep_sentences: whether to tear up sentences between chunks or not
        tokens_limit: a number of tokens in a single chunk (usually this number corresponds to the squad model limit)
        flatten_result: whether to flatten the resulting list of lists of chunks
        paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored

    Attributes:
        keep_sentences: whether to tear up sentences between chunks or not
        tokens_limit: a number of tokens in a single chunk
        flatten_result: whether to flatten the resulting list of lists of chunks
        paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored

    """

    def __init__(self, sentencize_fn: Callable = sent_tokenize, keep_sentences: bool = True,
                 tokens_limit: int = 400, flatten_result: bool = False,
                 paragraphs: bool = False, number_of_paragraphs: int = -1, *args, **kwargs) -> None:
        self._sentencize_fn = sentencize_fn
        self.keep_sentences = keep_sentences
        self.tokens_limit = tokens_limit
        self.flatten_result = flatten_result
        self.paragraphs = paragraphs
        self.number_of_paragraphs = number_of_paragraphs

    def __call__(self, batch_docs: List[Union[str, List[str]]],
                 batch_docs_ids: Optional[List[Union[str, List[str]]]] = None) -> \
            Union[Tuple[Union[List[str], List[List[str]]], Union[List[str], List[List[str]]]],
                  Union[List[str], List[List[str]]]]:
        """Make chunks from a batch of documents. There can be several documents in each batch.
        Args:
            batch_docs: a batch of documents / a batch of lists of documents
            batch_docs_ids (optional) : a batch of documents ids / a batch of lists of documents ids
        Returns:
            chunks of docs, flattened or not and
            chunks of docs ids, flattened or not if batch_docs_ids were passed
        """

        result = []
        result_ids = []

        empty_docs_ids_flag = False

        if not batch_docs_ids:
            empty_docs_ids_flag = True

        if empty_docs_ids_flag:
            batch_docs_ids = [[[] for j in i] for i in batch_docs]

        for ids, docs in zip(batch_docs_ids, batch_docs):
            batch_chunks = []
            batch_chunks_ids = []
            if isinstance(docs, str):
                docs = [docs]
                ids = [ids]

            for id, doc in zip(ids, docs):
                if self.paragraphs:
                    split_doc = doc.split('\n\n')
                    split_doc = [sd.strip() for sd in split_doc]
                    split_doc = list(filter(lambda x: len(x) > 40, split_doc))
                    if self.number_of_paragraphs != -1:
                        split_doc = split_doc[:self.number_of_paragraphs]
                    batch_chunks.append(split_doc)
                    batch_chunks_ids.append([id] * len(split_doc))
                else:
                    doc_chunks = []
                    if self.keep_sentences:
                        sentences = sent_tokenize(doc)
                        n_tokens = 0
                        keep = []
                        for s in sentences:
                            n_tokens += len(s.split())
                            if n_tokens > self.tokens_limit:
                                if keep:
                                    doc_chunks.append(' '.join(keep))
                                    n_tokens = 0
                                    keep.clear()
                            keep.append(s)
                        if keep:
                            doc_chunks.append(' '.join(keep))
                        batch_chunks.append(doc_chunks)
                        batch_chunks_ids.append([id] * len(doc_chunks))
                    else:
                        split_doc = doc.split()
                        doc_chunks = [split_doc[i:i + self.tokens_limit] for i in
                                      range(0, len(split_doc), self.tokens_limit)]
                        batch_chunks.append(doc_chunks)
                        batch_chunks_ids.append([id] * len(doc_chunks))
            result.append(batch_chunks)
            result_ids.append(batch_chunks_ids)

        if self.flatten_result:
            if isinstance(result[0][0], list):
                for i in range(len(result)):
                    flattened = list(chain.from_iterable(result[i]))
                    flattened_ids = list(chain.from_iterable(result_ids[i]))
                    result[i] = flattened
                    result_ids[i] = flattened_ids

        if empty_docs_ids_flag:
            return result

        return result, result_ids


@register('string_multiplier')
class StringMultiplier(Component):
    """Make a list of strings from a provided string. A length of the resulting list equals a length
    of a provided reference argument.

    """

    def __init__(self, **kwargs):
        pass

    def __call__(self, batch_s: List[str], ref: List[str]) -> List[List[str]]:
        """ Multiply each string in a provided batch of strings.

        Args:
            batch_s: a batch of strings to be multiplied
            ref: a reference to obtain a length of the resulting list

        Returns:
            a multiplied s as list

        """
        res = []
        for s, r in zip(batch_s, ref):
            res.append([s] * len(r))

        return res


================================================
FILE: deeppavlov/models/preprocessors/one_hotter.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Union, Iterable

import numpy as np

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import zero_pad
from deeppavlov.core.models.component import Component


@register('one_hotter')
class OneHotter(Component):
    """
    One-hot featurizer with zero-padding.
    If ``single_vector``, return the only vector per sample which can have several elements equal to ``1``.

    Parameters:
        depth: the depth for one-hotting
        pad_zeros: whether to pad elements of batch with zeros
        single_vector: whether to return one vector for the sample (sum of each one-hotted vectors)
    """

    def __init__(self, depth: int, pad_zeros: bool = False,
                 single_vector=False, *args, **kwargs):
        self._depth = depth
        self._pad_zeros = pad_zeros
        self.single_vector = single_vector
        if self._pad_zeros and self.single_vector:
            raise ConfigError("Cannot perform ``single_vector`` with zero padding for OneHotter")

    def __call__(self, batch: List[List[int]], **kwargs) -> Union[List[List[np.ndarray]], List[np.ndarray]]:
        """
        Convert given batch of list of labels to one-hot representation of the batch.

        Args:
            batch: list of samples, where each sample is a list of integer labels.
            **kwargs: additional arguments

        Returns:
            if ``single_vector``, list of one-hot representations of each sample,
            otherwise, list of lists of one-hot representations of each label in a sample
        """
        one_hotted_batch = []

        for utt in batch:
            if isinstance(utt, Iterable):
                one_hotted_utt = self._to_one_hot(utt, self._depth)
            elif isinstance(utt, int):
                if self._pad_zeros or self.single_vector:
                    one_hotted_utt = self._to_one_hot([utt], self._depth)
                else:
                    one_hotted_utt = self._to_one_hot([utt], self._depth).reshape(-1)

            if self.single_vector:
                one_hotted_utt = np.sum(one_hotted_utt, axis=0)

            one_hotted_batch.append(one_hotted_utt)

        if self._pad_zeros:
            one_hotted_batch = zero_pad(one_hotted_batch)
        return one_hotted_batch

    @staticmethod
    def _to_one_hot(x, n):
        b = np.zeros([len(x), n], dtype=np.float32)
        for q, tok in enumerate(x):
            b[q, int(tok)] = 1
        return b


================================================
FILE: deeppavlov/models/preprocessors/re_preprocessor.py
================================================
# Copyright 2021 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from logging import getLogger
from pathlib import Path
from typing import Tuple, List, Union

import numpy as np
from transformers import BertTokenizer

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.file import read_json
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

log = getLogger(__name__)


@register('re_preprocessor')
class REPreprocessor(Component):
    def __init__(
            self,
            vocab_file: str,
            special_token: str = '<ENT>',
            ner_tags=None,
            max_seq_length: int = 512,
            do_lower_case: bool = False,
            default_tag: str = None,
            **kwargs
    ):
        """
        Args:
            vocab_file: path to vocabulary / name of vocabulary for tokenizer initialization
            special_token: an additional token that will be used for marking the entities in the document
            do_lower_case: set True if lowercasing is needed
            default_tag: used for test purposes to create a valid input
        Return:
            list of feature batches with input_ids, attention_mask, entity_pos, ner_tags
        """

        self.special_token = special_token
        self.special_tokens_dict = {'additional_special_tokens': [self.special_token]}
        self.default_tag = default_tag

        if ner_tags is None:
            ner_tags = ['ORG', 'TIME', 'MISC', 'LOC', 'PER', 'NUM']
        self.ner2id = {tag: tag_id for tag_id, tag in enumerate(ner_tags)}
        self.max_seq_length = max_seq_length

        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
        else:
            self.tokenizer = BertTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)

    def __call__(
            self, tokens: Union[Tuple, List[List[str]]], entity_pos: Union[Tuple, List[List[Tuple]]],
            entity_tags: Union[Tuple, List[List[str]]],
    ) -> Tuple[List, List, List, List, List]:
        """
        Tokenize and create masks; recalculate the entity positions regarding the document boarders.
        Args:
            tokens: List of tokens of each document: List[List[tokens in doc]]
            entity_pos: start and end positions of the entities' mentions
            entity_tags: NER tag of the entities
        Return:
            input_ids: List[List[int]],
            attention_mask: List[List[int]],
            entity_poss: List[
                            List[
                                List[(entity1_mention1_start_id, entity1_mention1_end_id), ...],
                                List[(entity2_mention1_start_id, entity2_mention1_end_id), ...]
                            ]
                        ]
            entity_tags: List[List[int]]
            nf_samples: List[int] - contains the information about whether the corresponding sample is real sample or
                fake (for testing): 0 means the sample is real, 1 - it is fake.
        """

        _ = self.tokenizer.add_special_tokens(self.special_tokens_dict)

        input_ids, attention_mask, upd_entity_pos, upd_entity_tags, nf_samples = [], [], [], [], []

        # this workaround is for proper testing: for an unknown reason during test in test_quick_start.py
        # each input list is transformed into a tuple, e.g., tokens -> tuple(tokens, ).
        # todo: refactoring
        if type(tokens) == tuple and type(entity_pos) == tuple and type(entity_tags) == tuple:
            tokens = tokens[0]
            entity_pos = entity_pos[0]
            entity_tags = entity_tags[0]

        for n_sample, (doc, ent_pos, ent_tags) in enumerate(zip(tokens, entity_pos, entity_tags)):

            # valid scenario
            if isinstance(ent_pos, list) and len(ent_pos) == 2:
                count = 0
                doc_wordpiece_tokens = []

                entity1_pos_start = list(zip(*ent_pos[0]))[0]  # first entity mentions' start positions
                entity1_pos_end = list(zip(*ent_pos[0]))[1]  # first entity mentions' end positions
                entity2_pos_start = list(zip(*ent_pos[1]))[0]  # second entity mentions' start positions
                entity2_pos_end = list(zip(*ent_pos[1]))[1]  # second entity mentions' end positions

                upd_entity1_pos_start, upd_entity2_pos_start, upd_entity1_pos_end, upd_entity2_pos_end = [], [], [], []
                for n, token in enumerate(doc):
                    if n in entity1_pos_start:
                        doc_wordpiece_tokens.append(self.special_token)
                        upd_entity1_pos_start.append(count)
                        count += 1

                    if n in entity1_pos_end:
                        doc_wordpiece_tokens.append(self.special_token)
                        count += 1
                        upd_entity1_pos_end.append(count)

                    if n in entity2_pos_start:
                        doc_wordpiece_tokens.append(self.special_token)
                        upd_entity2_pos_start.append(count)
                        count += 1

                    if n in entity2_pos_end:
                        doc_wordpiece_tokens.append(self.special_token)
                        count += 1
                        upd_entity2_pos_end.append(count)

                    word_tokens = self.tokenizer.tokenize(token)
                    doc_wordpiece_tokens += word_tokens
                    count += len(word_tokens)

                # special case when the entity is the last in the doc
                if len(doc) in entity1_pos_end:
                    doc_wordpiece_tokens.append(self.special_token)
                    count += 1
                    upd_entity1_pos_end.append(count)
                if len(doc) in entity2_pos_end:
                    doc_wordpiece_tokens.append(self.special_token)
                    count += 1
                    upd_entity2_pos_end.append(count)
                    word_tokens = self.tokenizer.tokenize(token)
                    doc_wordpiece_tokens += word_tokens
                    count += len(word_tokens)

                upd_entity_1_pos = list(zip(upd_entity1_pos_start, upd_entity1_pos_end))
                upd_entity_2_pos = list(zip(upd_entity2_pos_start, upd_entity2_pos_end))

                # text entities for self check
                upd_entity1_text = [doc_wordpiece_tokens[ent_m[0]:ent_m[1]] for ent_m in upd_entity_1_pos]
                upd_entity2_text = [doc_wordpiece_tokens[ent_m[0]:ent_m[1]] for ent_m in upd_entity_2_pos]

                enc_entity_tags = self.encode_ner_tag(ent_tags)

                encoding = self.tokenizer.encode_plus(
                    doc_wordpiece_tokens[:self.max_seq_length],     # truncate tokens
                    add_special_tokens=True,
                    truncation=True,
                    max_length=self.max_seq_length,
                    pad_to_max_length=True,
                    return_attention_mask=True
                )
                upd_entity_pos.append([upd_entity_1_pos, upd_entity_2_pos])
                nf_samples.append(0)

            # api test scenario
            else:
                # for api test: dump values of entity tags and entity pos
                encoding = self.tokenizer.encode_plus(
                    doc,
                    add_special_tokens=True,
                    truncation=True,
                    max_length=self.max_seq_length,
                    pad_to_max_length=True,
                    return_attention_mask=True
                )
                upd_entity_pos.append([[(0, 1)], [(0, 1)]])
                enc_entity_tags = self.encode_ner_tag([self.default_tag] * 2)
                nf_samples.append(1)

            input_ids.append(encoding['input_ids'])
            attention_mask.append(encoding['attention_mask'])
            upd_entity_tags.append(enc_entity_tags)

        return input_ids, attention_mask, upd_entity_pos, upd_entity_tags, nf_samples

    def encode_ner_tag(self, ner_tags: List) -> List:
        """ Encode NER tags with one hot encodings """
        enc_ner_tags = []
        for ner_tag in ner_tags:
            ner_tag_one_hot = [0] * len(self.ner2id)
            ner_tag_one_hot[self.ner2id[ner_tag]] = 1
            enc_ner_tags.append(ner_tag_one_hot)
        return enc_ner_tags


@register('re_postprocessor')
class REPostprocessor:

    def __init__(self, rel2id_path: str, rel2label_path: str, **kwargs):
        self.rel2id_path = rel2id_path
        self.rel2label_path = rel2label_path
        self.rel2id = read_json(str(expand_path(self.rel2id_path)))
        self.id2rel = {rel_id: rel for rel, rel_id in self.rel2id.items()}
        self.rel2label = read_json(str(expand_path(self.rel2label_path)))

    def __call__(self, model_output: List, nf_samples: List) -> Tuple[List[str], List[str]]:
        """
        The model output is transformed to the relation id and relation name
        Args:
            model_output: List of probability vectors
            nf_samples: contains the information about true and fake samples (0 - true sample and should be included
                to the output, 1 - fake sample)
        Return:
            wikidata_relation_id: List of wiki ids of found relations
            relation_name: List of names of found relations
        """

        wikidata_relation_id, relation_name = [], []

        for predictions, nf_sample in zip(model_output, nf_samples):
            if nf_sample:
                wikidata_relation_id.append("-")
                relation_name.append("-")
            else:
                rel_indices = np.nonzero(predictions)[0]

                for index in rel_indices:
                    if index == 0:
                        wikidata_relation_id.append("-")
                        relation_name.append("no relation")
                        continue

                    rel_p = self.id2rel[index]
                    wikidata_relation_id.append(rel_p)

                    if rel_p in self.rel2label:
                        relation_name.append(self.rel2label[rel_p])
                    else:
                        relation_name.append("-")
        return wikidata_relation_id, relation_name


================================================
FILE: deeppavlov/models/preprocessors/response_base_loader.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
from logging import getLogger

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.serializable import Serializable

logger = getLogger(__name__)


@register('response_base_loader')
class ResponseBaseLoader(Serializable):
    """Class for loading a base with text responses (and contexts) and their vector representations."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.resps = None
        self.resp_vecs = None
        self.conts = None
        self.cont_vecs = None
        self.load()

    def load(self):
        if self.load_path is not None:
            resp_file = self.load_path / "responses.csv"
            if resp_file.exists():
                with open(resp_file) as f:
                    responses = f.readlines()
                    self.resps = [el.strip('#\n') for el in responses]
            else:
                logger.error("Please provide responses.csv file to the {} directory".format(self.load_path))
                sys.exit(1)
            resp_vec_file = self.load_path / "resp_vecs.npy"
            if resp_vec_file.exists():
                self.resp_vecs = np.load(resp_vec_file)
            cont_file = self.load_path / "contexts.csv"
            if cont_file.exists():
                with open(cont_file) as f:
                    contexts = f.readlines()
                    self.conts = [el.strip('#\n') for el in contexts]
            else:
                logger.error("Please add contexts.csv file to the {} directory".format(self.load_path))
                sys.exit(1)
            cont_vec_file = self.load_path / "cont_vecs.npy"
            if cont_vec_file.exists():
                self.cont_vecs = np.load(cont_vec_file)

    def save(self):
        logger.error("The method save of the {} class is not used.".format(self.__class__))


================================================
FILE: deeppavlov/models/preprocessors/sanitizer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import sys
import unicodedata

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register('sanitizer')
class Sanitizer(Component):
    """Remove all combining characters like diacritical marks from tokens

    Args:
        diacritical: whether to remove diacritical signs or not
            diacritical signs are something like hats and stress marks
        nums: whether to replace all digits with 1 or not
    """

    def __init__(self,
                 diacritical: bool = True,
                 nums: bool = False,
                 *args, **kwargs) -> None:
        self.diacritical = diacritical
        self.nums = nums
        self.combining_characters = dict.fromkeys([c for c in range(sys.maxunicode)
                                                   if unicodedata.combining(chr(c))])

    def filter_diacritical(self, tokens_batch):
        """Takes batch of tokens and returns the batch with sanitized tokens"""
        sanitized_batch = []
        for utterance in tokens_batch:
            sanitized_utterance = []
            for token in utterance:
                token = unicodedata.normalize('NFD', token)
                sanitized_utterance.append(token.translate(self.combining_characters))
            sanitized_batch.append(sanitized_utterance)
        return sanitized_batch

    def replace_nums(self, tokens_batch):
        sanitized_batch = []
        for utterance in tokens_batch:
            sanitized_batch.append([re.sub('[0-9]', '1', token) for token in utterance])
        return sanitized_batch

    def __call__(self, tokens_batch, **kwargs):
        if self.filter_diacritical:
            tokens_batch = self.filter_diacritical(tokens_batch)
        if self.nums:
            tokens_batch = self.replace_nums(tokens_batch)
        return tokens_batch


================================================
FILE: deeppavlov/models/preprocessors/sentseg_preprocessor.py
================================================
from typing import List

from deeppavlov.core.common.registry import register


@register("sentseg_restore_sent")
def SentSegRestoreSent(batch_words: List[List[str]], batch_tags: List[List[str]]) -> List[str]:
    ret = []
    for words, tags in zip(batch_words, batch_tags):
        if len(tags) == 0:
            ret.append("")
            continue
        sent = words[0]
        punct = "" if tags[0] == "O" else tags[0][-1]
        for word, tag in zip(words[1:], tags[1:]):
            if tag != "O":
                sent += punct
                punct = tag[-1]
            sent += " " + word
        sent += punct
        ret.append(sent)

    return ret


================================================
FILE: deeppavlov/models/preprocessors/squad_preprocessor.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import bisect
from logging import getLogger
from typing import List, Dict

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

logger = getLogger(__name__)


@register('squad_bert_mapping')
class SquadBertMappingPreprocessor(Component):
    """Create mapping from BERT subtokens to their characters positions and vice versa.
        Args:
            do_lower_case: set True if lowercasing is needed
    """

    def __init__(self, do_lower_case: bool = True, *args, **kwargs):
        self.do_lower_case = do_lower_case

    def __call__(self, contexts_batch, bert_features_batch, subtokens_batch, **kwargs):
        subtok2chars_batch: List[List[Dict[int, int]]] = []
        char2subtoks_batch: List[List[Dict[int, int]]] = []

        for batch_counter, (context_list, features_list, subtokens_list) in \
                enumerate(zip(contexts_batch, bert_features_batch, subtokens_batch)):
            subtok2chars_list, char2subtoks_list = [], []
            for context, features, subtokens in zip(context_list, features_list, subtokens_list):
                if self.do_lower_case:
                    context = context.lower()
                context_start = subtokens.index('[SEP]') + 1
                idx = 0
                subtok2char: Dict[int, int] = {}
                char2subtok: Dict[int, int] = {}
                for i, subtok in list(enumerate(subtokens))[context_start:-1]:
                    subtok = subtok[2:] if subtok.startswith('##') else subtok
                    subtok_pos = context[idx:].find(subtok)
                    if subtok_pos == -1:
                        # it could be UNK
                        idx += 1  # len was at least one
                    else:
                        # print(k, '\t', t, p + idx)
                        idx += subtok_pos
                        subtok2char[i] = idx
                        for j in range(len(subtok)):
                            char2subtok[idx + j] = i
                        idx += len(subtok)
                subtok2chars_list.append(subtok2char)
                char2subtoks_list.append(char2subtok)
            subtok2chars_batch.append(subtok2chars_list)
            char2subtoks_batch.append(char2subtoks_list)
        return subtok2chars_batch, char2subtoks_batch


@register('squad_bert_ans_preprocessor')
class SquadBertAnsPreprocessor(Component):
    """Create answer start and end positions in subtokens.
        Args:
            do_lower_case: set True if lowercasing is needed
    """

    def __init__(self, do_lower_case: bool = True, *args, **kwargs):
        self.do_lower_case = do_lower_case

    def __call__(self, answers_raw, answers_start, char2subtoks, **kwargs):
        answers, starts, ends = [], [], []
        for answers_raw, answers_start, c2sub in zip(answers_raw, answers_start, char2subtoks):
            answers.append([])
            starts.append([])
            ends.append([])
            for ans, ans_st in zip(answers_raw, answers_start):
                if self.do_lower_case:
                    ans = ans.lower()
                try:
                    indices = {c2sub[0][i] for i in range(ans_st, ans_st + len(ans)) if i in c2sub[0]}
                    st = min(indices)
                    end = max(indices)
                except ValueError:
                    # 0 - CLS token
                    st, end = 0, 0
                    ans = ''
                starts[-1] += [st]
                ends[-1] += [end]
                answers[-1] += [ans]
        return answers, starts, ends


@register('squad_bert_ans_postprocessor')
class SquadBertAnsPostprocessor(Component):
    """Extract answer and create answer start and end positions in characters from subtoken positions."""

    def __init__(self, *args, **kwargs):
        pass

    def __call__(self, answers_start_batch, answers_end_batch, contexts_batch,
                 subtok2chars_batch, subtokens_batch, ind_batch, *args, **kwargs):
        answers = []
        starts = []
        ends = []
        for answer_st, answer_end, context_list, sub2c_list, subtokens_list, ind in \
                zip(answers_start_batch, answers_end_batch, contexts_batch, subtok2chars_batch, subtokens_batch,
                    ind_batch):
            sub2c = sub2c_list[ind]
            subtok = subtokens_list[ind][answer_end]
            context = context_list[ind]
            # CLS token is no_answer token
            if answer_st == 0 or answer_end == 0:
                answers += ['']
                starts += [-1]
                ends += [-1]
            else:
                st = self.get_char_position(sub2c, answer_st)
                end = self.get_char_position(sub2c, answer_end)

                subtok = subtok[2:] if subtok.startswith('##') else subtok
                answer = context[st:end + len(subtok)]
                answers += [answer]
                starts += [st]
                ends += [ends]
        return answers, starts, ends

    @staticmethod
    def get_char_position(sub2c, sub_pos):
        keys = list(sub2c.keys())
        found_idx = bisect.bisect(keys, sub_pos)
        if found_idx == 0:
            return sub2c[keys[0]]

        return sub2c[keys[found_idx - 1]]


================================================
FILE: deeppavlov/models/preprocessors/str_lower.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Union

from deeppavlov.core.common.registry import register


@register('str_lower')
def str_lower(batch: Union[str, list, tuple]):
    """Recursively search for strings in a list and convert them to lowercase

    Args:
        batch: a string or a list containing strings at some level of nesting

    Returns:
        the same structure where all strings are converted to lowercase
    """
    if isinstance(batch, str):
        return batch.lower()
    else:
        return list(map(str_lower, batch))


================================================
FILE: deeppavlov/models/preprocessors/str_token_reverser.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Union

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

StrTokenReverserInfo = Union[List[str], List['StrTokenReverserInfo']]


@register('str_token_reverser')
class StrTokenReverser(Component):
    """Component for converting strings to strings with reversed token positions

    Args:
        tokenized: The parameter is only needed to reverse tokenized strings.
    """

    def __init__(self, tokenized: bool = False, *args, **kwargs) -> None:
        self.tokenized = tokenized

    @staticmethod
    def _reverse_str(raw_string):
        splitted = raw_string.split()
        splitted.reverse()
        string = ' '.join(splitted)
        return string

    @staticmethod
    def _reverse_tokens(raw_tokens):
        raw_tokens.reverse()
        return raw_tokens

    def __call__(self, batch: Union[str, list, tuple]) -> StrTokenReverserInfo:
        """Recursively search for strings in a list and convert them to strings with reversed token positions

        Args:
            batch: a string or a list containing strings

        Returns:
            the same structure where all strings tokens are reversed
        """
        if isinstance(batch, (list, tuple)):
            batch = batch.copy()

        if self.tokenized:
            if isinstance(batch, (list, tuple)):
                if isinstance(batch[-1], str):
                    return self._reverse_tokens(batch)
                else:
                    return [self(line) for line in batch]
            raise RuntimeError(f'The objects passed to the reverser are not list or tuple! '
                               f' But they are {type(batch)}.'
                               f' If you want to passed str type directly use option tokenized = False')
        else:
            if isinstance(batch, (list, tuple)):
                return [self(line) for line in batch]
            else:
                return self._reverse_str(batch)


================================================
FILE: deeppavlov/models/preprocessors/str_utf8_encoder.py
================================================
# originally based on https://github.com/allenai/bilm-tf/blob/master/bilm/data.py

# Modifications copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import Counter, OrderedDict
from itertools import chain
from logging import getLogger
from typing import Union, List, Tuple

import numpy as np

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.estimator import Estimator

log = getLogger(__name__)

StrUTF8EncoderInfo = Union[List[str], List['StrUTF8EncoderInfo']]


@register('str_utf8_encoder')
class StrUTF8Encoder(Estimator):
    """Component for encoding all strings to utf8 codes

    Args:
        max_word_length: Max length of words of input and output batches.
        pad_special_char_use: Whether to use special char for padding  or not.
        word_boundary_special_char_use: Whether to add word boundaries by special chars or not.
        sentence_boundary_special_char_use: Whether to add word boundaries by special chars or not.
        reversed_sentense_tokens: Whether to use reversed sequences of tokens or not.
        bos: Name of a special token of the begin of a sentence.
        eos: Name of a special token of the end of a sentence.
    """

    def __init__(self,
                 max_word_length: int = 50,
                 pad_special_char_use: bool = False,
                 word_boundary_special_char_use: bool = False,
                 sentence_boundary_special_char_use: bool = False,
                 reversed_sentense_tokens: bool = False,
                 bos: str = '<S>',
                 eos: str = '</S>',
                 **kwargs) -> None:
        super().__init__(**kwargs)

        if word_boundary_special_char_use and max_word_length < 3:
            raise ConfigError(f"`max_word_length` should be more than 3!")
        if max_word_length < 1:
            raise ConfigError(f"`max_word_length` should be more than 1!")

        self._max_word_length = max_word_length
        self._reverse = reversed_sentense_tokens

        self._pad_special_char_use = pad_special_char_use
        self._word_boundary_special_char_use = word_boundary_special_char_use
        self._sentence_boundary_special_char_use = sentence_boundary_special_char_use

        # char ids 0-255 come from utf-8 encoding bytes
        # assign 256-300 to special chars
        self.bos_char = 256  # <begin sentence>
        self.eos_char = 257  # <end sentence>
        self.bow_char = 258  # <begin word>
        self.eow_char = 259  # <end word>
        self.pad_char = 260  # <padding>

        self._len = 261  # an upper bound of all indexes

        # the charcter representation of the begin/end of sentence characters
        def _make_bos_eos(indx):
            indx = np.array([indx], dtype=np.int32)
            if self._word_boundary_special_char_use:
                code = np.pad(indx, (1, 1), 'constant', constant_values=(self.bow_char, self.eow_char))
            else:
                code = indx
            if self._pad_special_char_use:
                code = np.pad(code, (0, self._max_word_length - code.shape[0]), 'constant',
                              constant_values=(self.pad_char))
            else:
                pass
            return code

        self.bos_chars = _make_bos_eos(self.bos_char)
        self.eos_chars = _make_bos_eos(self.eos_char)

        if self._sentence_boundary_special_char_use:
            self._eos_chars = [self.eos_chars]
            self._bos_chars = [self.bos_chars]
        else:
            self._eos_chars = []
            self._bos_chars = []

        if self.load_path:
            self.load()
        else:
            self.tokens = []
        self._word_char_ids = OrderedDict()

        for token in self.tokens:
            self._word_char_ids[token] = self._convert_word_to_char_ids(token)
        self._word_char_ids[bos] = self.bos_chars
        self._word_char_ids[eos] = self.eos_chars

    def __call__(self, batch: Union[List[str], Tuple[str]]) -> StrUTF8EncoderInfo:
        """Recursively search for strings in a list and utf8 encode

        Args:
            batch: a string or a list containing strings

        Returns:
            the same structure where all strings are utf8 encoded
        """
        if isinstance(batch, (list, tuple)):
            if isinstance(batch[-1], str):
                return self._encode_chars(batch)
            else:
                return [self(line) for line in batch]
        raise RuntimeError(f'The objects passed to the reverser are not list or tuple of str! '
                           f' But they are {type(batch)}.')

    def load(self) -> None:
        if self.load_path:
            if self.load_path.is_file():
                log.debug(f"[loading vocabulary from {self.load_path}]")
                self.tokens = []
                for ln in self.load_path.open('r', encoding='utf8'):
                    token = ln.strip().split()[0]
                    self.tokens.append(token)
            else:
                raise ConfigError(f"Provided `load_path` for {self.__class__.__name__} doesn't exist!")
        else:
            raise ConfigError(f"`load_path` for {self} is not provided!")

    def save(self) -> None:
        log.info(f"[saving vocabulary to {self.save_path}]")
        with self.save_path.open('wt', encoding='utf8') as f:
            for token in self._word_char_ids.keys():
                f.write('{}\n'.format(token))

    def fit(self, *args) -> None:
        words = chain(*args)
        # filter(None, <>) -- to filter empty words
        freqs = Counter(filter(None, chain(*words)))
        for token, _ in freqs.most_common():
            if not (token in self._word_char_ids):
                self._word_char_ids[token] = self._convert_word_to_char_ids(token)

    def _convert_word_to_char_ids(self, word):

        code = np.zeros([self._max_word_length], dtype=np.int32)
        if self._pad_special_char_use:
            code[:] = self.pad_char
        if self._word_boundary_special_char_use:
            word_encoded = word.encode('utf-8', 'ignore')[:self._max_word_length - 2]
            code[0] = self.bow_char

            for k, chr_id in enumerate(word_encoded, start=1):
                code[k] = chr_id

            code[len(word_encoded) + 1] = self.eow_char
        else:
            word_encoded = word.encode('utf-8', 'ignore')[:self._max_word_length]

            for k, chr_id in enumerate(word_encoded):
                code[k] = chr_id

        if not self._pad_special_char_use:
            if self._word_boundary_special_char_use:
                code = code[:len(word_encoded) + 2]
            else:
                code = code[:len(word_encoded)]
        return code

    def _word_to_char_ids(self, word):
        if word in self._word_char_ids:
            return self._word_char_ids[word]
        else:
            return self._convert_word_to_char_ids(word)

    def _encode_chars(self, sentence):
        """
        Encode the sentence as a white space delimited string of tokens.
        """
        chars_ids = [self._word_to_char_ids(cur_word)
                     for cur_word in sentence]
        return self._wrap_in_s_char(chars_ids)

    def _wrap_in_s_char(self, chars_ids):
        chars_ids = chars_ids if self._pad_special_char_use else list(chars_ids)
        if self._reverse:
            ret = self._eos_chars + chars_ids + self._bos_chars
        else:
            ret = self._bos_chars + chars_ids + self._eos_chars
        return np.vstack(ret) if self._pad_special_char_use else ret

    def __len__(self):
        return self._len

    @property
    def len(self):
        """
        An upper bound of all indexes.
        """
        return len(self)


================================================
FILE: deeppavlov/models/preprocessors/torch_transformers_preprocessor.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import random
import re
from collections import defaultdict
from dataclasses import dataclass
from logging import getLogger
from pathlib import Path
from typing import Tuple, List, Optional, Union, Dict, Set, Any

import nltk
import numpy as np
import torch
from transformers import AutoTokenizer
from transformers.data.processors.utils import InputFeatures

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import zero_pad
from deeppavlov.core.models.component import Component
from deeppavlov.models.preprocessors.mask import Mask

log = getLogger(__name__)


@register('torch_transformers_multiplechoice_preprocessor')
class TorchTransformersMultiplechoicePreprocessor(Component):
    """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks.

    Args:
        vocab_file: path to vocabulary
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens

    Attributes:
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        tokenizer: instance of Bert FullTokenizer

    """

    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = True,
                 max_seq_length: int = 512,
                 **kwargs) -> None:
        self.max_seq_length = max_seq_length
        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, **kwargs)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case, **kwargs)

    def tokenize_mc_examples(self,
                             contexts: List[List[str]],
                             choices: List[List[str]]) -> Dict[str, torch.tensor]:

        num_choices = len(contexts[0])
        batch_size = len(contexts)

        # tokenize examples in groups of `num_choices`
        examples = []
        for context_list, choice_list in zip(contexts, choices):
            for context, choice in zip(context_list, choice_list):
                tokenized_input = self.tokenizer.encode_plus(text=context,
                                                             text_pair=choice,
                                                             return_attention_mask=True,
                                                             add_special_tokens=True,
                                                             truncation=True)

                examples.append(tokenized_input)

        padded_examples = self.tokenizer.pad(
            examples,
            padding=True,
            max_length=self.max_seq_length,
            return_tensors='pt',
        )

        padded_examples = {k: v.view(batch_size, num_choices, -1) for k, v in padded_examples.items()}

        return padded_examples

    def __call__(self, texts_a: List[List[str]], texts_b: List[List[str]] = None) -> Dict[str, torch.tensor]:
        """Tokenize and create masks.

        texts_a and texts_b are separated by [SEP] token

        Args:
            texts_a: list of texts,
            texts_b: list of texts, it could be None, e.g. single sentence classification task

        Returns:
            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \
                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens
        """
        input_features = []
        if texts_a and texts_b and texts_a[0] and texts_b[0]:
            input_features = self.tokenize_mc_examples(texts_a, texts_b)
        return input_features


@register('torch_transformers_preprocessor')
class TorchTransformersPreprocessor(Component):
    """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks.

    Args:
        vocab_file: A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co or
            a path to a `directory` containing vocabulary files required by the tokenizer.
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens

    Attributes:
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        tokenizer: instance of Bert FullTokenizer

    """

    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = True,
                 max_seq_length: int = 512,
                 **kwargs) -> None:
        self.max_seq_length = max_seq_length
        self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case, **kwargs)

    def __call__(self, texts_a: List, texts_b: Optional[List[str]] = None) -> Union[List[InputFeatures],
                                                                                    Tuple[List[InputFeatures],
                                                                                    List[List[str]]]]:
        """Tokenize and create masks.
        texts_a and texts_b are separated by [SEP] token
        Args:
            texts_a: list of texts,
            texts_b: list of texts, it could be None, e.g. single sentence classification task
        Returns:
            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \
                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens
        """

        # in case of iterator's strange behaviour
        if isinstance(texts_a, tuple):
            texts_a = list(texts_a)
        elif isinstance(texts_a, str):
            raise TypeError(f'Received string {texts_a} as an input! Check the iterator output')
        elif texts_a == []:
            return {}

        texts_a = [k for k in texts_a if k is not None]  # handle dummy output

        input_features = self.tokenizer(text=texts_a,
                                        text_pair=texts_b,
                                        add_special_tokens=True,
                                        max_length=self.max_seq_length,
                                        padding='max_length',
                                        return_attention_mask=True,
                                        truncation=True,
                                        return_tensors='pt')
        return input_features


@register('torch_transformers_entity_ranker_preprocessor')
class TorchTransformersEntityRankerPreprocessor(Component):
    """Class for tokenization of text into subtokens, encoding of subtokens with indices and obtaining positions of
    special [ENT]-tokens
    Args:
        vocab_file: path to vocabulary
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        special_tokens: list of special tokens
        special_token_id: id of special token
        return_special_tokens_pos: whether to return positions of found special tokens
    """

    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = False,
                 max_seq_length: int = 512,
                 special_tokens: List[str] = None,
                 special_token_id: int = None,
                 return_special_tokens_pos: bool = False,
                 **kwargs) -> None:
        self.max_seq_length = max_seq_length
        self.do_lower_case = do_lower_case
        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)
        if special_tokens is not None:
            special_tokens_dict = {'additional_special_tokens': special_tokens}
            self.tokenizer.add_special_tokens(special_tokens_dict)
        self.special_token_id = special_token_id
        self.return_special_tokens_pos = return_special_tokens_pos

    def __call__(self, texts_a: List[str]) -> Tuple[Any, List[int]]:
        """Tokenize and find special tokens positions.
        Args:
            texts_a: list of texts,
        Returns:
            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \
                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens
            batch of indices of special token ids in input ids sequence
        """
        # in case of iterator's strange behaviour
        if isinstance(texts_a, tuple):
            texts_a = list(texts_a)
        if self.do_lower_case:
            texts_a = [text.lower() for text in texts_a]
        lengths = []
        input_ids_batch = []
        for text_a in texts_a:
            encoding = self.tokenizer.encode_plus(
                text_a, add_special_tokens=True, pad_to_max_length=True, return_attention_mask=True)
            input_ids = encoding["input_ids"]
            input_ids_batch.append(input_ids)
            lengths.append(len(input_ids))

        max_length = min(max(lengths), self.max_seq_length)
        input_features = self.tokenizer(text=texts_a,
                                        add_special_tokens=True,
                                        max_length=max_length,
                                        padding='max_length',
                                        return_attention_mask=True,
                                        truncation=True,
                                        return_tensors='pt')
        special_tokens_pos = []
        for input_ids_list in input_ids_batch:
            found_n = -1
            for n, input_id in enumerate(input_ids_list):
                if input_id == self.special_token_id:
                    found_n = n
                    break
            if found_n == -1:
                found_n = 0
            special_tokens_pos.append(found_n)

        if self.return_special_tokens_pos:
            return input_features, special_tokens_pos
        else:
            return input_features


@register('torch_squad_transformers_preprocessor')
class TorchSquadTransformersPreprocessor(Component):
    """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks.

    Args:
        vocab_file: path to vocabulary
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens

    Attributes:
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        tokenizer: instance of Bert FullTokenizer

    """

    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = True,
                 max_seq_length: int = 512,
                 add_token_type_ids: bool = False,
                 **kwargs) -> None:
        self.max_seq_length = max_seq_length
        self.add_token_type_ids = add_token_type_ids
        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)

    def __call__(self, question_batch: List[str], context_batch: Optional[List[str]] = None) -> Union[
        List[InputFeatures],
        Tuple[List[InputFeatures],
              List[List[str]]]]:
        """Tokenize and create masks.

        texts_a_batch and texts_b_batch are separated by [SEP] token

        Args:
            texts_a_batch: list of texts,
            texts_b_batch: list of texts, it could be None, e.g. single sentence classification task

        Returns:
            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \
                subtoken mask, segment mask, or tuple of batch of InputFeatures, batch of subtokens and batch of
                split paragraphs
        """

        if context_batch is None:
            context_batch = [None] * len(question_batch)

        input_features_batch, tokens_batch, split_context_batch = [], [], []
        for question, context in zip(question_batch, context_batch):
            question_list, context_list = [], []
            context_subtokens = self.tokenizer.tokenize(context)
            question_subtokens = self.tokenizer.tokenize(question)
            max_chunk_len = self.max_seq_length - len(question_subtokens) - 3
            if 0 < max_chunk_len < len(context_subtokens):
                number_of_chunks = math.ceil(len(context_subtokens) / max_chunk_len)
                sentences = nltk.sent_tokenize(context)
                for chunk in np.array_split(sentences, number_of_chunks):
                    context_list += [' '.join(chunk)]
                    question_list += [question]
            else:
                context_list += [context]
                question_list += [question]

            input_features_list, tokens_list = [], []
            for question_elem, context_elem in zip(question_list, context_list):
                encoded_dict = self.tokenizer.encode_plus(
                    text=question_elem, text_pair=context_elem,
                    add_special_tokens=True,
                    max_length=self.max_seq_length,
                    truncation=True,
                    padding='max_length',
                    return_attention_mask=True,
                    return_tensors='pt')
                if 'token_type_ids' not in encoded_dict:
                    if self.add_token_type_ids:
                        input_ids = encoded_dict['input_ids']
                        seq_len = input_ids.size(1)
                        sep = torch.where(input_ids == self.tokenizer.sep_token_id)[1][0].item()
                        len_a = min(sep + 1, seq_len)
                        len_b = seq_len - len_a
                        encoded_dict['token_type_ids'] = torch.cat((torch.zeros(1, len_a, dtype=int),
                                                                    torch.ones(1, len_b, dtype=int)), dim=1)
                    else:
                        encoded_dict['token_type_ids'] = torch.tensor([0])

                curr_features = InputFeatures(input_ids=encoded_dict['input_ids'],
                                              attention_mask=encoded_dict['attention_mask'],
                                              token_type_ids=encoded_dict['token_type_ids'],
                                              label=None)
                input_features_list.append(curr_features)
                tokens_list.append(self.tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0]))

            input_features_batch.append(input_features_list)
            tokens_batch.append(tokens_list)
            split_context_batch.append(context_list)

        return input_features_batch, tokens_batch, split_context_batch


@register('rel_ranking_preprocessor')
class RelRankingPreprocessor(Component):
    """Class for tokenization of text and relation labels
    Args:
        vocab_file: path to vocabulary
        add_special_tokens: special_tokens_list
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
    """

    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = True,
                 max_seq_length: int = 512,
                 **kwargs) -> None:
        self.max_seq_length = max_seq_length
        self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)

    def __call__(self, questions_batch: List[List[str]], rels_batch: List[List[str]] = None) -> Dict[str, torch.tensor]:
        """Tokenize questions and relations
        texts_a and texts_b are separated by [SEP] token
        Args:
            questions_batch: list of texts,
            rels_batch: list of relations list
        Returns:
            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \
                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens
        """
        lengths, proc_rels_batch = [], []
        for question, rels_list in zip(questions_batch, rels_batch):
            if isinstance(rels_list, list):
                rels_str = " ".join(rels_list)
            else:
                rels_str = rels_list
            encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_str,
                                                  return_attention_mask=True, add_special_tokens=True,
                                                  truncation=True)
            lengths.append(len(encoding["input_ids"]))
            proc_rels_batch.append(rels_str)
        max_len = max(lengths)
        input_ids_batch, attention_mask_batch, token_type_ids_batch = [], [], []
        for question, rels_list in zip(questions_batch, proc_rels_batch):
            encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_list,
                                                  truncation=True, max_length=max_len,
                                                  pad_to_max_length=True, return_attention_mask=True)
            input_ids_batch.append(encoding["input_ids"])
            attention_mask_batch.append(encoding["attention_mask"])
            if "token_type_ids" in encoding:
                token_type_ids_batch.append(encoding["token_type_ids"])
            else:
                token_type_ids_batch.append([0])
        input_features = {"input_ids": torch.LongTensor(input_ids_batch),
                          "attention_mask": torch.LongTensor(attention_mask_batch),
                          "token_type_ids": torch.LongTensor(token_type_ids_batch)}
        return input_features


@register('path_ranking_preprocessor')
class PathRankingPreprocessor(Component):
    def __init__(self,
                 vocab_file: str,
                 additional_special_tokens: List[str] = None,
                 do_lower_case: bool = True,
                 max_seq_length: int = 67,
                 **kwargs) -> None:
        self.max_seq_length = max_seq_length
        self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)
        self.additional_special_tokens = additional_special_tokens
        if self.additional_special_tokens:
            self.tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens})

    def __call__(self, questions_batch: List[str], rels_batch: List[List[List[str]]]):
        lengths, proc_rels_batch = [], []
        for question, rels_list in zip(questions_batch, rels_batch):
            proc_rels_list = []
            for rels in rels_list:
                if isinstance(rels, str):
                    rels = [rels]
                rels_str = ""
                if len(rels) == 1:
                    if self.additional_special_tokens:
                        rels_str = f"<one_rel> {rels[0]} </one_rel>"
                    else:
                        rels_str = rels[0]
                elif len(rels) == 2:
                    if rels[0] == rels[1]:
                        rels_str = f"<double> {rels[0]} </double>"
                    else:
                        rels_str = f"<first_rel> {rels[0]} <mid> {rels[1]} </second_rel>"
                encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_str,
                                                      return_attention_mask=True, add_special_tokens=True,
                                                      truncation=True)
                lengths.append(len(encoding["input_ids"]))
                proc_rels_list.append(rels_str)
            proc_rels_batch.append(proc_rels_list)

        max_len = min(max(lengths), self.max_seq_length)
        input_ids_batch, attention_mask_batch, token_type_ids_batch = [], [], []
        for question, rels_list in zip(questions_batch, proc_rels_batch):
            input_ids_list, attention_mask_list, token_type_ids_list = [], [], []
            for rels_str in rels_list:
                encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_str,
                                                      truncation=True, max_length=max_len, add_special_tokens=True,
                                                      pad_to_max_length=True, return_attention_mask=True)
                input_ids_list.append(encoding["input_ids"])
                attention_mask_list.append(encoding["attention_mask"])
                if "token_type_ids" in encoding:
                    token_type_ids_list.append(encoding["token_type_ids"])
                else:
                    token_type_ids_list.append([0])
            input_ids_batch.append(input_ids_list)
            attention_mask_batch.append(attention_mask_list)
            token_type_ids_batch.append(token_type_ids_list)
        input_features = {"input_ids": input_ids_batch, "attention_mask": attention_mask_batch,
                          "token_type_ids": token_type_ids_batch}
        return input_features


@register('torch_transformers_ner_preprocessor')
class TorchTransformersNerPreprocessor(Component):
    """
    Takes tokens and splits them into bert subtokens, encodes subtokens with their indices.
    Creates a mask of subtokens (one for the first subtoken, zero for the others).

    If tags are provided, calculates tags for subtokens.

    Args:
        vocab_file: path to vocabulary
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        max_subword_length: replace token to <unk> if it's length is larger than this
            (defaults to None, which is equal to +infinity)
        token_masking_prob: probability of masking token while training
        provide_subword_tags: output tags for subwords or for words
        subword_mask_mode: subword to select inside word tokens, can be "first" or "last"
            (default="first")
        return_features: if True, returns answer in features format

    Attributes:
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        max_subword_length: rmax lenght of a bert subtoken
        tokenizer: instance of Bert FullTokenizer
    """

    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = False,
                 max_seq_length: int = 512,
                 max_subword_length: int = None,
                 token_masking_prob: float = 0.0,
                 provide_subword_tags: bool = False,
                 subword_mask_mode: str = "first",
                 return_features: bool = False,
                 **kwargs):
        self._re_tokenizer = re.compile(r"[\d]+[\d\.,]+[\d]+|[\w'\.:@]+|[^\w ]")
        self.provide_subword_tags = provide_subword_tags
        self.mode = kwargs.get('mode')
        self.max_seq_length = max_seq_length
        self.max_subword_length = max_subword_length
        self.subword_mask_mode = subword_mask_mode
        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)
        self.token_masking_prob = token_masking_prob
        self.return_features = return_features

    def __call__(self,
                 tokens: Union[List[List[str]], List[str]],
                 tags: List[List[str]] = None,
                 **kwargs):
        tokens_offsets_batch = [[] for _ in tokens]
        if isinstance(tokens[0], str):
            tokens_batch = []
            tokens_offsets_batch = []
            for s in tokens:
                tokens_list = []
                tokens_offsets_list = []
                matches = tuple(re.finditer(self._re_tokenizer, s))
                for i, elem in enumerate(matches):
                    if (i == len(matches) - 1) and (elem[0][-1] == '.'):
                        tokens_list.append(elem[0][:-1])
                        tokens_list.append('.')
                        tokens_offsets_list.append((elem.start(), elem.end() - 1))
                        tokens_offsets_list.append((elem.end() - 1, elem.end()))
                    else:
                        tokens_list.append(elem[0])
                        tokens_offsets_list.append((elem.start(), elem.end()))
                tokens_batch.append(tokens_list)
                tokens_offsets_batch.append(tokens_offsets_list)
            tokens = tokens_batch
        subword_tokens, subword_tok_ids, startofword_markers, subword_tags = [], [], [], []
        for i in range(len(tokens)):
            toks = tokens[i]
            ys = ['O'] * len(toks) if tags is None else tags[i]
            assert len(toks) == len(ys), \
                f"toks({len(toks)}) should have the same length as ys({len(ys)})"
            sw_toks, sw_marker, sw_ys = \
                self._ner_bert_tokenize(toks,
                                        ys,
                                        self.tokenizer,
                                        self.max_subword_length,
                                        mode=self.mode,
                                        subword_mask_mode=self.subword_mask_mode,
                                        token_masking_prob=self.token_masking_prob)
            if self.max_seq_length is not None:
                if len(sw_toks) > self.max_seq_length:
                    raise RuntimeError(f"input sequence after bert tokenization"
                                       f" shouldn't exceed {self.max_seq_length} tokens.")
            subword_tokens.append(sw_toks)
            subword_tok_ids.append(self.tokenizer.convert_tokens_to_ids(sw_toks))
            startofword_markers.append(sw_marker)
            subword_tags.append(sw_ys)
            assert len(sw_marker) == len(sw_toks) == len(subword_tok_ids[-1]) == len(sw_ys), \
                f"length of sow_marker({len(sw_marker)}), tokens({len(sw_toks)})," \
                f" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})" \
                f" for tokens = `{toks}` should match"

        subword_tok_ids = zero_pad(subword_tok_ids, dtype=int, padding=0)
        startofword_markers = zero_pad(startofword_markers, dtype=int, padding=0)
        attention_mask = Mask()(subword_tokens)

        if tags is not None:
            if self.provide_subword_tags:
                return tokens, subword_tokens, subword_tok_ids, \
                       attention_mask, startofword_markers, subword_tags
            else:
                nonmasked_tags = [[t for t in ts if t != 'X'] for ts in tags]
                for swts, swids, swms, ts in zip(subword_tokens,
                                                 subword_tok_ids,
                                                 startofword_markers,
                                                 nonmasked_tags):
                    if (len(swids) != len(swms)) or (len(ts) != sum(swms)):
                        log.warning('Not matching lengths of the tokenization!')
                        log.warning(f'Tokens len: {len(swts)}\n Tokens: {swts}')
                        log.warning(f'Markers len: {len(swms)}, sum: {sum(swms)}')
                        log.warning(f'Masks: {swms}')
                        log.warning(f'Tags len: {len(ts)}\n Tags: {ts}')
            if self.return_features:
                feature_list = ({'input_ids': torch.Tensor(subword_tok_ids),
                                 'attention_mask': torch.Tensor(attention_mask),
                                 'token_type_ids': torch.Tensor(startofword_markers),
                                 'labels': torch.Tensor(nonmasked_tags)})
                return feature_list
            else:
                return tokens, subword_tokens, subword_tok_ids, \
                    attention_mask, startofword_markers, nonmasked_tags
        if self.return_features:
            feature_list = ({'input_ids': torch.Tensor(subword_tok_ids),
                             'attention_mask': torch.Tensor(attention_mask),
                             'token_type_ids': torch.Tensor(startofword_markers)
                             })
            return feature_list
        else:
            return tokens, subword_tokens, subword_tok_ids, \
                startofword_markers, attention_mask, tokens_offsets_batch

    @staticmethod
    def _ner_bert_tokenize(tokens: List[str],
                           tags: List[str],
                           tokenizer: AutoTokenizer,
                           max_subword_len: int = None,
                           mode: str = None,
                           subword_mask_mode: str = "first",
                           token_masking_prob: float = None) -> Tuple[List[str], List[int], List[str]]:
        do_masking = (mode == 'train') and (token_masking_prob is not None)
        do_cutting = (max_subword_len is not None)
        tokens_subword = ['[CLS]']
        startofword_markers = [0]
        tags_subword = ['X']
        for token, tag in zip(tokens, tags):
            token_marker = int(tag != 'X')
            subwords = tokenizer.tokenize(token)
            if not subwords or (do_cutting and (len(subwords) > max_subword_len)):
                tokens_subword.append('[UNK]')
                startofword_markers.append(token_marker)
                tags_subword.append(tag)
            else:
                if do_masking and (random.random() < token_masking_prob):
                    tokens_subword.extend(['[MASK]'] * len(subwords))
                else:
                    tokens_subword.extend(subwords)
                if subword_mask_mode == "last":
                    startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker])
                else:
                    startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1))
                tags_subword.extend([tag] + ['X'] * (len(subwords) - 1))

        tokens_subword.append('[SEP]')
        startofword_markers.append(0)
        tags_subword.append('X')
        return tokens_subword, startofword_markers, tags_subword


@register('torch_bert_ranker_preprocessor')
class TorchBertRankerPreprocessor(TorchTransformersPreprocessor):
    """Tokenize text to sub-tokens, encode sub-tokens with their indices, create tokens and segment masks for ranking.

    Builds features for a pair of context with each of the response candidates.
    """

    def __call__(self, batch: List[List[str]]) -> List[List[InputFeatures]]:
        """Tokenize and create masks.

        Args:
            batch: list of elements where the first element represents the batch with contexts
                and the rest of elements represent response candidates batches

        Returns:
            list of feature batches with subtokens, subtoken ids, subtoken mask, segment mask.
        """

        if isinstance(batch[0], str):
            batch = [batch]

        cont_resp_pairs = []
        if len(batch[0]) == 1:
            contexts = batch[0]
            responses_empt = [None] * len(batch)
            cont_resp_pairs.append(zip(contexts, responses_empt))
        else:
            contexts = [el[0] for el in batch]
            for i in range(1, len(batch[0])):
                responses = []
                for el in batch:
                    responses.append(el[i])
                cont_resp_pairs.append(zip(contexts, responses))

        input_features = []

        for s in cont_resp_pairs:
            sub_list_features = []
            for context, response in s:
                encoded_dict = self.tokenizer.encode_plus(
                    text=context, text_pair=response, add_special_tokens=True, max_length=self.max_seq_length,
                    pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')

                curr_features = InputFeatures(input_ids=encoded_dict['input_ids'],
                                              attention_mask=encoded_dict['attention_mask'],
                                              token_type_ids=encoded_dict['token_type_ids'],
                                              label=None)
                sub_list_features.append(curr_features)
            input_features.append(sub_list_features)

        return input_features


@dataclass
class RecordFlatExample:
    """Dataclass to store a flattened ReCoRD example. Contains `probability` for
    a given `entity` candidate, as well as its label.
    """
    index: str
    label: int
    probability: float
    entity: str


@dataclass
class RecordNestedExample:
    """Dataclass to store a nested ReCoRD example. Contains a single predicted entity, as well as
    a list of correct answers.
    """
    index: str
    prediction: str
    answers: List[str]


@register("torch_record_postprocessor")
class TorchRecordPostprocessor:
    """Combines flat classification examples into nested examples. When called returns nested examples
    that weren't previously returned during current iteration over examples.

    Args:
        is_binary: signifies whether the classifier uses binary classification head
    Attributes:
        record_example_accumulator: underling accumulator that transforms flat examples
        total_examples: overall number of flat examples that must be processed during current iteration
    """

    def __init__(self, is_binary: bool = False, *args, **kwargs):
        self.record_example_accumulator: RecordExampleAccumulator = RecordExampleAccumulator()
        self.total_examples: Optional[int, None] = None
        self.is_binary: bool = is_binary

    def __call__(self,
                 idx: List[str],
                 y: List[int],
                 y_pred_probas: np.ndarray,
                 entities: List[str],
                 num_examples: List[int],
                 *args,
                 **kwargs) -> List[RecordNestedExample]:
        """Postprocessor call

        Args:
            idx: list of string indices
            y: list of integer labels
            y_pred_probas: array of predicted probabilities
            num_examples: list of duplicated total numbers of examples

        Returns:
            List[RecordNestedExample]: processed but not previously returned examples (may be empty in some cases)
        """
        if isinstance(y_pred_probas, list):
            y_pred_probas = [k for k in y_pred_probas if k is not None]
            y = [k for k in y if k is not None]
            y_pred_probas = np.array(y_pred_probas)
        if y == []:
            return []
        if not self.is_binary:
            # if we have outputs for both classes `0` and `1`
            y_pred_probas = y_pred_probas[:, 1]
        if self.total_examples != num_examples[0]:
            # start over if num_examples is different
            # implying that a different split is being evaluated
            self.reset_accumulator()
            self.total_examples = num_examples[0]
        for index, label, probability, entity in zip(idx, y, y_pred_probas, entities):
            self.record_example_accumulator.add_flat_example(index, label, probability, entity)
            self.record_example_accumulator.collect_nested_example(index)
            if self.record_example_accumulator.examples_processed >= self.total_examples:
                # start over if all examples were processed
                self.reset_accumulator()

        return self.record_example_accumulator.return_examples()

    def reset_accumulator(self):
        """Reinitialize the underlying accumulator from scratch
        """
        self.record_example_accumulator = RecordExampleAccumulator()


class RecordExampleAccumulator:
    """ReCoRD example accumulator

    Attributes:
        examples_processed: total number of examples processed so far
        record_counter: number of examples processed for each index
        nested_len: expected number of flat examples for a given index
        flat_examples: stores flat examples
        nested_examples: stores nested examples
        collected_indices: indices of collected nested examples
        returned_indices: indices that have been returned
    """

    def __init__(self):
        self.examples_processed: int = 0
        self.record_counter: Dict[str, int] = defaultdict(lambda: 0)
        self.nested_len: Dict[str, int] = dict()
        self.flat_examples: Dict[str, List[RecordFlatExample]] = defaultdict(lambda: [])
        self.nested_examples: Dict[str, RecordNestedExample] = dict()
        self.collected_indices: Set[str] = set()
        self.returned_indices: Set[str] = set()

    def add_flat_example(self, index: str, label: int, probability: float, entity: str):
        """Add a single flat example to the accumulator

        Args:
            index: example index
            label: example label (`-1` means that label is not available)
            probability: predicted probability
            entity: candidate entity
        """
        self.flat_examples[index].append(RecordFlatExample(index, label, probability, entity))
        if index not in self.nested_len:
            self.nested_len[index] = self.get_expected_len(index)
        self.record_counter[index] += 1
        self.examples_processed += 1

    def ready_to_nest(self, index: str) -> bool:
        """Checks whether all the flat examples for a given index were collected at this point.
        Args:
            index: the index of the candidate nested example
        Returns:
            bool: indicates whether the collected flat examples can be combined into a nested example
        """
        return self.record_counter[index] == self.nested_len[index]

    def collect_nested_example(self, index: str):
        """Combines a list of flat examples denoted by the given index into a single nested example
        provided that all the necessary flat example have been collected by this time.
        Args:
            index: the index of the candidate nested example
        """
        if self.ready_to_nest(index):
            example_list: List[RecordFlatExample] = self.flat_examples[index]
            entities: List[str] = []
            labels: List[int] = []
            probabilities: List[float] = []
            answers: List[str] = []

            for example in example_list:
                entities.append(example.entity)
                labels.append(example.label)
                probabilities.append(example.probability)
                if example.label == 1:
                    answers.append(example.entity)

            prediction_index = np.argmax(probabilities)
            prediction = entities[prediction_index]

            self.nested_examples[index] = RecordNestedExample(index, prediction, answers)
            self.collected_indices.add(index)

    def return_examples(self) -> List[RecordNestedExample]:
        """Determines which nested example were not yet returned during the current evaluation
        cycle and returns them. May return an empty list if there are no new nested examples
        to return yet.
        Returns:
            List[RecordNestedExample]: zero or more nested examples
        """
        indices_to_return: Set[str] = self.collected_indices.difference(self.returned_indices)
        examples_to_return: List[RecordNestedExample] = []
        for index in indices_to_return:
            examples_to_return.append(self.nested_examples[index])
        self.returned_indices.update(indices_to_return)
        log.debug(f'Returning {examples_to_return}')
        return examples_to_return

    @staticmethod
    def get_expected_len(index: str) -> int:
        """
        Calculates the total number of flat examples denoted by the give index
        Args:
            index: the index to calculate the number of examples for
        Returns:
            int: the expected number of examples for this index
        """
        return int(index.split("-")[-1])


================================================
FILE: deeppavlov/models/preprocessors/transformers_preprocessor.py
================================================
# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from logging import getLogger
from typing import List, Union, Tuple

import numpy as np
from transformers import BertTokenizer

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

log = getLogger(__name__)


def _pad(data: List[List[Union[int, float]]], value: Union[int, float] = 0):
    max_len = max(map(len, data))
    res = np.ones([len(data), max_len], dtype=type(value)) * value
    for i, item in enumerate(data):
        res[i][:len(item)] = item
    return res


@register('transformers_bert_preprocessor')
class TransformersBertPreprocessor(Component):
    def __init__(self, vocab_file: str,
                 do_lower_case: bool = False,
                 max_seq_length: int = 512,
                 tokenize_chinese_chars: bool = True,
                 **kwargs):
        vocab_file = expand_path(vocab_file)
        self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case,
                                       tokenize_chinese_chars=tokenize_chinese_chars)
        self.max_seq_length = max_seq_length

    def __call__(self, tokens_batch: Union[List[str], List[List[str]]]) ->\
            Tuple[List[List[str]], List[List[str]], np.ndarray, np.ndarray, np.ndarray]:

        if isinstance(tokens_batch[0], str):  # skip for already tokenized text
            tokens_batch = [self.tokenizer.basic_tokenizer.tokenize(sentence, self.tokenizer.all_special_tokens)
                            for sentence in tokens_batch]
        startofword_markers_batch = []
        subtokens_batch = []
        for tokens in tokens_batch:
            startofword_markers = [0]
            subtokens = ['[CLS]']
            for token in tokens:
                for i, subtoken in enumerate(self.tokenizer.wordpiece_tokenizer.tokenize(token)):
                    startofword_markers.append(int(i == 0))
                    subtokens.append(subtoken)
            startofword_markers.append(0)
            subtokens.append('[SEP]')
            if len(subtokens) > self.max_seq_length:
                raise RuntimeError(f"input sequence after bert tokenization"
                                   f" cannot exceed {self.max_seq_length} tokens.")

            startofword_markers_batch.append(startofword_markers)
            subtokens_batch.append(subtokens)

        encoded = self.tokenizer.batch_encode_plus([[subtokens, None] for subtokens in subtokens_batch],
                                                   add_special_tokens=False)

        return (tokens_batch, subtokens_batch,
                _pad(encoded['input_ids'], value=self.tokenizer.pad_token_id),
                _pad(startofword_markers_batch), _pad(encoded['attention_mask']))


================================================
FILE: deeppavlov/models/ranking/__init__.py
================================================


================================================
FILE: deeppavlov/models/ranking/metrics.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np

from deeppavlov.core.common.metrics_registry import register_metric


@register_metric('rank_response')
def rank_response(y_true, y_pred):
    num_examples = float(len(y_pred))
    predictions = np.array(y_pred)
    predictions = np.flip(np.argsort(predictions, -1), -1)
    rank_tot = 0
    for el in predictions:
        for i, x in enumerate(el):
            if x == 0:
                rank_tot += i
                break
    return float(rank_tot) / num_examples


@register_metric('r@1_insQA')
def r_at_1_insQA(y_true, y_pred):
    return recall_at_k_insQA(y_true, y_pred, k=1)


def recall_at_k_insQA(y_true, y_pred, k):
    labels = np.repeat(np.expand_dims(np.asarray(y_true), axis=1), k, axis=1)
    predictions = np.array(y_pred)
    predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k]
    flags = np.zeros_like(predictions)
    for i in range(predictions.shape[0]):
        for j in range(predictions.shape[1]):
            if predictions[i][j] in np.arange(labels[i][j]):
                flags[i][j] = 1.
    return np.mean((np.sum(flags, -1) >= 1.).astype(float))


================================================
FILE: deeppavlov/models/relation_extraction/__init__.py
================================================


================================================
FILE: deeppavlov/models/relation_extraction/losses.py
================================================
"""
This code is copied from ATLOP algorithm (https://github.com/wzhouad/ATLOP/blob/main/losses.py)
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor


class ATLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, logits: Tensor, labels: Tensor) -> float:
        """
        Args:
            logits: predicted probabilities (shape: batch size x num classes)
            labels: one-hot encoded true labels (shape: batch size x num classes)
        """

        # TH label
        th_label = torch.zeros_like(labels, dtype=torch.float).to(labels)
        th_label[:, 0] = 1.0
        labels[:, 0] = 0.0

        p_mask = labels + th_label          # = 1 for the gold labels + for 0 (negative) class, 0 otherwise
        n_mask = 1 - labels         # = 0 for the gold labels, 1 otherwise

        # Rank positive classes to TH
        logit1 = logits - (1 - p_mask) * 1e30   # org logits remain for gold labels + 0 class, others are reduced by 1
        loss1 = -(F.log_softmax(logit1, dim=-1) * labels).sum(1)

        # Rank TH to negative classes
        logit2 = logits - (1 - n_mask) * 1e30  # org logits remain for not gold and not 0-class, others are reduced by 1
        loss2 = -(F.log_softmax(logit2, dim=-1) * th_label).sum(1)

        # Sum two parts
        loss = loss1 + loss2
        loss = loss.mean()
        return loss

    def get_label(self, logits: Tensor, num_labels: int = -1, threshold: float = None) -> Tensor:
        """ Calculated the labels """
        if threshold:
            th_logit = torch.full((len(logits), 1), threshold)
        else:
            th_logit = logits[:, 0].unsqueeze(1)        # vector of predicted probabilities for class 0 (negative class)
        output = torch.zeros_like(logits).to(logits)
        mask = (logits > th_logit)    # for each sample: True, if prob for a class > prob for neg class, False otherwise
        if num_labels > 0:
            top_v, _ = torch.topk(logits, num_labels, dim=1)        # len(num_labels) max elements; sorted
            top_v = top_v[:, -1]            # the smallest pro for each sample
            mask = (logits >= top_v.unsqueeze(1)) & mask    # mask + additionally: logits should be bigger than minimum
        output[mask] = 1.0
        output[:, 0] = (output.sum(1) == 0.).to(logits)         # no relation if no label matched
        return output


================================================
FILE: deeppavlov/models/relation_extraction/relation_extraction_bert.py
================================================
from logging import getLogger
from typing import List, Optional, Union

import numpy as np
import torch

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel
from deeppavlov.models.classifiers.re_bert import BertWithAdaThresholdLocContextPooling

log = getLogger(__name__)


@register('re_classifier')
class REBertModel(TorchModel):

    def __init__(
            self,
            n_classes: int,
            num_ner_tags: int,
            pretrained_bert: str = None,
            return_probas: bool = False,
            threshold: Optional[float] = None,
            **kwargs
    ) -> None:
        """
        Transformer-based model on PyTorch for relation extraction. It predicts a relation hold between entities in a
        text sample (one or several sentences).
        Args:
            n_classes: number of output classes
            num_ner_tags: number of NER tags
            pretrained_bert: key title of pretrained Bert model (e.g. "bert-base-uncased")
            return_probas: set this to `True` if you need the probabilities instead of raw answers
            threshold: manually set value for defining the positively predicted classes (instead of adaptive one)
        """
        self.n_classes = n_classes
        self.return_probas = return_probas

        if self.n_classes == 0:
            raise ConfigError("Please provide a valid number of classes.")

        model = BertWithAdaThresholdLocContextPooling(
            n_classes=self.n_classes,
            pretrained_bert=pretrained_bert,
            bert_tokenizer_config_file=pretrained_bert,
            num_ner_tags=num_ner_tags,
            threshold=threshold,
        )

        super().__init__(model, **kwargs)

    def train_on_batch(
            self, input_ids: List, attention_mask: List, entity_pos: List, entity_tags: List, labels: List
    ) -> float:
        """
        Trains the relation extraction BERT model on the given batch.
        Returns:
            dict with loss and learning rate values.
        """

        _input = {
            'input_ids': torch.LongTensor(input_ids).to(self.device),
            'attention_mask': torch.LongTensor(attention_mask).to(self.device),
            'entity_pos': entity_pos,
            'ner_tags': entity_tags,
            'labels': labels
        }

        self.model.train()
        self.model.zero_grad()
        self.optimizer.zero_grad()      # zero the parameter gradients

        hidden_states = self.model(**_input)
        loss = hidden_states[0]
        self._make_step(loss)

        return loss.item()

    def __call__(
            self, input_ids: List, attention_mask: List, entity_pos: List, entity_tags: List
    ) -> Union[List[int], List[np.ndarray]]:
        """ Get model predictions using features as input """

        self.model.eval()

        _input = {
            'input_ids': torch.LongTensor(input_ids).to(self.device),
            'attention_mask': torch.LongTensor(attention_mask).to(self.device),
            'entity_pos': entity_pos,
            'ner_tags': entity_tags
        }

        with torch.no_grad():
            indices, probas = self.model(**_input)

        if self.return_probas:
            pred = probas.cpu().numpy()
            pred[np.isnan(pred)] = 0
            pred_without_no_rel = []        # eliminate no_relation predictions
            for elem in pred:
                elem[0] = 0.0
                pred_without_no_rel.append(elem)
            new_pred = np.argmax(pred_without_no_rel, axis=1)
            one_hot = [[0.0] * self.n_classes] * len(new_pred)
            for i in range(len(new_pred)):
                one_hot[i][new_pred[i]] = 1.0
            pred = np.array(one_hot)
        else:
            pred = indices.cpu().numpy()
            pred[np.isnan(pred)] = 0
        return pred


================================================
FILE: deeppavlov/models/sklearn/__init__.py
================================================
from .sklearn_component import *


================================================
FILE: deeppavlov/models/sklearn/sklearn_component.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
import pickle
from logging import getLogger
from pathlib import Path
from typing import List, Tuple, Union, Callable

import numpy as np
from scipy.sparse import issparse, csr_matrix
from scipy.sparse import spmatrix
from scipy.sparse import vstack, hstack

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register, cls_from_str
from deeppavlov.core.models.estimator import Estimator

log = getLogger(__name__)


@register("sklearn_component")
class SklearnComponent(Estimator):
    """
    Class implements wrapper for sklearn components for feature extraction,
    feature selection, classification, regression etc.

    Args:
        model_class: string with full name of sklearn model to use, e.g. ``sklearn.linear_model:LogisticRegression``
        save_path: save path for model, e.g. full name ``model_path/model.pkl`` \
            or prefix ``model_path/model`` (still model will be saved to ``model_path/model.pkl``)
        load_path: load path for model, e.g. full name ``model_path/model.pkl`` \
            or prefix ``model_path/model`` (still model will be loaded from ``model_path/model.pkl``)
        infer_method: string name of class method to use for infering model, \
            e.g. ``predict``, ``predict_proba``, ``predict_log_proba``, ``transform``
        ensure_list_output: whether to ensure that output for each sample is iterable (but not string)
        kwargs: dictionary with parameters for the sklearn model

    Attributes:
        model: sklearn model instance
        model_class: string with full name of sklearn model to use, e.g. ``sklearn.linear_model:LogisticRegression``
        model_params: dictionary with parameters for the sklearn model without pipe parameters
        pipe_params: dictionary with parameters for pipe: ``in``, ``out``, ``fit_on``, ``main``, ``name``
        save_path: save path for model, e.g. full name ``model_path/model.pkl`` \
            or prefix ``model_path/model`` (still model will be saved to ``model_path/model.pkl``)
        load_path: load path for model, e.g. full name ``model_path/model.pkl`` \
            or prefix ``model_path/model`` (still model will be loaded from ``model_path/model.pkl``)
        infer_method: string name of class method to use for infering model, \
            e.g. ``predict``, ``predict_proba``, ``predict_log_proba``, ``transform``
        ensure_list_output: whether to ensure that output for each sample is iterable (but not string)
    """

    def __init__(self, model_class: str,
                 save_path: Union[str, Path] = None,
                 load_path: Union[str, Path] = None,
                 infer_method: str = "predict",
                 ensure_list_output: bool = False,
                 **kwargs) -> None:
        """
        Initialize component with given parameters
        """

        super().__init__(save_path=save_path, load_path=load_path, **kwargs)
        self.model_class = model_class
        self.model_params = kwargs
        self.model = None
        self.ensure_list_output = ensure_list_output
        self.pipe_params = {}
        for required in ["in", "out", "fit_on", "main", "name"]:
            self.pipe_params[required] = self.model_params.pop(required, None)

        self.load()
        self.infer_method = getattr(self.model, infer_method)

    def fit(self, *args) -> None:
        """
        Fit model on the given data

        Args:
            *args: list of x-inputs and, optionally, one y-input (the last one) to fit on.
                Possible input (x0, ..., xK, y) or (x0, ..., xK) '
                where K is the number of input data elements (the length of list ``in`` from config). \
                In case of several inputs (K > 1) input features will be stacked. \
                For example, one has x0: (n_samples, n_features0), ..., xK: (n_samples, n_featuresK), \
                then model will be trained on x: (n_samples, n_features0 + ... + n_featuresK).

        Returns:
            None
        """
        n_inputs = len(self.pipe_params["in"]) if isinstance(self.pipe_params["in"], list) else 1
        x_features = self.compose_input_data(args[:n_inputs])
        if len(args) > n_inputs:
            y_ = np.squeeze(np.array(args[-1]))
        else:
            y_ = None

        try:
            log.info("Fitting model {}".format(self.model_class))
            self.model.fit(x_features, y_)
        except TypeError or ValueError:
            if issparse(x_features):
                log.info("Converting input for model {} to dense array".format(self.model_class))
                self.model.fit(x_features.todense(), y_)
            else:
                log.info("Converting input for model {} to sparse array".format(self.model_class))
                self.model.fit(csr_matrix(x_features), y_)

        return

    def __call__(self, *args):
        """
        Infer on the given data according to given in the config infer method, \
            e.g. ``"predict", "predict_proba", "transform"``

        Args:
            *args: list of inputs

        Returns:
            predictions, e.g. list of labels, array of probability distribution, sparse array of vectorized samples
        """
        x_features = self.compose_input_data(args)

        try:
            predictions = self.infer_method(x_features)
        except TypeError or ValueError:
            if issparse(x_features):
                log.debug("Converting input for model {} to dense array".format(self.model_class))
                predictions = self.infer_method(x_features.todense())
            else:
                log.debug("Converting input for model {} to sparse array".format(self.model_class))
                predictions = self.infer_method(csr_matrix(x_features))

        if isinstance(predictions, list):
            #  ``predict_proba`` sometimes returns list of n_outputs (each output corresponds to a label)
            #  but we will return (n_samples, n_labels)
            #  where each value is a probability of a sample to belong with the label
            predictions_ = [[predictions[j][i][1] for j in range(len(predictions))] for i in range(x_features.shape[0])]
            predictions = np.array(predictions_)

        if self.ensure_list_output and len(predictions.shape) == 1:
            predictions = predictions.reshape(-1, 1)

        if issparse(predictions):
            return predictions
        else:
            return predictions.tolist()

    def init_from_scratch(self) -> None:
        """
        Initialize ``self.model`` as some sklearn model from scratch with given in ``self.model_params`` parameters.

        Returns:
            None
        """
        log.debug("Initializing model {} from scratch".format(self.model_class))
        model_function = cls_from_str(self.model_class)

        if model_function is None:
            raise ConfigError("Model with {} model_class was not found.".format(self.model_class))

        given_params = {}
        if self.model_params:
            available_params = self.get_function_params(model_function)
            for param_name in self.model_params.keys():
                if param_name in available_params:
                    try:
                        given_params[param_name] = cls_from_str(self.model_params[param_name])
                    except (AttributeError, ValueError, ConfigError):
                        given_params[param_name] = self.model_params[param_name]

        self.model = model_function(**given_params)
        return

    def load(self, fname: str = None) -> None:
        """
        Initialize ``self.model`` as some sklearn model from saved re-initializing ``self.model_params`` parameters. \
            If in new given parameters ``warm_start`` is set to True and given model admits ``warm_start`` parameter, \
            model will be initilized from saved with opportunity to continue fitting.

        Args:
            fname: string name of path to model to load from

        Returns:
            None
        """
        if fname is None:
            fname = self.load_path

        fname = Path(fname).with_suffix('.pkl')

        if fname.exists():
            log.debug("Loading model {} from {}".format(self.model_class, str(fname)))
            with open(fname, "rb") as f:
                self.model = pickle.load(f)

            warm_start = self.model_params.get("warm_start", None)
            self.model_params = {param: getattr(self.model, param) for param in self.get_class_attributes(self.model)}
            self.model_class = self.model.__module__ + self.model.__class__.__name__
            log.debug("Model {} loaded  with parameters".format(self.model_class))

            if warm_start and "warm_start" in self.model_params.keys():
                self.model_params["warm_start"] = True
                log.debug("Fitting of loaded model can be continued because `warm_start` is set to True")
            else:
                log.warning("Fitting of loaded model can not be continued. Model can be fitted from scratch."
                            "If one needs to continue fitting, please, look at `warm_start` parameter")
        else:
            log.warning("Cannot load model from {}".format(str(fname)))
            self.init_from_scratch()

        return

    def save(self, fname: str = None) -> None:
        """
        Save ``self.model`` to the file from ``fname`` or, if not given, ``self.save_path``. \
            If ``self.save_path`` does not have ``.pkl`` extension, then it will be replaced \
            to ``str(Path(self.save_path).stem) + ".pkl"``

        Args:
            fname:  string name of path to model to save to

        Returns:
            None
        """
        if fname is None:
            fname = self.save_path

        fname = Path(fname).with_suffix('.pkl')

        log.info("Saving model to {}".format(str(fname)))
        with open(fname, "wb") as f:
            pickle.dump(self.model, f, protocol=4)
        return

    @staticmethod
    def compose_input_data(x: List[Union[Tuple[Union[np.ndarray, list, spmatrix, str]],
                                         List[Union[np.ndarray, list, spmatrix, str]],
                                         np.ndarray, spmatrix]]) -> Union[spmatrix, np.ndarray]:
        """
        Stack given list of different types of inputs to the one matrix. If one of the inputs is a sparse matrix, \
            then output will be also a sparse matrix

        Args:
            x: list of data elements

        Returns:
            sparse or dense array of stacked data
        """
        x_features = []
        for i in range(len(x)):
            if ((isinstance(x[i], tuple) or isinstance(x[i], list) or isinstance(x[i], np.ndarray) and len(x[i]))
                    or (issparse(x[i]) and x[i].shape[0])):
                if issparse(x[i][0]):
                    x_features.append(vstack(list(x[i])))
                elif isinstance(x[i][0], np.ndarray) or isinstance(x[i][0], list):
                    x_features.append(np.vstack(list(x[i])))
                elif isinstance(x[i][0], str):
                    x_features.append(np.array(x[i]))
                else:
                    raise ConfigError('Not implemented this type of vectors')
            else:
                raise ConfigError("Input vectors cannot be empty")

        sparse = False
        for inp in x_features:
            if issparse(inp):
                sparse = True
        if sparse:
            x_features = hstack(list(x_features))
        else:
            x_features = np.hstack(list(x_features))

        return x_features

    @staticmethod
    def get_function_params(f: Callable) -> List[str]:
        """
        Get list of names of given function's parameters

        Args:
            f: function

        Returns:
            list of names of given function's parameters
        """
        return inspect.getfullargspec(f)[0]

    @staticmethod
    def get_class_attributes(cls: type) -> List[str]:
        """
        Get list of names of given class' attributes

        Args:
            cls: class

        Returns:
            list of names of given class' attributes
        """
        return list(cls.__dict__.keys())


================================================
FILE: deeppavlov/models/spelling_correction/__init__.py
================================================


================================================
FILE: deeppavlov/models/spelling_correction/brillmoore/__init__.py
================================================
from .error_model import ErrorModel


================================================
FILE: deeppavlov/models/spelling_correction/brillmoore/error_model.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
import itertools
from collections import defaultdict, Counter
from heapq import heappop, heappushpop, heappush
from logging import getLogger
from math import log, exp
from typing import List, Iterable, Tuple

from tqdm import tqdm

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.estimator import Estimator
from deeppavlov.vocabs.typos import StaticDictionary

logger = getLogger(__name__)


@register('spelling_error_model')
class ErrorModel(Estimator):
    """Component that uses statistics based error model to find best candidates in a static dictionary.
    Based on An Improved Error Model for Noisy Channel Spelling Correction by Eric Brill and Robert C. Moore

    Args:
        dictionary: a :class:`~deeppavlov.vocabs.typos.StaticDictionary` object
        window: maximum context window size
        candidates_count: maximum number of replacement candidates to return for every token in the input

    Attributes:
        costs: logarithmic probabilities of character sequences replacements
        dictionary: a :class:`~deeppavlov.vocabs.typos.StaticDictionary` object
        window: maximum context window size
        candidates_count: maximum number of replacement candidates to return for every token in the input
    """

    def __init__(self, dictionary: StaticDictionary, window: int = 1, candidates_count: int = 1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.costs = defaultdict(itertools.repeat(float('-inf')).__next__)
        self.dictionary = dictionary
        self.window = window
        if self.window == 0:
            self.find_candidates = self._find_candidates_window_0
        else:
            self.find_candidates = self._find_candidates_window_n
        self.costs[('', '')] = log(1)
        self.costs[('⟬', '⟬')] = log(1)
        self.costs[('⟭', '⟭')] = log(1)

        for c in self.dictionary.alphabet:
            self.costs[(c, c)] = log(1)
        # if self.ser_path.is_file():
        self.load()

        self.candidates_count = candidates_count

    def _find_candidates_window_0(self, word, prop_threshold=1e-6):
        threshold = log(prop_threshold)
        d = {}
        prefixes_heap = [(0, {''})]
        candidates = [(float('-inf'), '') for _ in range(self.candidates_count)]
        word = '⟬{}⟭'.format(word.lower().replace('ё', 'е'))
        word_len = len(word) + 1
        while prefixes_heap and -prefixes_heap[0][0] > candidates[0][0]:
            _, prefixes = heappop(prefixes_heap)
            for prefix in prefixes:
                res = []
                for i in range(word_len):
                    c = word[i - 1:i]
                    res.append(max(
                        (res[-1] + self.costs[('', c)]) if i else float('-inf'),
                        d[prefix[:-1]][i] + self.costs[(prefix[-1], '')] if prefix else float(
                            '-inf'),
                        (d[prefix[:-1]][i - 1] + (self.costs[(prefix[-1], c)]))
                        if prefix and i else float('-inf')
                    ) if i or prefix else 0)
                d[prefix] = res
                if prefix in self.dictionary.words_set:
                    heappushpop(candidates, (res[-1], prefix))
                potential = max(res)
                if potential > threshold:
                    heappush(prefixes_heap, (-potential, self.dictionary.words_trie[prefix]))
        return [(w.strip('⟬⟭'), score) for score, w in sorted(candidates, reverse=True) if
                score > threshold]

    def _find_candidates_window_n(self, word, prop_threshold=1e-6):
        threshold = log(prop_threshold)
        word = '⟬{}⟭'.format(word.lower().replace('ё', 'е'))
        word_len = len(word) + 1
        inf = float('-inf')
        d = defaultdict(list)
        d[''] = [0.] + [inf] * (word_len - 1)
        prefixes_heap = [(0, self.dictionary.words_trie[''])]
        candidates = [(inf, '')] * self.candidates_count
        while prefixes_heap and -prefixes_heap[0][0] > candidates[0][0]:
            _, prefixes = heappop(prefixes_heap)
            for prefix in prefixes:
                prefix_len = len(prefix)
                d[prefix] = res = [inf]
                for i in range(1, word_len):
                    c_res = [inf]
                    for li in range(1, min(prefix_len + 1, self.window + 2)):
                        for ri in range(1, min(i + 1, self.window + 2)):
                            prev = d[prefix[:-li]][i - ri]
                            if prev > threshold:
                                edit = (prefix[-li:], word[i - ri:i])
                                if edit in self.costs:
                                    c_res.append(prev +
                                                 self.costs[edit])
                    res.append(max(c_res))
                if prefix in self.dictionary.words_set:
                    heappushpop(candidates, (res[-1], prefix))
                potential = max(res)
                # potential = max(
                #     [e for i in range(self.window + 2) for e in d[prefix[:prefix_len - i]]])
                if potential > threshold:
                    heappush(prefixes_heap, (-potential, self.dictionary.words_trie[prefix]))
        return [(w.strip('⟬⟭'), score) for score, w in sorted(candidates, reverse=True) if
                score > threshold]

    def _infer_instance(self, instance: List[str]) -> List[List[Tuple[float, str]]]:
        candidates = []
        for incorrect in instance:
            if any([c not in self.dictionary.alphabet for c in incorrect]):
                candidates.append([(0, incorrect)])
            else:
                res = self.find_candidates(incorrect, prop_threshold=1e-6)
                if res:
                    candidates.append([(score, candidate) for candidate, score in res])
                else:
                    candidates.append([(0, incorrect)])
        return candidates

    def __call__(self, data: Iterable[Iterable[str]], *args, **kwargs) -> List[List[List[Tuple[float, str]]]]:
        """Propose candidates for tokens in sentences

        Args:
            data: batch of tokenized sentences

        Returns:
            batch of lists of probabilities and candidates for every token
        """
        data = list(data)
        if len(data) > 1:
            data = tqdm(data, desc='Infering a batch with the error model', leave=False)
        return [self._infer_instance(instance) for instance in data]

    @staticmethod
    def _distance_edits(seq1, seq2):
        l1, l2 = len(seq1), len(seq2)
        d = [[(i, ()) for i in range(l2 + 1)]]
        d += [[(i, ())] + [(0, ())] * l2 for i in range(1, l1 + 1)]

        for i in range(1, l1 + 1):
            for j in range(1, l2 + 1):
                edits = [
                    (d[i - 1][j][0] + 1, d[i - 1][j][1] + ((seq1[i - 1], ''),)),
                    (d[i][j - 1][0] + 1, d[i][j - 1][1] + (('', seq2[j - 1]),)),
                    (d[i - 1][j - 1][0] + (seq1[i - 1] != seq2[j - 1]),
                     d[i - 1][j - 1][1] + ((seq1[i - 1], seq2[j - 1]),))
                ]
                if i > 1 and j > 1 and seq1[i - 1] == seq2[j - 2] and seq1[i - 2] == seq2[j - 1]:
                    edits.append((d[i - 2][j - 2][0] + (seq1[i - 1] != seq2[j - 1]),
                                  d[i - 2][j - 2][1] + ((seq1[i - 2:i], seq2[j - 2:j]),)))
                d[i][j] = min(edits, key=lambda x: x[0])

        return d[-1][-1]

    def fit(self, x: List[str], y: List[str]):
        """Calculate character sequences replacements probabilities

        Args:
            x: words with spelling errors
            y: words without spelling errors
        """
        changes = []
        entries = []
        data = list(zip(x, y))
        window = 4
        for error, correct in tqdm(data, desc='Training the error model'):
            correct = '⟬{}⟭'.format(' '.join(correct))
            error = '⟬{}⟭'.format(' '.join(error))
            d, ops = self._distance_edits(correct, error)
            if d <= 2:
                w_ops = set()
                for pos in range(len(ops)):
                    left, right = list(zip(*ops))
                    for l in range(pos, max(0, pos - window) - 1, -1):
                        for r in range(pos + 1, min(len(ops), l + 2 + window)):
                            w_ops.add(((''.join(left[l:r]), ''.join(right[l:r])), l, r))
                ops = [x[0] for x in w_ops]

                entries += [op[0] for op in ops]
                changes += [op for op in ops]

        e_count = Counter(entries)
        c_count = Counter(changes)
        incorrect_prior = 1
        correct_prior = 19
        for (w, s), c in c_count.items():
            c = c + (incorrect_prior if w != s else correct_prior)
            e = e_count[w] + incorrect_prior + correct_prior
            p = c / e
            self.costs[(w, s)] = log(p)

    def save(self):
        """Save replacements probabilities to a file

        """
        logger.info("[saving error_model to `{}`]".format(self.save_path))

        with open(self.save_path, 'w', newline='', encoding='utf8') as tsv_file:
            writer = csv.writer(tsv_file, delimiter='\t')
            for (w, s), log_p in self.costs.items():
                writer.writerow([w, s, exp(log_p)])

    def load(self):
        """Load replacements probabilities from a file

        """
        if self.load_path:
            if self.load_path.is_file():
                logger.debug("loading error_model from `{}`".format(self.load_path))
                with open(self.load_path, 'r', newline='', encoding='utf8') as tsv_file:
                    reader = csv.reader(tsv_file, delimiter='\t')
                    for w, s, p in reader:
                        self.costs[(w, s)] = log(float(p))
            elif not self.load_path.parent.is_dir():
                raise ConfigError("Provided `load_path` for {} doesn't exist!".format(
                    self.__class__.__name__))
        else:
            logger.warning('No load_path provided, initializing error model from scratch')


================================================
FILE: deeppavlov/models/spelling_correction/electors/__init__.py
================================================


================================================
FILE: deeppavlov/models/spelling_correction/electors/kenlm_elector.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import List, Tuple

import kenlm

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

logger = getLogger(__name__)


@register('kenlm_elector')
class KenlmElector(Component):
    """Component that chooses a candidate with the highest product of base and language model probabilities

    Args:
         load_path: path to the kenlm model file
         beam_size: beam size for highest probability search

    Attributes:
        lm: kenlm object
        beam_size: beam size for highest probability search
    """

    def __init__(self, load_path: Path, beam_size: int = 4, *args, **kwargs):
        self.lm = kenlm.Model(str(expand_path(load_path)))
        self.beam_size = beam_size

    def __call__(self, batch: List[List[List[Tuple[float, str]]]]) -> List[List[str]]:
        """Choose the best candidate for every token

        Args:
            batch: batch of probabilities and string values of candidates for every token in a sentence

        Returns:
            batch of corrected tokenized sentences
        """
        return [self._infer_instance(candidates) for candidates in batch]

    def _infer_instance(self, candidates: List[List[Tuple[float, str]]]):
        candidates = candidates + [[(0, '</s>')]]
        state = kenlm.State()
        self.lm.BeginSentenceWrite(state)
        beam = [(0, state, [])]
        for sublist in candidates:
            new_beam = []
            for beam_score, beam_state, beam_words in beam:
                for score, candidate in sublist:
                    prev_state = beam_state
                    c_score = 0
                    cs = candidate.split()
                    for candidate in cs:
                        state = kenlm.State()
                        c_score += self.lm.BaseScore(prev_state, candidate, state)
                        prev_state = state
                    new_beam.append((beam_score + score + c_score, state, beam_words + cs))
            new_beam.sort(reverse=True)
            beam = new_beam[:self.beam_size]
        score, state, words = beam[0]
        return words[:-1]


================================================
FILE: deeppavlov/models/spelling_correction/electors/top1_elector.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import List, Tuple

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

logger = getLogger(__name__)


@register('top1_elector')
class TopOneElector(Component):
    """Component that chooses a candidate with highest base probability for every token

    """

    def __init__(self, *args, **kwargs):
        pass

    def __call__(self, batch: List[List[List[Tuple[float, str]]]]) -> List[List[str]]:
        """Choose the best candidate for every token

        Args:
            batch: batch of probabilities and string values of candidates for every token in a sentence

        Returns:
            batch of corrected tokenized sentences
        """
        return [[max(sublist)[1] for sublist in candidates] for candidates in batch]


================================================
FILE: deeppavlov/models/spelling_correction/levenshtein/__init__.py
================================================
from .searcher_component import LevenshteinSearcherComponent


================================================
FILE: deeppavlov/models/spelling_correction/levenshtein/levenshtein_searcher.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import itertools

import numpy as np
from sortedcontainers import SortedListWithKey

from .tabled_trie import Trie, make_trie


class LevenshteinSearcher:
    """
    Класс для поиска близких слов
    в соответствии с расстоянием Левенштейна

    """

    def __init__(self, alphabet, dictionary, operation_costs=None,
                 allow_spaces=False, euristics='none'):
        self.alphabet = alphabet
        self.allow_spaces = allow_spaces
        if isinstance(euristics, int):
            if euristics < 0:
                raise ValueError("Euristics should be non-negative integer or None")
            else:
                self.euristics = euristics if euristics != 0 else None
        elif euristics in ["none", "None", None]:
            self.euristics = None
        else:
            raise ValueError("Euristics should be non-negative integer or None")
        if isinstance(dictionary, Trie):
            # словарь передан уже в виде бора
            self.dictionary = dictionary
        else:
            self.dictionary = make_trie(alphabet, dictionary, make_cashed=True,
                                        precompute_symbols=self.euristics,
                                        allow_spaces=self.allow_spaces)
        self.transducer = SegmentTransducer(
            alphabet, operation_costs=operation_costs, allow_spaces=allow_spaces)
        self._precompute_euristics()
        self._define_h_function()

    def __contains__(self, word):
        return word in self.dictionary

    def search(self, word, d, allow_spaces=True, return_cost=True):
        """
        Finds all dictionary words in d-window from word
        """
        if not all((c in self.alphabet
                    or (c == " " and self.allow_spaces)) for c in word):
            return []
            # raise ValueError("{0} contains an incorrect symbol".format(word))
        return self._trie_search(
            word, d, allow_spaces=allow_spaces, return_cost=return_cost)

    def _trie_search(self, word, d, transducer=None,
                     allow_spaces=True, return_cost=True):
        """
        Находит все слова в префиксном боре, расстояние до которых
        в соответствии с заданным преобразователем не превышает d
        """
        if transducer is None:
            # разобраться с пробелами
            transducer = self.transducer.inverse()
        allow_spaces &= self.allow_spaces
        trie = self.dictionary
        #  инициализация переменных
        used_agenda_keys = set()
        agenda = SortedListWithKey(key=(lambda x: x[1]))
        h = self.h_func(word, trie.root)
        # agenda[self.agenda_key("", 0, trie.root)] = (0.0, 0.0, h)
        key, value = ("", 0, trie.root), (0.0, 0.0, h)
        agenda.add((key, value))
        answer = dict()
        k = 0
        # очередь с приоритетом с промежуточными результатами
        while len(agenda) > 0:
            key, value = agenda.pop(0)
            if key in used_agenda_keys:
                continue
            used_agenda_keys.add(key)
            low, pos, index = key
            cost, g, h = value
            # g --- текущая стоимость, h --- нижняя оценка будущей стоимости
            # cost = g + h --- нижняя оценка суммарной стоимости
            k += 1
            max_upperside_length = min(len(word) - pos, transducer.max_up_length)
            for upperside_length in range(max_upperside_length + 1):
                new_pos = pos + upperside_length
                curr_up = word[pos: new_pos]
                if curr_up not in transducer.operation_costs:
                    continue
                for curr_low, curr_cost in transducer.operation_costs[curr_up].items():
                    new_g = g + curr_cost
                    if new_g > d:  # если g > d, то h можно не вычислять
                        continue
                    if curr_low == " ":
                        if allow_spaces and trie.is_final(index):
                            new_index = trie.root
                        else:
                            new_index = Trie.NO_NODE
                    else:
                        new_index = trie.descend(index, curr_low)
                    if new_index is Trie.NO_NODE:
                        continue
                    new_low = low + curr_low
                    new_h = self.h_func(word[new_pos:], new_index)
                    new_cost = new_g + new_h
                    if new_cost > d:
                        continue
                    new_key = (new_low, new_pos, new_index)
                    new_value = (new_cost, new_g, new_h)
                    if new_pos == len(word) and trie.is_final(new_index):
                        old_g = answer.get(new_low, None)
                        if old_g is None or new_g < old_g:
                            answer[new_low] = new_g
                    agenda.add((new_key, new_value))
        answer = sorted(answer.items(), key=(lambda x: x[1]))
        if return_cost:
            return answer
        else:
            return [elem[0] for elem in answer]

    def _precompute_euristics(self):
        """
        Предвычисляет будущие символы и стоимости операций с ними
        для h-эвристики
        """
        if self.euristics is None:
            return
        # вычисление минимальной стоимости операции,
        # приводящей к появлению ('+') или исчезновению ('-') данного символа
        removal_costs = {a: np.inf for a in self.alphabet}
        insertion_costs = {a: np.inf for a in self.alphabet}
        if self.allow_spaces:
            removal_costs[' '] = np.inf
            insertion_costs[' '] = np.inf
        for up, costs in self.transducer.operation_costs.items():
            for low, cost in costs.items():
                if up == low:
                    continue
                if up != '':
                    removal_cost = cost / len(up)
                    for a in up:
                        removal_costs[a] = min(removal_costs[a], removal_cost)
                if low != '':
                    insertion_cost = cost / len(low)
                    for a in low:
                        insertion_costs[a] = min(insertion_costs[a], insertion_cost)
        # предвычисление возможных будущих символов в узлах дерева
        # precompute_future_symbols(self.dictionary, self.euristics, self.allow_spaces)
        # предвычисление стоимостей потери символа в узлах дерева
        self._absense_costs_by_node = _precompute_absense_costs(
            self.dictionary, removal_costs, insertion_costs,
            self.euristics, self.allow_spaces)
        # массив для сохранения эвристик
        self._temporary_euristics = [dict() for i in range(len(self.dictionary))]

    def _define_h_function(self):
        if self.euristics in [None, 0]:
            self.h_func = (lambda *x: 0.0)
        else:
            self.h_func = self._euristic_h_function

    def _euristic_h_function(self, suffix, index):
        """
        Вычисление h-эвристики из работы Hulden,2009 для текущей вершины словаря

        Аргументы:
        ----------
        suffix : string
            непрочитанный суффикс входного слова
        index : int
            индекс текущего узла в словаре

        Возвращает:
        -----------
        cost : float
            оценка снизу для стоимости замены,
            приводящей к входному слову с суффиксом suffix,
            если прочитанный префикс слова без опечатки
            привёл в вершину с номером index
        """
        if self.euristics > 0:
            suffix = suffix[:self.euristics]
        # кэширование результатов
        index_temporary_euristics = self._temporary_euristics[index]
        cost = index_temporary_euristics.get(suffix, None)
        if cost is not None:
            return cost
        # извлечение нужных данных из массивов
        absense_costs = self._absense_costs_by_node[index]
        data = self.dictionary.data[index]
        costs = np.zeros(dtype=np.float64, shape=(self.euristics,))
        # costs[j] --- оценка штрафа при предпросмотре вперёд на j символов
        for i, a in enumerate(suffix):
            costs[i:] += absense_costs[a][i:]
        cost = max(costs)
        index_temporary_euristics[suffix] = cost
        return cost

    def _minimal_replacement_cost(self, first, second):
        first_symbols, second_symbols = set(), set()
        removal_cost, insertion_cost = 0, 0
        for a, b in itertools.zip_longest(first, second, fillvalue=None):
            if a is not None:
                first_symbols.add(a)
            if b is not None:
                second_symbols.add(b)
            removal_cost = max(removal_cost, len(first_symbols - second_symbols))
            insertion_cost = max(insertion_cost, len(second_symbols - first_symbols))
        return min(removal_cost, insertion_cost)


def _precompute_absense_costs(dictionary, removal_costs, insertion_costs, n,
                              allow_spaces=False):
    """
    Вычисляет минимальную стоимость появления нового символа в узлах словаря
    в соответствии со штрафами из costs

    Аргументы:
    ---------------
    dictionary : Trie
        словарь, хранящийся в виде ациклического автомата

    removal_costs : dict
        штрафы за удаление символов

    insertion_costs : dict
        штрафы за вставку символов

    n : int
        глубина ``заглядывания вперёд'' в словаре

    Возвращает
    ---------------
    answer : list of dicts, len(answer)=len(dictionary)
        answer[i][a][j] равно минимальному штрафу за появление символа a
        в j-ой позиции в вершине с номером i
    """
    answer = [dict() for node in dictionary.data]
    if n == 0:
        return answer
    curr_alphabet = copy.copy(dictionary.alphabet)
    if allow_spaces:
        curr_alphabet += [' ']
    for l, (costs_in_node, node) in enumerate(zip(answer, dictionary.data)):
        # определение минимальной стоимости удаления символов
        curr_node_removal_costs = np.empty(dtype=np.float64, shape=(n,))
        if len(node[0]) > 0:
            curr_node_removal_costs[0] = min(removal_costs[symbol] for symbol in node[0])
            for j, symbols in enumerate(node[1:], 1):
                if len(symbols) == 0:
                    curr_node_removal_costs[j:] = curr_node_removal_costs[j - 1]
                    break
                curr_cost = min(removal_costs[symbol] for symbol in symbols)
                curr_node_removal_costs[j] = min(curr_node_removal_costs[j - 1], curr_cost)
        else:
            curr_node_removal_costs[:] = np.inf
        # определение минимальной стоимости вставки
        for a in curr_alphabet:
            curr_symbol_costs = np.empty(dtype=np.float64, shape=(n,))
            curr_symbol_costs.fill(insertion_costs[a])
            for j, symbols in enumerate(node):
                if a in symbols:
                    curr_symbol_costs[j:] = 0.0
                    break
                curr_symbol_costs[j] = min(curr_symbol_costs[j], curr_node_removal_costs[j])
            costs_in_node[a] = curr_symbol_costs
    return answer


class SegmentTransducer:
    """
    Класс, реализующий взвешенный конечный преобразователь,
    осуществляющий замены из заданного списка операций

    Аргументы:
    ----------
    alphabet : list
        алфавит

    operation_costs : dict or None(optional, default=None)
        словарь вида {(up,low) : cost}

    allow_spaces : bool(optional, default=False)
        разрешены ли элементы трансдукции, содержащие пробел
        (используется только если явно не заданы operation costs
        и они равны значению по умолчанию)

    """

    def __init__(self, alphabet, operation_costs=None, allow_spaces=False):
        self.alphabet = alphabet
        if operation_costs is None:
            self._make_default_operation_costs(allow_spaces=allow_spaces)
        elif not isinstance(operation_costs, dict):
            raise TypeError("Operation costs must be a dictionary")
        else:
            self.operation_costs = operation_costs
        self._make_reversed_operation_costs()
        self._make_maximal_key_lengths()
        # self.maximal_value_lengths = {}
        # for up, probs in self.operation_costs.items():
        # СЛИШКОМ МНОГО ВЫЗОВОВ, НАДО КАК-ТО ЗАПОМНИТЬ
        # МАКСИМАЛЬНЫЕ ДЛИНЫ КЛЮЧЕЙ ПРИ ОБРАЩЕНИИ
        # max_low_length = max(len(low) for low in probs) if (len(probs) > 0) else -1
        # self.maximal_value_lengths[up] = self.maximal_key_length

    def get_operation_cost(self, up, low):
        """
        Возвращает стоимость элементарной трансдукции up->low
        или np.inf, если такой элементарной трансдукции нет

        Аргументы:
        ----------
        up, low : string
            элементы элементарной трансдукции

        Возвращает:
        -----------
        cost : float
            стоимость элементарной трансдукции up->low
            (np.inf, если такая трансдукция отсутствует)
        """
        up_costs = self.operation_costs.get(up, None)
        if up_costs is None:
            return np.inf
        cost = up_costs.get(low, np.inf)
        return cost

    def inverse(self):
        """
        Строит пробразователь, задающий обратное конечное преобразование
        """
        # УПРОСТИТЬ ОБРАЩЕНИЕ!!!
        inversed_transducer = SegmentTransducer(self.alphabet, operation_costs=dict())
        inversed_transducer.operation_costs = self._reversed_operation_costs
        inversed_transducer._reversed_operation_costs = self.operation_costs
        inversed_transducer.max_low_length = self.max_up_length
        inversed_transducer.max_up_length = self.max_low_length
        inversed_transducer.max_low_lengths_by_up = self.max_up_lengths_by_low
        inversed_transducer.max_up_lengths_by_low = self.max_low_lengths_by_up
        return inversed_transducer

    def distance(self, first, second, return_transduction=False):
        """
        Вычисляет трансдукцию минимальной стоимости,
        отображающую first в second

        Аргументы:
        -----------
        first : string
        second : string
            Верхний и нижний элементы трансдукции

        return_transduction : bool (optional, default=False)
            следует ли возвращать трансдукцию минимального веса
            (см. возвращаемое значение)

        Возвращает:
        -----------
        (final_cost, transductions) : tuple(float, list)
            если return_transduction=True, то возвращает
            минимальную стоимость трансдукции, переводящей first в second
            и список трансдукций с данной стоимостью

        final_cost : float
            если return_transduction=False, то возвращает
            минимальную стоимость трансдукции, переводящей first в second
        """
        if return_transduction:
            add_pred = (lambda x, y: (y == np.inf or x < y))
        else:
            add_pred = (lambda x, y: (y == np.inf or x <= y))
        clear_pred = (lambda x, y: x < y < np.inf)
        update_func = lambda x, y: min(x, y)
        costs, backtraces = self._fill_levenshtein_table(first, second,
                                                         update_func, add_pred, clear_pred)
        final_cost = costs[-1][-1]
        if final_cost == np.inf:
            transductions = [None]
        elif return_transduction:
            transductions = self._backtraces_to_transductions(first, second, backtraces,
                                                              final_cost, return_cost=False)
        if return_transduction:
            return final_cost, transductions
        else:
            return final_cost

    def transduce(self, first, second, threshold):
        """
        Возвращает все трансдукции, переводящие first в second,
        чья стоимость не превышает threshold

        Возвращает:
        ----------
        result : list
            список вида [(трансдукция, стоимость)]
        """
        add_pred = (lambda x, y: x <= threshold)
        clear_pred = (lambda x, y: False)
        update_func = (lambda x, y: min(x, y))
        costs, backtraces = self._fill_levenshtein_table(first, second,
                                                         update_func, add_pred, clear_pred,
                                                         threshold=threshold)
        result = self._backtraces_to_transductions(first, second,
                                                   backtraces, threshold, return_cost=True)
        return result

    def lower_transductions(self, word, max_cost, return_cost=True):
        """
        Возвращает все трансдукции с верхним элементом word,
        чья стоимость не превышает max_cost

    `   Возвращает:
        ----------
        result : list
            список вида [(трансдукция, стоимость)], если return_cost=True
            список трансдукций, если return_cost=False
            список отсортирован в порядке возрастания стоимости трансдукции
        """
        prefixes = [[] for i in range(len(word) + 1)]
        prefixes[0].append(((), 0.0))
        for pos in range(len(prefixes)):
            # вставки
            prefixes[pos] = self._perform_insertions(prefixes[pos], max_cost)
            max_upperside_length = min(len(word) - pos, self.max_up_length)
            for upperside_length in range(1, max_upperside_length + 1):
                up = word[pos: pos + upperside_length]
                for low, low_cost in self.operation_costs.get(up, dict()).items():
                    for transduction, cost in prefixes[pos]:
                        new_cost = cost + low_cost
                        if new_cost <= max_cost:
                            new_transduction = transduction + (up, low)
                            prefixes[pos + upperside_length].append((new_transduction, new_cost))
        answer = sorted(prefixes[-1], key=(lambda x: x[0]))
        if return_cost:
            return answer
        else:
            return [elem[0] for elem in answer]

    def lower(self, word, max_cost, return_cost=True):
        transductions = self.lower_transductions(word, max_cost, return_cost=True)
        answer = dict()
        for transduction, cost in transductions:
            low = "".join(elem[1] for elem in transductions)
            curr_cost = answer.get(low, None)
            if curr_cost is None or cost < curr_cost:
                answer[low] = cost
        answer = sorted(answer.items(), key=(lambda x: x[1]))
        if return_cost:
            return answer
        else:
            return [elem[0] for elem in answer]

    def upper(self, word, max_cost, return_cost=True):
        inversed_transducer = self.inverse()
        return inversed_transducer.lower(word, max_cost, return_cost)

    def upper_transductions(self, word, max_cost, return_cost=True):
        inversed_transducer = self.inverse()
        return inversed_transducer.lower_transductions(word, max_cost, return_cost)

    def _fill_levenshtein_table(self, first, second, update_func, add_pred, clear_pred,
                                threshold=None):
        """
        Функция, динамически заполняющая таблицу costs стоимости трансдукций,
        costs[i][j] --- минимальная стоимость трансдукции,
        переводящей first[:i] в second[:j]

        Аргументы:
        ----------
        first, second : string
            Верхний и нижний элементы трансдукции
        update_func : callable, float*float -> bool
            update_func(x, y) возвращает новое значение в ячейке таблицы costs,
            если старое значение --- y, а потенциально новое значение --- x
            везде update_func = min
        add_pred : callable : float*float -> bool
            add_pred(x, y) возвращает, производится ли добавление
            нового элемента p стоимости x в ячейку backtraces[i][j]
            в зависимости от значения costs[i][j]=y и текущей стоимости x
        clear_pred : callable : float*float -> bool
            clear_pred(x, y) возвращает, производится ли очистка
            ячейки backtraces[i][j] в зависимости от значения costs[i][j]=y
            и текущей стоимости x элемента p, добавляемого в эту ячейку

        Возвращает:
        -----------
        costs : array, dtype=float, shape=(len(first)+1, len(second)+1)
            массив, в ячейке с индексами i, j которого хранится
            минимальная стоимость трансдукции, переводящей first[:i] в second[:j]
        backtraces : array, dtype=list, shape=(len(first)+1, len(second)+1)
            массив, в ячейке с индексами i, j которого хранятся
            обратные ссылки на предыдущую ячейку в оптимальной трансдукции,
            приводящей в ячейку backtraces[i][j]
        """
        m, n = len(first), len(second)
        # если threshold=None, то в качестве порога берётся удвоенная стоимость
        # трансдукции, отображающей символы на одинаковых позициях друг в друга
        if threshold is None:
            threshold = 0.0
            for a, b in zip(first, second):
                threshold += self.get_operation_cost(a, b)
            if m > n:
                for a in first[n:]:
                    threshold += self.get_operation_cost(a, '')
            elif m < n:
                for b in second[m:]:
                    threshold += self.get_operation_cost('', b)
            threshold *= 2
        # инициализация возвращаемых массивов
        costs = np.zeros(shape=(m + 1, n + 1), dtype=np.float64)
        costs[:] = np.inf
        backtraces = [None] * (m + 1)
        for i in range(m + 1):
            backtraces[i] = [[] for j in range(n + 1)]
        costs[0][0] = 0.0
        for i in range(m + 1):
            for i_right in range(i, min(i + self.max_up_length, m) + 1):
                up = first[i: i_right]
                max_low_length = self.max_low_lengths_by_up.get(up, -1)
                if max_low_length == -1:  # no up key in transduction
                    continue
                up_costs = self.operation_costs[up]
                for j in range(n + 1):
                    if costs[i][j] > threshold:
                        continue
                    if len(backtraces[i][j]) == 0 and i + j > 0:
                        continue  # не нашлось обратных ссылок
                    for j_right in range((j if i_right > i else j + 1),
                                         min(j + max_low_length, n) + 1):
                        low = second[j: j_right]
                        curr_cost = up_costs.get(low, np.inf)
                        old_cost = costs[i_right][j_right]
                        new_cost = costs[i][j] + curr_cost
                        if new_cost > threshold:
                            continue
                        if add_pred(new_cost, old_cost):
                            if clear_pred(new_cost, old_cost):
                                backtraces[i_right][j_right] = []
                            costs[i_right][j_right] = update_func(new_cost, old_cost)
                            backtraces[i_right][j_right].append((i, j))
        return costs, backtraces

    def _make_reversed_operation_costs(self):
        """
        Заполняет массив _reversed_operation_costs
        на основе имеющегося массива operation_costs
        """
        _reversed_operation_costs = dict()
        for up, costs in self.operation_costs.items():
            for low, cost in costs.items():
                if low not in _reversed_operation_costs:
                    _reversed_operation_costs[low] = dict()
                _reversed_operation_costs[low][up] = cost
        self._reversed_operation_costs = _reversed_operation_costs

    def _make_maximal_key_lengths(self):
        """
        Вычисляет максимальную длину элемента low
        в элементарной трансдукции (up, low) для каждого up
        и максимальную длину элемента up
        в элементарной трансдукции (up, low) для каждого low
        """
        self.max_up_length = \
            (max(len(up) for up in self.operation_costs)
             if len(self.operation_costs) > 0 else -1)
        self.max_low_length = \
            (max(len(low) for low in self._reversed_operation_costs)
             if len(self._reversed_operation_costs) > 0 else -1)
        self.max_low_lengths_by_up, self.max_up_lengths_by_low = dict(), dict()
        for up, costs in self.operation_costs.items():
            self.max_low_lengths_by_up[up] = \
                max(len(low) for low in costs) if len(costs) > 0 else -1
        for low, costs in self._reversed_operation_costs.items():
            self.max_up_lengths_by_low[low] = \
                max(len(up) for up in costs) if len(costs) > 0 else -1

    def _backtraces_to_transductions(self, first, second, backtraces, threshold, return_cost=False):
        """
        Восстанавливает трансдукции по таблице обратных ссылок

        Аргументы:
        ----------
        first, second : string
            верхние и нижние элементы трансдукции
        backtraces : array-like, dtype=list, shape=(len(first)+1, len(second)+1)
            таблица обратных ссылок
        threshold : float
            порог для отсева трансдукций,
            возвращаются только трансдукции стоимостью <= threshold
        return_cost : bool (optional, default=False)
            если True, то вместе с трансдукциями возвращается их стоимость

        Возвращает:
        -----------
        result : list
            список вида [(трансдукция, стоимость)], если return_cost=True
            и вида [трансдукция], если return_cost=False,
            содержащий все трансдукции, переводящие first в second,
            чья стоимость не превышает threshold
        """
        m, n = len(first), len(second)
        agenda = [None] * (m + 1)
        for i in range(m + 1):
            agenda[i] = [[] for j in range(n + 1)]
        agenda[m][n] = [((), 0.0)]
        for i_right in range(m, -1, -1):
            for j_right in range(n, -1, -1):
                current_agenda = agenda[i_right][j_right]
                if len(current_agenda) == 0:
                    continue
                for (i, j) in backtraces[i_right][j_right]:
                    up, low = first[i:i_right], second[j:j_right]
                    add_cost = self.operation_costs[up][low]
                    for elem, cost in current_agenda:
                        new_cost = cost + add_cost
                        if new_cost <= threshold:  # удаление трансдукций большой стоимости
                            agenda[i][j].append((((up, low),) + elem, new_cost))
        if return_cost:
            return agenda[0][0]
        else:
            return [elem[0] for elem in agenda[0][0]]

    def _perform_insertions(self, initial, max_cost):
        """
        возвращает все трансдукции стоимости <= max_cost,
        которые можно получить из элементов initial

        Аргументы:
        ----------
        initial : list of tuples
            список исходных трансдукций вида [(трансдукция, стоимость)]
        max_cost : float
            максимальная стоимость трансдукции

        Возвращает:
        -----------
        final : list of tuples
            финальный список трансдукций вида [(трансдукция, стоимость)]
        """
        queue = list(initial)
        final = initial
        while len(queue) > 0:
            transduction, cost = queue[0]
            queue = queue[1:]
            for string, string_cost in self.operation_costs[""].items():
                new_cost = cost + string_cost
                if new_cost <= max_cost:
                    new_transduction = transduction + ("", string)
                    final.append((new_transduction, new_cost))
                    queue.append((new_transduction, new_cost))
        return final

    def _make_default_operation_costs(self, allow_spaces=False):
        """
        sets 1.0 cost for every replacement, insertion, deletion and transposition
        """
        self.operation_costs = dict()
        self.operation_costs[""] = {c: 1.0 for c in list(self.alphabet) + [' ']}
        for a in self.alphabet:
            current_costs = {c: 1.0 for c in self.alphabet}
            current_costs[a] = 0.0
            current_costs[""] = 1.0
            if allow_spaces:
                current_costs[" "] = 1.0
            self.operation_costs[a] = current_costs
        # транспозиции
        for a, b in itertools.permutations(self.alphabet, 2):
            self.operation_costs[a + b] = {b + a: 1.0}
        # пробелы
        if allow_spaces:
            self.operation_costs[" "] = {c: 1.0 for c in self.alphabet}
            self.operation_costs[" "][""] = 1.0


================================================
FILE: deeppavlov/models/spelling_correction/levenshtein/searcher_component.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import string
from logging import getLogger
from math import log10
from typing import Iterable, List, Tuple, Optional

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from .levenshtein_searcher import LevenshteinSearcher

logger = getLogger(__name__)


@register('spelling_levenshtein')
class LevenshteinSearcherComponent(Component):
    """Component that finds replacement candidates for tokens at a set Damerau-Levenshtein distance

    Args:
        words: list of every correct word
        max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates
        error_probability: assigned probability for every edit
        vocab_penalty: assigned probability of an out of vocabulary token being the correct one without changes

    Attributes:
        max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates
        error_probability: assigned logarithmic probability for every edit
        vocab_penalty: assigned logarithmic probability of an out of vocabulary token being the correct one without
         changes
    """

    _punctuation = frozenset(string.punctuation)

    def __init__(self, words: Iterable[str], max_distance: int = 1, error_probability: float = 1e-4,
                 vocab_penalty: Optional[float] = None, **kwargs):
        words = list({word.strip().lower().replace('ё', 'е') for word in words})
        alphabet = sorted({letter for word in words for letter in word})
        self.max_distance = max_distance
        self.error_probability = log10(error_probability)
        self.vocab_penalty = self.error_probability if vocab_penalty is None else log10(vocab_penalty)
        self.searcher = LevenshteinSearcher(alphabet, words, allow_spaces=True, euristics=2)

    def _infer_instance(self, tokens: Iterable[str]) -> List[List[Tuple[float, str]]]:
        candidates = []
        for word in tokens:
            if word in self._punctuation:
                candidates.append([(0, word)])
            else:
                c = {candidate: self.error_probability * distance
                     for candidate, distance in self.searcher.search(word, d=self.max_distance)}
                c[word] = c.get(word, self.vocab_penalty)
                candidates.append([(score, candidate) for candidate, score in c.items()])
        return candidates

    def __call__(self, batch: Iterable[Iterable[str]], *args, **kwargs) -> List[List[List[Tuple[float, str]]]]:
        """Propose candidates for tokens in sentences

        Args:
            batch: batch of tokenized sentences

        Returns:
            batch of lists of probabilities and candidates for every token
        """
        return [self._infer_instance(tokens) for tokens in batch]


================================================
FILE: deeppavlov/models/spelling_correction/levenshtein/tabled_trie.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
from collections import defaultdict

import numpy as np


class Trie:
    """
    Реализация префиксного бора (точнее, корневого направленного ациклического графа)

    Атрибуты
    --------
    alphabet: list, алфавит
    alphabet_codes: dict, словарь символ:код
    compressed: bool, индикатор сжатия
    cashed: bool, индикатор кэширования запросов к функции descend
    root: int, индекс корня
    graph: array, type=int, shape=(число вершин, размер алфавита), матрица потомков
    graph[i][j] = k <-> вершина k --- потомок вершины i по ребру, помеченному символом alphabet[j]
    data: array, type=object, shape=(число вершин), массив с данными, хранящямися в вершинах
    final: array, type=bool, shape=(число вершин), массив индикаторов
    final[i] = True <-> i --- финальная вершина
    """
    NO_NODE = -1
    SPACE_CODE = -1

    ATTRS = ['is_numpied', 'precompute_symbols', 'allow_spaces',
             'is_terminated', 'to_make_cashed']

    def __init__(self, alphabet, make_sorted=True, make_alphabet_codes=True,
                 is_numpied=False, to_make_cashed=False,
                 precompute_symbols=None, allow_spaces=False, dict_storage=False):
        self.alphabet = sorted(alphabet) if make_sorted else alphabet
        self.alphabet_codes = ({a: i for i, a in enumerate(self.alphabet)}
                               if make_alphabet_codes else self.alphabet)
        self.alphabet_codes[" "] = Trie.SPACE_CODE
        self.is_numpied = is_numpied
        self.to_make_cashed = to_make_cashed
        self.dict_storage = dict_storage
        self.precompute_symbols = precompute_symbols
        self.allow_spaces = allow_spaces
        self.initialize()

    def initialize(self):
        self.root = 0
        self.graph = [self._make_default_node()]
        self.data, self.final = [None], [False]
        self.nodes_number = 1
        self.descend = self._descend_simple
        self.is_terminated = False

    def _make_default_node(self):
        if self.dict_storage:
            return defaultdict(lambda: -1)
        elif self.is_numpied:
            return np.full(shape=(len(self.alphabet),),
                           fill_value=Trie.NO_NODE, dtype=int)
        else:
            return [Trie.NO_NODE] * len(self.alphabet)

    def save(self, outfile):
        """
        Сохраняет дерево для дальнейшего использования
        """
        with open(outfile, "w", encoding="utf8") as fout:
            attr_values = [getattr(self, attr) for attr in Trie.ATTRS]
            attr_values.append(any(x is not None for x in self.data))
            fout.write("{}\n{}\t{}\n".format(
                " ".join("T" if x else "F" for x in attr_values),
                self.nodes_number, self.root))
            fout.write(" ".join(str(a) for a in self.alphabet) + "\n")
            for index, label in enumerate(self.final):
                letters = self._get_letters(index, return_indexes=True)
                children = self._get_children(index)
                fout.write("{}\t{}\n".format(
                    "T" if label else "F", " ".join("{}:{}".format(*elem)
                                                    for elem in zip(letters, children))))
            if self.precompute_symbols is not None:
                for elem in self.data:
                    fout.write(":".join(",".join(
                        map(str, symbols)) for symbols in elem) + "\n")
        return

    def make_cashed(self):
        """
        Включает кэширование запросов к descend
        """
        self._descendance_cash = [dict() for _ in self.graph]
        self.descend = self._descend_cashed

    def make_numpied(self):
        self.graph = np.array(self.graph)
        self.final = np.asarray(self.final, dtype=bool)
        self.is_numpied = True

    def add(self, s):
        """
        Добавление строки s в префиксный бор
        """
        if self.is_terminated:
            raise TypeError("Impossible to add string to fitted trie")
        if s == "":
            self._set_final(self.root)
            return
        curr = self.root
        for i, a in enumerate(s):
            code = self.alphabet_codes[a]
            next = self.graph[curr][code]
            if next == Trie.NO_NODE:
                curr = self._add_descendant(curr, s[i:])
                break
            else:
                curr = next
        self._set_final(curr)
        return self

    def fit(self, words):
        for s in words:
            self.add(s)
        self.terminate()

    def terminate(self):
        if self.is_numpied:
            self.make_numpied()
        self.terminated = True
        if self.precompute_symbols is not None:
            precompute_future_symbols(self, self.precompute_symbols,
                                      allow_spaces=self.allow_spaces)
        if self.to_make_cashed:
            self.make_cashed()

    def __contains__(self, s):
        if any(a not in self.alphabet for a in s):
            return False
        # word = tuple(self.alphabet_codes[a] for a in s)
        node = self.descend(self.root, s)
        return (node != Trie.NO_NODE) and self.is_final(node)

    def words(self):
        """
        Возвращает итератор по словам, содержащимся в боре
        """
        branch, word, indexes = [self.root], [], [0]
        letters_with_children = [self._get_children_and_letters(self.root)]
        while len(branch) > 0:
            if self.is_final(branch[-1]):
                yield "".join(word)
            while indexes[-1] == len(letters_with_children[-1]):
                indexes.pop()
                letters_with_children.pop()
                branch.pop()
                if len(indexes) == 0:
                    raise StopIteration()
                word.pop()
            next_letter, next_child = letters_with_children[-1][indexes[-1]]
            indexes[-1] += 1
            indexes.append(0)
            word.append(next_letter)
            branch.append(next_child)
            letters_with_children.append(self._get_children_and_letters(branch[-1]))

    def is_final(self, index):
        """
        Аргументы
        ---------
        index: int, номер вершины

        Возвращает
        ----------
        True: если index --- номер финальной вершины
        """
        return self.final[index]

    def find_partitions(self, s, max_count=1):
        """
        Находит все разбиения s = s_1 ... s_m на словарные слова s_1, ..., s_m
        для m <= max_count
        """
        curr_agenda = [(self.root, [], 0)]
        for i, a in enumerate(s):
            next_agenda = []
            for curr, borders, cost in curr_agenda:
                if cost >= max_count:
                    continue
                child = self.graph[curr][self.alphabet_codes[a]]
                # child = self.graph[curr][a]
                if child == Trie.NO_NODE:
                    continue
                next_agenda.append((child, borders, cost))
                if self.is_final(child):
                    next_agenda.append((self.root, borders + [i + 1], cost + 1))
            curr_agenda = next_agenda
        answer = []
        for curr, borders, cost in curr_agenda:
            if curr == self.root:
                borders = [0] + borders
                answer.append([s[left:borders[i + 1]] for i, left in enumerate(borders[:-1])])
        return answer

    def __len__(self):
        return self.nodes_number

    def __repr__(self):
        answer = ""
        for i, (final, data) in enumerate(zip(self.final, self.data)):
            letters, children = self._get_letters(i), self._get_children(i)
            answer += "{0}".format(i)
            if final:
                answer += "F"
            for a, index in zip(letters, children):
                answer += " {0}:{1}".format(a, index)
            answer += "\n"
            if data is not None:
                answer += "data:{0} {1}\n".format(len(data), " ".join(str(elem) for elem in data))
        return answer

    def _add_descendant(self, parent, s, final=False):
        for a in s:
            code = self.alphabet_codes[a]
            parent = self._add_empty_child(parent, code, final)
        return parent

    def _add_empty_child(self, parent, code, final=False):
        """
        Добавление ребёнка к вершине parent по символу с кодом code
        """
        self.graph[parent][code] = self.nodes_number
        self.graph.append(self._make_default_node())
        self.data.append(None)
        self.final.append(final)
        self.nodes_number += 1
        return (self.nodes_number - 1)

    def _descend_simple(self, curr, s):
        """
        Спуск из вершины curr по строке s
        """
        for a in s:
            curr = self.graph[curr][self.alphabet_codes[a]]
            if curr == Trie.NO_NODE:
                break
        return curr

    def _descend_cashed(self, curr, s):
        """
        Спуск из вершины curr по строке s с кэшированием
        """
        if s == "":
            return curr
        curr_cash = self._descendance_cash[curr]
        answer = curr_cash.get(s, None)
        if answer is not None:
            return answer
        # для оптимизации дублируем код
        res = curr
        for a in s:
            res = self.graph[res][self.alphabet_codes[a]]
            # res = self.graph[res][a]
            if res == Trie.NO_NODE:
                break
        curr_cash[s] = res
        return res

    def _set_final(self, curr):
        """
        Делает состояние curr завершающим
        """
        self.final[curr] = True

    def _get_letters(self, index, return_indexes=False):
        """
        Извлекает все метки выходных рёбер вершины с номером index
        """
        if self.dict_storage:
            answer = list(self.graph[index].keys())
        else:
            answer = [i for i, elem in enumerate(self.graph[index])
                      if elem != Trie.NO_NODE]
        if not return_indexes:
            answer = [(self.alphabet[i] if i >= 0 else " ") for i in answer]
        return answer

    def _get_children_and_letters(self, index, return_indexes=False):
        if self.dict_storage:
            answer = list(self.graph[index].items())
        else:
            answer = [elem for elem in enumerate(self.graph[index])
                      if elem[1] != Trie.NO_NODE]
        if not return_indexes:
            for i, (letter_index, child) in enumerate(answer):
                answer[i] = (self.alphabet[letter_index], child)
        return answer

    def _get_children(self, index):
        """
        Извлекает всех потомков вершины с номером index
        """
        if self.dict_storage:
            return list(self.graph[index].values())
        else:
            return [elem for elem in self.graph[index] if elem != Trie.NO_NODE]


class TrieMinimizer:
    def __init__(self):
        pass

    def minimize(self, trie, dict_storage=False, make_cashed=False, make_numpied=False,
                 precompute_symbols=None, allow_spaces=False, return_groups=False):
        N = len(trie)
        if N == 0:
            raise ValueError("Trie should be non-empty")
        node_classes = np.full(shape=(N,), fill_value=-1, dtype=int)
        order = self.generate_postorder(trie)
        # processing the first node
        index = order[0]
        node_classes[index] = 0
        class_representatives = [index]
        node_key = ((), (), trie.is_final(index))
        classes, class_keys = {node_key: 0}, [node_key]
        curr_index = 1
        for index in order[1:]:
            letter_indexes = tuple(trie._get_letters(index, return_indexes=True))
            children = trie._get_children(index)
            children_classes = tuple(node_classes[i] for i in children)
            key = (letter_indexes, children_classes, trie.is_final(index))
            key_class = classes.get(key, None)
            if key_class is not None:
                node_classes[index] = key_class
            else:
                # появился новый класс
                class_keys.append(key)
                classes[key] = node_classes[index] = curr_index
                class_representatives.append(curr_index)
                curr_index += 1
        # построение нового дерева
        compressed = Trie(trie.alphabet, is_numpied=make_numpied,
                          dict_storage=dict_storage, allow_spaces=allow_spaces,
                          precompute_symbols=precompute_symbols)
        L = len(classes)
        new_final = [elem[2] for elem in class_keys[::-1]]
        if dict_storage:
            new_graph = [defaultdict(int) for _ in range(L)]
        elif make_numpied:
            new_graph = np.full(shape=(L, len(trie.alphabet)),
                                fill_value=Trie.NO_NODE, dtype=int)
            new_final = np.array(new_final, dtype=bool)
        else:
            new_graph = [[Trie.NO_NODE for a in trie.alphabet] for i in range(L)]
        for (indexes, children, final), class_index in \
                sorted(classes.items(), key=(lambda x: x[1])):
            row = new_graph[L - class_index - 1]
            for i, child_index in zip(indexes, children):
                row[i] = L - child_index - 1
        compressed.graph = new_graph
        compressed.root = L - node_classes[trie.root] - 1
        compressed.final = new_final
        compressed.nodes_number = L
        compressed.data = [None] * L
        if make_cashed:
            compressed.make_cashed()
        if precompute_symbols is not None:
            if (trie.is_terminated and trie.precompute_symbols
                    and trie.allow_spaces == allow_spaces):
                # копируем будущие символы из исходного дерева
                # нужно, чтобы возврат из финальных состояний в начальное был одинаковым в обоих деревьях
                for i, node_index in enumerate(class_representatives[::-1]):
                    # будущие символы для представителя i-го класса
                    compressed.data[i] = copy.copy(trie.data[node_index])
            else:
                precompute_future_symbols(compressed, precompute_symbols, allow_spaces)
        if return_groups:
            node_classes = [L - i - 1 for i in node_classes]
            return compressed, node_classes
        else:
            return compressed

    def generate_postorder(self, trie):
        """
        Обратная топологическая сортировка
        """
        order, stack = [], []
        stack.append(trie.root)
        colors = ['white'] * len(trie)
        while len(stack) > 0:
            index = stack[-1]
            color = colors[index]
            if color == 'white':  # вершина ещё не обрабатывалась
                colors[index] = 'grey'
                for child in trie._get_children(index):
                    # проверяем, посещали ли мы ребёнка раньше
                    if child != Trie.NO_NODE and colors[child] == 'white':
                        stack.append(child)
            else:
                if color == 'grey':
                    colors[index] = 'black'
                    order.append(index)
                stack = stack[:-1]
        return order


def load_trie(infile):
    with open(infile, "r", encoding="utf8") as fin:
        line = fin.readline().strip()
        flags = [x == 'T' for x in line.split()]
        if len(flags) != len(Trie.ATTRS) + 1:
            raise ValueError("Wrong file format")
        nodes_number, root = map(int, fin.readline().strip().split())
        alphabet = fin.readline().strip().split()
        trie = Trie(alphabet)
        for i, attr in enumerate(Trie.ATTRS):
            setattr(trie, attr, flags[i])
        read_data = flags[-1]
        final = [False] * nodes_number
        # print(len(alphabet), nodes_number)
        if trie.dict_storage:
            graph = [defaultdict(lambda: -1) for _ in range(nodes_number)]
        elif trie.is_numpied:
            final = np.array(final)
            graph = np.full(shape=(nodes_number, len(alphabet)),
                            fill_value=Trie.NO_NODE, dtype=int)
        else:
            graph = [[Trie.NO_NODE for a in alphabet] for i in range(nodes_number)]
        for i in range(nodes_number):
            line = fin.readline().strip()
            if "\t" in line:
                label, transitions = line.split("\t")
                final[i] = (label == "T")
            else:
                label = line
                final[i] = (label == "T")
                continue
            transitions = [x.split(":") for x in transitions.split()]
            for code, value in transitions:
                graph[i][int(code)] = int(value)
        trie.graph = graph
        trie.root = root
        trie.final = final
        trie.nodes_number = nodes_number
        trie.data = [None] * nodes_number
        if read_data:
            for i in range(nodes_number):
                line = fin.readline().strip("\n")
                trie.data[i] = [set(elem.split(",")) for elem in line.split(":")]
        if trie.to_make_cashed:
            trie.make_cashed()
        return trie


def make_trie(alphabet, words, compressed=True, is_numpied=False,
              make_cashed=False, precompute_symbols=False,
              allow_spaces=False, dict_storage=False):
    trie = Trie(alphabet, is_numpied=is_numpied, to_make_cashed=make_cashed,
                precompute_symbols=precompute_symbols, dict_storage=dict_storage)
    trie.fit(words)
    if compressed:
        tm = TrieMinimizer()
        trie = tm.minimize(trie, dict_storage=dict_storage, make_cashed=make_cashed,
                           make_numpied=is_numpied, precompute_symbols=precompute_symbols,
                           allow_spaces=allow_spaces)
    return trie


def precompute_future_symbols(trie, n, allow_spaces=False):
    """
    Collecting possible continuations of length <= n for every node
    """
    if n == 0:
        return
    if trie.is_terminated and trie.precompute_symbols:
        # символы уже предпосчитаны
        return
    for index, final in enumerate(trie.final):
        trie.data[index] = [set() for i in range(n)]
    for index, (node_data, final) in enumerate(zip(trie.data, trie.final)):
        node_data[0] = set(trie._get_letters(index))
        if allow_spaces and final:
            node_data[0].add(" ")
    for d in range(1, n):
        for index, (node_data, final) in enumerate(zip(trie.data, trie.final)):
            children = set(trie._get_children(index))
            for child in children:
                node_data[d] |= trie.data[child][d - 1]
            # в случае, если разрешён возврат по пробелу в стартовое состояние
            if allow_spaces and final:
                node_data[d] |= trie.data[trie.root][d - 1]
    trie.terminated = True


================================================
FILE: deeppavlov/models/tokenizers/__init__.py
================================================


================================================
FILE: deeppavlov/models/tokenizers/lazy_tokenizer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger

from nltk import word_tokenize

from deeppavlov.core.common.registry import register

log = getLogger(__name__)


@register('lazy_tokenizer')
def lazy_tokenizer(batch):
    """Tokenizes if there is something to tokenize."""

    if len(batch) > 0 and isinstance(batch[0], str):
        batch = [word_tokenize(utt) for utt in batch]
    return batch


================================================
FILE: deeppavlov/models/tokenizers/nltk_moses_tokenizer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union, List

from sacremoses import MosesDetokenizer, MosesTokenizer

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register("nltk_moses_tokenizer")
class NLTKMosesTokenizer(Component):
    """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer

    Attributes:
        escape: whether escape characters for use in html markup
        tokenizer: tokenizer instance from nltk.tokenize.moses
        detokenizer: detokenizer instance from nltk.tokenize.moses

    Args:
        escape: whether escape characters for use in html markup
    """

    def __init__(self, escape: bool = False, *args, **kwargs):
        self.escape = escape
        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()

    def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]:
        """Tokenize given batch of strings or detokenize given batch of lists of tokens

        Args:
            batch: list of text samples or list of lists of tokens

        Returns:
            list of lists of tokens or list of text samples
        """
        if isinstance(batch[0], str):
            return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch]
        else:
            return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape)
                    for line in batch]


================================================
FILE: deeppavlov/models/tokenizers/nltk_tokenizer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

import nltk

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register("nltk_tokenizer")
class NLTKTokenizer(Component):
    """Class for splitting texts on tokens using NLTK

    Args:
        tokenizer: tokenization mode for `nltk.tokenize`
        download: whether to download nltk data

    Attributes:
        tokenizer: tokenizer instance from nltk.tokenizers
    """

    def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False,
                 *args, **kwargs):
        if download:
            nltk.download()
        self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
        if not callable(self.tokenizer):
            raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))

    def __call__(self, batch: List[str]) -> List[List[str]]:
        """Tokenize given batch

        Args:
            batch: list of text samples

        Returns:
            list of lists of tokens
        """
        return [self.tokenizer(sent) for sent in batch]


================================================
FILE: deeppavlov/models/tokenizers/spacy_tokenizer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import List, Generator, Any, Optional, Union, Tuple, Iterable

import spacy
import spacy.language

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.models.tokenizers.utils import detokenize, ngramize

logger = getLogger(__name__)


# TODO: make proper handling through spacy.cli.download in the stage of python -m deeppavlov download
def _try_load_spacy_model(model_name: str, disable: Iterable[str] = ()):
    disable = set(disable)
    try:
        model = spacy.load(model_name, disable=disable)
    except OSError as e:
        try:
            model = __import__(model_name).load(disable=disable)
            if not isinstance(model, spacy.language.Language):
                raise RuntimeError(f'{model_name} is not a spacy model module')
        except Exception:
            raise e
    return model


@register('stream_spacy_tokenizer')
class StreamSpacyTokenizer(Component):
    """Tokenize or lemmatize a list of documents. Default spacy model is **en_core_web_sm**.
    Return a list of tokens or lemmas for a whole document.
    If is called onto ``List[str]``, performs detokenizing procedure.

    Args:
        disable: spacy pipeline elements to disable, serves a purpose of performing; if nothing
        filter_stopwords: whether to ignore stopwords during tokenizing/lemmatizing and ngrams creation
        batch_size: a batch size for spaCy buffering
        ngram_range: size of ngrams to create; only unigrams are returned by default
        lemmas: whether to perform lemmatizing or not
        lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize`
         and :meth:`_lemmatize` methods
        alphas_only: whether to filter out non-alpha tokens; is performed by default by
         :meth:`_filter` method
        spacy_model: a string name of spacy model to use; DeepPavlov searches for this name in
         downloaded spacy models; default model is **en_core_web_sm**, it downloads automatically
         during DeepPavlov installation


    Attributes:
        stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing
         and ngrams creation
        model: a loaded spacy model
        batch_size: a batch size for spaCy buffering
        ngram_range: size of ngrams to create; only unigrams are returned by default
        lemmas: whether to perform lemmatizing or not
        lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize`
         and :meth:`_lemmatize` methods
        alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter`
         method

    """

    def __init__(self, disable: Optional[Iterable[str]] = None, filter_stopwords: bool = False,
                 batch_size: Optional[int] = None, ngram_range: Optional[List[int]] = None,
                 lemmas: bool = False, lowercase: Optional[bool] = None, alphas_only: Optional[bool] = None,
                 spacy_model: str = 'en_core_web_sm', **kwargs):

        if disable is None:
            disable = ['parser', 'ner']
        if ngram_range is None:
            ngram_range = [1, 1]
        self.model = _try_load_spacy_model(spacy_model, disable=disable)
        self.stopwords = self.model.Defaults.stop_words if filter_stopwords else set()
        self.batch_size = batch_size
        self.ngram_range = tuple(ngram_range)  # cast JSON array to tuple
        self.lemmas = lemmas
        self.lowercase = lowercase
        self.alphas_only = alphas_only

    def __call__(self, batch: Union[List[str], List[List[str]]]) -> Union[List[List[str]], List[str]]:
        """Tokenize or detokenize strings, depends on the type structure of passed arguments.

        Args:
            batch: a batch of documents to perform tokenizing/lemmatizing;
             or a batch of lists of tokens/lemmas to perform detokenizing

        Returns:
            a batch of lists of tokens/lemmas; or a batch of detokenized strings

        Raises:
            TypeError: If the first element of ``batch`` is neither List, nor str.

        """
        if isinstance(batch[0], str):
            if self.lemmas:
                return list(self._lemmatize(batch))
            else:
                return list(self._tokenize(batch))
        if isinstance(batch[0], list):
            return [detokenize(doc) for doc in batch]
        raise TypeError(
            "StreamSpacyTokenizer.__call__() is not implemented for `{}`".format(type(batch[0])))

    def _tokenize(self, data: List[str], ngram_range: Optional[Tuple[int, int]] = None, batch_size: int = 10000,
                  lowercase: bool = True) -> Generator[List[str], Any, None]:
        """Tokenize a list of documents.

        Args:
            data: a list of documents to tokenize
            ngram_range: size of ngrams to create; only unigrams are returned by default
            batch_size: a batch size for spaCy buffering
            lowercase: whether to perform lowercasing or not; is performed by default by
                :meth:`_tokenize` and :meth:`_lemmatize` methods

        Yields:
            list of lists of ngramized tokens or list of detokenized strings

        Returns:
            None

        """
        _batch_size = self.batch_size or batch_size
        _ngram_range = ngram_range or self.ngram_range

        if self.lowercase is None:
            _lowercase = lowercase
        else:
            _lowercase = self.lowercase

        for i, doc in enumerate(
                self.model.tokenizer.pipe(data, batch_size=_batch_size)):
            if _lowercase:
                tokens = [t.lower_ for t in doc]
            else:
                tokens = [t.text for t in doc]
            filtered = self._filter(tokens)
            processed_doc = ngramize(filtered, ngram_range=_ngram_range, doc=data[i])
            yield from processed_doc

    def _lemmatize(self, data: List[str], ngram_range: Optional[Tuple[int, int]] = None, batch_size: int = 10000,
                   lowercase: bool = True) -> Generator[List[str], Any, None]:
        """Lemmatize a list of documents.

        Args:
            data: a list of documents to tokenize
            ngram_range: size of ngrams to create; only unigrams are returned by default
            batch_size: a batch size for spaCy buffering

       Yields:
           list of lists of ngramized lemmas or list of detokenized strings

        Returns:
            None

        """
        _batch_size = self.batch_size or batch_size
        _ngram_range = ngram_range or self.ngram_range

        if self.lowercase is None:
            _lowercase = lowercase
        else:
            _lowercase = self.lowercase

        for i, doc in enumerate(
                self.model.pipe(data, batch_size=_batch_size)):
            lemmas = [t.lemma_ for t in doc]
            if _lowercase:
                lemmas = [t.lower() for t in lemmas]
            lemm_doc = " ".join(lemmas)
            filtered = self._filter(lemmas)
            processed_doc = ngramize(filtered, ngram_range=_ngram_range, doc=lemm_doc)
            yield from processed_doc

    def _filter(self, items: List[str], alphas_only: bool = True) -> List[str]:
        """Filter a list of tokens/lemmas.

        Args:
            items: a list of tokens/lemmas to filter
            alphas_only: whether to filter out non-alpha tokens

        Returns:
            a list of filtered tokens/lemmas

        """
        if self.alphas_only is None:
            _alphas_only = alphas_only
        else:
            _alphas_only = self.alphas_only

        if _alphas_only:
            filter_fn = lambda x: x.isalpha() and not x.isspace() and x not in self.stopwords
        else:
            filter_fn = lambda x: not x.isspace() and x not in self.stopwords

        return list(filter(filter_fn, items))

    def set_stopwords(self, stopwords: List[str]) -> None:
        """Redefine a list of stopwords.

        Args:
            stopwords: a list of stopwords

        Returns:
            None

        """
        self.stopwords = stopwords


================================================
FILE: deeppavlov/models/tokenizers/split_tokenizer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register("split_tokenizer")
class SplitTokenizer(Component):
    """
    Generates utterance's tokens by mere python's ``str.split()``.

    Doesn't have any parameters.
    """

    def __init__(self, **kwargs) -> None:
        pass

    def __call__(self, batch: List[str]) -> List[List[str]]:
        """
        Tokenize given batch

        Args:
            batch: list of texts to tokenize

        Returns:
            tokenized batch
        """
        if isinstance(batch, (list, tuple)):
            return [sample.split() for sample in batch]
        else:
            raise NotImplementedError('not implemented for types other than'
                                      ' list or tuple')


================================================
FILE: deeppavlov/models/tokenizers/utils.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from typing import List, Generator, Any


def detokenize(tokens):
    """
    Detokenizing a text undoes the tokenizing operation, restores
    punctuation and spaces to the places that people expect them to be.
    Ideally, `detokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(tokens)
    step0 = text.replace('. . .', '...')
    step1 = step0.replace("`` ", '"').replace(" ''", '"')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't") \
        .replace(" nt", "nt").replace("can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()


def ngramize(items: List[str], ngram_range=(1, 1), doc: str = None) -> Generator[List[str], Any, None]:
    """
    Make ngrams from a list of tokens/lemmas
    :param items: list of tokens, lemmas or other strings to form ngrams
    :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to
    (1, 2), for bigrams only should be set to (2, 2)
    :return: ngrams (as strings) generator
    """

    ngrams = []
    ranges = [(0, i) for i in range(ngram_range[0], ngram_range[1] + 1)]
    for r in ranges:
        ngrams += list(zip(*[items[j:] for j in range(*r)]))

    formatted_ngrams = [' '.join(item) for item in ngrams]
    if doc is not None:
        doc_lower = doc.lower()
        formatted_ngrams = [ngram for ngram in formatted_ngrams if (ngram in doc or ngram in doc_lower)]

    yield formatted_ngrams


================================================
FILE: deeppavlov/models/torch_bert/__init__.py
================================================


================================================
FILE: deeppavlov/models/torch_bert/crf.py
================================================
import numpy as np
import torch
from torch import nn
from torchcrf import CRF as CRFbase


class CRF(CRFbase):
    """Class with Conditional Random Field from PyTorch-CRF library
       with modified training function
    """

    def __init__(self, num_tags: int, batch_first: bool = False) -> None:
        super().__init__(num_tags=num_tags, batch_first=batch_first)
        nn.init.zeros_(self.transitions)
        nn.init.zeros_(self.start_transitions)
        nn.init.zeros_(self.end_transitions)
        self.stats = torch.zeros((num_tags, num_tags), dtype=torch.float)
        self.zeros = torch.zeros((num_tags, num_tags), dtype=torch.float)
        self.neg = torch.full((num_tags, num_tags), -1000.0)

    def forward(self, tags_batch: torch.LongTensor, y_masks: np.ndarray):
        seq_lengths = np.sum(y_masks, axis=1)
        for seq_len, tags_list in zip(seq_lengths, tags_batch):
            if seq_len > 1:
                for i in range(seq_len - 1):
                    self.stats[int(tags_list[i])][int(tags_list[i + 1])] += 1.0
        with torch.no_grad():
            self.transitions.copy_(torch.where(self.stats > 0, self.zeros, self.neg))


================================================
FILE: deeppavlov/models/torch_bert/multitask_transformer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections.abc import Iterable
from logging import getLogger
from pathlib import Path
from typing import Dict, Optional

import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
from transformers import AutoConfig, AutoModel

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel
from deeppavlov.models.torch_bert.torch_transformers_sequence_tagger import token_from_subtoken, \
    token_labels_to_subtoken_labels

log = getLogger(__name__)


class FocalLoss(nn.Module):
    "Non weighted version of Focal Loss"

    def __init__(self, alpha=.5, gamma=2, categorical_loss=False, weight=None):
        super(FocalLoss, self).__init__()
        self.alpha = torch.tensor([alpha, 1 - alpha]).cuda()
        self.gamma = gamma
        self.categorical = categorical_loss
        self.weight = weight

    def forward(self, inputs, targets):
        if self.categorical:
            loss = CrossEntropyLoss(weight=self.weight, reduction='none')(inputs, targets)
        else:
            loss = BCEWithLogitsLoss(weight=self.weight, reduction='none')(inputs, targets)
        targets = targets.type(torch.long)
        at = self.alpha.gather(0, targets.data.view(-1))
        pt = torch.exp(-loss)
        F_loss = at * (1 - pt) ** self.gamma * loss
        return F_loss.mean()


def SoftCrossEntropyLoss(inputs, targets):
    logprobs = torch.nn.functional.log_softmax(inputs, dim=1)
    return -(targets * logprobs).sum() / inputs.shape[0]


def we_transform_input(name):
    return name in ['sequence_labeling', 'multiple_choice']


class BertForMultiTask(nn.Module):
    """
    BERT model for multiple choice,sequence labeling, ner, classification or regression
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
    task_num_classes
    task_types
    backbone_model - na
    """

    def __init__(self, tasks_num_classes, multilabel, task_types,
                 weights, backbone_model='bert_base_uncased',
                 dropout=None, new_model=False,focal=False,
                 max_seq_len=320, model_takes_token_type_ids=True):

        super(BertForMultiTask, self).__init__()
        config = AutoConfig.from_pretrained(backbone_model, output_hidden_states=True, output_attentions=True)
        self.bert = AutoModel.from_pretrained(pretrained_model_name_or_path=backbone_model,
                                                config=config)
        self.classes = tasks_num_classes  # classes for every task
        self.weights = weights
        self.multilabel = multilabel
        self.new_model = new_model
        self.model_takes_token_type_ids = model_takes_token_type_ids
        if dropout is not None:
            self.dropout = nn.Dropout(dropout)
        elif hasattr(config, 'hidden_dropout_prob'):
            self.dropout = nn.Dropout(config.hidden_dropout_prob)
        elif hasattr(config, 'seq_classif_dropout'):
            self.dropout = nn.Dropout(config.seq_classif_dropout)
        elif hasattr(config, 'dropout'):
            self.dropout = nn.Dropout(config.dropout)
        else:
            self.dropout = nn.Dropout(0)
        self.max_seq_len = max_seq_len
        self.activation = nn.Tanh()
        self.task_types = task_types
        self.focal=focal
        OUT_DIM = config.hidden_size
        if self.new_model and self.new_model!=2:
            OUT_DIM = OUT_DIM * 2
        self.bert.final_classifier = nn.ModuleList(
            [
                nn.Linear(OUT_DIM, num_labels) if self.task_types[i] not in ['multiple_choice',
                                                                             'regression', 'binary_head']
                else nn.Linear(OUT_DIM, 1) for i, num_labels in enumerate(self.classes)
            ]
        )
        if self.new_model:# or True:
            self.bert.pooling_layer = nn.Linear(OUT_DIM, OUT_DIM)
        else:
            self.bert.pooler = nn.Linear(OUT_DIM, OUT_DIM)

    def get_logits(self, task_id, input_ids, attention_mask, token_type_ids):
        name = self.task_types[task_id]
        outputs = None
        if we_transform_input(name):
            input_ids = input_ids.view(-1, input_ids.size(-1))
            attention_mask = attention_mask.view(-1, attention_mask.size(-1))
            if token_type_ids is not None:
                token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
        if token_type_ids is None or not self.model_takes_token_type_ids:
            outputs = self.bert(input_ids=input_ids.long(),
                                attention_mask=attention_mask.long())
        else:
            try:
                outputs = self.bert(input_ids=input_ids.long(),
                                token_type_ids=token_type_ids.long(),
                                attention_mask=attention_mask.long())
            except Exception as e:
                if "forward() got an unexpected keyword argument 'token_type_ids'" in str(e):
                    outputs = self.bert(input_ids=input_ids.long(),
                                        attention_mask=attention_mask.long())
                    self.model_takes_token_type_ids=False
                else:
                    raise e
        if name == 'sequence_labeling':
            return outputs.last_hidden_state
        elif self.new_model == 2:
            return outputs.last_hidden_state[:, task_id]
        elif self.new_model:
            return torch.cat([outputs.last_hidden_state[:, 0], outputs.last_hidden_state[:, task_id + 1]], axis=1)
        else:
            return outputs.last_hidden_state[:, 0]

    def predict_on_top(self, task_id, last_hidden_state, labels=None):
        name = self.task_types[task_id]
        if name == 'sequence_labeling':
            #  last hidden state is all token tensor
            final_output = self.dropout(last_hidden_state)
            logits = self.bert.final_classifier[task_id](final_output)
            if labels is not None:
                active_logits = logits.view(-1, self.classes[task_id])
                if self.multilabel[task_id]:
                    loss_fct = BCEWithLogitsLoss()
                    loss = loss_fct(active_logits, labels)
                elif not self.multilabel[task_id]:
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(active_logits, labels.view(-1))
                return loss, logits
            else:
                return logits
        elif name in ['classification', 'regression', 'multiple_choice']:
            #  last hidden state is a first token tensor
            if self.new_model:  # or True:
                pooled_output = self.bert.pooling_layer(last_hidden_state)
            else:
                pooled_output = self.bert.pooler(last_hidden_state)
            pooled_output = self.activation(pooled_output)
            pooled_output = self.dropout(pooled_output)
            logits = self.bert.final_classifier[task_id](pooled_output)
            if name == 'multiple_choice':
                logits = logits.view((-1, self.classes[task_id]))
                if labels is not None:
                    l1, l2 = len(logits), len(labels)
                    if len(logits) != len(labels):
                        raise Exception(f'Len of logits {l1} and labels {l2} not match')
            if labels is not None:
                if name != "regression":
                    if self.multilabel[task_id]:
                        loss_fct = BCEWithLogitsLoss()
                        loss = loss_fct(logits, labels)
                    elif not self.multilabel[task_id]:
                        if self.focal:
                            if self.weights[task_id] is None:
                                loss_fct = FocalLoss()
                            else:
                                loss_fct = FocalLoss(weight=torch.tensor([self.weights[task_id]]).cuda())
                            loss = loss_fct(logits, labels.view(-1))
                        else:
                            if self.weights[task_id] is None:
                                loss_fct = CrossEntropyLoss()
                            else:
                                loss_fct = CrossEntropyLoss(weight=torch.Tensor([self.weights[task_id]]).cuda())
                            loss = loss_fct(logits, labels.view(-1))
                    return loss, logits
                elif name == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits, labels.unsqueeze(1))
                    return loss, logits
            else:
                return logits
        elif name == 'binary_head':
            last_hidden_state = self.dropout(last_hidden_state)
            pooled_output = self.bert.pooler(last_hidden_state)
            pooled_output = self.activation(pooled_output)
            pooled_output = self.dropout(pooled_output)
            logits = self.bert.final_classifier[task_id](pooled_output)
            if labels is not None:
                if self.focal:
                    if self.weights[task_id] is None:
                        loss_fct = FocalLoss()
                    else:
                        loss_fct = FocalLoss(weight=torch.tensor([self.weights[task_id]]).cuda())
                else:
                    if self.weights[task_id] is None:
                        loss_fct = BCEWithLogitsLoss()
                    else:
                        loss_fct = BCEWithLogitsLoss(weight=torch.Tensor([self.weights[task_id]]).cuda())
                if len(labels.shape) == 1 and len(logits.shape) == 2:
                    labels = labels.unsqueeze(1)
                loss = loss_fct(logits, labels)
                return loss, logits
            else:
                return logits
        else:
            raise Exception(f'Unsupported name {name}')

    def forward(self, task_id, input_ids, attention_mask, token_type_ids, labels=None):
        last_hidden_state = self.get_logits(task_id, input_ids, attention_mask, token_type_ids)
        return self.predict_on_top(task_id, last_hidden_state, labels)


@register('multitask_transformer')
class MultiTaskTransformer(TorchModel):
    """
    Multi-Task transformer-agnostic model
    Args:
        tasks: Dict of task names along with the labels for each task,
        max_seq_len(int): maximum length of the input token sequence.
        gradient_accumulation_steps(default:1): number of gradient accumulation steps,
        steps_per_epoch(int): number of steps taken per epoch. Specify if gradient_accumulation_steps > 1
        backbone_model(str): name of HuggingFace.Transformers backbone model. Default: 'bert-base-cased'
        multilabel(default: False): set to true for multilabel classification,
        return_probas(default: False): set true to return prediction probabilities,
        freeze_embeddings(default: False): set true to freeze BERT embeddings
        dropout(default: None): dropout for the final model layer.
        If not set, defaults to the parameter hidden_dropout_prob of original model
        cuda_cache_size(default:3): predicts cache size. Recommended if we need classify one samples for many tasks. 0 if we don't use cache
        cuda_cache(default:True): if True, store cache on GPU
        seed(default:42): Torch manual_random_seed
    """

    def __init__(
            self,
            tasks: Dict[str, Dict],
            max_seq_len: int = 320,
            gradient_accumulation_steps: Optional[int] = 1,
            steps_per_epoch: Optional[int] = None,
            backbone_model: str = "bert-base-cased",
            focal: bool = False,
            return_probas: bool = False,
            freeze_embeddings: bool = False,
            new_model=False,
            dropout: Optional[float] = None,
            binary_threshold: float = 0.5,
            seed: int = 42,
            *args,
            **kwargs,
    ) -> None:
        self.return_probas = return_probas
        self.task_names = list(tasks.keys())
        self.task_types = []
        self.max_seq_len = max_seq_len
        self.tasks_num_classes = []
        self.task_names = []
        self.multilabel = []
        weights = []
        self.types_to_cache = []
        for task in tasks:
            self.task_names.append(task)
            self.tasks_num_classes.append(tasks[task].get('options', 1))
            weights.append(tasks[task].get('weight', None))
            self.task_types.append(tasks[task]['type'])
            self.multilabel.append(tasks[task].get('multilabel', False))
            self.types_to_cache.append(tasks[task].get('type_to_cache', -1))
        if self.return_probas and 'sequence_labeling' in self.task_types:
            log.warning(f'Return_probas for sequence_labeling not supported yet. Returning ids for this task')
        self.n_tasks = len(tasks)
        self.train_losses = [[] for _ in self.task_names]
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.steps_per_epoch = steps_per_epoch
        self.steps_taken = 0
        self.prev_id = None
        self.printed = False
        self.freeze_embeddings = freeze_embeddings
        self.binary_threshold = binary_threshold
        self._reset_cache()
        torch.manual_seed(seed)

        model = BertForMultiTask(
            backbone_model=backbone_model,
            tasks_num_classes=self.tasks_num_classes,
            weights=weights,
            multilabel=self.multilabel,
            task_types=self.task_types,
            new_model=new_model,
            focal=focal,
            dropout=dropout)

        super().__init__(model, **kwargs)

    def _reset_cache(self):
        self.preds_cache = {index_: None for index_ in self.types_to_cache if index_ != -1}

    def load(self, fname: Optional[str] = None, *args, **kwargs) -> None:
        """
        Loads weights.
        """
        super().load(fname)
        if self.freeze_embeddings:
            for n, p in self.model.bert.named_parameters():
                if not ('final_classifier' in n or 'pool' in n):
                    p.requires_grad = False

    def _make_input(self, task_features, task_id, labels=None):
        batch_input_size = None
        if len(task_features) == 1 and isinstance(task_features, list):
            task_features = task_features[0]

        if isinstance(labels, Iterable) and all([k is None for k in labels]):
            labels = None
        _input = {}
        element_list = ["input_ids", "attention_mask", "token_type_ids"]
        for elem in element_list:
            if elem in task_features:
                _input[elem] = task_features[elem]
                batch_input_size = _input[elem].shape[0]
            elif hasattr(task_features, elem):
                _input[elem] = getattr(task_features, elem)
                batch_input_size = _input[elem].shape[0]
            if elem in _input:
                if we_transform_input(self.task_types[task_id]):
                    _input[elem] = _input[elem].view(
                        (-1, _input[elem].size(-1)))

        if labels is not None:
            if self.task_types[task_id] in ["regression", "binary_head"]:
                _input["labels"] = torch.tensor(
                    np.array(labels, dtype=float), dtype=torch.float32
                )
            elif self.task_types[task_id] == 'multiple_choice':
                labels = torch.Tensor(labels).long()
                _input['labels'] = labels
            elif self.task_types[task_id] == 'sequence_labeling':
                subtoken_labels = [token_labels_to_subtoken_labels(y_el, y_mask, input_mask)
                                   for y_el, y_mask, input_mask in zip(labels, _input['token_type_ids'].numpy(),
                                                                       _input['attention_mask'].numpy())]
                _input['labels'] = torch.from_numpy(
                    np.array(subtoken_labels)).to(torch.int64)
            else:
                if not self.multilabel[task_id]:
                    _input["labels"] = torch.from_numpy(np.array(labels))
                elif self.multilabel[task_id]:
                    # We assume that labels already are one hot encoded
                    num_classes = self.tasks_num_classes[task_id]
                    _input['labels'] = torch.zeros((len(labels), num_classes))
                    for i in range(len(labels)):
                        for label_ind in labels[i]:
                            _input['labels'][i][label_ind] = 1
            element_list = element_list + ['labels']
        for elem in element_list:
            if elem not in _input:
                _input[elem] = None
            else:
                _input[elem] = _input[elem].to(self.device)
        if 'labels' in _input and self.task_types[task_id] != 'multiple_choice':
            error_msg = f'Len of labels {len(_input["labels"])} does not match len of ids {len(_input["input_ids"])}'
            if len(_input['labels']) != len(_input['input_ids']):
                raise Exception(error_msg)
        return _input, batch_input_size

    def __call__(self, *args):
        """Make prediction for given features (texts).
        Args:
            features: batch of InputFeatures for all tasks
        Returns:
            predicted classes or probabilities of each class
        """
        # IMPROVE ARGS CHECKING AFTER DEBUG
        log.debug(f'Calling {args}')
        self.validation_predictions = [None for _ in range(len(args))]
        for task_id in range(len(self.task_names)):
            if len(args[task_id]):
                _input, batch_input_size = self._make_input(task_features=args[task_id], task_id=task_id)

                if 'input_ids' not in _input:
                    raise Exception(f'No input_ids in _input {_input}')
                cache_key = self.types_to_cache[task_id]
                if cache_key != -1 and self.preds_cache[cache_key] is not None:
                    last_hidden_state = self.preds_cache[cache_key]
                else:
                    with torch.no_grad():
                        if self.is_data_parallel:
                            last_hidden_state = self.model.module.get_logits(task_id, **_input)
                        else:
                            last_hidden_state = self.model.get_logits(task_id, **_input)
                        if cache_key != -1:
                            self.preds_cache[cache_key] = last_hidden_state
                with torch.no_grad():
                    if self.is_data_parallel:
                        logits = self.model.module.predict_on_top(task_id, last_hidden_state)
                    else:
                        logits = self.model.predict_on_top(task_id, last_hidden_state)
                if self.task_types[task_id] == 'sequence_labeling':
                    y_mask = _input['token_type_ids'].cpu()
                    logits = token_from_subtoken(logits.cpu(), y_mask)
                    predicted_ids = torch.argmax(logits, dim=-1).int().tolist()
                    seq_lengths = torch.sum(y_mask, dim=1).int().tolist()
                    pred = [prediction[:max_seq_len] for max_seq_len, prediction in zip(seq_lengths, predicted_ids)]
                elif self.task_types[task_id] in ['regression', 'binary_head']:
                    pred = logits[:, 0]
                    if self.task_types[task_id] == 'binary_head':
                        pred = torch.sigmoid(logits).squeeze(1)
                        if not self.return_probas:
                            pred = (pred > self.binary_threshold).int()
                    pred = pred.cpu().numpy()
                else:
                    if self.multilabel[task_id]:
                        probs = torch.sigmoid(logits)
                        if self.return_probas:
                            pred = probs
                            pred = pred.cpu().numpy()
                        else:
                            numbers_of_sample, numbers_of_class = (probs > self.binary_threshold).nonzero(as_tuple=True)
                            numbers_of_sample, numbers_of_class = numbers_of_sample.cpu().numpy(), numbers_of_class.cpu().numpy()
                            pred = [[] for _ in range(len(logits))]
                            for sample_num, class_num in zip(numbers_of_sample, numbers_of_class):
                                pred[sample_num].append(int(class_num))
                    else:
                        if self.multilabel[task_id]:
                            probs = torch.sigmoid(logits)
                            if self.return_probas:
                                pred = probs
                                pred = pred.cpu().numpy()
                            else:
                                numbers_of_sample, numbers_of_class = (probs > self.binary_threshold).nonzero(as_tuple=True)
                                numbers_of_sample, numbers_of_class = numbers_of_sample.cpu().numpy(), numbers_of_class.cpu().numpy()
                                pred = [[] for _ in range(len(logits))]
                                for sample_num, class_num in zip(numbers_of_sample, numbers_of_class):
                                    pred[sample_num].append(int(class_num))
                        else:
                            if self.return_probas:
                                pred = torch.softmax(logits, dim=-1)
                            else:
                                pred = torch.argmax(logits, dim=1)
                            pred = pred.cpu().numpy()
                self.validation_predictions[task_id] = pred
        if len(args) == 1:
            return self.validation_predictions[0]
        for i in range(len(self.validation_predictions)):
            if self.validation_predictions[i] is None:
                self.validation_predictions[i] = []
        self._reset_cache()
        log.debug(self.validation_predictions)
        return self.validation_predictions

    def train_on_batch(self, *args):
        """Train model on given batch.
        This method calls train_op using features and y (labels).
        Args:
            features: batch of InputFeatures
            y: batch of labels (class id)
        Returns:
            dict with loss for each task
        """
        log.debug(f'Training for {args}')
        error_msg = f'Len of arguments {len(args)} is WRONG. ' \
                    f'Correct is {2 * self.n_tasks} as n_tasks is {self.n_tasks}'
        if len(args) != 2 * self.n_tasks:
            raise Exception(error_msg)
        ids_to_iterate = [k for k in range(self.n_tasks) if len(args[k]) > 0]
        if len(ids_to_iterate) == 0:
            raise Exception(f'No examples given! Given args {args}')
        elif len(ids_to_iterate) > 1:
            raise Exception('Samples from more than 1 task in train_on_batch')
        task_id = ids_to_iterate[0]
        _input, batch_size = self._make_input(task_features=args[task_id], task_id=task_id,
                                              labels=args[task_id + self.n_tasks])
        if _input == {}:
            raise Exception('Empty input!')

        if self.prev_id is None:
            self.prev_id = task_id
        elif self.prev_id != task_id and not self.printed:
            log.info('Seen samples from different tasks')
            self.printed = True
        if 'token_type_ids' not in _input:
            _input['token_type_ids'] = None
        loss, logits = self.model(task_id=task_id, **_input)
        if self.is_data_parallel:
            loss = loss.mean()
        loss = loss / self.gradient_accumulation_steps
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        if self.clip_norm:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm)

        if (self.steps_taken + 1) % self.gradient_accumulation_steps == 0 or (
                self.steps_per_epoch is not None and (self.steps_taken + 1) % self.steps_per_epoch == 0):
            self.optimizer.step()
            self.optimizer.zero_grad()
        self.train_losses[task_id] = loss.item()
        self.steps_taken += 1
        log.debug(f'train {task_id} {logits}')
        return {"losses": self.train_losses}


================================================
FILE: deeppavlov/models/torch_bert/torch_bert_ranker.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import List, Dict, Union, Optional

import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig
from transformers.data.processors.utils import InputFeatures

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel

log = getLogger(__name__)


@register('torch_bert_ranker')
class TorchBertRankerModel(TorchModel):
    """BERT-based model for interaction-based text ranking on PyTorch.

    Linear transformation is trained over the BERT pooled output from [CLS] token.
    Predicted probabilities of classes are used as a similarity measure for ranking.

    Args:
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title)
        n_classes: number of classes
        return_probas: set True if class probabilities are returned instead of the most probable label
    """

    def __init__(self, pretrained_bert: str = None,
                 bert_config_file: Optional[str] = None,
                 n_classes: int = 2,
                 return_probas: bool = True,
                 **kwargs) -> None:

        self.return_probas = return_probas

        if self.return_probas and n_classes == 1:
            raise RuntimeError('Set return_probas to False for regression task!')

        if pretrained_bert:
            log.debug(f"From pretrained {pretrained_bert}.")
            if Path(expand_path(pretrained_bert)).exists():
                pretrained_bert = str(expand_path(pretrained_bert))
            config = AutoConfig.from_pretrained(pretrained_bert,
                                                # num_labels=self.n_classes,
                                                output_attentions=False,
                                                output_hidden_states=False)

            model = AutoModelForSequenceClassification.from_pretrained(pretrained_bert, config=config)

            # TODO: make better exception handling here and at
            # deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel.load
            try:
                hidden_size = model.classifier.out_proj.in_features

                if n_classes != model.num_labels:
                    model.classifier.out_proj.weight = torch.nn.Parameter(torch.randn(n_classes, hidden_size))
                    model.classifier.out_proj.bias = torch.nn.Parameter(torch.randn(n_classes))
                    model.classifier.out_proj.out_features = n_classes
                    model.num_labels = n_classes

            except AttributeError:
                hidden_size = model.classifier.in_features

                if n_classes != model.num_labels:
                    model.classifier.weight = torch.nn.Parameter(torch.randn(n_classes, hidden_size))
                    model.classifier.bias = torch.nn.Parameter(torch.randn(n_classes))
                    model.classifier.out_features = n_classes
                    model.num_labels = n_classes


        elif bert_config_file and expand_path(bert_config_file).is_file():
            self.bert_config = AutoConfig.from_pretrained(str(expand_path(bert_config_file)))
            model = AutoModelForSequenceClassification.from_config(config=self.bert_config)

        else:
            raise ConfigError("No pre-trained BERT model is given.")

        super().__init__(model, **kwargs)

    def train_on_batch(self, features_li: List[List[InputFeatures]], y: Union[List[int], List[List[int]]]) -> Dict:
        """Train the model on the given batch.

        Args:
            features_li: list with the single element containing the batch of InputFeatures
            y: batch of labels (class id or one-hot encoding)

        Returns:
            dict with loss and learning rate values
        """
        features = features_li[0]

        input_ids = [f.input_ids for f in features]
        input_masks = [f.attention_mask for f in features]

        b_input_ids = torch.cat(input_ids, dim=0).to(self.device)
        b_input_masks = torch.cat(input_masks, dim=0).to(self.device)
        b_labels = torch.from_numpy(np.array(y)).to(self.device)

        self.optimizer.zero_grad()

        loss, logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks,
                                  labels=b_labels, return_dict=False)
        self._make_step(loss)

        return {'loss': loss.item()}

    def __call__(self, features_li: List[List[InputFeatures]]) -> Union[List[int], List[List[float]]]:
        """Calculate scores for the given context over candidate responses.

        Args:
            features_li: list of elements where each element contains the batch of features
             for contexts with particular response candidates

        Returns:
            predicted scores for contexts over response candidates
        """
        if len(features_li) == 1 and len(features_li[0]) == 1:
            msg = f"It is not intended to use the {self.__class__} in the interact mode."
            log.error(msg)
            return [msg]

        predictions = []
        for features in features_li:

            input_ids = [f.input_ids for f in features]
            input_masks = [f.attention_mask for f in features]

            b_input_ids = torch.cat(input_ids, dim=0).to(self.device)
            b_input_masks = torch.cat(input_masks, dim=0).to(self.device)

            with torch.no_grad():
                # Forward pass, calculate logit predictions
                logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks)
                logits = logits[0]

            if self.return_probas:
                pred = torch.nn.functional.softmax(logits, dim=-1)[:, 1]
                pred = pred.detach().cpu().numpy()
            else:
                logits = logits.detach().cpu().numpy()
                pred = np.argmax(logits, axis=1)

            predictions.append(pred)

        if len(features_li) == 1:
            predictions = predictions[0]
        else:
            predictions = np.hstack([np.expand_dims(el, 1) for el in predictions])

        return predictions


================================================
FILE: deeppavlov/models/torch_bert/torch_transformers_classifier.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import List, Dict, Union, Optional, Tuple

import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoModel, AutoTokenizer
from transformers.modeling_outputs import SequenceClassifierOutput

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel

log = getLogger(__name__)


@register('torch_transformers_classifier')
class TorchTransformersClassifierModel(TorchModel):
    """Bert-based model for text classification on PyTorch.

    It uses output from [CLS] token and predicts labels using linear transformation.

    Args:
        n_classes: number of classes
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        multilabel: set True if it is multi-label classification
        return_probas: set True if return class probabilites instead of most probable label needed
        attention_probs_keep_prob: keep_prob for Bert self-attention layers
        hidden_keep_prob: keep_prob for Bert hidden layers
        bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title)
        is_binary: whether classification task is binary or multi-class
        num_special_tokens: number of special tokens used by classification model
    """

    def __init__(self, n_classes,
                 pretrained_bert,
                 multilabel: bool = False,
                 return_probas: bool = False,
                 attention_probs_keep_prob: Optional[float] = None,
                 hidden_keep_prob: Optional[float] = None,
                 bert_config_file: Optional[str] = None,
                 is_binary: Optional[bool] = False,
                 num_special_tokens: int = None,
                 **kwargs) -> None:

        self.return_probas = return_probas
        self.multilabel = multilabel
        self.n_classes = n_classes
        self.is_binary = is_binary

        if self.multilabel and not self.return_probas:
            raise RuntimeError('Set return_probas to True for multilabel classification!')

        if self.return_probas and self.n_classes == 1:
            raise RuntimeError('Set return_probas to False for regression task!')

        if pretrained_bert:
            log.debug(f"From pretrained {pretrained_bert}.")
            config = AutoConfig.from_pretrained(pretrained_bert,
                                                # num_labels=self.n_classes,
                                                output_attentions=False,
                                                output_hidden_states=False)

            if self.is_binary:
                config.add_pooling_layer = False
                model = AutoModelForBinaryClassification(pretrained_bert, config)
            else:
                model = AutoModelForSequenceClassification.from_pretrained(pretrained_bert, config=config)

                # TODO need a better solution here and at
                # deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel.load
                try:
                    hidden_size = model.classifier.out_proj.in_features

                    if self.n_classes != model.num_labels:
                        model.classifier.out_proj.weight = torch.nn.Parameter(torch.randn(self.n_classes,
                                                                                               hidden_size))
                        model.classifier.out_proj.bias = torch.nn.Parameter(torch.randn(self.n_classes))
                        model.classifier.out_proj.out_features = self.n_classes
                        model.num_labels = self.n_classes

                except AttributeError:
                    hidden_size = model.classifier.in_features

                    if self.n_classes != model.num_labels:
                        model.classifier.weight = torch.nn.Parameter(torch.randn(self.n_classes, hidden_size))
                        model.classifier.bias = torch.nn.Parameter(torch.randn(self.n_classes))
                        model.classifier.out_features = self.n_classes
                        model.num_labels = self.n_classes

        elif bert_config_file and Path(bert_config_file).is_file():
            bert_config = AutoConfig.from_pretrained(str(expand_path(bert_config_file)))
            if attention_probs_keep_prob is not None:
                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
            if hidden_keep_prob is not None:
                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob
            model = AutoModelForSequenceClassification.from_config(config=bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        tokenizer = AutoTokenizer.from_pretrained(pretrained_bert)
        if num_special_tokens is not None:
            model.resize_token_embeddings(len(tokenizer) + num_special_tokens)

        super().__init__(model, **kwargs)

    def train_on_batch(self, features: Dict[str, torch.tensor], y: Union[List[int], List[List[int]]]) -> Dict:
        """Train model on given batch.
        This method calls train_op using features and y (labels).

        Args:
            features: batch of InputFeatures
            y: batch of labels (class id or one-hot encoding)

        Returns:
            dict with loss and learning_rate values
        """

        _input = {key: value.to(self.device) for key, value in features.items()}

        if self.n_classes > 1 and not self.is_binary:
            _input["labels"] = torch.from_numpy(np.array(y)).to(self.device)

        # regression
        else:
            _input["labels"] = torch.from_numpy(np.array(y, dtype=np.float32)).unsqueeze(1).to(self.device)

        self.optimizer.zero_grad()

        tokenized = {key: value for (key, value) in _input.items()
                     if key in self.accepted_keys}

        loss = self.model(**tokenized).loss
        if self.is_data_parallel:
            loss = loss.mean()
        self._make_step(loss)

        return {'loss': loss.item()}

    def __call__(self, features: Dict[str, torch.tensor]) -> Union[List[int], List[List[float]]]:
        """Make prediction for given features (texts).

        Args:
            features: batch of InputFeatures

        Returns:
            predicted classes or probabilities of each class

        """

        _input = {key: value.to(self.device) for key, value in features.items()}

        with torch.no_grad():
            tokenized = {key: value for (key, value) in _input.items()
                         if key in self.accepted_keys}

            # Forward pass, calculate logit predictions
            logits = self.model(**tokenized)
            logits = logits[0]

        if self.return_probas:
            if self.is_binary:
                pred = torch.sigmoid(logits).squeeze(1)
            elif not self.multilabel:
                pred = torch.nn.functional.softmax(logits, dim=-1)
            else:
                pred = torch.nn.functional.sigmoid(logits)
            pred = pred.detach().cpu().numpy()
        elif self.n_classes > 1:
            logits = logits.detach().cpu().numpy()
            pred = np.argmax(logits, axis=1)
        # regression
        else:
            pred = logits.squeeze(-1).detach().cpu().numpy()

        return pred

    # TODO move to the super class
    @property
    def accepted_keys(self) -> Tuple[str]:
        if self.is_data_parallel:
            accepted_keys = self.model.module.forward.__code__.co_varnames
        else:
            accepted_keys = self.model.forward.__code__.co_varnames
        return accepted_keys


class AutoModelForBinaryClassification(torch.nn.Module):

    def __init__(self, pretrained_bert, config):
        super().__init__()
        self.pretrained_bert = pretrained_bert
        self.config = config

        self.model = AutoModel.from_pretrained(self.pretrained_bert, self.config)
        self.classifier = BinaryClassificationHead(config)

        self.classifier.init_weights()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                labels=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None):

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
                             position_ids=position_ids,
                             head_mask=head_mask,
                             inputs_embeds=inputs_embeds,
                             output_attentions=output_attentions,
                             output_hidden_states=output_hidden_states,
                             return_dict=return_dict)

        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss,
                                        logits=logits,
                                        hidden_states=outputs.hidden_states,
                                        attentions=outputs.attentions)


class BinaryClassificationHead(torch.nn.Module):
    def __init__(self, config):
        super().__init__()

        self.config = config

        self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = torch.nn.Linear(config.hidden_size, 1)

    def init_weights(self):
        self.dense.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if self.dense.bias is not None:
            self.dense.bias.data.zero_()

    def forward(self, features, **kwargs):
        x = features[:, 0, :]
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


================================================
FILE: deeppavlov/models/torch_bert/torch_transformers_el_ranker.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import List, Optional, Dict, Tuple, Union, Any

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoConfig, AutoTokenizer, AutoModel

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel
from deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersEntityRankerPreprocessor

log = getLogger(__name__)


@register('torch_transformers_el_ranker')
class TorchTransformersElRanker(TorchModel):
    """Class for ranking of entities by context and description
    Args:
        encoder_save_path: path to save the encoder checkpoint
        bilinear_save_path: path to save bilinear layer checkpoint
        block_size: size of block in bilinear layer
        emb_size: entity embedding size
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name
        return_probas: set this to `True` if you need the probabilities instead of raw answers
    """

    def __init__(
            self,
            encoder_save_path: str,
            bilinear_save_path: str,
            block_size: int,
            emb_size: int,
            pretrained_bert: str = None,
            return_probas: bool = False,
            **kwargs
    ):
        self.return_probas = return_probas

        model = SiameseBertElModel(
            pretrained_bert=pretrained_bert,
            encoder_save_path=encoder_save_path,
            bilinear_save_path=bilinear_save_path,
            bert_config_file=pretrained_bert,
            block_size=block_size,
            emb_size=emb_size
        )

        super().__init__(model, **kwargs)

    def train_on_batch(self, q_features: List[Dict],
                       c_features: List[Dict],
                       entity_tokens_pos: List[int],
                       labels: List[int]) -> float:
        """

        Args:
            q_features: batch of indices of text subwords
            c_features: batch of indices of entity description subwords
            entity_tokens_pos: list of indices of special tokens
            labels: 1 if entity is appropriate to context, 0 - otherwise

        Returns:
            the value of loss
        """
        _input = {'labels': labels}
        _input['entity_tokens_pos'] = entity_tokens_pos
        for elem in ['input_ids', 'attention_mask']:
            inp_elem = [getattr(f, elem) for f in q_features]
            _input[f"q_{elem}"] = torch.LongTensor(inp_elem).to(self.device)
        for elem in ['input_ids', 'attention_mask']:
            inp_elem = [getattr(f, elem) for f in c_features]
            _input[f"c_{elem}"] = torch.LongTensor(inp_elem).to(self.device)

        self.model.train()
        self.model.zero_grad()
        self.optimizer.zero_grad()  # zero the parameter gradients

        loss, softmax_scores = self.model(**_input)
        self._make_step(loss)

        return loss.item()

    def __call__(self, q_features: List[Dict],
                 c_features: List[Dict],
                 entity_tokens_pos: List[int]) -> Union[List[int], List[np.ndarray]]:
        """ Predicts entity labels (1 if the entity description is appropriate to the context, 0 - otherwise)

        Args:
            q_features: batch of indices of text subwords
            c_features: batch of indices of entity description subwords
            entity_tokens_pos: list of indices of special tokens

        Returns:
            Label indices or class probabilities for each token (not subtoken)

        """
        self.model.eval()

        _input = {'entity_tokens_pos': entity_tokens_pos}
        for elem in ['input_ids', 'attention_mask']:
            inp_elem = [getattr(f, elem) for f in q_features]
            _input[f"q_{elem}"] = torch.LongTensor(inp_elem).to(self.device)
        for elem in ['input_ids', 'attention_mask']:
            inp_elem = [getattr(f, elem) for f in c_features]
            _input[f"c_{elem}"] = torch.LongTensor(inp_elem).to(self.device)

        with torch.no_grad():
            softmax_scores = self.model(**_input)
            if self.return_probas:
                pred = softmax_scores
            else:
                pred = torch.argmax(softmax_scores, dim=1).cpu().numpy()

        return pred

    def save(self, fname: Optional[str] = None, *args, **kwargs) -> None:
        if fname is None:
            fname = self.save_path
        if not fname.parent.is_dir():
            raise ConfigError("Provided save path is incorrect!")
        weights_path = Path(fname).with_suffix(f".pth.tar")
        log.info(f"Saving model to {weights_path}.")
        torch.save({
            "model_state_dict": self.model.cpu().state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "epochs_done": self.epochs_done
        }, weights_path)
        self.model.to(self.device)
        self.model.save()


class TextEncoder(nn.Module):
    """Class for obtaining the BERT output for CLS-token and special entity token
    Args:
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name
        device: device to use
    """

    def __init__(self, pretrained_bert: str = None,
                 bert_config_file: str = None,
                 device: torch.device = torch.device('cpu')):
        super().__init__()
        self.pretrained_bert = pretrained_bert
        self.bert_config_file = bert_config_file
        self.encoder, self.config, self.bert_config = None, None, None
        self.device = device
        self.load()
        self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_bert)
        self.encoder.resize_token_embeddings(len(self.tokenizer) + 1)
        self.encoder.to(self.device)

    def forward(self,
                input_ids: Tensor,
                attention_mask: Tensor,
                entity_tokens_pos: List[int] = None
                ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]:
        if entity_tokens_pos is not None:
            q_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
            q_hidden_states = q_outputs.last_hidden_state

            entity_emb = []
            for i in range(len(entity_tokens_pos)):
                pos = entity_tokens_pos[i]
                entity_emb.append(q_hidden_states[i, pos])

            entity_emb = torch.stack(entity_emb, dim=0).to(self.device)
            return entity_emb
        else:
            c_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
            c_cls_emb = c_outputs.last_hidden_state[:, :1, :].squeeze(1)
            return c_cls_emb

    def load(self) -> None:
        if self.pretrained_bert:
            log.debug(f"From pretrained {self.pretrained_bert}.")
            self.config = AutoConfig.from_pretrained(
                self.pretrained_bert, output_hidden_states=True
            )
            self.encoder = AutoModel.from_pretrained(self.pretrained_bert, config=self.config)

        elif self.bert_config_file and Path(self.bert_config_file).is_file():
            self.config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file)))
            self.encoder = AutoModel.from_config(config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")
        self.encoder.to(self.device)


class BilinearRanking(nn.Module):
    """Class for calculation of bilinear form of two vectors
    Args:
        n_classes: number of classes for classification
        emb_size: entity embedding size
        block_size: size of block in bilinear layer
    """

    def __init__(self, n_classes: int = 2, emb_size: int = 768, block_size: int = 8):
        super().__init__()
        self.n_classes = n_classes
        self.emb_size = emb_size
        self.block_size = block_size
        self.bilinear = nn.Linear(self.emb_size * self.block_size, self.n_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text1: Tensor, text2: Tensor):
        b1 = text1.view(-1, self.emb_size // self.block_size, self.block_size)
        b2 = text2.view(-1, self.emb_size // self.block_size, self.block_size)
        bl = (b1.unsqueeze(3) * b2.unsqueeze(2)).view(-1, self.emb_size * self.block_size)
        logits = self.bilinear(bl)
        softmax_logits = self.softmax(logits)
        log_softmax = F.log_softmax(logits, dim=-1)
        return softmax_logits, log_softmax


class SiameseBertElModel(nn.Module):
    """Class with model for ranking of entities by context and description
    Args:
        emb_size: entity embedding size
        block_size: size of block in bilinear layer
        encoder_save_path: path to save the encoder checkpoint
        bilinear_save_path: path to save bilinear layer checkpoint
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name
        device: device to use
    """

    def __init__(
            self,
            emb_size: int,
            block_size: int,
            encoder_save_path: str,
            bilinear_save_path: str,
            pretrained_bert: str = None,
            bert_config_file: str = None,
            device: torch.device = torch.device('cpu')
    ):
        super().__init__()
        self.pretrained_bert = pretrained_bert
        self.encoder_save_path = encoder_save_path
        self.bilinear_save_path = bilinear_save_path
        self.bert_config_file = bert_config_file
        self.device = device

        # initialize parameters that would be filled later
        self.encoder = TextEncoder(pretrained_bert=self.pretrained_bert, device=self.device)
        self.bilinear_ranker = BilinearRanking(emb_size, block_size)

    def forward(
            self,
            q_input_ids: Tensor,
            q_attention_mask: Tensor,
            c_input_ids: Tensor,
            c_attention_mask: Tensor,
            entity_tokens_pos: List,
            labels: List[int] = None
    ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]:

        entity_emb = self.encoder(input_ids=q_input_ids, attention_mask=q_attention_mask,
                                  entity_tokens_pos=entity_tokens_pos)
        c_cls_emb = self.encoder(input_ids=c_input_ids, attention_mask=c_attention_mask)
        softmax_scores, log_softmax = self.bilinear_ranker(entity_emb, c_cls_emb)

        if labels is not None:
            labels_one_hot = [[0.0, 0.0] for _ in labels]
            for i in range(len(labels)):
                labels_one_hot[i][labels[i]] = 1.0
            labels_one_hot = torch.Tensor(labels_one_hot).to(self.device)

            bs, dim = labels_one_hot.shape
            per_sample_loss = -torch.bmm(labels_one_hot.view(bs, 1, dim), log_softmax.view(bs, dim, 1)).squeeze(
                2).squeeze(1)
            loss = torch.mean(per_sample_loss)
            return loss, softmax_scores
        else:
            return softmax_scores

    def save(self) -> None:
        encoder_weights_path = expand_path(self.encoder_save_path).with_suffix(f".pth.tar")
        log.info(f"Saving encoder to {encoder_weights_path}.")
        torch.save({"model_state_dict": self.encoder.cpu().state_dict()}, encoder_weights_path)
        bilinear_weights_path = expand_path(self.bilinear_save_path).with_suffix(f".pth.tar")
        log.info(f"Saving bilinear weights to {bilinear_weights_path}.")
        torch.save({"model_state_dict": self.bilinear_ranker.cpu().state_dict()}, bilinear_weights_path)
        self.encoder.to(self.device)
        self.bilinear_ranker.to(self.device)


@register('torch_transformers_entity_ranker_infer')
class TorchTransformersEntityRankerInfer:
    """Class for infering of model for ranking of entities from a knowledge base by context and description
    Args:
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        encoder_weights_path: path to save the encoder checkpoint
        bilinear_weights_path: path to save bilinear layer checkpoint
        spaecial_token_id: id of special token
        do_lower_case: whether to lower case the text
        batch_size: batch size when model infering
        emb_size: entity embedding size
        block_size: size of block in bilinear layer
        device: `cpu` or `gpu` device to use
    """

    def __init__(self, pretrained_bert,
                 encoder_weights_path,
                 bilinear_weights_path,
                 special_token_id: int,
                 do_lower_case: bool = False,
                 batch_size: int = 5,
                 emb_size: int = 300,
                 block_size: int = 8,
                 device: str = "gpu", **kwargs):
        self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu")
        self.pretrained_bert = pretrained_bert
        self.preprocessor = TorchTransformersEntityRankerPreprocessor(vocab_file=self.pretrained_bert,
                                                                      do_lower_case=do_lower_case,
                                                                      special_tokens=["[ENT]"])
        self.encoder, self.config = None, None
        self.config = AutoConfig.from_pretrained(self.pretrained_bert, output_hidden_states=True)
        self.emb_size = emb_size
        self.block_size = block_size
        self.encoder = TextEncoder(pretrained_bert=self.pretrained_bert, device=self.device)
        self.encoder_weights_path = str(expand_path(encoder_weights_path))
        self.bilinear_weights_path = str(expand_path(bilinear_weights_path))
        encoder_checkpoint = torch.load(self.encoder_weights_path, map_location=self.device)
        self.encoder.load_state_dict(encoder_checkpoint["model_state_dict"])
        self.encoder.to(self.device)
        self.bilinear_ranking = BilinearRanking(emb_size=self.emb_size, block_size=self.block_size)
        bilinear_checkpoint = torch.load(self.bilinear_weights_path, map_location=self.device)
        self.bilinear_ranking.load_state_dict(bilinear_checkpoint["model_state_dict"])
        self.bilinear_ranking.to(self.device)
        self.special_token_id = special_token_id
        self.batch_size = batch_size

    def __call__(self, contexts_batch: List[str],
                 candidate_entities_batch: List[List[str]],
                 candidate_entities_descr_batch: List[List[str]]):
        entity_emb_batch = []

        num_batches = len(contexts_batch) // self.batch_size + int(len(contexts_batch) % self.batch_size > 0)
        for ii in range(num_batches):
            contexts_list = contexts_batch[ii * self.batch_size:(ii + 1) * self.batch_size]
            context_features = self.preprocessor(contexts_list)
            context_input_ids = context_features["input_ids"].to(self.device)
            context_attention_mask = context_features["attention_mask"].to(self.device)
            special_tokens_pos = []
            for input_ids_list in context_input_ids:
                found_n = -1
                for n, input_id in enumerate(input_ids_list):
                    if input_id == self.special_token_id:
                        found_n = n
                        break
                if found_n == -1:
                    found_n = 0
                special_tokens_pos.append(found_n)

            cur_entity_emb_batch = self.encoder(input_ids=context_input_ids,
                                                attention_mask=context_attention_mask,
                                                entity_tokens_pos=special_tokens_pos)

            entity_emb_batch += cur_entity_emb_batch.detach().cpu().numpy().tolist()

        scores_batch = []
        for entity_emb, candidate_entities_list, candidate_entities_descr_list in \
                zip(entity_emb_batch, candidate_entities_batch, candidate_entities_descr_batch):
            if candidate_entities_list:
                entity_emb = [entity_emb for _ in candidate_entities_list]
                entity_emb = torch.Tensor(entity_emb).to(self.device)
                descr_features = self.preprocessor(candidate_entities_descr_list)
                descr_input_ids = descr_features["input_ids"].to(self.device)
                descr_attention_mask = descr_features["attention_mask"].to(self.device)
                candidate_entities_emb = self.encoder(input_ids=descr_input_ids,
                                                      attention_mask=descr_attention_mask)
                scores_list, _ = self.bilinear_ranking(entity_emb, candidate_entities_emb)
                scores_list = scores_list.detach().cpu().numpy()
                scores_list = [score[1] for score in scores_list]
                entities_with_scores = [(entity, score) for entity, score in zip(candidate_entities_list, scores_list)]
                entities_with_scores = sorted(entities_with_scores, key=lambda x: x[1], reverse=True)
                scores_batch.append(entities_with_scores)
            else:
                scores_batch.append([])

        return scores_batch


================================================
FILE: deeppavlov/models/torch_bert/torch_transformers_multiplechoice.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import List, Dict, Union, Optional

import numpy as np
import torch
from transformers import AutoModelForMultipleChoice, AutoConfig

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel

log = getLogger(__name__)


@register('torch_transformers_multiplechoice')
class TorchTransformersMultiplechoiceModel(TorchModel):
    """Bert-based model for text classification on PyTorch.

    It uses output from [CLS] token and predicts labels using linear transformation.

    Args:
        n_classes: number of classes
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        multilabel: set True if it is multi-label classification
        return_probas: set True if return class probabilites instead of most probable label needed
        attention_probs_keep_prob: keep_prob for Bert self-attention layers
        hidden_keep_prob: keep_prob for Bert hidden layers
        bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title)
    """

    def __init__(self, n_classes,
                 pretrained_bert,
                 multilabel: bool = False,
                 return_probas: bool = False,
                 attention_probs_keep_prob: Optional[float] = None,
                 hidden_keep_prob: Optional[float] = None,
                 bert_config_file: Optional[str] = None,
                 **kwargs) -> None:

        self.return_probas = return_probas
        self.multilabel = multilabel
        self.n_classes = n_classes

        if self.multilabel and not self.return_probas:
            raise RuntimeError('Set return_probas to True for multilabel classification!')

        if self.return_probas and self.n_classes == 1:
            raise RuntimeError('Set return_probas to False for regression task!')

        if pretrained_bert:
            log.debug(f"From pretrained {pretrained_bert}.")
            config = AutoConfig.from_pretrained(pretrained_bert, num_labels=self.n_classes,
                                                output_attentions=False, output_hidden_states=False)

            model = AutoModelForMultipleChoice.from_pretrained(pretrained_bert, config=config)

        elif bert_config_file and Path(bert_config_file).is_file():
            bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file)))
            if attention_probs_keep_prob is not None:
                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
            if hidden_keep_prob is not None:
                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob
            model = AutoModelForMultipleChoice.from_config(config=bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        super().__init__(model, **kwargs)

    def train_on_batch(self, features: Dict[str, torch.tensor], y: Union[List[int], List[List[int]]]) -> Dict:
        """Train model on given batch.
        This method calls train_op using features and y (labels).

        Args:
            features: batch of InputFeatures
            y: batch of labels (class id or one-hot encoding)

        Returns:
            dict with loss and learning_rate values
        """

        _input = {key: value.to(self.device) for key, value in features.items()}

        _input["labels"] = torch.tensor(y).long().to(self.device)

        self.optimizer.zero_grad()

        tokenized = {key: value for (key, value) in _input.items() if key in self.model.forward.__code__.co_varnames}

        loss = self.model(**tokenized).loss
        self._make_step(loss)

        return {'loss': loss.item()}

    def __call__(self, features: Dict[str, torch.tensor]) -> Union[List[int], List[List[float]]]:
        """Make prediction for given features (texts).

        Args:
            features: batch of InputFeatures

        Returns:
            predicted classes or probabilities of each class

        """

        _input = {key: value.to(self.device) for key, value in features.items()}

        with torch.no_grad():
            tokenized = {key: value for (key, value) in _input.items()
                         if key in self.model.forward.__code__.co_varnames}

            # Forward pass, calculate logit predictions
            logits = self.model(**tokenized)
            logits = logits[0]

        if self.return_probas:
            if not self.multilabel:
                pred = torch.nn.functional.softmax(logits, dim=-1)
            else:
                pred = torch.nn.functional.sigmoid(logits)
            pred = pred.detach().cpu().numpy()
        elif self.n_classes > 1:
            logits = logits.detach().cpu().numpy()
            pred = np.argmax(logits, axis=1)
        else:  # regression
            pred = logits.squeeze(-1).detach().cpu().numpy()

        return pred


================================================
FILE: deeppavlov/models/torch_bert/torch_transformers_nll_ranking.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import List, Dict, Tuple, Union, Any

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoConfig, AutoModel, AutoTokenizer

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel

log = getLogger(__name__)


@register('torch_transformers_nll_ranker')
class TorchTransformersNLLRanker(TorchModel):
    """Class for ranking of relations using the model trained with NLL loss
    Args:
        pretrained_bert: pretrained transformer checkpoint path or key title (e.g. "bert-base-uncased")
        encoder_save_path: path to save the encoder checkpoint
        linear_save_path: path to save linear layer checkpoint
        return_probas: set this to `True` if you need the probabilities instead of raw answers
    """

    def __init__(
            self,
            pretrained_bert: str = None,
            encoder_save_path: str = None,
            linear_save_path: str = None,
            return_probas: bool = False,
            **kwargs
    ):
        self.return_probas = return_probas

        model = NLLRanking(
            pretrained_bert=pretrained_bert,
            encoder_save_path=encoder_save_path,
            linear_save_path=linear_save_path,
            bert_tokenizer_config_file=pretrained_bert,
        )

        super().__init__(model, **kwargs)

    def train_on_batch(self, input_features: Dict[str, Any], positive_idx: List[int]) -> float:
        _input = {'positive_idx': positive_idx,
                  "input_ids": torch.LongTensor(input_features["input_ids"]).to(self.device),
                  "attention_mask": torch.LongTensor(input_features["attention_mask"]).to(self.device),
                  "token_type_ids": torch.LongTensor(input_features["token_type_ids"]).to(self.device)}

        self.model.train()
        self.model.zero_grad()
        self.optimizer.zero_grad()  # zero the parameter gradients

        loss, softmax_scores = self.model(**_input)
        loss.backward()
        self.optimizer.step()

        # Clip the norm of the gradients to prevent the "exploding gradients" problem
        if self.clip_norm:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm)

        return loss.item()

    def __call__(self, input_features: Dict[str, Any]) -> Union[List[int], List[np.ndarray]]:
        self.model.eval()
        _input = {"input_ids": torch.LongTensor(input_features["input_ids"]).to(self.device),
                  "attention_mask": torch.LongTensor(input_features["attention_mask"]).to(self.device),
                  "token_type_ids": torch.LongTensor(input_features["token_type_ids"]).to(self.device)}

        with torch.no_grad():
            output = self.model(**_input)
            if isinstance(output, tuple) and len(output) == 2:
                loss, softmax_scores = output
            else:
                softmax_scores = output
        if self.return_probas:
            softmax_scores = softmax_scores.cpu().numpy().tolist()
            return softmax_scores
        else:
            pred = torch.argmax(softmax_scores, dim=1)
            pred = pred.cpu()
            pred = pred.numpy()
            return pred


class NLLRanking(nn.Module):
    """Class which implements the relation ranking model
    Args:
        pretrained_bert: pretrained transformer checkpoint path or key title (e.g. "bert-base-uncased")
        encoder_save_path: path to save the encoder checkpoint
        linear_save_path: path to save linear layer checkpoint
        bert_tokenizer_config_file: path to configuration file of transformer tokenizer
        device: cpu or gpu
    """

    def __init__(
            self,
            pretrained_bert: str = None,
            encoder_save_path: str = None,
            linear_save_path: str = None,
            bert_tokenizer_config_file: str = None,
            device: str = "gpu"
    ):
        super().__init__()
        self.pretrained_bert = pretrained_bert
        self.encoder_save_path = encoder_save_path
        self.linear_save_path = linear_save_path
        self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu")

        # initialize parameters that would be filled later
        self.encoder, self.config, self.bert_config = None, None, None
        self.load()

        if Path(bert_tokenizer_config_file).is_file():
            vocab_file = str(expand_path(bert_tokenizer_config_file))
            tokenizer = AutoTokenizer(vocab_file=vocab_file)
        else:
            tokenizer = AutoTokenizer.from_pretrained(pretrained_bert)
        self.encoder.resize_token_embeddings(len(tokenizer) + 7)

    def forward(
            self,
            input_ids: Tensor,
            attention_mask: Tensor,
            token_type_ids: Tensor,
            positive_idx: List[List[int]] = None
    ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]:

        bs, samples_num, seq_len = input_ids.size()
        input_ids = input_ids.reshape(bs * samples_num, -1)
        attention_mask = attention_mask.reshape(bs * samples_num, -1)
        token_type_ids = token_type_ids.reshape(bs * samples_num, -1)
        if hasattr(self.config, "type_vocab_size"):
            encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask,
                                          token_type_ids=token_type_ids)
        else:
            encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_emb = encoder_output.last_hidden_state[:, :1, :].squeeze(1)
        scores = self.fc(cls_emb)
        scores = scores.reshape(bs, samples_num)

        if positive_idx is not None:
            scores = F.log_softmax(scores, dim=1)
            positive_idx = []
            for i in range(bs):
                positive_idx.append(0)
            loss = F.nll_loss(scores, torch.tensor(positive_idx).to(scores.device), reduction="mean")
            return loss, scores
        else:
            return scores

    def load(self) -> None:
        if self.pretrained_bert:
            log.info(f"From pretrained {self.pretrained_bert}.")
            self.config = AutoConfig.from_pretrained(
                self.pretrained_bert, output_hidden_states=True
            )
            self.encoder = AutoModel.from_pretrained(self.pretrained_bert, config=self.config)
            self.fc = nn.Linear(self.config.hidden_size, 1)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.encoder.to(self.device)
        self.fc.to(self.device)


================================================
FILE: deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py
================================================
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import List, Union, Dict, Optional, Tuple

import numpy as np
import torch
from transformers import AutoModelForTokenClassification, AutoConfig

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel
from deeppavlov.models.torch_bert.crf import CRF

log = getLogger(__name__)


def token_from_subtoken(units: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
    """ Assemble token level units from subtoken level units

    Args:
        units: torch.Tensor of shape [batch_size, SUBTOKEN_seq_length, n_features]
        mask: mask of token beginnings. For example: for tokens

                [[``[CLS]`` ``My``, ``capybara``, ``[SEP]``],
                [``[CLS]`` ``Your``, ``aar``, ``##dvark``, ``is``, ``awesome``, ``[SEP]``]]

            the mask will be

                [[0, 1, 1, 0, 0, 0, 0],
                [0, 1, 1, 0, 1, 1, 0]]

    Returns:
        word_level_units: Units assembled from ones in the mask. For the
            example above this units will correspond to the following

                [[``My``, ``capybara``],
                [``Your`, ``aar``, ``is``, ``awesome``,]]

            the shape of this tensor will be [batch_size, TOKEN_seq_length, n_features]
    """
    shape = units.size()
    batch_size = shape[0]
    nf = shape[2]
    nf_int = units.size()[-1]

    token_seq_lengths = torch.sum(mask, 1).to(torch.int64)

    n_words = torch.sum(token_seq_lengths)

    max_token_seq_len = torch.max(token_seq_lengths)

    idxs = torch.stack(torch.nonzero(mask, as_tuple=True), dim=1)

    sample_ids_in_batch = torch.nn.functional.pad(input=idxs[:, 0], pad=[1, 0])

    a = torch.logical_not(torch.eq(sample_ids_in_batch[1:], sample_ids_in_batch[:-1]).to(torch.int64))

    q = a * torch.arange(n_words).to(torch.int64)
    count_to_substract = torch.nn.functional.pad(torch.masked_select(q, q.to(torch.bool)), [1, 0])

    new_word_indices = torch.arange(n_words).to(torch.int64) - torch.gather(
        count_to_substract, dim=0, index=torch.cumsum(a, 0))

    n_total_word_elements = (batch_size * max_token_seq_len).to(torch.int32)
    word_indices_flat = (idxs[:, 0] * max_token_seq_len + new_word_indices).to(torch.int64)
    x_mask = torch.sum(torch.nn.functional.one_hot(word_indices_flat, n_total_word_elements), 0)
    x_mask = x_mask.to(torch.bool)

    full_range = torch.arange(batch_size * max_token_seq_len).to(torch.int64)
    nonword_indices_flat = torch.masked_select(full_range, torch.logical_not(x_mask))

    def gather_nd(params, indices):
        assert type(indices) == torch.Tensor
        return params[indices.transpose(0, 1).long().numpy().tolist()]

    elements = gather_nd(units, idxs)

    sh = tuple(torch.stack([torch.sum(max_token_seq_len - token_seq_lengths), torch.tensor(nf)], 0).numpy())
    paddings = torch.zeros(sh, dtype=torch.float64)

    def dynamic_stitch(indices, data):
        # https://discuss.pytorch.org/t/equivalent-of-tf-dynamic-partition/53735/2
        n = sum(idx.numel() for idx in indices)
        res = [None] * n
        for i, data_ in enumerate(data):
            idx = indices[i].view(-1)
            if idx.numel() > 0:
                d = data_.view(idx.numel(), -1)
                k = 0
                for idx_ in idx:
                    res[idx_] = d[k].to(torch.float64)
                    k += 1
        return res

    tensor_flat = torch.stack(dynamic_stitch([word_indices_flat, nonword_indices_flat], [elements, paddings]))

    tensor = torch.reshape(tensor_flat, (batch_size, max_token_seq_len.item(), nf_int))

    return tensor


def token_labels_to_subtoken_labels(labels, y_mask, input_mask):
    subtoken_labels = []
    labels_ind = 0
    n_tokens_with_special = int(np.sum(input_mask))

    for el in y_mask[1:n_tokens_with_special - 1]:
        if el == 1:
            subtoken_labels += [labels[labels_ind]]
            labels_ind += 1
        else:
            subtoken_labels += [labels[labels_ind - 1]]

    subtoken_labels = [0] + subtoken_labels + [0] * (len(input_mask) - n_tokens_with_special + 1)
    return subtoken_labels


@register('torch_transformers_sequence_tagger')
class TorchTransformersSequenceTagger(TorchModel):
    """Transformer-based model on PyTorch for text tagging. It predicts a label for every token (not subtoken)
    in the text. You can use it for sequence labeling tasks, such as morphological tagging or named entity recognition.

    Args:
        n_tags: number of distinct tags
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name
        attention_probs_keep_prob: keep_prob for Bert self-attention layers
        hidden_keep_prob: keep_prob for Bert hidden layers
        use_crf: whether to use Conditional Ramdom Field to decode tags
    """

    def __init__(self,
                 n_tags: int,
                 pretrained_bert: str,
                 bert_config_file: Optional[str] = None,
                 attention_probs_keep_prob: Optional[float] = None,
                 hidden_keep_prob: Optional[float] = None,
                 use_crf: bool = False,
                 **kwargs) -> None:

        if pretrained_bert:
            config = AutoConfig.from_pretrained(pretrained_bert, num_labels=n_tags,
                                                output_attentions=False, output_hidden_states=False)
            model = AutoModelForTokenClassification.from_pretrained(pretrained_bert, config=config)
        elif bert_config_file and Path(bert_config_file).is_file():
            bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file)))

            if attention_probs_keep_prob is not None:
                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
            if hidden_keep_prob is not None:
                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob
            model = AutoModelForTokenClassification(config=bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.crf = CRF(n_tags) if use_crf else None

        super().__init__(model, **kwargs)

    def train_on_batch(self,
                       input_ids: Union[List[List[int]], np.ndarray],
                       input_masks: Union[List[List[int]], np.ndarray],
                       y_masks: Union[List[List[int]], np.ndarray],
                       y: List[List[int]],
                       *args, **kwargs) -> Dict[str, float]:
        """

        Args:
            input_ids: batch of indices of subwords
            input_masks: batch of masks which determine what should be attended
            args: arguments passed  to _build_feed_dict
                and corresponding to additional input
                and output tensors of the derived class.
            kwargs: keyword arguments passed to _build_feed_dict
                and corresponding to additional input
                and output tensors of the derived class.

        Returns:
            dict with fields 'loss', 'head_learning_rate', and 'bert_learning_rate'
        """
        b_input_ids = torch.from_numpy(input_ids).to(self.device)
        b_input_masks = torch.from_numpy(input_masks).to(self.device)
        subtoken_labels = [token_labels_to_subtoken_labels(y_el, y_mask, input_mask)
                           for y_el, y_mask, input_mask in zip(y, y_masks, input_masks)]
        b_labels = torch.from_numpy(np.array(subtoken_labels)).to(torch.int64).to(self.device)
        self.optimizer.zero_grad()

        loss = self.model(input_ids=b_input_ids,
                          attention_mask=b_input_masks,
                          labels=b_labels).loss
        if self.crf is not None:
            self.crf(y, y_masks)
        if self.is_data_parallel:
            loss = loss.mean()
        self._make_step(loss)

        return {'loss': loss.item()}

    def __call__(self,
                 input_ids: Union[List[List[int]], np.ndarray],
                 input_masks: Union[List[List[int]], np.ndarray],
                 y_masks: Union[List[List[int]], np.ndarray]) -> Tuple[List[List[int]], List[np.ndarray]]:
        """ Predicts tag indices for a given subword tokens batch

        Args:
            input_ids: indices of the subwords
            input_masks: mask that determines where to attend and where not to
            y_masks: mask which determines the first subword units in the the word

        Returns:
            Label indices or class probabilities for each token (not subtoken)

        """
        b_input_ids = torch.from_numpy(input_ids).to(self.device)
        b_input_masks = torch.from_numpy(input_masks).to(self.device)

        with torch.no_grad():
            # Forward pass, calculate logit predictions
            logits = self.model(b_input_ids, attention_mask=b_input_masks)

            # Move logits and labels to CPU and to numpy arrays
            logits = token_from_subtoken(logits[0].detach().cpu(), torch.from_numpy(y_masks))

        probas = torch.nn.functional.softmax(logits, dim=-1)
        probas = probas.detach().cpu().numpy()
        if self.crf is not None:
            logits = logits.transpose(1, 0).to(self.device)
            pred = self.crf.decode(logits)
        else:
            logits = logits.detach().cpu().numpy()
            pred = np.argmax(logits, axis=-1)
        seq_lengths = np.sum(y_masks, axis=1)
        pred = [p[:l] for l, p in zip(seq_lengths, pred)]

        return pred, probas

    def load(self, fname=None):
        super().load(fname)
        if self.crf is not None:
            self.crf = self.crf.to(self.device)
            if self.load_path:
                weights_path_crf = Path(f"{self.load_path}_crf").resolve()
                weights_path_crf = weights_path_crf.with_suffix(".pth.tar")
                if weights_path_crf.exists():
                    checkpoint = torch.load(weights_path_crf, map_location=self.device)
                    self.crf.load_state_dict(checkpoint["model_state_dict"], strict=False)
                else:
                    log.warning(f"Init from scratch. Load path {weights_path_crf} does not exist.")

    def save(self, fname: Optional[str] = None, *args, **kwargs) -> None:
        super().save(fname, *args, **kwargs)
        if self.crf is not None:
            if fname is None:
                fname = self.save_path
            weights_path_crf = Path(f"{fname}_crf").resolve()
            weights_path_crf = weights_path_crf.with_suffix(".pth.tar")
            torch.save({"model_state_dict": self.crf.cpu().state_dict()}, weights_path_crf)
            self.crf.to(self.device)


================================================
FILE: deeppavlov/models/torch_bert/torch_transformers_squad.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import namedtuple
from logging import getLogger
from pathlib import Path
from typing import List, Tuple, Optional, Dict

import numpy as np
import torch
from transformers import AutoModelForQuestionAnswering, AutoConfig, AutoModel
from transformers.data.processors.utils import InputFeatures

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel

logger = getLogger(__name__)


def softmax_mask(val, mask):
    inf = 1e30
    return -inf * (1 - mask.to(torch.float32)) + val


class PassageReaderClassifier(torch.nn.Module):
    """The model with a Transformer encoder and two linear layers: the first for prediction of answer start and end
    positions, the second defines the probability of the paragraph to contain the answer.

    Args:
        config: path to Transformer configuration file
    """

    def __init__(self, config):
        super().__init__()
        self.encoder = AutoModel.from_config(config=config)
        self.qa_outputs = torch.nn.Linear(config.hidden_size, 2)
        self.qa_classifier = torch.nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.encoder(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        logits = self.qa_outputs(out[0])
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        rank_logits = self.qa_classifier(out[0][:, 0, :])
        outputs = namedtuple("outputs", "start_logits end_logits rank_logits")
        return outputs(start_logits=start_logits, end_logits=end_logits, rank_logits=rank_logits)


@register('torch_transformers_squad')
class TorchTransformersSquad(TorchModel):
    """Bert-based on PyTorch model for SQuAD-like problem setting:
    It predicts start and end position of answer for given question and context.

    [CLS] token is used as no_answer. If model selects [CLS] token as most probable
    answer, it means that there is no answer in given context.

    Start and end position of answer are predicted by linear transformation
    of Bert outputs.

    Args:
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        attention_probs_keep_prob: keep_prob for Bert self-attention layers
        hidden_keep_prob: keep_prob for Bert hidden layers
        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name
        psg_cls: whether to use a separate linear layer to define if a passage contains the answer to the question
        batch_size: batch size for inference of squad model
    """

    def __init__(self,
                 pretrained_bert: str,
                 attention_probs_keep_prob: Optional[float] = None,
                 hidden_keep_prob: Optional[float] = None,
                 bert_config_file: Optional[str] = None,
                 psg_cls: bool = False,
                 batch_size: int = 10,
                 **kwargs) -> None:
        self.batch_size = batch_size
        self.psg_cls = psg_cls

        if pretrained_bert:
            logger.debug(f"From pretrained {pretrained_bert}.")
            config = AutoConfig.from_pretrained(pretrained_bert, output_attentions=False, output_hidden_states=False)
            if self.psg_cls:
                model = PassageReaderClassifier(config=config)
            else:
                model = AutoModelForQuestionAnswering.from_pretrained(pretrained_bert, config=config)

        elif bert_config_file and Path(bert_config_file).is_file():
            bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file)))
            if attention_probs_keep_prob is not None:
                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
            if hidden_keep_prob is not None:
                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob
            if self.psg_cls:
                model = PassageReaderClassifier(config=self.bert_config)
            else:
                model = AutoModelForQuestionAnswering(config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")
        super().__init__(model, **kwargs)

    def train_on_batch(self, features: List[List[InputFeatures]],
                       y_st: List[List[int]], y_end: List[List[int]]) -> Dict:
        """Train model on given batch.
        This method calls train_op using features and labels from y_st and y_end

        Args:
            features: batch of InputFeatures instances
            y_st: batch of lists of ground truth answer start positions
            y_end: batch of lists of ground truth answer end positions

        Returns:
            dict with loss and learning_rate values

        """
        input_ids = [f[0].input_ids for f in features]
        input_masks = [f[0].attention_mask for f in features]
        input_type_ids = [f[0].token_type_ids for f in features]

        b_input_ids = torch.cat(input_ids, dim=0).to(self.device)
        b_input_masks = torch.cat(input_masks, dim=0).to(self.device)
        b_input_type_ids = torch.cat(input_type_ids, dim=0).to(self.device)

        y_st = [x[0] for x in y_st]
        y_end = [x[0] for x in y_end]
        b_y_st = torch.from_numpy(np.array(y_st)).to(self.device)
        b_y_end = torch.from_numpy(np.array(y_end)).to(self.device)

        input_ = {
            'input_ids': b_input_ids,
            'attention_mask': b_input_masks,
            'token_type_ids': b_input_type_ids,
            'start_positions': b_y_st,
            'end_positions': b_y_end,
            'return_dict': True
        }

        self.optimizer.zero_grad()
        input_ = {arg_name: arg_value for arg_name, arg_value in input_.items() if arg_name in self.accepted_keys}
        loss = self.model(**input_).loss
        if self.is_data_parallel:
            loss = loss.mean()
        self._make_step(loss)

        return {'loss': loss.item()}

    @property
    def accepted_keys(self) -> Tuple[str]:
        if self.is_data_parallel:
            accepted_keys = self.model.module.forward.__code__.co_varnames
        else:
            accepted_keys = self.model.forward.__code__.co_varnames
        return accepted_keys

    def __call__(self, features_batch: List[List[InputFeatures]]) -> Tuple[
        List[List[int]], List[List[int]], List[List[float]], List[List[float]], List[int]]:
        """get predictions using features as input

        Args:
            features_batch: batch of InputFeatures instances

        Returns:
            start_pred_batch: answer start positions
            end_pred_batch: answer end positions
            logits_batch: answer logits
            scores_batch: answer confidences
            ind_batch: indices of paragraph pieces where the answer was found

        """
        predictions = {}
        # TODO: refactor batchification
        indices, input_ids, input_masks, input_type_ids = [], [], [], []
        for n, features_list in enumerate(features_batch):
            for f in features_list:
                input_ids.append(f.input_ids)
                input_masks.append(f.attention_mask)
                input_type_ids.append(f.token_type_ids)
                indices.append(n)

        num_batches = len(indices) // self.batch_size + int(len(indices) % self.batch_size > 0)
        for i in range(num_batches):
            b_input_ids = torch.cat(input_ids[i * self.batch_size:(i + 1) * self.batch_size], dim=0).to(self.device)
            b_input_masks = torch.cat(input_masks[i * self.batch_size:(i + 1) * self.batch_size], dim=0).to(self.device)
            b_input_type_ids = torch.cat(input_type_ids[i * self.batch_size:(i + 1) * self.batch_size],
                                         dim=0).to(self.device)
            input_ = {
                'input_ids': b_input_ids,
                'attention_mask': b_input_masks,
                'token_type_ids': b_input_type_ids,
                'return_dict': True
            }

            with torch.no_grad():
                input_ = {arg_name: arg_value for arg_name, arg_value in input_.items()
                          if arg_name in self.accepted_keys}
                # Forward pass, calculate logit predictions
                outputs = self.model(**input_)

                logits_st = outputs.start_logits
                logits_end = outputs.end_logits

                bs = b_input_ids.size()[0]
                seq_len = b_input_ids.size()[-1]
                mask = torch.cat([torch.ones(bs, 1, dtype=torch.int32),
                                  torch.zeros(bs, seq_len - 1, dtype=torch.int32)], dim=-1).to(self.device)
                logit_mask = b_input_type_ids + mask
                logits_st = softmax_mask(logits_st, logit_mask)
                logits_end = softmax_mask(logits_end, logit_mask)

                start_probs = torch.nn.functional.softmax(logits_st, dim=-1)
                end_probs = torch.nn.functional.softmax(logits_end, dim=-1)
                if self.psg_cls:
                    scores = outputs.rank_logits.squeeze(1)
                else:
                    scores = torch.tensor(1) - start_probs[:, 0] * end_probs[:, 0]

                outer = torch.matmul(start_probs.view(*start_probs.size(), 1),
                                     end_probs.view(end_probs.size()[0], 1, end_probs.size()[1]))
                outer_logits = torch.exp(logits_st.view(*logits_st.size(), 1) + logits_end.view(
                    logits_end.size()[0], 1, logits_end.size()[1]))

                context_max_len = torch.max(torch.sum(b_input_type_ids, dim=1)).to(torch.int64)

                max_ans_length = torch.min(torch.tensor(20).to(self.device), context_max_len).to(torch.int64).item()

                outer = torch.triu(outer, diagonal=0) - torch.triu(outer, diagonal=outer.size()[1] - max_ans_length)
                outer_logits = torch.triu(outer_logits, diagonal=0) - torch.triu(
                    outer_logits, diagonal=outer_logits.size()[1] - max_ans_length)

                start_pred = torch.argmax(torch.max(outer, dim=2)[0], dim=1)
                end_pred = torch.argmax(torch.max(outer, dim=1)[0], dim=1)
                logits = torch.max(torch.max(outer_logits, dim=2)[0], dim=1)[0]

            # Move logits and labels to CPU and to numpy arrays
            start_pred = start_pred.detach().cpu().numpy()
            end_pred = end_pred.detach().cpu().numpy()
            logits = logits.detach().cpu().numpy().tolist()
            scores = scores.detach().cpu().numpy().tolist()

            for j, (start_pred_elem, end_pred_elem, logits_elem, scores_elem) in \
                    enumerate(zip(start_pred, end_pred, logits, scores)):
                ind = indices[i * self.batch_size + j]
                if ind in predictions:
                    predictions[ind] += [(start_pred_elem, end_pred_elem, logits_elem, scores_elem)]
                else:
                    predictions[ind] = [(start_pred_elem, end_pred_elem, logits_elem, scores_elem)]

        start_pred_batch, end_pred_batch, logits_batch, scores_batch, ind_batch = [], [], [], [], []
        for ind in sorted(predictions.keys()):
            prediction = predictions[ind]
            max_ind = np.argmax([pred[2] for pred in prediction])
            start_pred_batch.append(prediction[max_ind][0])
            end_pred_batch.append(prediction[max_ind][1])
            logits_batch.append(prediction[max_ind][2])
            scores_batch.append(prediction[max_ind][3])
            ind_batch.append(max_ind)

        return start_pred_batch, end_pred_batch, logits_batch, scores_batch, ind_batch


================================================
FILE: deeppavlov/models/torch_bert/torch_transformers_syntax_parser.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from logging import getLogger
from pathlib import Path
from typing import List, Dict, Union, Optional, Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoConfig, AutoModel

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel
from deeppavlov.models.torch_bert.torch_transformers_sequence_tagger import token_from_subtoken

logger = getLogger(__name__)


class Biaffine(nn.Module):
    def __init__(self, in1_features: int, in2_features: int, out_features: int):
        super().__init__()
        self.bilinear = PairwiseBilinear(in1_features + 1, in2_features + 1, out_features)
        self.bilinear.weight.data.zero_()
        self.bilinear.bias.data.zero_()

    def forward(self, input1: torch.Tensor, input2: torch.Tensor) -> torch.Tensor:
        input1 = torch.cat([input1, input1.new_ones(*input1.size()[:-1], 1)], dim=input1.dim() - 1)
        input2 = torch.cat([input2, input2.new_ones(*input2.size()[:-1], 1)], dim=input2.dim() - 1)
        return self.bilinear(input1, input2)


class PairwiseBilinear(nn.Module):
    """
    https://github.com/stanfordnlp/stanza/blob/v1.1.1/stanza/models/common/biaffine.py#L5  # noqa
    """

    def __init__(self, in1_features: int, in2_features: int, out_features: int, bias: bool = True):
        super().__init__()
        self.in1_features = in1_features
        self.in2_features = in2_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.Tensor(in1_features, out_features, in2_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter("bias", None)
        self.reset_parameters()

    def reset_parameters(self):
        bound = 1 / math.sqrt(self.weight.size(0))
        nn.init.uniform_(self.weight, -bound, bound)
        if self.bias is not None:
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, input1: torch.Tensor, input2: torch.Tensor) -> torch.Tensor:
        d1, d2, out = self.in1_features, self.in2_features, self.out_features
        n1, n2 = input1.size(1), input2.size(1)
        # (b * n1, d1) @ (d1, out * d2) => (b * n1, out * d2)
        x1W = torch.mm(input1.view(-1, d1), self.weight.view(d1, out * d2))
        # (b, n1 * out, d2) @ (b, d2, n2) => (b, n1 * out, n2)
        x1Wx2 = x1W.view(-1, n1 * out, d2).bmm(input2.transpose(1, 2))
        y = x1Wx2.view(-1, n1, self.out_features, n2).transpose(2, 3)
        if self.bias is not None:
            y.add_(self.bias)
        return y  # (b, n1, n2, out)

    def extra_repr(self) -> str:
        return "in1_features={}, in2_features={}, out_features={}, bias={}".format(
            self.in1_features, self.in2_features, self.out_features, self.bias is not None
        )


@torch.no_grad()
def mask_arc(lengths: torch.Tensor, mask_diag: bool = True) -> Optional[torch.Tensor]:
    b, n = lengths.numel(), lengths.max()
    if torch.all(lengths == n):
        if not mask_diag:
            return None
        mask = torch.ones(b, n, n + 1)
    else:
        mask = torch.zeros(b, n, n + 1)
        for i, length in enumerate(lengths):
            mask[i, :length, :length + 1] = 1
    if mask_diag:
        mask.masked_fill_(torch.eye(n, dtype=torch.bool), 0)
    return mask


class SyntaxParserNetwork(torch.nn.Module):
    """The model which defines heads in syntax tree and dependencies for text tokens.
       Text token ids are fed into Transformer encoder, hidden states are passed into dense layers followed by
       two biaffine layers (first for prediction of pairwise probabilities of a token to be the head for other token,
       second - for prediction of syntax dependency of a token).
    """

    def __init__(self, n_deps: int, pretrained_bert: str, encoder_layer_ids: List[int] = (-1,),
                 bert_config_file: Optional[str] = None, attention_probs_keep_prob: Optional[float] = None,
                 hidden_keep_prob: Optional[float] = None, state_size: int = 256, device: str = "gpu"):
        super().__init__()

        self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu")
        self.n_deps = n_deps
        self.encoder_layer_ids = encoder_layer_ids
        self.state_size = state_size
        if pretrained_bert:
            logger.debug(f"From pretrained {pretrained_bert}.")
            config = AutoConfig.from_pretrained(pretrained_bert, output_attentions=False, output_hidden_states=False)
            self.encoder = AutoModel.from_pretrained(pretrained_bert, config=config)

        elif bert_config_file and Path(bert_config_file).is_file():
            bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file)))
            if attention_probs_keep_prob is not None:
                bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
            if hidden_keep_prob is not None:
                bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob
            self.encoder = AutoModel(config=bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.head_embs1 = torch.nn.Linear(config.hidden_size, state_size)
        self.dep_embs1 = torch.nn.Linear(config.hidden_size, state_size)
        self.head_embs2 = torch.nn.Linear(config.hidden_size, state_size)
        self.dep_embs2 = torch.nn.Linear(config.hidden_size, state_size)
        self.zero_emb1 = torch.nn.Parameter(torch.randn(state_size, ), requires_grad=True)
        self.zero_emb2 = torch.nn.Parameter(torch.randn(state_size, ), requires_grad=True)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.biaf_head = Biaffine(state_size, state_size, 1)
        self.biaf_dep = Biaffine(state_size, state_size, n_deps)

    def forward(self, input_ids, attention_mask, subtoken_mask, y_heads=None, y_dep=None):
        input_ids = torch.from_numpy(input_ids).to(self.device)
        attention_mask = torch.from_numpy(attention_mask).to(self.device)
        subtoken_mask = torch.from_numpy(subtoken_mask)

        outputs = self.encoder(input_ids, attention_mask, output_hidden_states=True)
        hidden_states = outputs.hidden_states
        layer_output_list = []
        for layer_id in self.encoder_layer_ids:
            layer_id = layer_id + 1 if layer_id != -1 else layer_id
            layer_output_list.append(hidden_states[layer_id])
        layer_output = torch.stack(layer_output_list)
        layer_output = torch.sum(layer_output, dim=0)

        layer_output = token_from_subtoken(layer_output, subtoken_mask)
        bs, seq_len, dim = layer_output.size()

        layer_output = layer_output.float().to(self.device)
        lengths = torch.sum(subtoken_mask, dim=-1)

        head1 = self.head_embs1(layer_output)
        dep1 = self.dep_embs1(layer_output)
        dep1_zero = [self.zero_emb1 for _ in range(bs)]
        dep1_zero = torch.stack(dep1_zero).unsqueeze(1).to(self.device)
        dep1 = torch.cat([dep1_zero, dep1], dim=1)

        head2 = self.head_embs2(layer_output)
        dep2 = self.dep_embs2(layer_output)
        dep2_zero = [self.zero_emb2 for _ in range(bs)]
        dep2_zero = torch.stack(dep2_zero).unsqueeze(1).to(self.device)
        dep2 = torch.cat([dep2_zero, dep2], dim=1)

        head1 = self.dropout(head1)
        dep1 = self.dropout(dep1)
        head2 = self.dropout(head2)
        dep2 = self.dropout(dep2)

        logits_head_init = self.biaf_head(head1, dep1).squeeze_(3)
        logits_deprel = self.biaf_dep(head2, dep2)
        mask = mask_arc(lengths, mask_diag=False)
        if mask is not None:
            logits_head_init.masked_fill_(mask.logical_not().to(logits_head_init.device), -10.0)
        logits_head = F.softmax(logits_head_init, dim=-1)

        head_loss, dep_loss = None, None
        if y_heads is not None:
            y_heads = tuple(torch.LongTensor(yh).to(self.device) for yh in y_heads)
            y_heads_pd = nn.utils.rnn.pad_sequence(y_heads, batch_first=True, padding_value=-1)

            logits_head_flatten = logits_head.contiguous().view(-1, logits_head.size(-1))
            y_heads_flatten = y_heads_pd.contiguous().view(-1)
            head_loss = F.cross_entropy(logits_head_flatten, y_heads_flatten, ignore_index=-1, reduction="sum")
            head_loss.div_((y_heads_flatten != -1).sum())

            y_dep = tuple(torch.LongTensor(ydp).to(self.device) for ydp in y_dep)
            y_dep_pd = nn.utils.rnn.pad_sequence(y_dep, batch_first=True, padding_value=-1)
            y_heads_new = y_heads_pd.masked_fill(y_heads_pd == -1, 0)
            gather_index = y_heads_new.view(*y_heads_new.size(), 1, 1).expand(-1, -1, -1, logits_deprel.size(-1))

            logits_deprel = torch.gather(logits_deprel, dim=2, index=gather_index)
            logits_deprel_flatten = logits_deprel.contiguous().view(-1, logits_deprel.size(-1))
            y_dep_flatten = y_dep_pd.contiguous().view(-1)
            dep_loss = F.cross_entropy(logits_deprel_flatten, y_dep_flatten, ignore_index=-1, reduction="sum")
            dep_loss.div_((y_dep_flatten != -1).sum())
        else:
            logits_head = logits_head.detach().cpu().numpy()
            head_ids = np.argmax(logits_head, axis=-1).tolist()

            head_ids_new = torch.LongTensor(head_ids)
            steps = torch.arange(head_ids_new.size(1))
            logits_deprel = [logits_deprel[i, steps, heads] for i, heads in enumerate(head_ids_new)]
            logits_deprel = torch.stack(logits_deprel, dim=0)
            deprels = logits_deprel.argmax(dim=2).detach().cpu().numpy().tolist()

            head_probas = [head_probas_list[:l, :l + 1] for l, head_probas_list in zip(lengths, logits_head)]
            deprels = [deprel[:l] for l, deprel in zip(lengths, deprels)]

        if y_heads is not None:
            return head_loss + dep_loss
        else:
            return head_probas, deprels


@register('torch_transformers_syntax_parser')
class TorchTransformersSyntaxParser(TorchModel):
    """Transformer-based model on PyTorch for syntax parsing. It predicts probabilities of heads and
       dependency ids for text tokens. 

    Args:
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        n_deps: number of syntax dependencies
        encoder_layer_ids: list of indexes of encoder layers which will be used for further predicting of heads and
            dependencies with biaffine layer
        state_size: size of dense layers which follow after transformer encoder
        attention_probs_keep_prob: keep_prob for Bert self-attention layers
        hidden_keep_prob: keep_prob for Bert hidden layers
        bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name
    """

    def __init__(self, pretrained_bert: str,
                 n_deps: int,
                 encoder_layer_ids: List[int] = (-1,),
                 state_size: int = 256,
                 attention_probs_keep_prob: Optional[float] = None,
                 hidden_keep_prob: Optional[float] = None,
                 bert_config_file: Optional[str] = None,
                 **kwargs) -> None:

        model = SyntaxParserNetwork(n_deps, pretrained_bert, encoder_layer_ids,
                                    bert_config_file, attention_probs_keep_prob, hidden_keep_prob,
                                    state_size)
        super().__init__(model, **kwargs)

    def train_on_batch(self, input_ids: Union[List[List[int]], np.ndarray],
                       input_masks: Union[List[List[int]], np.ndarray],
                       y_masks: Union[List[List[int]], np.ndarray],
                       y_heads: List[List[int]], y_dep: List[List[int]]) -> Dict:
        """

        Args:
            input_ids: indices of the subwords
            input_masks: mask that determines where to attend and where not to
            y_masks: mask which determines the first subword units in the the word
            y_heads: for each token - id fo token which is the head in syntax tree for the token
            y_dep: syntax dependencies for each tokens
        """
        self.optimizer.zero_grad()
        loss = self.model(input_ids, input_masks, y_masks, y_heads, y_dep)
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        if self.clip_norm:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm)

        self.optimizer.step()

        return {'loss': loss.item()}

    def __call__(self, input_ids: Union[List[List[int]], np.ndarray],
                 input_masks: Union[List[List[int]], np.ndarray],
                 y_masks: Union[List[List[int]], np.ndarray]) -> Tuple[List[List[List[float]]], List[List[int]]]:
        """ Predicts probas of heads and dependency ids for tokens

        Args:
            input_ids: indices of the subwords
            input_masks: mask that determines where to attend and where not to
            y_masks: mask which determines the first subword units in the the word

        Returns:
            Probas of heads and dependency ids for each token (not subtoken)

        """
        with torch.no_grad():
            head_probas, dep_ids = self.model(input_ids, input_masks, y_masks)
        return head_probas, dep_ids


================================================
FILE: deeppavlov/models/vectorizers/__init__.py
================================================


================================================
FILE: deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import Counter
from logging import getLogger
from typing import List, Any, Generator, Tuple, KeysView, ValuesView, Dict, Optional

import numpy as np
import scipy as sp
from scipy import sparse
from sklearn.utils import murmurhash3_32

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.estimator import Estimator

logger = getLogger(__name__)

Sparse = sp.sparse.csr_matrix


def hash_(token: str, hash_size: int) -> int:
    """Convert a token to a hash of given size.
    Args:
        token: a word
        hash_size: hash size

    Returns:
        int, hashed token

    """
    return murmurhash3_32(token, positive=True) % hash_size


@register('hashing_tfidf_vectorizer')
class HashingTfIdfVectorizer(Estimator):
    """Create a tfidf matrix from collection of documents of size [n_documents X n_features(hash_size)].

    Args:
        tokenizer: a tokenizer class
        hash_size: a hash size, power of two
        doc_index: a dictionary of document ids and their titles
        save_path: a path to **.npz** file where tfidf matrix is saved
        load_path: a path to **.npz** file where tfidf matrix is loaded from

    Attributes:
        hash_size: a hash size
        tokenizer: instance of a tokenizer class
        term_freqs: a dictionary with tfidf terms and their frequences
        doc_index: provided by a user ids or generated automatically ids
        rows: tfidf matrix rows corresponding to terms
        cols: tfidf matrix cols corresponding to docs
        data: tfidf matrix data corresponding to tfidf values

    """

    def __init__(self, tokenizer: Component, hash_size=2 ** 24, doc_index: Optional[dict] = None,
                 save_path: Optional[str] = None, load_path: Optional[str] = None, **kwargs):

        super().__init__(save_path=save_path, load_path=load_path, mode=kwargs.get('mode', 'infer'))

        self.hash_size = hash_size
        self.tokenizer = tokenizer
        self.rows = []
        self.cols = []
        self.data = []

        if kwargs.get('mode', 'infer') == 'infer':
            self.tfidf_matrix, opts = self.load()
            self.ngram_range = opts['ngram_range']
            self.hash_size = opts['hash_size']
            self.term_freqs = opts['term_freqs'].squeeze()
            self.doc_index = opts['doc_index']
            self.index2doc = self.get_index2doc()
        else:
            self.term_freqs = None
            self.doc_index = doc_index or {}

    def __call__(self, questions: List[str]) -> Sparse:
        """Transform input list of documents to tfidf vectors.

        Args:
            questions: a list of input strings

        Returns:
            transformed documents as a csr_matrix with shape [n_documents X :attr:`hash_size`]

        """

        sp_tfidfs = []

        for question in questions:
            ngrams = list(self.tokenizer([question]))
            hashes = [hash_(ngram, self.hash_size) for ngram in ngrams[0]]

            hashes_unique, q_hashes = np.unique(hashes, return_counts=True)
            tfs = np.log1p(q_hashes)

            if len(q_hashes) == 0:
                sp_tfidfs.append(Sparse((1, self.hash_size)))
                continue

            size = len(self.doc_index)
            Ns = self.term_freqs[hashes_unique]
            idfs = np.log((size - Ns + 0.5) / (Ns + 0.5))
            idfs[idfs < 0] = 0

            tfidf = np.multiply(tfs, idfs).astype("float32")

            indptr = np.array([0, len(hashes_unique)])
            sp_tfidf = Sparse((tfidf, hashes_unique, indptr), shape=(1, self.hash_size)
                              )
            sp_tfidfs.append(sp_tfidf)

        transformed = sp.sparse.vstack(sp_tfidfs)
        return transformed

    def get_index2doc(self) -> Dict[Any, int]:
        """Invert doc_index.

        Returns:
            inverted doc_index dict

        """
        return dict(zip(self.doc_index.values(), self.doc_index.keys()))

    def get_counts(self, docs: List[str], doc_ids: List[Any]) \
            -> Generator[Tuple[KeysView, ValuesView, List[int]], Any, None]:
        """Get term counts for a list of documents.

        Args:
            docs: a list of input documents
            doc_ids: a list of document ids corresponding to input documents

        Yields:
            a tuple of term hashes, count values and column ids

        Returns:
            None

        """
        logger.debug("Tokenizing batch...")
        batch_ngrams = list(self.tokenizer(docs))
        logger.debug("Counting hash...")
        doc_id = iter(doc_ids)
        for ngrams in batch_ngrams:
            counts = Counter([hash_(gram, self.hash_size) for gram in ngrams])
            hashes = counts.keys()
            values = counts.values()
            _id = self.doc_index[next(doc_id)]
            if values:
                col_id = [_id] * len(values)
            else:
                col_id = []
            yield hashes, values, col_id

    def get_count_matrix(self, row: List[int], col: List[int], data: List[int], size: int) \
            -> Sparse:
        """Get count matrix.

        Args:
            row: tfidf matrix rows corresponding to terms
            col:  tfidf matrix cols corresponding to docs
            data: tfidf matrix data corresponding to tfidf values
            size: :attr:`doc_index` size

        Returns:
            a count csr_matrix

        """
        count_matrix = Sparse((data, (row, col)), shape=(self.hash_size, size))
        count_matrix.sum_duplicates()
        return count_matrix

    @staticmethod
    def get_tfidf_matrix(count_matrix: Sparse) -> Tuple[Sparse, np.array]:
        """Convert a count matrix into a tfidf matrix.

        Args:
            count_matrix: a count matrix

        Returns:
            a tuple of tfidf matrix and term frequences

        """

        binary = (count_matrix > 0).astype(int)
        term_freqs = np.array(binary.sum(1)).squeeze()
        idfs = np.log((count_matrix.shape[1] - term_freqs + 0.5) / (term_freqs + 0.5))
        idfs[idfs < 0] = 0
        idfs = sp.sparse.diags(idfs, 0)
        tfs = count_matrix.log1p()
        tfidfs = idfs.dot(tfs)
        return tfidfs, term_freqs

    def save(self) -> None:
        """Save tfidf matrix into **.npz** format.

        Returns:
            None

        """
        logger.info("Saving tfidf matrix to {}".format(self.save_path))
        count_matrix = self.get_count_matrix(self.rows, self.cols, self.data,
                                             size=len(self.doc_index))
        tfidf_matrix, term_freqs = self.get_tfidf_matrix(count_matrix)
        self.term_freqs = term_freqs

        opts = {'hash_size': self.hash_size,
                'ngram_range': self.tokenizer.ngram_range,
                'doc_index': self.doc_index,
                'term_freqs': self.term_freqs}

        data = {
            'data': tfidf_matrix.data,
            'indices': tfidf_matrix.indices,
            'indptr': tfidf_matrix.indptr,
            'shape': tfidf_matrix.shape,
            'opts': opts
        }
        np.savez(self.save_path, **data)

        # release memory
        self.reset()

    def reset(self) -> None:
        """Clear :attr:`rows`, :attr:`cols` and :attr:`data`

        Returns:
            None

        """
        self.rows.clear()
        self.cols.clear()
        self.data.clear()

    def load(self) -> Tuple[Sparse, Dict]:
        """Load a tfidf matrix as csr_matrix.

        Returns:
            a tuple of tfidf matrix and csr data.

        Raises:
            FileNotFoundError if :attr:`load_path` doesn't exist.

        Todo:
            * implement loading from URL

        """
        if not self.load_path.exists():
            raise FileNotFoundError("HashingTfIdfVectorizer path doesn't exist!")

        logger.debug("Loading tfidf matrix from {}".format(self.load_path))
        loader = np.load(self.load_path, allow_pickle=True)
        matrix = Sparse((loader['data'], loader['indices'],
                         loader['indptr']), shape=loader['shape'])
        return matrix, loader['opts'].item(0)

    def partial_fit(self, docs: List[str], doc_ids: List[Any], doc_nums: List[int]) -> None:
        """Partially fit on one batch.

        Args:
            docs: a list of input documents
            doc_ids: a list of document ids corresponding to input documents
            doc_nums: a list of document integer ids as they appear in a database

        Returns:
            None

        """
        for doc_id, i in zip(doc_ids, doc_nums):
            self.doc_index[doc_id] = i

        for batch_rows, batch_data, batch_cols in self.get_counts(docs, doc_ids):
            self.rows.extend(batch_rows)
            self.cols.extend(batch_cols)
            self.data.extend(batch_data)

    def fit(self, docs: List[str], doc_ids: List[Any], doc_nums: List[int]) -> None:
        """Fit the vectorizer.

        Args:
            docs: a list of input documents
            doc_ids: a list of document ids corresponding to input documents
            doc_nums: a list of document integer ids as they appear in a database

        Returns:
            None

        """
        self.doc_index = {}
        self.rows = []
        self.cols = []
        self.data = []
        return self.partial_fit(docs, doc_ids, doc_nums)


================================================
FILE: deeppavlov/paramsearch.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import sys
from copy import deepcopy
from itertools import product
from logging import getLogger
from pathlib import Path

import numpy as np
from sklearn.model_selection import train_test_split

from deeppavlov.core.commands.train import train_evaluate_model_from_config, get_iterator_from_config, \
    read_data_by_config
from deeppavlov.core.commands.utils import parse_config
from deeppavlov.core.common.cross_validation import calc_cv_score
from deeppavlov.core.common.file import save_json, find_config, read_json
from deeppavlov.core.common.params_search import ParamsSearch

p = (Path(__file__) / ".." / "..").resolve()
sys.path.append(str(p))

log = getLogger(__name__)

parser = argparse.ArgumentParser()
parser.add_argument("config_path", help="path to a pipeline json config", type=str)
parser.add_argument("--folds", help="number of folds", type=str, default=None)
parser.add_argument("--search_type", help="search type: grid or random search", type=str, default='grid')


def get_best_params(combinations, scores, param_names, target_metric):
    max_id = np.argmax(scores)
    best_params = dict(zip(param_names, combinations[max_id]))
    best_params[target_metric] = scores[max_id]

    return best_params


def main():
    params_helper = ParamsSearch()

    args = parser.parse_args()
    is_loo = False
    n_folds = None
    if args.folds == 'loo':
        is_loo = True
    elif args.folds is None:
        n_folds = None
    elif args.folds.isdigit():
        n_folds = int(args.folds)
    else:
        raise NotImplementedError('Not implemented this type of CV')

    # read config
    pipeline_config_path = find_config(args.config_path)
    config_init = read_json(pipeline_config_path)
    config = parse_config(config_init)
    data = read_data_by_config(config)
    target_metric = parse_config(config_init)['train']['metrics'][0]
    if isinstance(target_metric, dict):
        target_metric = target_metric['name']

    # get all params for search
    param_paths = list(params_helper.find_model_path(config, 'search_choice'))
    param_values = []
    param_names = []
    for path in param_paths:
        value = params_helper.get_value_from_config(config, path)
        param_name = path[-1]
        param_value_search = value['search_choice']
        param_names.append(param_name)
        param_values.append(param_value_search)

    # find optimal params
    if args.search_type == 'grid':
        # generate params combnations for grid search
        combinations = list(product(*param_values))

        # calculate cv scores
        scores = []
        for comb in combinations:
            config = deepcopy(config_init)
            for param_path, param_value in zip(param_paths, comb):
                params_helper.insert_value_or_dict_into_config(config, param_path, param_value)
            config = parse_config(config)

            if (n_folds is not None) | is_loo:
                # CV for model evaluation
                score_dict = calc_cv_score(config, data=data, n_folds=n_folds, is_loo=is_loo)
                score = score_dict[next(iter(score_dict))]
            else:
                # train/valid for model evaluation
                data_to_evaluate = data.copy()
                if len(data_to_evaluate['valid']) == 0:
                    data_to_evaluate['train'], data_to_evaluate['valid'] = train_test_split(data_to_evaluate['train'],
                                                                                            test_size=0.2)
                iterator = get_iterator_from_config(config, data_to_evaluate)
                score = train_evaluate_model_from_config(config, iterator=iterator)['valid'][target_metric]

            scores.append(score)

        # get model with best score
        best_params_dict = get_best_params(combinations, scores, param_names, target_metric)
        log.info('Best model params: {}'.format(best_params_dict))
    else:
        raise NotImplementedError('Not implemented this type of search')

    # save config
    best_config = config_init
    for i, param_name in enumerate(best_params_dict.keys()):
        if param_name != target_metric:
            params_helper.insert_value_or_dict_into_config(best_config, param_paths[i], best_params_dict[param_name])

    best_model_filename = pipeline_config_path.with_suffix('.cvbest.json')
    save_json(best_config, best_model_filename)
    log.info('Best model saved in json-file: {}'.format(best_model_filename))


# try to run:
# --config_path path_to_config.json --folds 2
if __name__ == "__main__":
    main()


================================================
FILE: deeppavlov/requirements/datasets.txt
================================================
datasets>=1.16.0,<2.5.0;python_version<="3.10"
datasets==2.2.*;python_version=="3.11.*"


================================================
FILE: deeppavlov/requirements/dependency_decoding.txt
================================================
ufal.chu-liu-edmonds


================================================
FILE: deeppavlov/requirements/en_core_web_sm.txt
================================================
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl
spacy


================================================
FILE: deeppavlov/requirements/faiss.txt
================================================
faiss-cpu==1.7.2;python_version<="3.10"
faiss-cpu==1.7.4;python_version=="3.11.*"


================================================
FILE: deeppavlov/requirements/fasttext.txt
================================================
fasttext==0.9.*


================================================
FILE: deeppavlov/requirements/hdt.txt
================================================
hdt==2.3


================================================
FILE: deeppavlov/requirements/kenlm.txt
================================================
pypi-kenlm==0.1.20220713;python_version<="3.10"
kenlm==0.2.*;python_version=="3.11.*"


================================================
FILE: deeppavlov/requirements/lxml.txt
================================================
lxml==4.9.*


================================================
FILE: deeppavlov/requirements/opt_einsum.txt
================================================
opt-einsum==3.3.*


================================================
FILE: deeppavlov/requirements/protobuf.txt
================================================
protobuf<=3.20


================================================
FILE: deeppavlov/requirements/pytorch.txt
================================================
torch>=1.6.0,<1.14.0


================================================
FILE: deeppavlov/requirements/rapidfuzz.txt
================================================
rapidfuzz==2.1.*


================================================
FILE: deeppavlov/requirements/razdel.txt
================================================
razdel==0.5.0


================================================
FILE: deeppavlov/requirements/ru_core_news_sm.txt
================================================
https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.5.0/ru_core_news_sm-3.5.0-py3-none-any.whl
spacy


================================================
FILE: deeppavlov/requirements/sacremoses.txt
================================================
sacremoses==0.0.53


================================================
FILE: deeppavlov/requirements/sentencepiece.txt
================================================
sentencepiece==0.2.0


================================================
FILE: deeppavlov/requirements/slovnet.txt
================================================
slovnet==0.5.*
navec


================================================
FILE: deeppavlov/requirements/sortedcontainers.txt
================================================
sortedcontainers==2.4.*


================================================
FILE: deeppavlov/requirements/torchcrf.txt
================================================
pytorch-crf==0.7.*


================================================
FILE: deeppavlov/requirements/transformers.txt
================================================
transformers>=4.13.0,<4.25.0;python_version<"3.8"
transformers==4.30.0;python_version>="3.8"


================================================
FILE: deeppavlov/requirements/udapi.txt
================================================
udapi==0.3.*


================================================
FILE: deeppavlov/requirements/whapi.txt
================================================
bs4
whapi==0.6.*


================================================
FILE: deeppavlov/settings.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse

from deeppavlov.core.common.paths import get_settings_path, populate_settings_dir

parser = argparse.ArgumentParser()

parser.add_argument("-d", "--default", action="store_true", help="return to defaults")


def main():
    """DeepPavlov console configuration utility."""
    args = parser.parse_args()
    path = get_settings_path()

    if args.default:
        if populate_settings_dir(force=True):
            print(f'Populated {path} with default settings files')
        else:
            print(f'{path} is already a default settings directory')
    else:
        print(f'Current DeepPavlov settings path: {path}')


if __name__ == "__main__":
    main()


================================================
FILE: deeppavlov/utils/__init__.py
================================================


================================================
FILE: deeppavlov/utils/benchmarks/__init__.py
================================================


================================================
FILE: deeppavlov/utils/benchmarks/benchmarks.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from collections import defaultdict
from logging import getLogger

import numpy as np
from tqdm import tqdm

from deeppavlov import build_model
from deeppavlov.core.commands.train import read_data_by_config, get_iterator_from_config
from deeppavlov.core.commands.utils import parse_config, expand_path
from deeppavlov.core.common.file import save_jsonl

log = getLogger(__name__)

parser = argparse.ArgumentParser()

parser.add_argument('config_path', help='path to a pipeline json config', type=str)
parser.add_argument('benchmark_name', help='benchmark name to be submitted',
                    choices=['glue', 'superglue', 'russian_superglue'])
parser.add_argument('-o', '--output-file', default=None, help='path to save output', type=str)
parser.add_argument('-d', '--download', action='store_true', help='download model components')

GLUE_TASKS = {
    'cola': 'CoLA',
    'mnli-m': 'MNLI-m',
    'mnli-mm': 'MNLI-mm',
    'mrpc': 'MRPC',
    'qnli': 'QNLI',
    'qqp': 'QQP',
    'rte': 'RTE',
    'sst2': 'SST-2',
    'stsb': 'STS-B',
    'wnli': 'WNLI'
}

SUPER_GLUE_TASKS = {
    'copa': 'COPA',
    'multirc': 'MultiRC',
    'boolq': 'BoolQ',
    'record': 'ReCoRD',
    'wic': 'WiC'
}

RSG_TASKS = {
    'lidirus': 'LiDiRus',
    'rcb': 'RCB',
    'parus': 'PARus',
    'muserc': 'MuSeRC',
    'terra': 'TERRa',
    'russe': 'RUSSE',
    'rwsd': 'RWSD',
    'danetqa': 'DaNetQA',
    'rucos': 'RuCoS'
}


def split_config(config_path, download):
    """Gets model, data iterator and a task name from the configuration file.
    
    Args:
        config_path: Path to the model configuration file.
        download: If True, the model will be downloaded from the DeepPavlov server.
    """

    config = parse_config(config_path)
    data = read_data_by_config(config)
    iterator = get_iterator_from_config(config, data)
    task_name = config['dataset_reader']['name']
    if task_name == 'mnli':
        task_name = 'mnli-m' if config['dataset_reader']['valid'] == 'validation_matched' else 'mnli-mm'
    data_gen = iterator.gen_batches(1, data_type='test', shuffle=False)
    model = build_model(config, download=download)
    return model, data_gen, task_name


def get_predictions(model, data_gen, replace_word=None, round_res=False):
    """Gets model predictions and replaces model output with replace_word.
    
    Args:
        model: The model itself.
        data_gen: Iterator with data to be submitted.
        replace_word: Model outputs to be replaced with 1, other outputs are replaced with 0.
            If None, model outputs are not replaced.
        round_res: If True, model outputs are rounded (used in stsb).
    """

    submission = {'index': [], 'prediction': []}
    for idx, (x, _) in enumerate(tqdm(data_gen)):
        prediction = model.compute(x)[0]
        if replace_word:
            prediction = 1 if prediction == replace_word else 0
        if round_res:
            prediction = round(prediction, 3)
        submission['index'].append(idx)
        submission['prediction'].append(prediction)
    return submission


def submit_glue(config_path, output_path, download):
    """Creates submission file for the GLUE tasks.
    Args:
        config_path: Path to the model configuration file.
        output_path: Path to output file. If None, file name is selected according corresponding task name.
        download: If True, the model will be downloaded from the DeepPavlov server.
    """

    model, data_gen, task_name = split_config(config_path, download)

    if task_name == 'cola':
        submission = get_predictions(model, data_gen, 'acceptable')

    elif task_name.startswith('mnli'):
        submission = get_predictions(model, data_gen)

    elif task_name == 'mrpc':
        submission = get_predictions(model, data_gen, 'equivalent')

    elif task_name == 'sst2':
        submission = get_predictions(model, data_gen, 'positive')

    elif task_name == 'stsb':
        submission = get_predictions(model, data_gen, None, True)

    elif task_name == 'wnli':
        submission = get_predictions(model, data_gen, 'entailment')

    elif task_name in GLUE_TASKS:
        submission = get_predictions(model, data_gen)
    else:
        raise ValueError(f'Unexpected GLUE task name: {task_name}')

    save_path = output_path or f'{GLUE_TASKS[task_name]}.tsv'
    save_path = expand_path(save_path)
    save_path.parent.mkdir(parents=True, exist_ok=True)
    save_array = np.vstack(([list(submission.keys())], np.array(list(submission.values())).transpose()))
    np.savetxt(save_path, save_array, delimiter='\t', fmt='%s')
    log.info(f'Prediction saved to {save_path}')


def commonsense_reasoning_prediction(model, data_gen):
    """Common part for ReCoRD and RuCoS tasks that gets their predictions in needed format.
    
    Args:
        model: The model itself.
        data_gen: Iterator with data to be submitted.
    """

    submission = []
    output = defaultdict(
        lambda: {
            'predicted': [],
            'probability': []
        }
    )

    for x, _ in tqdm(data_gen):
        indices, _, _, entities, _ = x[0]
        prediction = model.compute(x)[:, 1]
        output[indices]['predicted'].append(entities)
        output[indices]['probability'].append(prediction)

    for key, value in output.items():
        answer_index = np.argmax(value['probability'])
        answer = value['predicted'][answer_index]
        submission.append({'idx': int(key.split('-')[1]), 'label': answer})
    return submission


def multi_sentence_comprehension_prediction(model, data_gen):
    """Common part for MultiRC and MuSeRC tasks that gets their predictions in needed format.
    
    Args:
        model: The model itself.
        data_gen: Iterator with data to be submitted.
    """

    output = {}

    for x, _ in tqdm(data_gen):
        contexts, answers, indices = x[0]

        prediction = model([contexts], [answers], indices)

        paragraph_idx = indices['paragraph']
        question_idx = indices['question']
        answer_idx = indices['answer']

        label = int(prediction[0] == 'True')
        if paragraph_idx not in output:
            output[paragraph_idx] = {
                'idx': paragraph_idx,
                'passage': {
                    'questions': [
                        {
                            'idx': question_idx,
                            'answers': [{'idx': answer_idx, 'label': label}]
                        }
                    ]
                }
            }

        questions = output[paragraph_idx]['passage']['questions']
        question_indices = set(el['idx'] for el in questions)
        if question_idx not in question_indices:
            output[paragraph_idx]['passage']['questions'].append({
                'idx': question_idx,
                'answers': [{'idx': answer_idx, 'label': label}]
            })
        else:
            for question in questions:
                if question['idx'] == question_idx:
                    question['answers'].append({'idx': answer_idx, 'label': label})

    submission = list(output.values())
    return submission


def submit_superglue(config_path, output_path, download):
    """Creates submission file for the SuperGLUE tasks.

    Args:
        config_path: Path to the model configuration file.
        output_path: Path to output file. If None, file name is selected according corresponding task name.
        download: If True, the model will be downloaded from the DeepPavlov server.
    """

    model, data_gen, task_name = split_config(config_path, download)
    submission = []

    if task_name == 'record':
        submission = commonsense_reasoning_prediction(model, data_gen)

    elif task_name == 'copa':
        for idx, (x, _) in enumerate(tqdm(data_gen)):
            prediction = model.compute(x)[0]
            label = int(prediction == 'choice2')
            submission.append({'idx': idx, 'label': label})

    elif task_name == 'multirc':
        submission = multi_sentence_comprehension_prediction(model, data_gen)

    elif task_name in SUPER_GLUE_TASKS:
        for idx, (x, _) in enumerate(tqdm(data_gen)):
            prediction = model.compute(x)

            while isinstance(prediction, list):
                prediction = prediction[0]

            submission.append({'idx': idx, 'label': prediction})
    else:
        raise ValueError(f'Unexpected SuperGLUE task name: {task_name}')

    save_path = output_path if output_path is not None else f'{SUPER_GLUE_TASKS[task_name]}.jsonl'
    save_path = expand_path(save_path)
    save_path.parent.mkdir(parents=True, exist_ok=True)
    save_jsonl(submission, save_path)
    log.info(f'Prediction saved to {save_path}')


def submit_rsg(config_path, output_path, download):
    """Creates submission file for the Russian SuperGLUE tasks.

    Args:
        config_path: Path to the model configuration file.
        output_path: Path to output file. If None, file name is selected according corresponding task name.
        download: If True, the model will be downloaded from the DeepPavlov server.
    """

    model, data_gen, task_name = split_config(config_path, download)
    submission = []

    if task_name == 'rucos':
        submission = commonsense_reasoning_prediction(model, data_gen)

    elif task_name == 'parus':
        for idx, (x, _) in enumerate(tqdm(data_gen)):
            prediction = model.compute(x)[0]
            label = int(prediction == 'choice2')
            submission.append({'idx': idx, 'label': label})

    elif task_name == 'muserc':
        submission = multi_sentence_comprehension_prediction(model, data_gen)

    elif task_name in RSG_TASKS:
        for idx, (x, _) in enumerate(tqdm(data_gen)):
            prediction = model.compute(x)

            while isinstance(prediction, list):
                prediction = prediction[0]

            submission.append({'idx': idx, 'label': prediction})
    else:
        raise ValueError(f'Unexpected Russian SuperGLUE task name: {task_name}')

    save_path = output_path if output_path is not None else f'{RSG_TASKS[task_name]}.jsonl'
    save_path = expand_path(save_path)
    save_path.parent.mkdir(parents=True, exist_ok=True)
    save_jsonl(submission, save_path)
    log.info(f'Prediction saved to {save_path}')


def main():
    args = parser.parse_args()
    if args.benchmark_name == 'glue':
        submit_glue(args.config_path, args.output_file, args.download)
    elif args.benchmark_name == 'superglue':
        submit_superglue(args.config_path, args.output_file, args.download)
    elif args.benchmark_name == 'russian_superglue':
        submit_rsg(args.config_path, args.output_file, args.download)


if __name__ == '__main__':
    main()


================================================
FILE: deeppavlov/utils/connector/__init__.py
================================================
from .dialog_logger import DialogLogger


================================================
FILE: deeppavlov/utils/connector/dialog_logger.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from datetime import datetime
from logging import getLogger
from pathlib import Path
from typing import Any, Optional, Hashable

from deeppavlov.core.common.file import read_json
from deeppavlov.core.common.paths import get_settings_path
from deeppavlov.core.data.utils import jsonify_data

LOGGER_CONFIG_FILENAME = 'dialog_logger_config.json'
LOG_TIMESTAMP_FORMAT = '%Y-%m-%d_%H-%M-%S_%f'

log = getLogger(__name__)


class DialogLogger:
    """DeepPavlov dialog logging facility.

    DialogLogger is an entity which provides tools for dialogs logging.

    Args:
        enabled: DialogLogger on/off flag.
        logger_name: Dialog logger name that is used for organising log files.

    Attributes:
        logger_name: Dialog logger name which is used for organising log files.
        log_max_size: Maximum size of log file, kb.
        self.log_file: Current log file object.
    """
    def __init__(self, enabled: bool = False, logger_name: Optional[str] = None) -> None:
        self.config: dict = read_json(get_settings_path() / LOGGER_CONFIG_FILENAME)
        self.enabled: bool = enabled or self.config['enabled']

        if self.enabled:
            self.logger_name: str = logger_name or self.config['logger_name']
            self.log_max_size: int = self.config['logfile_max_size_kb']
            self.log_file = self._get_log_file()
            self.log_file.writelines('"Dialog logger initiated"\n')

    @staticmethod
    def _get_timestamp_utc_str() -> str:
        """Returns str converted current UTC timestamp.

        Returns:
            utc_timestamp_str: str converted current UTC timestamp.
        """
        utc_timestamp_str = datetime.strftime(datetime.utcnow(), LOG_TIMESTAMP_FORMAT)
        return utc_timestamp_str

    def _get_log_file(self):
        """Returns opened file object for writing dialog logs.

        Returns:
            log_file: opened Python file object.
        """
        log_dir: Path = Path(self.config['log_path']).expanduser().resolve() / self.logger_name
        log_dir.mkdir(parents=True, exist_ok=True)
        log_file_path = Path(log_dir, f'{self._get_timestamp_utc_str()}_{self.logger_name}.log')
        log_file = open(log_file_path, 'a', buffering=1, encoding='utf8')
        return log_file

    def _log(self, utterance: Any, direction: str, dialog_id: Optional[Hashable]=None):
        """Logs single dialog utterance to current dialog log file.

        Args:
            utterance: Dialog utterance.
            direction: 'in' or 'out' utterance direction.
            dialog_id: Dialog ID.
        """
        if isinstance(utterance, str):
            pass
        elif isinstance(utterance, (list, dict)):
            utterance = jsonify_data(utterance)
        else:
            utterance = str(utterance)

        dialog_id = str(dialog_id) if not isinstance(dialog_id, str) else dialog_id

        if self.log_file.tell() >= self.log_max_size * 1024:
            self.log_file.close()
            self.log_file = self._get_log_file()
        else:
            try:
                log_msg = {}
                log_msg['timestamp'] = self._get_timestamp_utc_str()
                log_msg['dialog_id'] = dialog_id
                log_msg['direction'] = direction
                log_msg['message'] = utterance
                log_str = json.dumps(log_msg, ensure_ascii=self.config['ensure_ascii'])
                self.log_file.write(f'{log_str}\n')
            except IOError:
                log.error('Failed to write dialog log.')

    def log_in(self, utterance: Any, dialog_id: Optional[Hashable] = None) -> None:
        """Wraps _log method for all input utterances.
        Args:
            utterance: Dialog utterance.
            dialog_id: Dialog ID.
        """
        if self.enabled:
            self._log(utterance, 'in', dialog_id)

    def log_out(self, utterance: Any, dialog_id: Optional[Hashable] = None) -> None:
        """Wraps _log method for all output utterances.
        Args:
            utterance: Dialog utterance.
            dialog_id: Dialog ID.
        """
        if self.enabled:
            self._log(utterance, 'out', dialog_id)


================================================
FILE: deeppavlov/utils/pip_wrapper/__init__.py
================================================
from .pip_wrapper import *


================================================
FILE: deeppavlov/utils/pip_wrapper/pip_wrapper.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
import subprocess
import sys
from logging import getLogger
from pathlib import Path

from deeppavlov.core.commands.utils import expand_path, parse_config
from deeppavlov.core.data.utils import get_all_elems_from_json

log = getLogger(__name__)

_tf_re = re.compile(r'\s*tensorflow\s*([<=>;]|$)')


def install(*packages):
    if any(_tf_re.match(package) for package in packages) \
            and b'tensorflow-gpu' in subprocess.check_output([sys.executable, '-m', 'pip', 'freeze'],
                                                             env=os.environ.copy()):
        log.warning('found tensorflow-gpu installed, so upgrading it instead of tensorflow')
        packages = [_tf_re.sub(r'tensorflow-gpu\1', package) for package in packages]
    result = subprocess.check_call([sys.executable, '-m', 'pip', 'install',
                                    *[re.sub(r'\s', '', package) for package in packages]],
                                   env=os.environ.copy())
    return result


def get_config_requirements(config: [str, Path, dict]):
    config = parse_config(config)

    requirements = set()
    for req in config.get('metadata', {}).get('requirements', []):
        requirements.add(req)

    config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')]
    requirements |= {req for config in config_references for req in get_config_requirements(config)}

    return requirements


def install_from_config(config: [str, Path, dict]):
    requirements_files = get_config_requirements(config)

    if not requirements_files:
        log.warning('No requirements found in config')
        return

    requirements = []
    for rf in requirements_files:
        with expand_path(rf).open(encoding='utf8') as f:
            for line in f:
                line = re.sub(r'\s', '', line.strip())
                if line and not line.startswith('#') and line not in requirements:
                    requirements.append(line)

    for r in requirements:
        install(r)


================================================
FILE: deeppavlov/utils/server/__init__.py
================================================
from .server import get_server_params, get_ssl_params, redirect_root_to_docs, start_model_server


================================================
FILE: deeppavlov/utils/server/metrics.py
================================================
# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import time
from typing import Tuple

from prometheus_client import CONTENT_TYPE_LATEST, REGISTRY, generate_latest
from prometheus_client import Counter, Gauge, Histogram
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.requests import Request
from starlette.responses import Response
from starlette.types import ASGIApp

REQUESTS_COUNT = Counter('http_requests_count', 'Number of processed requests', ['endpoint', 'status_code'])
REQUESTS_LATENCY = Histogram('http_requests_latency_seconds', 'Request latency histogram', ['endpoint'])
REQUESTS_IN_PROGRESS = Gauge('http_requests_in_progress', 'Number of requests currently being processed', ['endpoint'])


def metrics(request: Request) -> Response:
    return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST)


class PrometheusMiddleware(BaseHTTPMiddleware):
    def __init__(self, app: ASGIApp, ignore_paths: Tuple = ()) -> None:
        super().__init__(app)
        self.ignore_paths = ignore_paths

    async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
        endpoint = request.url.path

        if endpoint in self.ignore_paths:
            return await call_next(request)

        REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).inc()

        start_time = time.perf_counter()
        status_code = 500

        try:
            response = await call_next(request)
            status_code = response.status_code
        finally:
            if status_code == 200:
                duration = time.perf_counter() - start_time
                REQUESTS_LATENCY.labels(endpoint=endpoint).observe(duration)
            REQUESTS_COUNT.labels(endpoint=endpoint, status_code=status_code).inc()
            REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).dec()

        return response


================================================
FILE: deeppavlov/utils/server/server.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import asyncio
import os
from collections import namedtuple
from logging import getLogger
from pathlib import Path
from ssl import PROTOCOL_TLSv1_2
from typing import Dict, List, Optional, Union

import uvicorn
from fastapi import Body, FastAPI, HTTPException
from fastapi.utils import generate_operation_id_for_path
from pydantic import BaseConfig, BaseModel
from pydantic.fields import Field, ModelField
from pydantic.main import ModelMetaclass
from starlette.middleware.cors import CORSMiddleware
from starlette.responses import RedirectResponse

from deeppavlov.core.commands.infer import build_model
from deeppavlov.core.commands.utils import parse_config
from deeppavlov.core.common.chainer import Chainer
from deeppavlov.core.common.file import read_json
from deeppavlov.core.common.log import log_config
from deeppavlov.core.common.paths import get_settings_path
from deeppavlov.core.data.utils import check_nested_dict_keys, jsonify_data
from deeppavlov.utils.connector import DialogLogger
from deeppavlov.utils.server.metrics import metrics, PrometheusMiddleware

SERVER_CONFIG_PATH = get_settings_path() / 'server_config.json'
SSLConfig = namedtuple('SSLConfig', ['version', 'keyfile', 'certfile'])

log = getLogger(__name__)
dialog_logger = DialogLogger(logger_name='rest_api')

COMPATIBILITY_MODE = os.getenv('COMPATIBILITY_MODE', False)

if COMPATIBILITY_MODE is not False:
    log.warning('DeepPavlov riseapi mode will use the old model response data format used up and including 1.0.0rc1.\n'
                'COMPATIBILITY_MODE will be removed in the DeepPavlov 1.2.0.\n'
                'Please, update your client code according to the new format.')

app = FastAPI()

app.add_middleware(
    PrometheusMiddleware,
    ignore_paths=('/', '/metrics', '/api', '/probe', '/docs', '/openapi.json')
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*']
)

app.add_route("/metrics", metrics)


def get_server_params(model_config: Union[str, Path]) -> Dict:
    server_config = read_json(SERVER_CONFIG_PATH)
    model_config = parse_config(model_config)

    server_params = server_config['common_defaults']

    if check_nested_dict_keys(model_config, ['metadata', 'server_utils']):
        model_tag = model_config['metadata']['server_utils']
        if check_nested_dict_keys(server_config, ['model_defaults', model_tag]):
            model_defaults = server_config['model_defaults'][model_tag]
            for param_name in model_defaults.keys():
                if model_defaults[param_name]:
                    server_params[param_name] = model_defaults[param_name]

    server_params['model_endpoint'] = server_params.get('model_endpoint', '/model')

    arg_names = server_params['model_args_names'] or model_config['chainer']['in']
    if isinstance(arg_names, str):
        arg_names = [arg_names]
    server_params['model_args_names'] = arg_names

    return server_params


def get_ssl_params(server_params: dict,
                   https: Optional[bool],
                   ssl_key: Optional[str],
                   ssl_cert: Optional[str]) -> SSLConfig:
    https = https or server_params['https']
    if https:
        ssh_key_path = Path(ssl_key or server_params['https_key_path']).resolve()
        if not ssh_key_path.is_file():
            e = FileNotFoundError('Ssh key file not found: please provide correct path in --key param or '
                                  'https_key_path param in server configuration file')
            log.error(e)
            raise e

        ssh_cert_path = Path(ssl_cert or server_params['https_cert_path']).resolve()
        if not ssh_cert_path.is_file():
            e = FileNotFoundError('Ssh certificate file not found: please provide correct path in --cert param or '
                                  'https_cert_path param in server configuration file')
            log.error(e)
            raise e

        ssl_config = SSLConfig(version=PROTOCOL_TLSv1_2, keyfile=str(ssh_key_path), certfile=str(ssh_cert_path))
    else:
        ssl_config = SSLConfig(None, None, None)

    return ssl_config


def redirect_root_to_docs(fast_app: FastAPI, func_name: str, endpoint: str, method: str) -> None:
    """Adds api route to server that redirects user from root to docs with opened `endpoint` description."""

    @fast_app.get('/', include_in_schema=False)
    async def redirect_to_docs() -> RedirectResponse:
        operation_id = generate_operation_id_for_path(name=func_name, path=endpoint, method=method)
        response = RedirectResponse(url=f'/docs#/default/{operation_id}')
        return response


def interact(model: Chainer, payload: Dict[str, Optional[List]]) -> List:
    model_args = payload.values()
    dialog_logger.log_in(payload)
    error_msg = None
    lengths = {len(model_arg) for model_arg in model_args if model_arg is not None}

    if not lengths:
        error_msg = 'got empty request'
    elif 0 in lengths:
        error_msg = 'got empty array as model argument'
    elif len(lengths) > 1:
        error_msg = 'got several different batch sizes'

    if error_msg is not None:
        log.error(error_msg)
        raise HTTPException(status_code=400, detail=error_msg)

    batch_size = next(iter(lengths))
    model_args = [arg or [None] * batch_size for arg in model_args]

    prediction = model(*model_args)

    # TODO: remove in 1.2.0
    if COMPATIBILITY_MODE is not False:
        if len(model.out_params) == 1:
            prediction = [prediction]
        prediction = list(zip(*prediction))

    result = jsonify_data(prediction)
    dialog_logger.log_out(result)
    return result


def test_interact(model: Chainer, payload: Dict[str, Optional[List]]) -> List[str]:
    model_args = [arg or ["Test string."] for arg in payload.values()]
    try:
        _ = model(*model_args)
        return ["Test passed"]
    except Exception as e:
        raise HTTPException(status_code=400, detail=repr(e))


def start_model_server(model_config: Path,
                       https: Optional[bool] = None,
                       ssl_key: Optional[str] = None,
                       ssl_cert: Optional[str] = None,
                       port: Optional[int] = None) -> None:

    server_params = get_server_params(model_config)

    host = server_params['host']
    port = port or server_params['port']
    model_endpoint = server_params['model_endpoint']
    model_args_names = server_params['model_args_names']

    ssl_config = get_ssl_params(server_params, https, ssl_key=ssl_key, ssl_cert=ssl_cert)

    model = build_model(model_config)

    def batch_decorator(cls: ModelMetaclass) -> ModelMetaclass:
        cls.__annotations__ = {arg_name: list for arg_name in model_args_names}
        cls.__fields__ = {arg_name: ModelField(name=arg_name, type_=list, class_validators=None,
                                               model_config=BaseConfig, required=False, field_info=Field(None))
                          for arg_name in model_args_names}
        return cls

    @batch_decorator
    class Batch(BaseModel):
        pass

    redirect_root_to_docs(app, 'answer', model_endpoint, 'post')

    model_endpoint_post_example = {arg_name: ['string'] for arg_name in model_args_names}

    @app.post(model_endpoint, summary='A model endpoint')
    async def answer(item: Batch = Body(..., example=model_endpoint_post_example)) -> List:
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(None, interact, model, item.dict())

    @app.post('/probe', include_in_schema=False)
    async def probe(item: Batch) -> List[str]:
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(None, test_interact, model, item.dict())

    @app.get('/api', summary='Model argument names')
    async def api() -> Dict[str, List[str]]:
        if COMPATIBILITY_MODE is not False:
            return model_args_names
        return {
            'in': model.in_x,
            'out': model.out_params
        }

    uvicorn.run(app, host=host, port=port, log_config=log_config, ssl_version=ssl_config.version,
                ssl_keyfile=ssl_config.keyfile, ssl_certfile=ssl_config.certfile, timeout_keep_alive=20)


================================================
FILE: deeppavlov/utils/settings/__init__.py
================================================


================================================
FILE: deeppavlov/utils/settings/dialog_logger_config.json
================================================
{
  "enabled": false,
  "logger_name": "default",
  "log_path": "~/.deeppavlov/dialog_logs",
  "logfile_max_size_kb": 10240,
  "ensure_ascii": false
}

================================================
FILE: deeppavlov/utils/settings/log_config.json
================================================
{
  "version": 1,
  "disable_existing_loggers": false,
  "loggers": {
    "deeppavlov": {
      "level": "INFO",
      "handlers": [
        "stderr"
      ],
      "propagate": true
    },
    "uvicorn.access": {
      "level": "INFO",
      "handlers": [
        "uvicorn_handler"
      ],
      "propagate": true
    },
    "uvicorn.error": {
      "level": "INFO",
      "handlers": [
        "uvicorn_handler"
      ],
      "propagate": true
    },
    "train_report": {
      "level": "INFO",
      "handlers": [
        "train_handler"
      ],
      "propagate": true
    },
    "filelock": {
      "level": "WARNING",
      "handlers": [
        "stdout"
      ],
      "propagate": true
    }
  },
  "formatters": {
    "default": {
      "format": "%(asctime)s.%(msecs)d %(levelname)s in '%(name)s'['%(module)s'] at line %(lineno)d: %(message)s",
      "datefmt": "%Y-%m-%d %H:%M:%S"
    },
    "uvicorn_fmt": {
      "format": "%(asctime)s %(message)s",
      "datefmt": "%Y-%m-%d %H:%M:%S"
    },
    "message": {
      "format": "%(message)s"
    }
  },
  "handlers": {
    "file": {
      "class": "logging.FileHandler",
      "level": "DEBUG",
      "formatter": "default",
      "filename": "~/.deeppavlov/log.log"
    },
    "stdout": {
      "class": "logging.StreamHandler",
      "level": "DEBUG",
      "formatter": "default",
      "stream": "ext://sys.stdout"
    },
    "stderr": {
      "class": "logging.StreamHandler",
      "level": "DEBUG",
      "formatter": "default",
      "stream": "ext://sys.stderr"
    },
    "uvicorn_handler": {
      "class": "logging.StreamHandler",
      "level": "INFO",
      "formatter": "uvicorn_fmt",
      "stream": "ext://sys.stdout",
      "filters": ["probeFilter"]
    },
    "train_handler": {
      "class": "logging.StreamHandler",
      "level": "INFO",
      "formatter": "message",
      "stream": "ext://sys.stdout"
    }
  },
  "filters": {
    "probeFilter": {
      "()": "deeppavlov.core.common.log.ProbeFilter"
    }
  }
}


================================================
FILE: deeppavlov/utils/settings/server_config.json
================================================
{
  "common_defaults": {
    "host": "0.0.0.0",
    "port": 5000,
    "model_args_names": [],
    "https": false,
    "https_cert_path": "",
    "https_key_path": "",
    "socket_type": "TCP",
    "unix_socket_file": "/tmp/deeppavlov_socket.s",
    "socket_launch_message": "launching socket server at"
  }
}


================================================
FILE: deeppavlov/utils/socket/__init__.py
================================================
from .socket import encode, start_socket_server


================================================
FILE: deeppavlov/utils/socket/socket.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import asyncio
import json
from logging import getLogger
from pathlib import Path
from struct import pack, unpack
from typing import Any, List, Optional, Tuple, Union

from deeppavlov.core.commands.infer import build_model
from deeppavlov.core.common.chainer import Chainer
from deeppavlov.core.data.utils import jsonify_data
from deeppavlov.utils.connector import DialogLogger
from deeppavlov.utils.server import get_server_params

HEADER_FORMAT = '<I'

log = getLogger(__name__)
dialog_logger = DialogLogger(logger_name='socket_api')


def encode(data: Any) -> bytes:
    """Сonverts data to the socket server input formatted bytes array.

    Serializes ``data`` to the JSON formatted bytes array and adds 4 bytes to the beginning of the array - packed
    to bytes length of the JSON formatted bytes array. Header format is "<I"
    (see https://docs.python.org/3/library/struct.html#struct-format-strings)

    Args:
        data: Object to pact to the bytes array.

    Raises:
        TypeError: If data is not JSON-serializable object.

    Examples:
        >>> from deeppavlov.utils.socket import encode
        >>> encode({'a':1})
        b'\x08\x00\x00\x00{"a": 1}
        >>> encode([42])
        b'\x04\x00\x00\x00[42]'

    """
    json_data = jsonify_data(data)
    bytes_data = json.dumps(json_data).encode()
    response = pack(HEADER_FORMAT, len(bytes_data)) + bytes_data
    return response


class SocketServer:
    """Creates socket server that sends the received data to the DeepPavlov model and returns model response.

    The server receives bytes array consists of the `header` and the `body`. The `header` is the first 4 bytes
    of the array - `body` length in bytes represented by a packed unsigned int (byte order is little-endian).
    `body` is dictionary serialized to JSON formatted bytes array that server sends to the model. The dictionary
    keys should match model arguments names, the values should be lists or tuples of inferenced values.

    Socket server request creation example:
        >>> from deeppavlov.utils.socket import encode
        >>> request = encode({"context":["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]})
        >>> request
        b'I\x00\x00\x00{"x": ["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]}'

    Socket server response, like the request, consists of the header and the body. Response body is dictionary
    {'status': status, 'payload': payload} serialized to a JSON formatted byte array, where:
        status (str): 'OK' if the model successfully processed the data, else - error message.
        payload: (Optional[List[Tuple]]): The model result if no error has occurred, otherwise None.

    """
    _launch_msg: str
    _loop: asyncio.AbstractEventLoop
    _model: Chainer
    _model_args_names: List

    def __init__(self,
                 model_config: Path,
                 socket_type: str,
                 port: Optional[int] = None,
                 socket_file: Optional[Union[str, Path]] = None) -> None:
        """Initializes socket server.

        Args:
            model_config: Path to the config file.
            socket_type: Socket family. "TCP" for the AF_INET socket server, "UNIX" for UNIX Domain Socket server.
            port: Port number for the AF_INET address family. If parameter is not defined, the port number from the
                utils/settings/server_config.json is used.
            socket_file: Path to the file to which UNIX Domain Socket server connects. If parameter is not defined,
                the path from the utils/settings/server_config.json is used.

        Raises:
            ValueError: If ``socket_type`` parameter is neither "TCP" nor "UNIX".

        """
        server_params = get_server_params(model_config)
        socket_type = socket_type or server_params['socket_type']
        self._loop = asyncio.get_event_loop()

        if socket_type == 'TCP':
            host = server_params['host']
            port = port or server_params['port']
            self._launch_msg = f'{server_params["socket_launch_message"]} http://{host}:{port}'
            self._loop.create_task(asyncio.start_server(self._handle_client, host, port))
        elif socket_type == 'UNIX':
            socket_file = socket_file or server_params['unix_socket_file']
            socket_path = Path(socket_file).resolve()
            if socket_path.exists():
                socket_path.unlink()
            self._launch_msg = f'{server_params["socket_launch_message"]} {socket_file}'
            self._loop.create_task(asyncio.start_unix_server(self._handle_client, socket_file))
        else:
            raise ValueError(f'socket type "{socket_type}" is not supported')

        self._model = build_model(model_config)
        self._model_args_names = server_params['model_args_names']

    def start(self) -> None:
        """Launches socket server"""
        log.info(self._launch_msg)
        try:
            self._loop.run_forever()
        except KeyboardInterrupt:
            pass
        except Exception as e:
            log.error(f'got exception {e} while running server')
        finally:
            self._loop.close()

    async def _handle_client(self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
        """Handles connection from a client.

        Validates requests, sends request body to DeepPavlov model, sends responses to client.

        """
        addr = writer.get_extra_info('peername')
        log.info(f'handling connection from {addr}')
        while True:
            header = await reader.read(4)
            if not header:
                log.info(f'closing connection from {addr}')
                writer.close()
                break
            elif len(header) != 4:
                error_msg = f'header "{header}" length less than 4 bytes'
                log.error(error_msg)
                response = self._response(error_msg)
            else:
                data_len = unpack(HEADER_FORMAT, header)[0]
                request_body = await reader.read(data_len)
                try:
                    data = json.loads(request_body)
                    response = await self._interact(data)
                except ValueError:
                    error_msg = f'request "{request_body}" type is not json'
                    log.error(error_msg)
                    response = self._response(error_msg)
            writer.write(response)
            await writer.drain()

    async def _interact(self, data: dict) -> bytes:
        dialog_logger.log_in(data)
        model_args = []
        for param_name in self._model_args_names:
            param_value = data.get(param_name)
            if param_value is None or (isinstance(param_value, list) and len(param_value) > 0):
                model_args.append(param_value)
            else:
                error_msg = f"nonempty array expected but got '{param_name}'={repr(param_value)}"
                log.error(error_msg)
                return self._response(error_msg)
        lengths = {len(i) for i in model_args if i is not None}

        if not lengths:
            error_msg = 'got empty request'
            log.error(error_msg)
            return self._response(error_msg)
        elif len(lengths) > 1:
            error_msg = f'got several different batch sizes: {lengths}'
            log.error(error_msg)
            return self._response(error_msg)

        batch_size = list(lengths)[0]
        model_args = [arg or [None] * batch_size for arg in model_args]

        # in case when some parameters were not described in model_args
        model_args += [[None] * batch_size for _ in range(len(self._model.in_x) - len(model_args))]

        prediction = await self._loop.run_in_executor(None, self._model, *model_args)
        if len(self._model.out_params) == 1:
            prediction = [prediction]
        prediction = list(zip(*prediction))
        dialog_logger.log_out(prediction)
        return self._response(payload=prediction)

    @staticmethod
    def _response(status: str = 'OK', payload: Optional[List[Tuple]] = None) -> bytes:
        """Puts arguments into dict and serialize it to JSON formatted byte array with header.

        Args:
            status: Response status. 'OK' if no error has occurred, otherwise error message.
            payload: DeepPavlov model result if no error has occurred, otherwise None.

        Returns:
            dict({'status': status, 'payload': payload}) serialized to a JSON formatted byte array starting with the
                4-byte header - the length of serialized dict in bytes.

        """
        return encode({'status': status, 'payload': payload})


def start_socket_server(model_config: Path, socket_type: str, port: Optional[int],
                        socket_file: Optional[Union[str, Path]]) -> None:
    server = SocketServer(model_config, socket_type, port, socket_file)
    server.start()


================================================
FILE: deeppavlov/vocabs/__init__.py
================================================


================================================
FILE: deeppavlov/vocabs/typos.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import shutil
from collections import defaultdict
from logging import getLogger
from pathlib import Path

import requests
from lxml import html

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.file import load_pickle, save_pickle
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import is_done, mark_done

log = getLogger(__name__)


@register('static_dictionary')
class StaticDictionary:
    """Trie vocabulary used in spelling correction algorithms

    Args:
        data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as
            relative to pipeline's data directory
        dictionary_name: logical name of the dictionary
        raw_dictionary_path: path to the source file with the list of words

    Attributes:
        dict_name: logical name of the dictionary
        alphabet: set of all the characters used in this dictionary
        words_set: set of all the words
        words_trie: trie structure of all the words
    """

    def __init__(self, data_dir: [Path, str] = '', *args, dictionary_name: str = 'dictionary', **kwargs):
        data_dir = expand_path(data_dir) / dictionary_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            log.debug('Trying to build a dictionary in {}'.format(data_dir))
            if data_dir.is_dir():
                shutil.rmtree(str(data_dir))
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i + 1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            log.debug('built')
        else:
            log.debug('Loading a dictionary from {}'.format(data_dir))

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)

    @staticmethod
    def _get_source(data_dir, raw_dictionary_path, *args, **kwargs):
        raw_path = expand_path(raw_dictionary_path)
        with raw_path.open(newline='', encoding='utf8') as f:
            data = [line.strip().split('\t')[0] for line in f]
        return data

    @staticmethod
    def _normalize(word):
        return '⟬{}⟭'.format(word.strip().lower().replace('ё', 'е'))


@register('russian_words_vocab')
class RussianWordsVocab(StaticDictionary):
    """Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data from https://github.com/danakt/russian-words/

    Args:
        data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as
            relative to pipeline's data directory

    Attributes:
        dict_name: logical name of the dictionary
        alphabet: set of all the characters used in this dictionary
        words_set: set of all the words
        words_trie: trie structure of all the words
    """

    def __init__(self, data_dir: [Path, str] = '', *args, **kwargs):
        kwargs['dictionary_name'] = 'russian_words_vocab'
        super().__init__(data_dir, *args, **kwargs)

    @staticmethod
    def _get_source(*args, **kwargs):
        log.debug('Downloading russian vocab from https://github.com/danakt/russian-words/')
        url = 'https://github.com/danakt/russian-words/raw/master/russian.txt'
        page = requests.get(url)
        return [word.strip() for word in page.content.decode('cp1251').strip().split('\n')]


@register('wikitionary_100K_vocab')
class Wiki100KDictionary(StaticDictionary):
    """Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data
    from `Wikitionary <https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#Project_Gutenberg>`__

    Args:
        data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as
            relative to pipeline's data directory

    Attributes:
        dict_name: logical name of the dictionary
        alphabet: set of all the characters used in this dictionary
        words_set: set of all the words
        words_trie: trie structure of all the words
    """

    def __init__(self, data_dir: [Path, str] = '', *args, **kwargs):
        kwargs['dictionary_name'] = 'wikipedia_100K_vocab'
        super().__init__(data_dir, *args, **kwargs)

    @staticmethod
    def _get_source(*args, **kwargs):
        words = []
        log.debug('Downloading english vocab from Wiktionary')
        for i in range(1, 100000, 10000):
            k = 10000 + i - 1
            url = 'https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/PG/2005/08/{}-{}'.format(i, k)
            page = requests.get(url)
            tree = html.fromstring(page.content)
            words += tree.xpath('//div[@class="mw-parser-output"]/p/a/text()')
        return words


================================================
FILE: deeppavlov/vocabs/wiki_sqlite.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import List, Any, Optional, Union

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.dataset_iterators.sqlite_iterator import SQLiteDataIterator

logger = getLogger(__name__)


@register('wiki_sqlite_vocab')
class WikiSQLiteVocab(SQLiteDataIterator, Component):
    """Get content from SQLite database by document ids.

    Args:
        load_path: a path to local DB file
        join_docs: whether to join extracted docs with ' ' or not
        shuffle: whether to shuffle data or not

    Attributes:
        join_docs: whether to join extracted docs with ' ' or not

    """

    def __init__(self, load_path: str, join_docs: bool = True, shuffle: bool = False, **kwargs) -> None:
        SQLiteDataIterator.__init__(self, load_path=load_path, shuffle=shuffle)
        self.join_docs = join_docs

    def __call__(self, doc_ids: Optional[List[List[Any]]] = None, *args, **kwargs) -> List[Union[str, List[str]]]:
        """Get the contents of files, stacked by space or as they are.

        Args:
            doc_ids: a batch of lists of ids to get contents for

        Returns:
            a list of contents / list of lists of contents
        """
        all_contents = []
        if not doc_ids:
            logger.warning('No doc_ids are provided in WikiSqliteVocab, return all docs')
            doc_ids = [self.get_doc_ids()]

        for ids in doc_ids:
            contents = [self.get_doc_content(doc_id) for doc_id in ids]
            if self.join_docs:
                contents = ' '.join(contents)
            all_contents.append(contents)

        return all_contents


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS    = -WT
SPHINXBUILD   = sphinx-build
SPHINXPROJ    = DeepPavlov
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

================================================
FILE: docs/_static/deeppavlov.css
================================================
.wy-side-nav-search {
    background-color: #0176bd;
}

.wy-nav-content {
    max-width: 1000px;
}

.wy-side-nav-search>div.version {
    color: #ffffff;
}

================================================
FILE: docs/_static/my_blocks.css
================================================
button.copybtn svg {
    width: 1.3em;
    height: 1.3em;
    padding: 0.1em;
}

button.copybtn {
    top: 0.2em;
    width: 1.4em;
    height: 1.4em;   
}

.rst-content .linenodiv pre, .rst-content div[class^=highlight] pre, .rst-content pre.literal-block {
    font-size: 13px;
    line-height: 1.4;
}


================================================
FILE: docs/_templates/footer.html
================================================
{#{% extends '!footer.html' %}#}

<footer>
  <!-- Yandex.Metrika counter -->
  <script type="text/javascript" >
     (function(m,e,t,r,i,k,a){m[i]=m[i]||function(){(m[i].a=m[i].a||[]).push(arguments)};
     m[i].l=1*new Date();k=e.createElement(t),a=e.getElementsByTagName(t)[0],k.async=1,k.src=r,a.parentNode.insertBefore(k,a)})
     (window, document, "script", "https://mc.yandex.ru/metrika/tag.js", "ym");
     ym(72484825, "init", {
          clickmap:true,
          trackLinks:true,
          accurateTrackBounce:true,
          webvisor:true
     });
  </script>
  <noscript><div><img src="https://mc.yandex.ru/watch/72484825" style="position:absolute; left:-9999px;" alt="" /></div></noscript>
  <!-- /Yandex.Metrika counter -->
  {% if (theme_prev_next_buttons_location == 'bottom' or theme_prev_next_buttons_location == 'both') and (next or prev) %}
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      {% if next %}
        <a href="{{ next.link|e }}" class="btn btn-neutral float-right" title="{{ next.title|striptags|e }}" accesskey="n" rel="next">{{ _('Next') }} <span class="fa fa-arrow-circle-right"></span></a>
      {% endif %}
      {% if prev %}
        <a href="{{ prev.link|e }}" class="btn btn-neutral float-left" title="{{ prev.title|striptags|e }}" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> {{ _('Previous') }}</a>
      {% endif %}
    </div>
  {% endif %}

  <hr/>

  <div role="contentinfo">
    {%- block extrafooter %}
        <p>Problem? <a href="https://forum.deeppavlov.ai">Ask a Question</a> or <a href="https://demo.deeppavlov.ai/">try our Demo</a></p>
        <p>
            <a href="https://medium.com/deeppavlov"><img style="width: 30px; height: 30px;" src="{{ pathto('_static/social/Medium_Monogram.svg', 1) }}" alt="medium"></a>
            <a href="https://twitter.com/deeppavlov"><img style="width: 30px; height: 30px;" src="{{ pathto('_static/social/Twitter_Social_Icon_Circle_Color.svg', 1) }}" alt="twitter"></a>
            <a href="https://www.youtube.com/channel/UCJ-6K2HGA0hpQytlSM7FBVQ"><img style="width: 30px; height: 30px;" src="{{ pathto('_static/social/youtube_social_circle_red.png', 1) }}" alt="youtube"></a>
            <a href="https://t.me/deeppavlov"><img style="width: 30px; height: 30px;" src="{{ pathto('_static/social/telegram.png', 1) }}" alt="medium"></a>
        </p>
    {% endblock %}
    <p>
    {%- if show_copyright %}
      {%- if hasdoc('copyright') %}
        {% set path = pathto('copyright') %}
        {% set copyright = copyright|e %}
        &copy; <a href="{{ path }}">{% trans %}Copyright{% endtrans %}</a> {{ copyright }}
      {%- else %}
        {% set copyright = copyright|e %}
        &copy; {% trans %}Copyright{% endtrans %} {{ copyright }}
      {%- endif %}
    {%- endif %}

    {%- if build_id and build_url %}
      <span class="build">
        {# Translators: Build is a noun, not a verb #}
        {% trans %}Build{% endtrans %}
        <a href="{{ build_url }}">{{ build_id }}</a>.
      </span>
    {%- elif commit %}
      <span class="commit">
        {% trans %}Revision{% endtrans %} <code>{{ commit }}</code>.
      </span>
    {%- elif last_updated %}
      <span class="lastupdated">
        {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %}
      </span>
    {%- endif %}

    </p>
  </div>

  {%- if show_sphinx %}
    {% set sphinx_web = '<a href="http://sphinx-doc.org/">Sphinx</a>' %}
    {% set readthedocs_web = '<a href="https://readthedocs.org">Read the Docs</a>'  %}
      {% trans sphinx_web=sphinx_web, readthedocs_web=readthedocs_web %}Built with {{ sphinx_web }} using a{% endtrans %} <a href="https://github.com/rtfd/sphinx_rtd_theme">{% trans %}theme{% endtrans %}</a> {% trans %}provided by {{ readthedocs_web }}{% endtrans %}.
  {%- endif %}

</footer>


================================================
FILE: docs/apiref/core/commands.rst
================================================
deeppavlov.core.commands
========================
Basic training and inference functions.

.. automodule:: deeppavlov.core.commands.infer
   :members:

.. automodule:: deeppavlov.core.commands.train
   :members:


================================================
FILE: docs/apiref/core/common.rst
================================================
deeppavlov.core.common
======================
Registration and classes initialization functionality, class method decorators.

.. autoclass:: deeppavlov.core.common.chainer.Chainer
   :members:

   .. automethod:: __call__

.. autoclass:: deeppavlov.core.common.base.Element

    .. automethod:: __init__

.. autoclass:: deeppavlov.core.common.base.Model

    .. automethod:: __init__

.. automodule:: deeppavlov.core.common.metrics_registry
   :members:

.. automodule:: deeppavlov.core.common.params
   :members:

.. automodule:: deeppavlov.core.common.registry
   :members:


================================================
FILE: docs/apiref/core/data.rst
================================================
deeppavlov.core.data
====================
DatasetReader, Vocab, DataLearningIterator and DataFittingIterator classes.

.. autoclass:: deeppavlov.core.data.dataset_reader.DatasetReader

.. autoclass:: deeppavlov.core.data.data_fitting_iterator.DataFittingIterator

.. autoclass:: deeppavlov.core.data.data_learning_iterator.DataLearningIterator

.. autoclass:: deeppavlov.core.data.simple_vocab.SimpleVocabulary


================================================
FILE: docs/apiref/core/models.rst
================================================
deeppavlov.core.models
======================
Abstract model classes and interfaces.

.. autoclass:: deeppavlov.core.models.component.Component

.. autoclass:: deeppavlov.core.models.serializable.Serializable

.. autoclass:: deeppavlov.core.models.estimator.Estimator

.. autoclass:: deeppavlov.core.models.nn_model.NNModel

.. autoclass:: deeppavlov.core.models.torch_model.TorchModel


================================================
FILE: docs/apiref/core/trainers.rst
================================================
deeppavlov.core.trainers
========================
Trainer classes.

.. autoclass:: deeppavlov.core.trainers.FitTrainer
   :members:

.. autoclass:: deeppavlov.core.trainers.NNTrainer
   :members:
   :inherited-members:


================================================
FILE: docs/apiref/core.rst
================================================
core
====
DeepPavlov Core

.. automodule:: deeppavlov.core
   :members:

.. toctree::
   :glob:
   :caption: Core

   core/*


================================================
FILE: docs/apiref/dataset_iterators.rst
================================================
dataset_iterators
=================
Concrete DatasetIterator classes.

.. autoclass:: deeppavlov.dataset_iterators.basic_classification_iterator.BasicClassificationDatasetIterator
    :members:

.. autoclass:: deeppavlov.dataset_iterators.siamese_iterator.SiameseIterator

.. autoclass:: deeppavlov.dataset_iterators.sqlite_iterator.SQLiteDataIterator

.. autoclass:: deeppavlov.dataset_iterators.squad_iterator.SquadIterator

.. automodule:: deeppavlov.dataset_iterators.typos_iterator
    :members:

.. automodule:: deeppavlov.dataset_iterators.multitask_iterator
    :members:


================================================
FILE: docs/apiref/dataset_readers.rst
================================================
dataset_readers
===============
Concrete DatasetReader classes.

.. autoclass:: deeppavlov.dataset_readers.basic_classification_reader.BasicClassificationDatasetReader
   :members:

.. autoclass:: deeppavlov.dataset_readers.conll2003_reader.Conll2003DatasetReader

.. autoclass:: deeppavlov.dataset_readers.faq_reader.FaqDatasetReader
   :members:

.. autoclass:: deeppavlov.dataset_readers.line_reader.LineReader
   :members:

.. autoclass:: deeppavlov.dataset_readers.paraphraser_reader.ParaphraserReader

.. autoclass:: deeppavlov.dataset_readers.squad_dataset_reader.SquadDatasetReader
   :members:

.. automodule:: deeppavlov.dataset_readers.typos_reader
   :members:

.. automodule:: deeppavlov.dataset_readers.ubuntu_v2_reader
   :members:

.. automodule:: deeppavlov.dataset_readers.multitask_reader
   :members:


================================================
FILE: docs/apiref/metrics.rst
================================================
metrics
=======
Different Metric functions.

.. automodule:: deeppavlov.metrics
   :members:

.. autofunction:: deeppavlov.metrics.accuracy.sets_accuracy

.. autofunction:: deeppavlov.metrics.fmeasure.round_f1

.. autofunction:: deeppavlov.metrics.fmeasure.round_f1_macro

.. autofunction:: deeppavlov.metrics.fmeasure.round_f1_weighted

.. autofunction:: deeppavlov.metrics.fmeasure.ner_f1

.. autofunction:: deeppavlov.metrics.fmeasure.ner_token_f1

.. autofunction:: deeppavlov.metrics.log_loss.sk_log_loss

.. autofunction:: deeppavlov.metrics.roc_auc_score.roc_auc_score


================================================
FILE: docs/apiref/models/api_requester.rst
================================================
deeppavlov.models.api_requester
===============================

.. automodule:: deeppavlov.models.api_requester
    :members:

.. autoclass:: deeppavlov.models.api_requester.api_requester.ApiRequester

    .. automethod:: __call__
    .. automethod:: get_async_response


.. autoclass:: deeppavlov.models.api_requester.api_router.ApiRouter

    .. automethod:: __call__


================================================
FILE: docs/apiref/models/classifiers.rst
================================================
deeppavlov.models.classifiers
=============================

.. automodule:: deeppavlov.models.classifiers
   :members:

.. autoclass:: deeppavlov.models.classifiers.torch_classification_model.TorchTextClassificationModel
    :members:

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.classifiers.cos_sim_classifier.CosineSimilarityClassifier
    :members:

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.classifiers.proba2labels.Proba2Labels
    :members:

    .. automethod:: __call__


================================================
FILE: docs/apiref/models/doc_retrieval.rst
================================================
deeppavlov.models.doc_retrieval
===============================

Document retrieval classes.

.. automodule:: deeppavlov.models.doc_retrieval

.. autoclass:: deeppavlov.models.doc_retrieval.tfidf_ranker.TfidfRanker
    :members:

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.doc_retrieval.logit_ranker.LogitRanker
    :members:

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.doc_retrieval.pop_ranker.PopRanker
    :members:

    .. automethod:: __call__

================================================
FILE: docs/apiref/models/embedders.rst
================================================
deeppavlov.models.embedders
============================

.. autoclass:: deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder

   .. automethod:: __call__
   .. automethod:: __iter__

.. autoclass:: deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder

   .. automethod:: __call__

.. autoclass:: deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder

   .. automethod:: __call__


================================================
FILE: docs/apiref/models/entity_extraction.rst
================================================
deeppavlov.models.entity_extraction
===================================

.. autoclass:: deeppavlov.models.entity_extraction.ner_chunker.NerChunker

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.entity_extraction.entity_linking.EntityLinker

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.EntityDetectionParser

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.QuestionSignChecker


================================================
FILE: docs/apiref/models/kbqa.rst
================================================
deeppavlov.models.kbqa
======================

.. automodule:: deeppavlov.models.kbqa

.. autoclass:: deeppavlov.models.kbqa.type_define.AnswerTypesExtractor

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.kbqa.query_generator.QueryGenerator

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.kbqa.query_generator_base.QueryGeneratorBase

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.kbqa.rel_ranking_infer.RelRankerInfer

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.kbqa.template_matcher.TemplateMatcher

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.kbqa.ru_adj_to_noun.RuAdjToNoun

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.kbqa.tree_to_sparql.TreeToSparql

    .. automethod:: __init__
    .. automethod:: __call__

.. autoclass:: deeppavlov.models.kbqa.wiki_parser.WikiParser

    .. automethod:: __init__
    .. automethod:: __call__


================================================
FILE: docs/apiref/models/preprocessors.rst
================================================
deeppavlov.models.preprocessors
===============================

.. autoclass:: deeppavlov.models.preprocessors.dirty_comments_preprocessor.DirtyCommentsPreprocessor

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.preprocessors.mask.Mask

.. autoclass:: deeppavlov.models.preprocessors.one_hotter.OneHotter

.. autoclass:: deeppavlov.models.preprocessors.sanitizer.Sanitizer

.. autofunction:: deeppavlov.models.preprocessors.str_lower.str_lower

.. autoclass:: deeppavlov.models.preprocessors.str_token_reverser.StrTokenReverser

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.preprocessors.str_utf8_encoder.StrUTF8Encoder

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.DocumentChunker

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.StringMultiplier

    .. automethod:: __call__


================================================
FILE: docs/apiref/models/relation_extraction.rst
================================================
deeppavlov.models.relation_extraction
=====================================

.. autoclass:: deeppavlov.models.relation_extraction.relation_extraction_bert.REBertModel

    .. automethod:: __init__
    .. automethod:: __call__
    .. automethod:: train_on_batch


================================================
FILE: docs/apiref/models/sklearn.rst
================================================
deeppavlov.models.sklearn
=============================

.. automodule:: deeppavlov.models.sklearn
   :members:

.. autoclass:: deeppavlov.models.sklearn.sklearn_component.SklearnComponent

    .. automethod:: __call__
    .. automethod:: fit
    .. automethod:: init_from_scratch
    .. automethod:: load
    .. automethod:: save
    .. automethod:: compose_input_data
    .. automethod:: get_class_attributes
    .. automethod:: get_function_params


================================================
FILE: docs/apiref/models/spelling_correction.rst
================================================
deeppavlov.models.spelling_correction
=====================================

.. autoclass:: deeppavlov.models.spelling_correction.brillmoore.ErrorModel

    .. automethod:: __call__
    .. automethod:: fit
    .. automethod:: save
    .. automethod:: load

.. autoclass:: deeppavlov.models.spelling_correction.levenshtein.LevenshteinSearcherComponent

    .. automethod:: __call__


.. autoclass:: deeppavlov.models.spelling_correction.electors.top1_elector.TopOneElector

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.spelling_correction.electors.kenlm_elector.KenlmElector

    .. automethod:: __call__


================================================
FILE: docs/apiref/models/tokenizers.rst
================================================
deeppavlov.models.tokenizers
============================

.. autoclass:: deeppavlov.models.tokenizers.nltk_moses_tokenizer.NLTKMosesTokenizer

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.tokenizers.nltk_tokenizer.NLTKTokenizer

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.tokenizers.split_tokenizer.SplitTokenizer

.. autoclass:: deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer

    .. automethod:: __call__

================================================
FILE: docs/apiref/models/torch_bert.rst
================================================
deeppavlov.models.torch_bert
============================

.. automodule:: deeppavlov.models.torch_bert
   :members:

.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersPreprocessor

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersNerPreprocessor

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchBertRankerPreprocessor

    .. automethod:: __call__

.. autoclass:: deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel

    .. automethod:: __call__
    .. automethod:: train_on_batch

.. autoclass:: deeppavlov.models.torch_bert.torch_transformers_sequence_tagger.TorchTransformersSequenceTagger

    .. automethod:: __call__
    .. automethod:: train_on_batch

.. autoclass:: deeppavlov.models.torch_bert.torch_transformers_squad.TorchTransformersSquad

    .. automethod:: __call__
    .. automethod:: train_on_batch

.. autoclass:: deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel

    .. automethod:: __call__
    .. automethod:: train_on_batch


================================================
FILE: docs/apiref/models/vectorizers.rst
================================================
deeppavlov.models.vectorizers
=============================


.. autoclass:: deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer
    :members:

    .. automethod:: __call__


================================================
FILE: docs/apiref/models.rst
================================================
models
======
Concrete Model classes.

.. automodule:: deeppavlov.models
   :members:

.. toctree::
   :glob:
   :caption: Models

   models/*

================================================
FILE: docs/apiref/vocabs.rst
================================================
vocabs
======
Concrete Vocab classes.

.. automodule:: deeppavlov.vocabs
   :members:

.. autoclass:: deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab
   :members:

   .. automethod:: __call__

.. automodule:: deeppavlov.vocabs.typos
   :members:


================================================
FILE: docs/conf.py
================================================
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.

import sphinx_rtd_theme

import deeppavlov

# -- Project information -----------------------------------------------------

project = 'DeepPavlov'
copyright = '2018, ' + deeppavlov.__author__
author = deeppavlov.__author__

# The short X.Y version
version = deeppavlov.__version__
# The full version, including alpha/beta/rc tags
release = version


# -- General configuration ---------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.doctest',
    'sphinx.ext.intersphinx',
    'sphinx.ext.todo',
    'sphinx.ext.coverage',
    'sphinx.ext.napoleon',
    'sphinx.ext.viewcode',
    'sphinx.ext.mathjax',
    'sphinx.ext.extlinks',
    'nbsphinx',
    'IPython.sphinxext.ipython_console_highlighting',
    'sphinx_copybutton'
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'

# The master toctree document.
master_doc = 'index'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = 'en'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path .
exclude_patterns = ['_build', 
                    'Thumbs.db', 
                    '.DS_Store', 
                    '**.ipynb_checkpoints'
]

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {
    'collapse_navigation': False,
    'display_version': True,
    'logo_only': True,
}

html_logo = '_static/deeppavlov.png'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_css_files = ['my_blocks.css', 'deeppavlov.css']

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself.  Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}

nbsphinx_prolog = """
.. raw:: html

    <style>
    .nbinput .prompt,
    .nboutput .prompt {
        display: none;
    }
    </style>
"""
nbsphinx_execute = 'never'


# -- Options for HTMLHelp output ---------------------------------------------

# Output file base name for HTML help builder.

htmlhelp_basename = f'{project}-Docs'


# -- Options for LaTeX output ------------------------------------------------

latex_engine = 'xelatex'

latex_elements = {

    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',

    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',

    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',

    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',

    'extraclassoptions': 'openany,oneside',

    'fncychap': r'\usepackage[Sonny]{fncychap}'

}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_documents = [
    (master_doc, f'{project}.tex', f'{project} Documentation',
     author, 'manual'),
]


# -- Options for manual page output ------------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
    (master_doc, project.lower(), f'{project} Documentation',
     [author], 1)
]


# -- Options for Texinfo output ----------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
#  dir menu entry, description, category)
texinfo_documents = [
    (master_doc, project, f'{project} Documentation',
     author, project, deeppavlov.__description__,
     str(deeppavlov.__keywords__)),
]


# -- Extension configuration -------------------------------------------------

autodoc_mock_imports = ['bs4', 'faiss', 'fasttext', 'hdt', 'kenlm', 'lxml', 'navec', 'nltk', 'opt_einsum', 'rapidfuzz',
                        'razdel', 'sacremoses', 'slovnet', 'sortedcontainers', 'spacy', 'torch', 'torchcrf',
                        'transformers', 'udapi', 'ufal', 'whapi']

extlinks = {
    'config': (f'https://github.com/deeppavlov/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None),
    'dp_file': (f'https://github.com/deeppavlov/DeepPavlov/blob/{release}/%s', None)
}

# -- Options for intersphinx extension ---------------------------------------

# Configuration for intersphinx
intersphinx_mapping = {
    'python': ('https://docs.python.org/3.6', None),
    'scipy': ('https://docs.scipy.org/doc/scipy/reference', None)
}

# -- Options for todo extension ----------------------------------------------

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False


================================================
FILE: docs/devguides/contribution_guide.rst
================================================

Contribution Guide
=====================

We are happy that you share your research with us and want to improve our code!

Please follow the steps below to contribute to our project.

If you have any questions or suggestions about the contributing process,
please share them with us on the `forum <https://forum.deeppavlov.ai>`_.
Please note that we do not answer general questions in the github issues interface.

If you are a regular contributor in the DeepPavlov open source project,
you can receive an invitation to one of our events or an opportunity to become a part of our team.

How to contribute:

#. Don't start the coding first.
   You should do a quick search over `existing issues <https://github.com/deeppavlov/DeepPavlov/issues?q=is%3Aissue>`_
   for the project to see if your suggestion was already discussed or even resolved.
   If nothing relevant was found, please create a new one and state what exactly you would like
   to implement or fix.
   You may proceed with coding once someone on our team accepts your offer.

#. `Fork <https://guides.github.com/activities/forking/>`_ the
   `DeepPavlov repository <https://github.com/deeppavlov/DeepPavlov>`_

#. Checkout the ``dev`` branch from
   `the upstream <https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/configuring-a-remote-for-a-fork>`_
   as a base for your code:

    .. code:: bash

        git clone https://github.com/<OWNER>/<REPOSITORY>.git
        cd <REPOSITORY>
        git remote add upstream https://github.com/deeppavlov/DeepPavlov.git
        git fetch upstream
        git checkout -b dev --track upstream/dev

   afterwards to sync the ``dev`` branch with external updates you can run:

    .. code:: bash

        git checkout dev
        git fetch upstream
        git pull

#. **Create a new branch and switch** to it. Give it a meaningful name:

    .. code:: bash

        git checkout -b what_my_code_does_branch

#. **Install DeepPavlov** in editable mode:

   .. code:: bash

       pip install -e .

   or

   .. code:: bash

       pip install -e .[docs,tests]

   In editable mode changes of the files in the repository directory will automatically reflect in your
   python environment. The last command with ``[docs,tests]`` will install additional requirements to build
   documentation and run tests.

#. **Write readable code** and keep it
   `PEP8 <https://www.python.org/dev/peps/pep-0008/>`_-ed, **add docstrings**
   and keep them consistent with the
   `Google Style <http://google.github.io/styleguide/pyguide.html#381-docstrings>`_.
   Pay attention that we support typing annotations in every function
   declaration.

   Accompany your code with **clear comments** to let other people understand the
   flow of your mind.

   If you create new models, refer to the :doc:`Register your model
   </devguides/registry>` section to add it to the DeepPavlov registry of
   models.

#. We ask you to **add some tests**. This will help us maintain the
   framework, and this will help users to understand the feature you introduce.
   Examples of implemented tests are available in `tests/
   <https://github.com/deeppavlov/DeepPavlov/tree/dev/tests>`_
   directory.

#. Please, **update the documentation**, if you committed significant changes
   to our code. Make sure that documentation could be built after your changes
   and check how it looks using:

   .. code:: bash

       cd docs
       make html

   The built documentation will be added to ``docs/_build`` directory. Open it with your browser.

#. **Commit your changes and push** your feature branch to your GitHub fork:

    .. code:: bash

        git add my_files
        git commit -m "fix: resolve issue #271"
        git push origin what_my_code_does_branch

    Follow the `semantic commit notation <https://seesparkbox.com/foundry/semantic_commit_messages>`_
    for the name of the commit.

#. Create a new `pull request <https://github.com/deeppavlov/DeepPavlov/pulls>`_
   to get your feature branch merged into dev for others to use.
   Don't forget to `reference <https://help.github.com/en/github/writing-on-github/autolinked-references-and-urls>`_
   the GitHub issue associated with your task in the description.

#. **Relax and wait** : )

Some time after that your commit will be assigned to somebody from our team
to check your code. 
After a code review and a successful completion of all tests, your pull request will be approved and
pushed into the framework.

If you still have any questions, either on the contribution process or about
the framework itself, please share them with us on our `forum <https://forum.deeppavlov.ai>`_.
Join our official `Telegram channel <https://t.me/deeppavlov>`_ to get notified about our updates & news.


================================================
FILE: docs/devguides/registry.rst
================================================
Register your model
===================

In order to extend the library, you need to register your classes and functions; it is done in two steps.

1. Decorate your :class:`~deeppavlov.core.models.component.Component`
   (or :class:`~deeppavlov.core.data.dataset_reader.DatasetReader`,
   or :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`,
   or :class:`~deeppavlov.core.data.data_fitting_iterator.DataFittingIterator`)
   using :func:`~deeppavlov.core.common.registry.register` and/or metrics function
   using :func:`~deeppavlov.core.common.metrics_registry.register_metric`.

2. Rebuild the registry running from DeepPavlov root directory:

::

    python -m utils.prepare.registry

This script imports all the modules in deeppavlov package, builds the registry from them and writes it to a file.


However, it is possible to use some classes and functions inside configuration files without registering them explicitly.
There are two options available here:

- instead of ``{"class_name": "registered_component_name"}`` in config file use key-value pair similar to
  ``{"class_name": "my_package.my_module:MyClass"}``

- if your classes/functions are properly decorated but not included in the registry, use ``"metadata"`` section of
  your config file specifying imports as ``"metadata": {"imports": ["my_local_package.my_module", "global_package.module"]}``;
  then the second step described above will be unnecessary (local packages are imported from the current working
  directory).


================================================
FILE: docs/features/hypersearch.rst
================================================
Hyperparameters optimization
============================

You can search for best hyperparameters of your model in DeepPavlov by means of cross-validation.

Cross-validation
~~~~~~~~~~~~~~~~

You can run cross-validation in DeepPavlov to select best parameters of your model.
For this purpose you have to run special command 'paramserach'. for example:

.. code:: bash

    python -m deeppavlov.paramsearch path_to_json_config.json --folds 5


Parameters
----------

Cross validation command have several parameters:

-  ``config_path``:
    Specify config path, where you model is located.
-  ``--folds``:
    This parameter shows how many folds you need in cross validation.
    Do you want to use leave one out cross validation instead of folds?
    Just specify this: ``--folds loo``.
    If you want not to cross-validate just omit this parameter.
-  ``--search_type``:
    This parameter is optional - default value is "grid" (grid search).


.. note::

    Folds will be created automatically from union of train and validation datasets.


Special parameters in config
----------------------------
Config file of model should be consist of parameters ranges for search.
For example, you try to optimize regularization coefficient in model,
so you should add additional parameter in config with suffix '_range'.
Let's see example for logistic regression model:

.. code:: python

      {
        "class_name": "faq_logreg_model",
        "in": "q_vect",
        "fit_on": ["q_vect", "y"],
        "c": {"search_choice": [1, 10, 100, 1000]},
        "out": ["answer", "score"]
      }

In this example parameter "c" described as search_choice, values for grid search:

.. code:: python

    {"search_choice": [value_0, ..., value_n]}


Results
-------
As a result you'll have new json config with best model parameters.
It'll be stored in the same directory as config file and will have suffix '_cvbest.json'.
Also you'll see final log messages about best model:

.. code:: bash

    INFO in '__main__'['paramsearch'] at line 169: Best model params: {'C': 10000, 'penalty': 'l1', 'accuracy': 0.81466}
    INFO in '__main__'['paramsearch'] at line 184: Best model saved in json-file: path_to_model_config_cvbest.json


================================================
FILE: docs/features/models/KBQA.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Knowledge Base Question Answering (KBQA)\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/KBQA.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1. [Predict using Python](#4.1-Predict-using-Python)\n",
    "    \n",
    "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "    4.3. [Using entity linking and Wiki parser as standalone services for KBQA](#4.3-Using-entity-linking-and-Wiki-parser-as-standalone-tools-for-KBQA)\n",
    "     \n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "    \n",
    "    5.1. [Description of config parameters](#5.1-Description-of-config-parameters)\n",
    "    \n",
    "    5.2. [Train KBQA components](#5.2-Train-KBQA-components)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "The knowledge base:\n",
    "\n",
    "* is a comprehensive repository of information about given domain or a number of domains;\n",
    "\n",
    "* reflects the ways we model knowledge about given subject or subjects, in terms of concepts, entities, properties, and relationships;\n",
    "\n",
    "* enables us to use this structured knowledge where appropriate, e.g. answering factoid questions.\n",
    "\n",
    "Currently, we support Wikidata as a Knowledge Base (Knowledge Graph). In the future, we will expand support for custom knowledge bases.\n",
    "\n",
    "The question answerer:\n",
    "\n",
    "* validates questions against the preconfigured list of question templates, disambiguates entities using entity linking and answers questions asked in natural language;\n",
    "\n",
    "* can be used with Wikidata (English, Russian) and (in the future versions) with custom knowledge graphs.\n",
    "\n",
    "Here are some of the most popular types of questions supported by the model:\n",
    "\n",
    "* **Complex questions with numerical values:** “What position did Angela Merkel hold on November 10, 1994?”\n",
    "* **Complex question where the answer is a number or a date:** “When did Jean-Paul Sartre move to Le Havre?”\n",
    "* **Questions with counting of answer entities:** “How many sponsors are for Juventus F.C.?”\n",
    "* **Questions with ordering of answer entities by ascending or descending of some parameter:** “Which country has highest individual tax rate?”\n",
    "* **Simple questions:** “What is crew member Yuri Gagarin’s Vostok?”\n",
    "\n",
    "The following models are used to find the answer (the links are for the English language model):\n",
    "\n",
    "* [BERT model](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/classifiers/query_pr.json) for prediction of query template type. Model performs classification of questions into 8 classes correponding to 8 query template types;\n",
    "* [BERT entity detection model](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/entity_extraction/entity_detection_en.json) for extraction of entity substrings from the questions;\n",
    "* Substring extracted by the entity detection model is used for [entity linking](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/entity_extraction/entity_linking_en.json). Entity linking performs matching the substring with one of the Wikidata entities. Matching is based on the Levenshtein distance between the substring and an entity title. The result of the matching procedure is a set of candidate entities. There is also the search for the entity among this set with one of the top-k relations predicted by classification model;\n",
    "* [BERT model](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/ranking/rel_ranking_bert_en.json) for ranking candidate relation paths;\n",
    "* Query generator model is used to fill query template with candidate entities and relations to find valid combinations of entities and relations for query template. Query generation model uses Wikidata HDT file.\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation](https://deeppavlov-test.readthedocs.io/en/latest/notebooks/Get%20Started%20with%20DeepPavlov.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install --q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then make sure that all the required packages for the model are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov install kbqa_cq_en\n",
    "! python -m deeppavlov install kbqa_cq_ru"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`kbqa_cq_en` and `kbqa_cq_rus` here are the names of the model's *config_files*. [What is a Config File?](https://docs.deeppavlov.ai/en/master/intro/configuration.html) \n",
    "\n",
    "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n",
    "The full list of KBQA models with their config names can be found in the [table](#3.-Models-list).\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "The table presents a list of all of the KBQA-models available in DeepPavlov Library.\n",
    "\n",
    "| Config name  | Database | Language | RAM | GPU |\n",
    "| :--- | --- | --- | --- | --- |\n",
    "| [kbqa_cq_en](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/kbqa/kbqa_cq_en.json)    | Wikidata | En | 3.1 Gb | 3.4 Gb |\n",
    "| [kbqa_cq_ru](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/kbqa/kbqa_cq_en.json)    | Wikidata | Ru | 4.3 Gb | 8.0 Gb |\n",
    "\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import configs, build_model\n",
    "\n",
    "kbqa = build_model('kbqa_cq_en', download=True, install=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Input**: List[sentences]\n",
    "\n",
    "**Output**: List[answers]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['Robert Zemeckis'],\n",
       " [['Q187364']],\n",
       " [['SELECT ?answer WHERE { wd:Q134773 wdt:P57 ?answer. }']]]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kbqa(['Who directed Forrest Gump?'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['United States senator'],\n",
       " [['Q4416090']],\n",
       " [['SELECT ?answer WHERE { wd:Q11613 p:P39 ?ent . ?ent ps:P39 ?answer . ?ent ?p ?x filter(contains(?x, n)). }']]]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kbqa(['What position was held by Harry S. Truman on 1/3/1935?'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['FC Barcelona B, Argentina national under-20 football team'],\n",
       " [['Q10467', 'Q1187790']],\n",
       " [['SELECT ?answer WHERE { wd:Q615 p:P54 ?ent . ?ent ps:P54 ?answer . ?ent ?p ?x filter(contains(?x, n)). }']]]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kbqa(['What teams did Lionel Messi play for in 2004?'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "KBQA model for complex question answering in Russian can be used from Python using the following code:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import configs, build_model\n",
    "\n",
    "kbqa = build_model('kbqa_cq_ru', download=True, install=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['26 мая 1799, 06 июня 1799'],\n",
       " [['+1799-05-26^^T', '+1799-06-06^^T']],\n",
       " [['SELECT ?answer WHERE { wd:Q7200 wdt:P569 ?answer. }']]]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kbqa(['Когда родился Пушкин?'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact kbqa_сq_en [-d]\n",
    "! python -m deeppavlov interact kbqa_cq_ru [-d]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`-d` is an optional download key (alternative to `download=True` in Python code). It is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n",
    "\n",
    "Or make predictions for samples from *stdin*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov predict kbqa_сq_en -f <file-name>\n",
    "! python -m deeppavlov predict kbqa_cq_ru -f <file-name>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.3 Using entity linking and Wiki parser as standalone tools for KBQA\n",
    "\n",
    "Default configuration for KBQA was designed to use all of the supporting models together as a part of the KBQA pipeline. However, there might be a case when you want to work with some of these models in addition to KBQA.\n",
    "\n",
    "For example, you might want to use entity linking model as an annotator in your [multiskill AI Assistant](https://github.com/deeppavlov/dream). Or, you might want to use Wiki Parser component to directly run SPARQL queries against your copy of Wikidata. To support these usages, you can also deploy supporting models as standalone components.\n",
    "\n",
    "Don’t forget to replace the `url` parameter values in the examples below with correct URLs.\n",
    "\n",
    "Config [entity_linking_en](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/entity_extraction/entity_linking_en.json) can be used with the following commands:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov install entity_linking_en -d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov riseapi entity_linking_en [-d] [-p <port>]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "payload = {\"entity_substr\": [[\"Forrest Gump\"]], \"tags\": [[\"PERSON\"]], \"probas\": [[0.9]],\n",
    "           \"sentences\": [[\"Who directed Forrest Gump?\"]]}\n",
    "response = requests.post(entity_linking_url, json=payload).json()\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Config [wiki_parser](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/kbqa/wiki_parser.json) can be used with the following command:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov riseapi wiki_parser [-d] [-p <port>]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Arguments of the annotator are `parser_info` (what we want to extract from Wikidata) and `query`.\n",
    "\n",
    "**Examples of queries:**\n",
    "\n",
    "To extract triplets for entities, the `query` argument should be the list of entities ids. `parser_info` should be the list of “find_triplets” strings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "requests.post(wiki_parser_url, json = {\"parser_info\": [\"find_triplets\"], \"query\": [\"Q159\"]}).json()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To extract all relations of the entities, the `query` argument should be the list of entities ids, and `parser_info` should be the list of “find_rels” strings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "requests.post(wiki_parser_url, json = {\"parser_info\": [\"find_rels\"], \"query\": [\"Q159\"]}).json()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To find labels for entities ids, the `query` argument should be the list of entities ids, and `parser_info` should be the list of “find_label” strings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "requests.post(wiki_parser_url, json = {\"parser_info\": [\"find_label\"], \"query\": [[\"Q159\", \"\"]]}).json()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this example, the second element of the list (an empty string) can be replaced with a sentence.\n",
    "\n",
    "To execute SPARQL queries, the `query` argument should be the list of tuples with the info about SPARQL queries, and `parser_info` should be the list of “query_execute” strings.\n",
    "\n",
    "Let us consider an example of the question “What is the deepest lake in Russia?” with the corresponding SPARQL query `SELECT ?ent WHERE { ?ent wdt:P31 wd:T1 . ?ent wdt:R1 ?obj . ?ent wdt:R2 wd:E1 } ORDER BY ASC(?obj) LIMIT 5`\n",
    "\n",
    "Arguments:\n",
    "\n",
    "* *what_return*: ```[“?obj”]```,\n",
    "* *query_seq*: ```[[“?ent”, “P17”, “Q159”], [“?ent”, “P31”, “Q23397”], [“?ent”, “P4511”, “?obj”]]```,\n",
    "* *filter_info*: ```[]```,\n",
    "* *order_info*: ```order_info(variable=’?obj’, sorting_order=’asc’)```."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "requests.post(\"wiki_parser_url\", json = {\"parser_info\": [\"query_execute\"], \"query\": [[[\"?obj\"], [[\"Q159\", \"P36\", \"?obj\"]], [], [], True]]}).json()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To use entity linking model in KBQA, you should add following API Requester component to the `pipe` in the *config_file*:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "{\n",
    "    \"class_name\": \"api_requester\",\n",
    "    \"id\": \"linker_entities\",\n",
    "    \"url\": \"entity_linking_url\",\n",
    "    \"out\": [\"entity_substr\", \"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"],\n",
    "    \"param_names\": [\"entity_substr\", \"tags\", \"probas\", \"sentences\"]\n",
    " }\n",
    " ```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To use Wiki parser service in KBQA, you should add following API Requester component to the `pipe` in the *config_file*:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "{\n",
    "    \"class_name\": \"api_requester\",\n",
    "    \"id\": \"wiki_p\",\n",
    "    \"url\": \"wiki_parser_url\",\n",
    "    \"out\": [\"wiki_parser_output\"],\n",
    "    \"param_names\": [\"parser_info\", \"query\"]\n",
    " }\n",
    " ```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Customize the model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.1 Description of config parameters"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Parameters of ``entity_linker`` component:\n",
    "\n",
    "- ``num_entities_to_return: int`` - the number of entity IDs, returned for each entity mention in text;\n",
    "- ``lemmatize: bool`` - whether to lemmatize entity mentions before searching candidate entity IDs in the inverted index;\n",
    "- ``use_decriptions: bool`` - whether to perform ranking of candidate entities by similarity of their descriptions to the context;\n",
    "- ``use_connections: bool`` - whether to use connections between candidate entities for different mentions for ranking;\n",
    "- ``use_tags: bool`` - whether to search only those entity IDs in the inverted index, which have the same tag as the entity mention;\n",
    "- ``prefixes: Dict[str, Any]`` - prefixes in the knowledge base for entities and relations;\n",
    "- ``alias_coef: float`` - the coefficient which is multiplied by the substring matching score of the entity if the entity mention in the text matches with the entity title."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Parameters of ``rel_ranking_infer`` component:\n",
    "\n",
    "- ``return_elements: List[str]`` - what elements should be returned by the component in the output tuple (answers are returned by default, optional elements are `\"confidences\"`, `\"answer_ids\"`, `\"entities_and_rels\"` (entities and relations from SPARQL queries), `\"queries\"` (SPARQL queries), `\"triplets\"` (triplets from SPARQL queries));\n",
    "- ``batch_size: int`` - candidate relations list will be split into N batches of the size `batch_size` for further ranking;\n",
    "- ``softmax: bool`` - whether to apply softmax function to the confidences list of candidate relations for a question;\n",
    "- ``use_api_requester: bool`` - true if wiki_parser [is called through api_requester](#4.3-Using-entity-linking-and-Wiki-parser-as-standalone-tools-for-KBQA);\n",
    "- ``rank: bool`` - whether to perform ranking of candidate relation paths;\n",
    "- ``nll_rel_ranking: bool`` - in DeepPavlov we have two types of relation ranking models: 1) the model which takes a question and a relation and is trained to classify question-relation by two classes (relevant / irrelevant relation) 2) the model which takes a question and a list of relations (one relevant relation and others - irrelevant) and is trained to define the relevant relation in the list with NLL loss; the output format in two cases is different;\n",
    "- ``nll_path_ranking: bool`` - the same case as `nll_rel_ranking` for ranking of relation paths;\n",
    "- ``top_possible_answers: int`` - SPARQL query execution can result in several valid answers, so `top_possible_answers` is the number of these answers which we leave in the output;\n",
    "- ``top_n: int`` - number of candidate SPARQL queries (and corresponding answers) in the output for a question;\n",
    "- ``pos_class_num: int`` - if we use the model which classifies question-relation into two classes (relevant / irrelevant), we should set the number of positive class (0 or 1);\n",
    "- ``rel_thres: float`` - we leave only relations with the confidence upper threshold;\n",
    "- ``type_rels: List[str]`` - relations which connect entity and its type in the knowledge graph."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Parameters of ``query_generator`` component:\n",
    "\n",
    "- ``entities_to_leave: int`` - how many entity IDs to use to make a a combination of entities and relations for filling in the slots of the SPARQL query template;\n",
    "- ``rels_to_leave: int`` - how many relations to use to make a a combination of entities and relations for filling in the slots of the SPARQL query template;\n",
    "- ``max_comb_num: int`` - maximal number of combinations of entities and relations for filling in the slots of SPARQL query template;\n",
    "- ``map_query_str_to_kb: List[Tuple[str, str]]`` - a list of elements like [\"wd:\", \"http://we/\"], where the first element is a prefix of an entity (\"wd:\") or relation in the SPARQL query template, the second - the corresponding prefix in the knowledge base (\"http://we/\");\n",
    "- ``kb_prefixes: Dict[str, str]`` - a dictionary {\"entity\": \"wd:E\", \"rel\": \"wdt:R\", ...} - prefixes of entities, relations and types in the knowledge base;\n",
    "- ``gold_query_info: Dict[str, str]`` - names of unknown variables in SPARQL queries in the dataset (LC-QuAD2.0 or RuBQ2.0);\n",
    "- ``syntax_structure_known: bool`` - whether the syntax structure of the question is known (is True in kbqa_cq_ru.json, because this config performs syntax parsing with slovnet_syntax_parser)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.2 Train KBQA components"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train Query Prediction Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The dataset for training query prediction model consists of three *.csv* files: *train.csv*, *valid.csv* and *test.csv*. Each line in this file contains question and corresponding query template type, for example:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "\"What is the longest river in the UK?\", 6\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train Entity Detection Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The dataset is a pickle file. The dataset must be split into three parts: train, test, and validation. Each part is a list of tuples of question tokens and tags for each token. An example of training sample:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "(['What', 'is', 'the', 'complete', 'list', 'of', 'records', 'released', 'by', 'Jerry', 'Lee', 'Lewis', '?'],\n",
    " ['O', 'O', 'O', 'O', 'B-T', 'I-T', 'I-T', 'O', 'O', 'B-E', 'I-E', 'I-E', 'O'])\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`B-T` corresponds to tokens of entity types substrings beginning, `I-T` - to tokens of inner part of entity types substrings, `B-E` and `I-E` - for entities, `O` - for other tokens."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train Path Ranking Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The dataset (in pickle format) is a dict of three keys: \"train\", \"valid\" and \"test\". The value by each key is the list of samples, an example of a sample:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "(['What is the Main St. Exile label, which Nik Powell co-founded?', ['record label', 'founded by']], '1')\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The sample contains the question, relations in the question and label (1 - if the relations correspond to the question, 0 - otherwise)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Adding Templates For New SPARQL Queries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Templates can be added to sparql_queries.json file, which is a dictionary, where keys are template types and values are templates with additional information. An example of a template:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "{\n",
    "    \"query_template\": \"SELECT ?obj WHERE { wd:E1 p:R1 ?s . ?s ps:R1 ?obj . ?s ?p ?x filter(contains(?x, N)) }\",\n",
    "    \"rank_rels\": [\"wiki\", \"do_not_rank\", \"do_not_rank\"],\n",
    "    \"rel_types\": [\"no_type\", \"statement\", \"qualifier\"],\n",
    "    \"query_sequence\": [1, 2, 3],\n",
    "    \"return_if_found\": true,\n",
    "    \"template_num\": \"0\",\n",
    "    \"alternative_templates\": []\n",
    " }\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* `query_template` is the template of the SPARQL query;\n",
    "* `rank_rels` is a list which defines whether to rank relations, in this example **p:R1** relations we extract from Wikidata for **wd:E1** entities and rank with RelRanker, **ps:R1** and **?p** relations we do not extract or rank;\n",
    "* `rel_types` - direct, statement or qualifier relations;\n",
    "* `query_sequence` - the sequence in which the triplets will be extracted from the Wikidata hdt file;\n",
    "* `return_if_found` - the parameter which iterates over all possible combinations of entities, relations and types, if true - return the first valid combination found, if false - consider all combinations;\n",
    "* `template_num` - the type of a template;\n",
    "* `alternative_templates` - type numbers of alternative templates to use if the answer was not found using the current template."
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: docs/features/models/NER.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Named Entity Recognition (NER)\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/NER.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1. [Predict using Python](#4.1-Predict-using-Python)\n",
    "    \n",
    "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "    \n",
    "5. [Evaluate](#5.-Evaluate)\n",
    "    \n",
    "    5.1. [Evaluate from Python](#5.1-Evaluate-from-Python)\n",
    "    \n",
    "    5.2. [Evaluate from CLI](#5.2-Evaluate-from-CLI)\n",
    "\n",
    "6. [Customize the model](#6.-Customize-the-model)\n",
    "    \n",
    "    6.1. [Train your model from Python](#6.1-Train-your-model-from-Python)\n",
    "    \n",
    "    6.2. [Train your model from CLI](#6.2-Train-your-model-from-CLI)\n",
    "\n",
    "7. [NER-tags list](#7.-NER-tags-list)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "**Named Entity Recognition (NER)** is a task of assigning a tag (from a predefined set of tags) to each token in a given sequence. In other words, NER-task consists of identifying named entities in the text and classifying them into types (e.g. person name, organization, location etc). \n",
    "\n",
    "**BIO encoding schema** is usually used in NER task. It uses 3 tags: B for the beginning of the entity, I for the inside of the entity, and O for non-entity tokens. The second part of the tag stands for the entity type.\n",
    "\n",
    "Here is an example of a tagged sequence:\n",
    "\n",
    "| Elon | Musk | founded | Tesla| in | 2003 | . |\n",
    "| --- | --- | --- | --- | --- | --- | --- |\n",
    "| B-PER | I-PER | O | B-ORG | O | B-DATE | O |\n",
    "\n",
    "Here we can see three extracted named entities: *Elon Musk* (which is a person's name), *Tesla* (which is a name of an organization) and *2003* (which is a date). To see more examples try out our [Demo](https://demo.deeppavlov.ai/#/en/ner).\n",
    "\n",
    "The list of possible types of NER entities may vary depending on your dataset domain. The list of tags used in DeepPavlov's models can be found in the [table](#7.-NER-tags-list).\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then make sure that all the required packages for the model are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install ner_ontonotes_bert"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`ner_ontonotes_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n",
    "\n",
    "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n",
    "The full list of NER models with their config names can be found in the [table](#3.-Models-list).\n",
    "\n",
    "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "The table presents a list of all of the NER-models available in the DeepPavlov Library.\n",
    "\n",
    "| Config name | Dataset | Language | Model Size | F1 score (ner_f1) | F1 score (ner_f1_token) |\n",
    "| :--- | --- | --- | --- | --- | ---: |\n",
    "| ner_case_agnostic_mdistilbert| [CoNLL-2003](https://paperswithcode.com/dataset/conll-2003)   | En | 1.6 GB | 89.9 | 91.6 |\n",
    "| ner_conll2003_bert | [CoNLL-2003](https://paperswithcode.com/dataset/conll-2003) | En | 1.3 GB | **91.9** | **93.4** |\n",
    "| ner_ontonotes_bert | [OntoNotes](https://paperswithcode.com/dataset/ontonotes-5-0) | En | 1.3 GB | 89.2 | 92.7 |\n",
    "| ner_collection3_bert | [Collection3](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 2.1 GB | **98.5** | **98.9** |\n",
    "| ner_rus_bert | [Collection3](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 2.1 GB | 97.6 | 98.5 |\n",
    "| ner_rus_convers_distilrubert_2L | [Collection-rus](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 1.3 GB | 92.9 | 96.6 |\n",
    "| ner_rus_convers_distilrubert_6L | [Collection-rus](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 1.6 GB | 96.7 | 98.5 |\n",
    "| ner_rus_bert_probas | [Wiki-NER-rus](https://aclanthology.org/I17-1042/) | Ru | 2.1 GB | 72.6 | 79.5 |\n",
    "| ner_ontonotes_bert_mult | [OntoNotes](https://paperswithcode.com/dataset/ontonotes-5-0) | Multi | 2.1 GB | 88.9 | 92.0 |\n",
    "\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "\n",
    "ner_model = build_model('ner_ontonotes_bert', download=True, install=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `download` argument defines whether it is necessary to download the files defined in the `download` section of the config: usually it provides the links to the train and test data, to the pretrained models, or to the embeddings.\n",
    "\n",
    "Setting the `install` argument to `True` is equivalent to executing the command line `install` command. If set to `True`, it will first install all the required packages.\n",
    "\n",
    "**Input**: List[sentences]\n",
    "\n",
    "**Output**: List[tokenized sentences, corresponding NER-tags]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[['Bob', 'Ross', 'lived', 'in', 'Florida'],\n",
       "  ['Elon', 'Musk', 'founded', 'Tesla']],\n",
       " [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE'],\n",
       "  ['B-PERSON', 'I-PERSON', 'O', 'B-ORG']]]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ner_model(['Bob Ross lived in Florida', 'Elon Musk founded Tesla'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact ner_ontonotes_bert -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n",
    "\n",
    "Or make predictions for samples from *stdin*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov predict ner_ontonotes_bert -f <file-name>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Evaluate\n",
    "\n",
    "There are two metrics that are used to evaluate a NER model in DeepPavlov:\n",
    "\n",
    "`ner_f1` is measured on the entity-level (actual text spans should match exactly)\n",
    "\n",
    "`ner_token_f1` is measured on a token level (correct tokens from not fully extracted entities will still be counted as TPs (true positives))\n",
    "\n",
    "## 5.1 Evaluate from Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import evaluate_model\n",
    "\n",
    "model = evaluate_model('ner_ontonotes_bert', download=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.2 Evaluate from CLI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov evaluate ner_ontonotes_bert"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6. Customize the model\n",
    "\n",
    "## 6.1 Train your model from Python\n",
    "\n",
    "### Provide your data path\n",
    "\n",
    "To train the model on your data, you need to change the path to the training data in the *config_file*.\n",
    " \n",
    "Parse the *config_file* and change the path to your data from Python."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "~/.deeppavlov/downloads/ontonotes/\n"
     ]
    }
   ],
   "source": [
    "from deeppavlov import train_model\n",
    "from deeppavlov.core.commands.utils import parse_config\n",
    "\n",
    "model_config = parse_config('ner_ontonotes_bert')\n",
    "\n",
    "# dataset that the model was trained on\n",
    "print(model_config['dataset_reader']['data_path'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Provide a *data_path* to your own dataset. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# download and unzip a new example dataset\n",
    "!wget http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz\n",
    "!tar -xzvf \"conll2003_v2.tar.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# provide a path to the train file\n",
    "model_config['dataset_reader']['data_path'] = 'contents/train.txt'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "### Train dataset format\n",
    "\n",
    "To train the model, you need to have a txt-file with a dataset in the following format:\n",
    "\n",
    "```\n",
    "EU B-ORG\n",
    "rejects O\n",
    "the O\n",
    "call O\n",
    "of O\n",
    "Germany B-LOC\n",
    "to O\n",
    "boycott O\n",
    "lamb O\n",
    "from O\n",
    "Great B-LOC\n",
    "Britain I-LOC\n",
    ". O\n",
    "\n",
    "China B-LOC\n",
    "says O\n",
    "time O\n",
    "right O\n",
    "for O\n",
    "Taiwan B-LOC\n",
    "talks O\n",
    ". O\n",
    "```\n",
    "\n",
    "The source text is **tokenized** and **tagged**. For each token, there is a tag with **BIO** markup. Tags are separated from tokens with **whitespaces**. Sentences are separated with **empty lines**.\n",
    "\n",
    "\n",
    "### Train the model using new config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ner_model = train_model(model_config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use your model for prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[['Bob', 'Ross', 'lived', 'in', 'Florida'],\n",
       "  ['Elon', 'Musk', 'founded', 'Tesla']],\n",
       " [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE'],\n",
       "  ['B-PERSON', 'I-PERSON', 'O', 'B-ORG']]]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ner_model(['Bob Ross lived in Florida', 'Elon Musk founded Tesla'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.2 Train your model from CLI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov train ner_ontonotes_bert"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 7. NER-tags list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The table presents a list of all of the NER entity tags used in DeepPavlov's NER-models."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "|              |                                                        |\n",
    "| ------------ | ------------------------------------------------------ |\n",
    "| **PERSON**       | People including fictional                             |\n",
    "| **NORP**         | Nationalities or religious or political groups         |\n",
    "| **FACILITY**     | Buildings, airports, highways, bridges, etc.           |\n",
    "| **ORGANIZATION** | Companies, agencies, institutions, etc.                |\n",
    "| **GPE**          | Countries, cities, states                              |\n",
    "| **LOCATION**     | Non-GPE locations, mountain ranges, bodies of water    |\n",
    "| **PRODUCT**      | Vehicles, weapons, foods, etc. (Not services)          |\n",
    "| **EVENT**        | Named hurricanes, battles, wars, sports events, etc.   |\n",
    "| **WORK OF ART**  | Titles of books, songs, etc.                           |\n",
    "| **LAW**          | Named documents made into laws                         |\n",
    "| **LANGUAGE**     | Any named language                                     |\n",
    "| **DATE**         | Absolute or relative dates or periods                  |\n",
    "| **TIME**         | Times smaller than a day                               |\n",
    "| **PERCENT**      | Percentage (including “%”)                             |\n",
    "| **MONEY**        | Monetary values, including unit                        |\n",
    "| **QUANTITY**     | Measurements such as weight or distance                |\n",
    "| **ORDINAL**      | “first”, “second”, etc.                                |\n",
    "| **CARDINAL**     | Numerals that do not fall under another type           |"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/ODQA.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Open Domain Question Answering (ODQA)\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/ODQA.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1 [Predict using Python](#4.1-Predict-using-Python)\n",
    "\n",
    "    4.2 [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "\n",
    "    5.1 [Description of config parameters](#5.1-Description-of-config-parameters)\n",
    "    \n",
    "    5.2 [Building the index and training the reader model](#5.2-Building-the-index-and-training-the-reader-model)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "**Open Domain Question Answering (ODQA)** is a task to find an exact answer\n",
    "to any question in **Wikipedia** articles. Thus, given only a question, the system outputs\n",
    "the best answer it can find.\n",
    "The default ODQA implementation takes a batch of queries as input and returns the best answer.\n",
    "\n",
    "English ODQA version consists of the following components:\n",
    "\n",
    "- TF-IDF ranker, which defines top-N most relevant paragraphs in TF-IDF index;\n",
    "- Binary Passage Retrieval (BPR) ranker, which defines top-K most relevant in binary index;\n",
    "- a database of paragraphs (by default, from Wikipedia) which finds N + K most relevant paragraph text by IDs, defined by TF-IDF and BPR ranker;\n",
    "- Reading Comprehension component, which finds answers in paragraphs and defines answer confidences.\n",
    "\n",
    "Russian ODQA version performs retrieval only with TF-IDF index.\n",
    "\n",
    "Binary Passage Retrieval is resource-efficient the method of building a dense passage index. The dual encoder (with BERT or other Tranformer as backbone) is trained on question answering dataset (Natural Questions in our case) to maximize dot product of question and passage with answer embeddings and minimize otherwise. The question or passage embeddings are obtained the following way: vector of BERT CLS-token is fed into a dense layer followed by a hash function which turns dense vector into binary one.\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The example below is given for basic ODQA config [en_odqa_infer_wiki](https://github.com/deeppavlov/DeepPavlov/blob/1.1.1/deeppavlov/configs/odqa/en_odqa_infer_wiki.json).\n",
    "Check what [other ODQA configs](#3.-Models-list) are available and simply replace `en_odqa_infer_wiki`\n",
    "with the config name of your preference. [What is a Config File?](https://docs.deeppavlov.ai/en/master/intro/configuration.html)\n",
    "\n",
    "Before using the model make sure that all required packages are installed running the command:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install en_odqa_infer_wiki"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "The table presents a list of all of the ODQA models available in the DeepPavlov Library.\n",
    "\n",
    "| Config | Description |\n",
    "| :--- | :--- |\n",
    "| odqa/en_odqa_infer_wiki.json | Basic config for **English** language. Consists of of Binary Passage Retrieval, TF-IDF retrieval and reader. |\n",
    "| odqa/en_odqa_pop_infer_wiki.json | Extended config for **English** language. Consists of of Binary Passage Retrieval, TF-IDF retrieval, popularity ranker and reader. |\n",
    "| odqa/ru_odqa_infer_wiki.json | Basic config for **Russian** language. Consists of TF-IDF ranker and reader. |\n",
    "\n",
    "The table presents the scores on Natural Questions and SberQuAD dataset and memory consumption.\n",
    "\n",
    "| Config | Number of<br>paragraphs | Dataset | F1 | EM | RAM | GPU | Time for <br> 1 query |\n",
    "| :--- | :---: | :--- | :---: | :---: | :---: | :---: | :---: |\n",
    "| odqa/en_odqa_infer_wiki.json | 200 | Natural Questions | 45.2 | 37.0 | 10.4 | 2.4 | 4.9 s |\n",
    "| odqa/ru_odqa_infer_wiki.json | 100 | SberQuAD | 59.2 | 49.0 | 13.1 | 5.3 | 2.0 s |\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "### English"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "\n",
    "odqa_en = build_model('en_odqa_infer_wiki', download=True, install=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Input**: List[questions]\n",
    "\n",
    "**Output**: Tuple[List[answers], List[answer scores], List[answer places in paragraph]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['Luke Skywalker'], [4.196979999542236]]\n"
     ]
    }
   ],
   "source": [
    "odqa_en([\"What is the name of Darth Vader's son?\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Russian"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "\n",
    "odqa_ru = build_model('ru_odqa_infer_wiki', download=True, install=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['на востоке и юге Австралии'], [0.9999760985374451]]\n"
     ]
    }
   ],
   "source": [
    "odqa_ru([\"Где живут кенгуру?\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact en_odqa_infer_wiki -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n",
    "\n",
    "# 5. Customize the model\n",
    "\n",
    "## 5.1 Description of config parameters\n",
    "\n",
    "Parameters of ``bpr`` component:\n",
    "    \n",
    "- ``load_path`` - path with checkpoint of query encoder and bpr index;\n",
    "- ``query_encoder_file`` - filename of query encoder (Transformer-based model which takes a question as input and obtains its binary embedding);\n",
    "- ``bpr_index`` - filename with BPR index (matrix of paragraph binary vectors);\n",
    "- ``pretrained_model`` - Transformer model, used in query encoder;\n",
    "- ``max_query_length`` - maximal length (in sub-tokens) of the input to the query encoder;\n",
    "- ``top_n`` - how many paragraph IDs to return per a question.\n",
    "\n",
    "Parameters of ``tfidf_ranker`` component:\n",
    "\n",
    "- ``top_n`` - how many paragraph IDs to return per a question.\n",
    "\n",
    "Parameters of ``logit_ranker`` component:\n",
    "\n",
    "- ``batch_size`` - the paragraphs from the database (some of which contain the answer to the question, others - do not contain) will be split into batches with the size ``batch_size`` for extraction of candidate answer in each paragraph;\n",
    "- ``squad_model`` - the model which finds spans of an answer in a paragraph;\n",
    "- ``sort_noans`` - whether to put paragraphs with no answer in the end of paragraph list, sorted by confidences;\n",
    "- ``top_n`` - the number of possible answers for a question;\n",
    "- ``return_answer_sentence`` - whether to return the sentence from the paragraph with the answer.\n",
    "\n",
    "## 5.2 Building the index and training the reader model\n",
    "\n",
    "There are two customizable components in ODQA configs:\n",
    "\n",
    "- TF-IDF ranker;\n",
    "- Reading comprehension model.\n",
    "\n",
    "If you would like to build the TF-IDF index for your own text database, read [here](https://docs.deeppavlov.ai/en/master/features/models/tfidf_ranking.html#ranker-training). \n",
    "\n",
    "In addition, to train the Reader on your data, read [here](https://docs.deeppavlov.ai/en/master/features/models/SQuAD.html#4.1-Train-your-model-from-Python)."
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/SQuAD.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Context Question Answering\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/SQuAD.ipynb)\n",
    "\n",
    "[![Medium](https://img.shields.io/badge/Medium-12100E?style=for-the-badge&logo=medium&logoColor=white)](https://medium.com/deeppavlov/developing-qa-systems-for-any-language-with-deeppavlov-a9033d5231a8)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1. [Predict using Python](#4.1-Predict-using-Python)\n",
    "    \n",
    "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "     \n",
    "5. [Train the model on your data](#5.-Train-the-model-on-your-data)\n",
    "    \n",
    "    5.1. [from Python](#5.1-Train-your-model-from-Python)\n",
    "    \n",
    "    5.2. [from CLI](#5.2-Train-your-model-from-CLI)\n",
    "    \n",
    "6. [Evaluate](#6.-Evaluate)\n",
    "    \n",
    "    6.1. [from Python](#6.1-Evaluate-from-Python)\n",
    "    \n",
    "    6.2. [from CLI](#6.2-Evaluate-from-CLI)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "Context Question Answering is a task of finding a fragment with an answer to a question in a given segment of context.\n",
    "\n",
    "**Context**:\n",
    "\n",
    "```\n",
    "In meteorology, precipitation is any product of the condensation \n",
    "of atmospheric water vapor that falls under gravity. The main forms \n",
    "of precipitation include drizzle, rain, sleet, snow, graupel and hail… \n",
    "Precipitation forms as smaller droplets coalesce via collision with \n",
    "other rain drops or ice crystals within a cloud. Short, intense periods \n",
    "of rain in scattered locations are called “showers”.\n",
    "```\n",
    "\n",
    "**Question**:\n",
    "```\n",
    "Where do water droplets collide with ice crystals to form precipitation?\n",
    "```\n",
    "\n",
    "**Answer**: \n",
    "```\n",
    "within a cloud\n",
    "```\n",
    "\n",
    "Datasets that follow this task format:\n",
    "\n",
    "- [Stanford Question Answering Dataset (SQuAD) (EN)](https://rajpurkar.github.io/SQuAD-explorer/)\n",
    "\n",
    "- [SberQuAD (RU)](https://paperswithcode.com/dataset/sberquad)\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then make sure that all the required packages for the model are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install squad_bert"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`squad_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n",
    "\n",
    "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n",
    "The full list of the models with their config names can be found in the [table](#3.-Models-list).\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "The table presents a list of all of the Context Question Answering models available in DeepPavlov Library.\n",
    "\n",
    "| Config name  | Dataset | Language | Model Size | F1 score | EM  |\n",
    "| :--- | --- | --- | --- | --- | ---: |\n",
    "| squad_bert | SQuAD v1.1 | En | 1.3 GB | 88.86 | 81.49 |\n",
    "| qa_squad2_bert | SQuAD v2.0 | En | 1.3 GB | 83.56 | 75.54 |\n",
    "| qa_multisberquad_bert | MultiSQuAD | Multi | 2 GB | 80.76 | 63.81 |\n",
    "| squad_ru_bert | SberQuAD | Ru | 2.0 GB | 84.71 | 66.21 |\n",
    "| squad_ru_convers_distilrubert_2L | SberQuAD | Ru | 1.2 GB | 65.20 | 44.52 |\n",
    "| squad_ru_convers_distilrubert_6L | SberQuAD | Ru | 1.6 GB | 80.57 | 61.54 |\n",
    "\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "\n",
    "model = build_model('squad_bert', download=True, install=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Input**: List[context], List[question]\n",
    "\n",
    "**Output**: List[answer, start_character, logit]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['a library for NLP and dialog systems'], [14], [200928.390625]]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model(['DeepPavlov is a library for NLP and dialog systems.'], ['What is DeepPavlov?'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Command Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov interact squad_bert -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n",
    "\n",
    "Or make predictions for samples from *stdin*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov predict squad_bert -f <file-name>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Train the model on your data\n",
    "\n",
    "\n",
    "## 5.1 Train your model from Python\n",
    "\n",
    "### Provide your data path\n",
    "\n",
    "To train the model on your data, you need to change the path to the training data in the *config_file*.\n",
    "\n",
    "Parse the *config_file* and change the path to your data from Python."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "~/.deeppavlov/downloads/squad/\n"
     ]
    }
   ],
   "source": [
    "from deeppavlov import train_model\n",
    "from deeppavlov.core.commands.utils import parse_config\n",
    "\n",
    "model_config = parse_config('squad_bert')\n",
    "\n",
    "#  dataset that the model was trained on\n",
    "print(model_config['dataset_reader']['data_path'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Provide a *data_path* to your own dataset. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# download and unzip a new example dataset\n",
    "!wget http://files.deeppavlov.ai/datasets/squad-v1.1.tar.gz\n",
    "!tar -xzvf \"squad-v1.1.tar.gz\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that if you want to provide your own dataset, it should have the same format as the SQuAD dataset downloaded in this cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# provide a path to the train file\n",
    "model_config['dataset_reader']['data_path'] = '/contents/train-v1.1.json'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SQuAD dataset info\n",
    "\n",
    "There are *two* versions of the SQuAD dataset available for training at the moment: \n",
    "\n",
    "- [SQuAD 1.1](https://arxiv.org/abs/1606.05250) contains 107,785 question-answer pairs on 536 articles. Dataset size: `33.52 MiB`.\n",
    "\n",
    "- [SQuAD 2.0](https://arxiv.org/abs/1806.03822) combines all of the questions from SQuAD 1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers. Dataset size: `44.34 MiB`.\n",
    "\n",
    "### Train the model using new config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = train_model(model_config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use your model for prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['a library for NLP and dialog systems'], [14], [200928.390625]]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model(['DeepPavlov is a library for NLP and dialog systems.'], ['What is DeepPavlov?'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.2 Train your model from CLI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov train squad_bert"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6. Evaluate\n",
    "\n",
    "## 6.1 Evaluate from Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import evaluate_model\n",
    "\n",
    "model = evaluate_model('squad_bert', download=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.2 Evaluate from CLI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov evaluate squad_bert -d"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/bert.rst
================================================
BERT in DeepPavlov
==================
BERT (Bidirectional Encoder Representations from Transformers) is a Transformer pre-trained on masked language model
and next sentence prediction tasks. This approach showed state-of-the-art results on a wide range of NLP tasks in
English.

| BERT paper: https://arxiv.org/abs/1810.04805
| Google Research BERT repository: https://github.com/google-research/bert

There are several pre-trained BERT models released by Google Research, more details about these pre-trained models could be found here: https://github.com/google-research/bert#pre-trained-models

-  BERT-base, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip>`__,
   `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip>`__
-  BERT-base, English, uncased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip>`__,
   `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip>`__
-  BERT-large, English, cased, 24-layer, 1024-hidden, 16-heads, 340M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip>`__
-  BERT-base, multilingual, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip>`__,
   `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12.zip>`__, `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz>`__
-  BERT-base, Chinese, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip>`__,
   `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/chinese_L-12_H-768_A-12.zip>`__, `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/chinese_L-12_H-768_A-12_pt.tar.gz>`__

We have trained BERT-base model for other languages and domains:

-  RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v2.tar.gz>`__,
   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__
-  SlavicBERT, Slavic (bg, cs, pl, ru), cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_v1.tar.gz>`__,
   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__
-  Conversational BERT, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_v1.tar.gz>`__,
   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__
-  Conversational RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12.tar.gz>`__,
   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__
-  Conversational DistilRuBERT, Russian, cased, 6-layer, 768-hidden, 12-heads, 135.4M parameters: `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/distil_ru_conversational_cased_L-6_H-768_A-12_pt.tar.gz>`__
-  Conversational DistilRuBERT-tiny, Russian, cased, 2-layer, 768-hidden, 12-heads, 107M parameters: `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/distil_ru_conversational_cased_L-2_H-768_A-12_pt.tar.gz>`__
-  Sentence Multilingual BERT, 101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12.tar.gz>`__,
   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__
-  Sentence RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12.tar.gz>`__,
   `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__

The ``deeppavlov_pytorch`` models are designed to be run with the `HuggingFace's Transformers <https://huggingface.co/transformers/>`__ library.

RuBERT was trained on the Russian part of Wikipedia and news data. We used this training data to build vocabulary of Russian subtokens and took
multilingual version of BERT-base as initialization for RuBERT [1]_.

SlavicBERT was trained on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian.
Subtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT.
The model is described in our ACL paper [2]_.

Conversational BERT was trained on the English part of Twitter, Reddit, DailyDialogues [4]_, OpenSubtitles [5]_, Debates [6]_, Blogs [7]_, Facebook News Comments.
We used this training data to build the vocabulary of English subtokens and took
English cased version of BERT-base as initialization for English Conversational BERT.

Conversational RuBERT was trained on OpenSubtitles [5]_, Dirty, Pikabu, and Social Media segment of Taiga corpus [8]_.
We assembled new vocabulary for Conversational RuBERT model on this data and initialized model with RuBERT.

Conversational DistilRuBERT (6 transformer layers) and DistilRuBERT-tiny (2 transformer layers) were trained on the same data as Conversational RuBERT and highly inspired by DistilBERT [3]_. Namely, Distil* models (students) used pretrained Conversational RuBERT as teacher and linear combination of the following losses:

1. Masked language modeling loss (between student output logits for tokens and its true labels)
2. Kullback-Leibler divergence (between student and teacher output logits)
3. Cosine embedding loss (between averaged hidden states of the teacher and hidden states of the student)
4. Mean squared error loss (between averaged attention maps of the teacher and attention maps of the student)

Sentence Multilingual BERT is a representation-based sentence encoder for 101 languages of Multilingual BERT.
It is initialized with Multilingual BERT and then fine-tuned on english MultiNLI [9]_ and on dev set of multilingual XNLI [10]_.
Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_.

Sentence RuBERT is a representation-based sentence encoder for Russian.
It is initialized with RuBERT and fine-tuned on SNLI [11]_ google-translated to russian and on russian part of XNLI dev set [10]_.
Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_.

Here, in DeepPavlov, we made it easy to use pre-trained BERT for downstream tasks like classification, tagging, question answering and
ranking. We also provide pre-trained models and examples on how to use BERT with DeepPavlov.

BERT as Embedder
----------------

:class:`~deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder` allows for using BERT
model outputs as token, subtoken and sentence level embeddings.

Additionaly the embeddings can be easily used in DeepPavlov. To get text level, token level and subtoken level representations,
you can use or modify a :config:`BERT embedder configuration <embedder/bert_embedder.json>`:

.. code:: python
    
    from deeppavlov.core.common.file import read_json
    from deeppavlov import build_model, configs
    
    bert_config = read_json(configs.embedder.bert_embedder)
    bert_config['metadata']['variables']['BERT_PATH'] = 'path/to/bert/directory'

    m = build_model(bert_config)

    texts = ['Hi, i want my embedding.', 'And mine too, please!']
    tokens, token_embs, subtokens, subtoken_embs, sent_max_embs, sent_mean_embs, bert_pooler_outputs = m(texts)


BERT for Classification
-----------------------

:class:`~deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel`
provides solution for classification problem using pre-trained BERT on PyTorch.
One can use several pre-trained English, multi-lingual and Russian BERT models that are
listed above. :class:`~deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel`
also supports any Transformer-based model of `Transformers <https://github.com/huggingface/transformers>`.

Two main components of BERT classifier pipeline in DeepPavlov are
:class:`~deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersPreprocessor` and
:class:`~deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel`.
Non-processed texts should be given to ``torch_transformers_preprocessor`` for tokenization on subtokens,
encoding subtokens with their indices and creating tokens and segment masks.

``torch_transformers_classifier`` has a dense layer of number of classes size upon pooled outputs of Transformer encoder,
it is followed by ``softmax`` activation (``sigmoid`` if ``multilabel`` parameter is set to ``true`` in config).


BERT for Named Entity Recognition (Sequence Tagging)
----------------------------------------------------

Pre-trained BERT model can be used for sequence tagging. Examples of BERT application to sequence tagging
can be found :doc:`here </features/models/NER>`. The module used for tagging
is :class:`~deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger`.
The tags are obtained by applying a dense layer to the representation of
the first subtoken of each word. There is also an optional CRF layer on the top.
You can choose among different Transformers architectures by modifying the TRANSFORMER variable in the corresponding configuration files.
The possible choices are DistilBert, Albert, Camembert, XLMRoberta, Bart, Roberta, Bert, XLNet, Flaubert, XLM.

..
    TODO: fix Zero-Shot NER reference

Multilingual BERT model allows to perform zero-shot transfer across languages. To use our 19 tags NER for over a
hundred languages see ner_multi_bert.


BERT for Context Question Answering (SQuAD)
-------------------------------------------
Context Question Answering on `SQuAD <https://rajpurkar.github.io/SQuAD-explorer/>`__ dataset is a task
of looking for an answer on a question in a given context. This task could be formalized as predicting answer start
and end position in a given context. :class:`~deeppavlov.models.torch_bert.torch_transformers_squad:TorchTransformersSquad` on PyTorch uses two linear
transformations to predict probability that current subtoken is start/end position of an answer. For details check
:doc:`Context Question Answering documentation page </features/models/SQuAD>`.

Using custom BERT in DeepPavlov
-------------------------------

The previous sections describe the BERT based models implemented in DeepPavlov.
To change the BERT model used for initialization in any downstream task mentioned above the following parameters of
the :doc:`config </intro/configuration>` file must be changed to match new BERT path:

* download URL in the ``metadata.download.url`` part of the config
* ``bert_config_file``, ``pretrained_bert`` in the BERT based Component. In case of PyTorch BERT, ``pretrained_bert`` can be assigned to
    string name of any Transformer-based model (e.g. ``"bert-base-uncased"``, ``"distilbert-base-uncased"``) and then ``bert_config_file`` is set to ``None``.
* ``vocab_file`` in the ``torch_transformers_preprocessor``. ``vocab_file`` can be assigned to
    string name of used pre-trained BERT (e.g. ``"bert-base-uncased"``).

.. [1] Kuratov, Y., Arkhipov, M. (2019). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint arXiv:1905.07213.
.. [2] Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. (2019). `Tuning Multilingual Transformers for Language-Specific Named Entity Recognition <https://www.aclweb.org/anthology/W19-3712/>`__ . ACL anthology W19-3712.
.. [3] Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108.
.. [4] Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017.
.. [5] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016)
.. [6] Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016.
.. [7] J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs.
.. [8] Shavrina T., Shapovalova O. (2017) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING: «TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017.
.. [9] Williams A., Nangia N. & Bowman S. (2017) A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference. arXiv preprint arXiv:1704.05426
.. [10] Williams A., Bowman S. (2018) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint arXiv:1809.05053
.. [11] S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. (2015) A large annotated corpus for learning natural language inference. arXiv preprint arXiv:1508.05326
.. [12] N. Reimers, I. Gurevych (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:1908.10084


================================================
FILE: docs/features/models/classification.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Classification\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/classification.ipynb)\n",
    "\n",
    "[![Medium](https://img.shields.io/badge/Medium-12100E?style=for-the-badge&logo=medium&logoColor=white)](https://medium.com/deeppavlov/text-classification-using-deeppavlov-library-with-pytorch-and-transformers-f14db5528821)\n",
    "\n",
    "# Table of contents\n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1. [Predict using Python](#4.1-Predict-using-Python)\n",
    "\n",
    "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "5. [Evaluation](#5.-Evaluation)\n",
    "\n",
    "    5.1. [from Python](#5.1-Evaluate-from-Python)\n",
    "\n",
    "    5.2. [from CLI](#5.2-Evaluate-from-CLI)\n",
    "\n",
    "6. [Train the model on your data](#6.-Train-the-model-on-your-data)\n",
    "\n",
    "    6.1. [from Python](#6.1-Train-your-model-from-Python)\n",
    "\n",
    "    6.2. [from CLI](#6.2-Train-your-model-from-CLI)\n",
    "\n",
    "7. [Simple few-shot classifiers](#7.-Simple-few-shot-classifiers)\n",
    "\n",
    "    7.1. [Few-shot setting](#7.1-Few-shot-setting)\n",
    "\n",
    "    7.2. [Multiple languages support](#7.2-Multiple-languages-support)\n",
    "\n",
    "    7.3. [Dataset and Scores](#7.3-Dataset-and-Scores)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "This section describes a family of BERT-based models that solve a variety of different classification tasks.\n",
    "\n",
    "**Insults detection** is a binary classification task of identying wether a given sequence is an insult of another participant of communication.\n",
    "\n",
    "**Sentiment analysis** is a task of classifying the polarity of the the given sequence. The number of classes may vary depending on the data: positive/negative binary classification, multiclass classification with a neutral class added or with a number of different emotions.\n",
    "\n",
    "The models trained for the **paraphrase detection** task identify whether two sentences expressed with different words convey the same meaning.\n",
    "\n",
    "**Topic classification** refers to the task of classifying an utterance by the topic which belongs to the conversational domain.\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then make sure that all the required packages for the model are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install insults_kaggle_bert"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`insults_kaggle_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n",
    "\n",
    "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n",
    "The full list of classification models with their config names can be found in the [table](#3.-Models-list).\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "The table presents a list of all of the classification models available in DeepPavlov Library.\n",
    "\n",
    "| Config name  | Language | Task | Dataset | Model Size | Metric | Score |\n",
    "| :--- | --- | --- | --- | --- | --- | ---: |\n",
    "| insults_kaggle_bert | En | Insults | [Insults](https://www.kaggle.com/c/detecting-insults-in-social-commentary) | 1.1 GB | ROC-AUC | 0.8770 |\n",
    "| paraphraser_rubert | Ru | Paraphrase | [Paraphrase Corpus](http://paraphraser.ru/download/) | 2.0 GB | F1 | 0.8738 |\n",
    "| paraphraser_convers_distilrubert_2L | Ru | Paraphrase | [Paraphrase Corpus](http://paraphraser.ru/download/) | 1.2 GB | F1 | 0.7396 |\n",
    "| paraphraser_convers_distilrubert_6L | Ru | Paraphrase | [Paraphrase Corpus](http://paraphraser.ru/download/) | 1.6 GB | F1 | 0.8354 |\n",
    "| sentiment_sst_conv_bert | En | Sentiment | [SST](https://paperswithcode.com/dataset/sst) | 1.1 GB | Accuracy | 0.6626 |\n",
    "| sentiment_twitter | Ru | Sentiment | [Twitter Mokoron](https://github.com/mokoron/sentirueval) | 6.2 GB | F1-macro | 0.9961 |\n",
    "| rusentiment_bert | Ru | Sentiment | [RuSentiment](https://text-machine.cs.uml.edu/projects/rusentiment/) | 1.3 GB | F1-weighted | 0.7005 |\n",
    "| rusentiment_convers_bert | Ru | Sentiment | [RuSentiment](https://text-machine.cs.uml.edu/projects/rusentiment/) | 1.5 GB | F1-weighted | 0.7724  |\n",
    "| topics_distilbert_base_uncased | En | Topics | [DeepPavlov Topics](https://deeppavlov.ai/datasets/topics) | 6.2 GB | F1-macro | 0.9961 |\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "\n",
    "model = build_model('insults_kaggle_bert', download=True, install=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Input format**: List[sentences]\n",
    "\n",
    "**Output format**: List[labels]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Insult', 'Not Insult']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model(['You are kind of stupid', 'You are a wonderful person!'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Command Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python deeppavlov interact insults_kaggle_bert -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n",
    "\n",
    "Or make predictions for samples from *stdin*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python deeppavlov predict insults_kaggle_bert -f <file-name>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Evaluation\n",
    "\n",
    "## 5.1 Evaluate from Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import evaluate_model\n",
    "\n",
    "model = evaluate_model('insults_kaggle_bert', download=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.2 Evaluate from CLI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov evaluate insults_kaggle_bert -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6. Train the model on your data\n",
    "\n",
    "## 6.1 Train your model from Python\n",
    "\n",
    "### Provide your data path\n",
    "\n",
    "To train the model on your data, you need to change the path to the training data in the *config_file*.\n",
    "\n",
    "Parse the *config_file* and change the path to your data from Python."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "~/.deeppavlov/downloads/insults_data\n"
     ]
    }
   ],
   "source": [
    "from deeppavlov import train_model\n",
    "from deeppavlov.core.commands.utils import parse_config\n",
    "\n",
    "model_config = parse_config('insults_kaggle_bert')\n",
    "\n",
    "# dataset that the model was trained on\n",
    "print(model_config['dataset_reader']['data_path'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Provide a *data_path* to your own dataset. You can also change any of the hyperparameters of the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# download and unzip a new example dataset\n",
    "!wget http://files.deeppavlov.ai/datasets/insults_data.tar.gz\n",
    "!tar -xzvf \"insults_data.tar.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# provide a path to the directory with your train, valid and test files\n",
    "model_config['dataset_reader']['data_path'] = \"./contents/\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "### Train dataset format\n",
    "\n",
    "### Train the model using new config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = train_model(model_config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use your model for prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Insult', 'Not Insult']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model(['You are kind of stupid', 'You are a wonderful person!'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.2 Train your model from CLI\n",
    "\n",
    "To train the model on your data, create a copy of a config file and change the *data_path* variable in it. After that, train the model using your new *config_file*. You can also change any of the hyperparameters of the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov train model_config.json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 7. Simple few-shot classifiers\n",
    "\n",
    "Additionally, in the [faq](https://github.com/deeppavlov/DeepPavlov/tree/master/deeppavlov/configs/faq) section you can find a config for a fast and simple pre-BERT model, which consists of a fasttext vectorizer and a simple logistic regression classifier.\n",
    "\n",
    "## 7.1 Few-shot setting\n",
    "\n",
    "In the current setting the config can be used for few-shot classification - a task, in which only a few training examples are available for each class (usually from 5 to 10). Note that the config takes the full version of the dataset as the input and samples N examples for each class of the train data in the iterator.\n",
    "\n",
    "The sampling is done within the `basic_classification_iterator` component of the pipeline and the `shot` parameter defines the number of examples to be sampled. By default the `shot` parameter is set to `None` (no sampling applied).\n",
    "\n",
    "## 7.2 Multiple languages support\n",
    "\n",
    "By default `fasttext_logreg` supports classification in English, but can be modified for classification in Russian.\n",
    "\n",
    "In order to change `fasttext_logreg` language to Russian, change `LANGUAGE` variable in the `metadata.variables` section from `en` to `ru` and change the Spacy model by changing `SPACY_MODEL` variable from `en_core_web_sm` to `ru_core_news_sm`.\n",
    "\n",
    "You can do that by directly editing the config file through an editor or change it through Python (example below). N.B. `read_json` and `find_config` combination is intentionally used instead of `parse_config` to read config in the example, because `parse_config` will replace all `LANGUAGE` and `SPACY_MODEL` usages in the config with the default values from `metadata.variables`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "from deeppavlov.core.common.file import read_json, find_config\n",
    "\n",
    "model_config = read_json(find_config('fasttext_logreg'))\n",
    "model_config['metadata']['variables']['LANGUAGE'] = 'ru'\n",
    "model_config['metadata']['variables']['SPACY_MODEL'] = 'ru_core_news_sm'\n",
    "model = build_model(model_config, install=True, download=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7.3 Dataset and Scores\n",
    "\n",
    "To demonstrate the performance of the model in two languages, we use the English and Russian subsets of [the MASSIVE dataset](https://github.com/alexa/massive).\n",
    "\n",
    "MASSIVE is a parallel dataset of utterrances in 52 languages with annotations for the Natural Language Understanding tasks of intent prediction and slot annotation. We only employ the intent classification data. You can see the results of the given configs in 5-shot classification setting in the table below.\n",
    "\n",
    "| Config name | Language | Train accuracy | Validation accuracy | Test accuracy |\n",
    "| :--- | --- | --- | --- | ---: |\n",
    "| fasttext_logreg | en | 0.9632 | 0.5239 | 0.5155 |\n",
    "| fasttext_logreg | ru | 0.9231 | 0.4565 | 0.4304 |"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/entity_extraction.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Entity Extraction\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/entity_extraction.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1 [Predict using Python](#4.1-Predict-using-Python)\n",
    "    \n",
    "    4.2 [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "    \n",
    "    5.1 [Description of config parameters](#5.1-Description-of-config-parameters)\n",
    "    \n",
    "    5.2 [Training entity detection model](#5.2-Training-entity-detection-model)\n",
    "    \n",
    "    5.3 [Using custom knowledge base](#5.3-Using-custom-knowledge-base)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "**Entity Detection** is the task of identifying entity mentions in text with corresponding entity types. Entity Detection models in DeepPavlov split the input text into fragments of the lengths less than 512 tokens and find entities with BERT-based models.\n",
    "\n",
    "**Entity Linking** is the task of finding knowledge base entity ids for entity mentions in text. Entity Linking in DeepPavlov supports Wikidata and Wikipedia. Entity Linking component performs the following steps:\n",
    "\n",
    "* extraction of candidate entities from SQLite database;\n",
    "* candidate entities sorting by entity tags (if entity tags are provided);\n",
    "* ranking of candidate entities by connections in Wikidata knowledge graph of candidate entities for different mentions;\n",
    "* candidate entities ranking by context and descriptions using Transformer model [bert-small](https://huggingface.co/prajjwal1/bert-small) in English config and [distilrubert-tiny](https://huggingface.co/DeepPavlov/distilrubert-tiny-cased-conversational-v1).\n",
    "\n",
    "**Entity Extraction** configs perform subsequent Entity Detection and Entity Linking of extracted entity mentions.\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then make sure that all the required packages for the model are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install entity_extraction_en"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`entity_extraction_en` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n",
    "\n",
    "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n",
    "The full list of models for entity detection, linking and extraction with their config names can be found in the [table](#3.-Models-list).\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "The table presents a list of all of the models for entity detection, linking and extraction available in the DeepPavlov Library.\n",
    "\n",
    "| Config name | Language | RAM | GPU |\n",
    "| :--- | --- | --- | --- |\n",
    "| entity_detection_en | En | 2.5 Gb | 3.7 Gb |\n",
    "| entity_detection_ru | Ru | 2.5 Gb | 5.3 Gb |\n",
    "| entity_linking_en | En | 2.4 Gb | 1.2 Gb |\n",
    "| entity_linking_ru | Ru | 2.2 Gb | 1.1 Gb |\n",
    "| entity_extraction_en | En | 2.5 Gb | 3.7 Gb |\n",
    "| entity_extraction_ru | Ru | 2.5 Gb | 5.3 Gb |\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict.\n",
    "\n",
    "### Entity Detection\n",
    "\n",
    "**For English:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "from deeppavlov import build_model\n",
    "\n",
    "ed_en = build_model('entity_detection_en', download=True, install=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**The output elements:**\n",
    "\n",
    "* entity substrings\n",
    "* entity offsets (indices of start and end symbols of entities in text)\n",
    "* entity positions (indices of entity tokens in text)\n",
    "* entity tags\n",
    "* sentences offsets\n",
    "* list of sentences in text\n",
    "* confidences of detected entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ed_en(['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**For Russian:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ed_ru = build_model('entity_detection_ru', download=True, install=True)\n",
    "ed_ru(['Москва — столица России, центр Центрального федерального округа и центр Московской области.'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Entity Linking\n",
    "\n",
    "**For English:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "el_en = build_model('entity_linking_en', download=True, install=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**The input elements:**\n",
    "\n",
    "* entity substrings\n",
    "* entity tags (optional argument)\n",
    "* confidences of entity substrings (optional argument)\n",
    "* sentences (context) of the entities (optional argument)\n",
    "* entity offsets (optional argument)\n",
    "* sentences offsets (optional argument)\n",
    "\n",
    "**The output elements:**\n",
    "\n",
    "* entity ids\n",
    "* entity confidences (for each entity - the list with three confidences: substring matching confidence, popularity ranking confidence and context ranking confidence)\n",
    "* entity pages in Wikipedia\n",
    "* entity labels in Wikidata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "el_en([['forrest gump', 'robert zemeckis', 'eric roth']],\n",
    "      [['WORK_OF_ART', 'PERSON', 'PERSON']],\n",
    "      [[1.0, 1.0, 1.0]],\n",
    "      [['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.']],\n",
    "      [[(0, 12), (48, 63), (79, 88)]],\n",
    "      [[(0, 89)]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**For Russian:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "el_ru = build_model('entity_linking_ru', download=True, install=True)\n",
    "\n",
    "el_ru([['москва', 'россии', 'центрального федерального округа', 'московской области']],\n",
    "      [['CITY', 'COUNTRY', 'LOC', 'LOC']],\n",
    "      [[1.0, 1.0, 1.0, 1.0]],\n",
    "      [['Москва — столица России, центр Центрального федерального округа и центр Московской области.']],\n",
    "      [[(0, 6), (17, 23), (31, 63), (72, 90)]],\n",
    "      [[(0, 91)]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Entity Extraction\n",
    "\n",
    "**For English:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ex_en = build_model('entity_extraction_en', download=True, install=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**The output elements:**\n",
    "\n",
    "* entity substrings\n",
    "* entity tags\n",
    "* entity offsets\n",
    "* entity ids in the knowledge base\n",
    "* entity linking confidences\n",
    "* entity pages\n",
    "* entity labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ex_en(['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**For Russian:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ex_ru = build_model('entity_extraction_ru', download=True, install=True)\n",
    "\n",
    "ex_ru(['Москва — столица России, центр Центрального федерального округа и центр Московской области.'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact entity_extraction_en -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Customize the model\n",
    "\n",
    "## 5.1 Description of config parameters\n",
    "\n",
    "Parameters of ``ner_chunker`` component:\n",
    "\n",
    "- ``batch_size: int`` - each text from the input text batch is split into chunks with the length lower than the threshold (because Transformer-based models for entity detection work with limited lengths of the input sequences), than all chunks are concatenated into one list and the list is split into batches of the size ``batch_size``;\n",
    "- ``max_seq_len: int`` - maximum length of chunk (in wordpiece tokens);\n",
    "- ``vocab_file: str`` - vocab file of Transformer tokenizer, which is used to tokenize the text for further splitting into chunks.\n",
    "\n",
    "Parameters of ``entity_detection_parser`` component:\n",
    "    \n",
    "- ``thres_proba: float`` - the NER models return tag confidences for each token; if the probability of \"O\" tag (which is used for tokens not related to entities) for the token is lower than the ``thres_proba``, the tag with the maximum probability from entity tags list is chosen;\n",
    "- ``o_tag: str`` - tag for non-entity tokens (by default is \"O\" tag);\n",
    "- ``tags_file: str`` - the filename with the list of tags used in the NER model.\n",
    "\n",
    "Parameters of ``ner_chunk_model`` component:\n",
    "\n",
    "- ``ner: deeppavlov.core.common.chainer:Chainer`` - the config for entity recognition, which defines entity tags (or \"O\" tag) and tag probabilities for each token in the input text;\n",
    "- ``ner_parser: deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser`` - the component which processes the tags and tag probabilities returned by the entity recognition model and defines entity substrings;\n",
    "- ``ner2: deeppavlov.core.common.chainer:Chainer`` - (optional) an additional entity recognition config, which can improve the quality of entity recognition in the case of joint usage with ``ner`` config;\n",
    "- ``ner_parser2: deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser`` - (optional) an additional config for processing entity recognition output.\n",
    "\n",
    "Parameters of ``entity_linker`` component:\n",
    "\n",
    "- ``load_path: str`` - the path to the folder with the inverted index;\n",
    "- ``entity_ranker`` - the component for ranking of candidate entities by descriptions;\n",
    "- ``entities_database_filename: str`` - file with the inverted index (the mapping between entity titles and entity IDs);\n",
    "- ``words_dict_filename: str`` - file with mapping of entity titles to the tags of entity detection model;\n",
    "- ``ngrams_matrix_filename: str`` - matrix of char ngrams of words from entity titles from the knowledge base;\n",
    "- ``num_entities_for_bert_ranking: int`` - number of candidate entities which are re-ranked by context and description using Transformer-based model;\n",
    "- ``num_entities_for_conn_ranking: int`` - number of candidate entities which are re-ranked by connections in the knowledge graph between entities for different mentions in the text;\n",
    "- ``num_entities_to_return: int`` - the number of entity IDs, returned for each entity mention in text; \n",
    "- ``max_paragraph_len: int`` - maximum length of context used for ranking of entities by description;\n",
    "- ``lang: str`` - language of the entity linking model (Russian or English);\n",
    "- ``use_descriptions: bool`` - whether to perform ranking of candidate entities by similarity of their descriptions to the context;\n",
    "- ``alias_coef: float`` - the coefficient which is multiplied by the substring matching score of the entity if the entity mention in the text matches with the entity title;\n",
    "- ``use_tags: bool`` - whether to search only those entity IDs in the inverted index, which have the same tag as the entity mention;\n",
    "- ``lemmatize: bool`` - whether to lemmatize entity mentions before searching candidate entity IDs in the inverted index;\n",
    "- ``full_paragraph: bool`` - whether to use full context for ranking of entities by descriptions or cut the paragraph to one sentence with entity mention;\n",
    "- ``use_connections: bool`` - whether to use connections between candidate entities for different mentions for ranking;\n",
    "- ``kb_filename: str`` - file with the knowledge base in .hdt format;\n",
    "- ``prefixes: Dict[str, Any]`` - prefixes in the knowledge base for entities and relations.\n",
    "\n",
    "## 5.2 Training entity detection model\n",
    "\n",
    "The configs `entity_detection_en` and `entity extraction_en` use `ner_ontonotes_bert` model for detection of entity mentions, the configs `entity_detection_ru` and `entity extraction_ru` use `ner_rus_bert_probas` model. [How to train a NER model](http://docs.deeppavlov.ai/en/master/features/models/NER.html#6.-Customize-the-model).\n",
    "\n",
    "## 5.3 Using custom knowledge base\n",
    "\n",
    "The database filename is defined with the **entities_database_filename** in entity linking configs. The file is in SQLite format with FTS5 extensions for full-text search of entities by entity mention. The database file should contain the **inverted_index** table with the following columns:\n",
    "\n",
    "* ``title`` - entity title (name or alias) in the knowledge base;\n",
    "* ``entity_id`` - entity ID in the knowledge base;\n",
    "* ``num_rels`` - number of relations of the entity with other entities in the knowledge graph;\n",
    "* ``ent_tag`` - entity tag of the entity detection model (for example, CITY, PERSON, WORK_OF_ART, etc.);\n",
    "* ``page`` - page title of the entity (for Wikidata entities - the Wikipedia page);\n",
    "* ``label`` - entity label in the knowledge base;\n",
    "* ``descr`` - entity description in the knowledge base.\n",
    "\n",
    "Tags of entities in the knowledge base should correspond with the tags of the custom NER model or default `ner_ontonotes_bert` or `ner_rus_bert_probas` models. The list of `ner_ontonotes_bert` tags is listed in tags.dict file in ~/.deeppavlov/models/ner_ontonotes_bert_torch_crf directory, the list of `ner_rus_bert_probas tags` - in tags.dict file in ~/.deeppavlov/models/wiki_ner_rus_bert directory."
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/few_shot_classification.ipynb
================================================
{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Few-shot Text Classification\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1 [Dataset format](#4.1-Dataset-format)\n",
    "\n",
    "    4.2. [Predict using Python](#4.2-Predict-using-Python)\n",
    "    \n",
    "    4.3. [Predict using CLI](#4.3-Predict-using-CLI)\n",
    "\n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "__Text classification__ is a task of identifying one of the pre-defined label given an utterance, where label is one of N classes or \"OOS\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n",
    "\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then make sure that all the required packages are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install few_shot_roberta"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`few_shot_roberta` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n",
    "\n",
    "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n",
    "Some of few-shot classification models with their config names can be found in the [table](#3.-Models-list).\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "At the moment, only `few_shot_roberta` config support out-of-scope detection.\n",
    "\n",
    "| Config name  | Dataset | Shot | Model Size | In-domain accuracy | Out-of-scope recall | Out-of-scope precision |\n",
    "| :--- | --- | --- | --- | --- |  --- | ---: |\n",
    "| few_shot_roberta| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 1.4 GB | 84.1±1.9 | 93.2±0.8 | 97.8±0.3 |\n",
    "| few_shot_roberta| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 1.4 GB | 59.4±1.4 | 87.9±1.2 | 40.3±0.7 |\n",
    "| few_shot_roberta| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent)  | 5 | 1.4 GB | 51.4±2.1 | 93.7±0.7 | 82.7±1.4 |\n",
    "| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 37 KB |24.8±2.2 | 98.2±0.4 | 74.8±0.6 |\n",
    "| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 37 KB | 13.4±0.5 | 98.6±0.2 | 20.5±0.1 |\n",
    "| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent)  | 5 | 37 KB |10.7±0.8 | 99.0±0.3 | 36.4±0.2 |\n",
    "\n",
    "\n",
    "With zero threshold we can get a classification accuracy without OOS detection:\n",
    "\n",
    "| Config name  | Dataset | Shot | Model Size | Accuracy |\n",
    "| :--- | --- | --- | --- | ---: |\n",
    "| few_shot_roberta| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 1.4 GB | 89.6 |\n",
    "| few_shot_roberta| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 1.4 GB | 79.6 |\n",
    "| few_shot_roberta| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent)  | 5 | 1.4 GB | 55.1 |\n",
    "| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 37 KB | 86.3 |\n",
    "| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent)  | 5 | 37 KB | 73.6\n",
    "| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent)  | 5 | 37 KB | 51.6 |\n",
    "\n",
    "\\* \\- config file was modified to predict OOS examples\n",
    "\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "Base model `few_shot_roberta` was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed.\n",
    "\n",
    "## 4.1 Dataset format\n",
    "\n",
    "DNNC model compares input text to every example in dataset to determine, which class the input example belongs to. The dataset based on which classification is performed has the following format:\n",
    "\n",
    "```\n",
    "[\n",
    "    [\"text_1\",  \"label_1\"],\n",
    "    [\"text_2\",  \"label_2\"],\n",
    "             ...\n",
    "    [\"text_n\",  \"label_n\"]\n",
    "]\n",
    "```\n",
    "\n",
    "## 4.2 Predict using Python\n",
    "\n",
    "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "\n",
    "model = build_model(\"few_shot_roberta\", download=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you set `download` flag to `True`, then existing model weights will be overwritten.\n",
    "\n",
    "Setting the `install` argument to `True` is equivalent to executing the command line `install` command. If set to `True`, it will first install all the required packages.\n",
    "\n",
    "**Input**: List[texts, dataset]\n",
    "\n",
    "**Output**: List[labels]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['translate', 'exchange_rate', 'car_rental']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "texts = [\n",
    "    \"what expression would i use to say i love you if i were an italian\",\n",
    "    \"what's the currency conversion between krones and yen\",\n",
    "    \"i'd like to reserve a high-end car\"\n",
    "]\n",
    "\n",
    "dataset = [\n",
    "    [\"please help me book a rental car for nashville\",                       \"car_rental\"],\n",
    "    [\"how can i rent a car in boston\",                                       \"car_rental\"],\n",
    "    [\"help me get a rental car for march 2 to 6th\",                          \"car_rental\"],\n",
    "    \n",
    "    [\"how many pesos can i get for one dollar\",                              \"exchange_rate\"],\n",
    "    [\"tell me the exchange rate between rubles and dollars\",                 \"exchange_rate\"],\n",
    "    [\"what is the exchange rate in pesos for 100 dollars\",                   \"exchange_rate\"],\n",
    "    \n",
    "    [\"can you tell me how to say 'i do not speak much spanish', in spanish\", \"translate\"],\n",
    "    [\"please tell me how to ask for a taxi in french\",                       \"translate\"],\n",
    "    [\"how would i say thank you if i were russian\",                          \"translate\"]\n",
    "]\n",
    "\n",
    "model(texts, dataset)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.3 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov interact few_shot_roberta -d"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with all other files needed to run the model.\n",
    "\n",
    "Or make predictions for samples from *stdin*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov predict few_shot_roberta -f <file-name>"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Customize the model\n",
    "\n",
    "Out-of-scope (OOS) examples are determined via confidence with *confidence_threshold* parameter. For each input text, if the confidence of the model is lower than the *confidence_threshold*, then the input example is considered out-of-scop. The higher the threshold, the more often the model predicts \"oos\" class. By default it is set to 0, but you can change it to your preferences in configuration file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0\n"
     ]
    }
   ],
   "source": [
    "from deeppavlov import build_model\n",
    "from deeppavlov.core.commands.utils import parse_config\n",
    "\n",
    "model_config = parse_config('few_shot_roberta')\n",
    "model_config['chainer']['pipe'][-1]['confidence_threshold'] = 0.1\n",
    "model = build_model(model_config)"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: docs/features/models/morpho_tagger.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Morphotagger\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/morpho_tagger.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1. [Predict using Python](#4.1-Predict-using-Python)\n",
    "\n",
    "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "Morphological tagging is definition morphological tags, such as case, number, gender, aspect etc. for text tokens.\n",
    "\n",
    "An example:\n",
    "```\n",
    "Я шёл домой по незнакомой улице.\n",
    "```\n",
    "```\n",
    "1\tЯ\tя\tPRON\t_\tCase=Nom|Number=Sing|Person=1\t_\t_\t_\t_\n",
    "2\tшёл\tидти\tVERB\t_\tAspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t_\t_\t_\t_\n",
    "3\tдомой\tдомой\tADV\t_\tDegree=Pos\t_\t_\t_\t_\n",
    "4\tпо\tпо\tADP\t_\t_\t_\t_\t_\t_\n",
    "5\tнезнакомой\tнезнакомый\tADJ\t_\tCase=Dat|Degree=Pos|Gender=Fem|Number=Sing\t_\t_\t_\t_\n",
    "6\tулице\tулица\tNOUN\t_\tAnimacy=Inan|Case=Dat|Gender=Fem|Number=Sing\t_\t_\t_\t_\n",
    "7\t.\t.\tPUNCT\t_\t_\t_\t_\t_\t_\n",
    "```\n",
    "\n",
    "The model is based on [BERT for token classification](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForTokenClassification).\n",
    "The model is trained on [Universal Dependencies corpora](https://universaldependencies.org/) (version 2.3).\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Before using the model make sure that all required packages are installed running the command:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install morpho_ru_syntagrus_bert"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Models list\n",
    "\n",
    "The table presents comparison of ``morpho_ru_syntagrus_bert`` config with other models on UD2.3 dataset.\n",
    "\n",
    "| Model | Accuracy |\n",
    "| :--- | :---: |\n",
    "| UDPipe | 93.5 |\n",
    "| morpho_ru_syntagrus_bert | 97.6 |\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "\n",
    "model = build_model(\"morpho_ru_syntagrus_bert\", download=True, install=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\tЯ\tя\tPRON\t_\tCase=Nom|Number=Sing|Person=1\t_\t_\t_\t_\n",
      "2\tшёл\tшёл\tVERB\t_\tAspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t_\t_\t_\t_\n",
      "3\tдомой\tдомой\tADV\t_\tDegree=Pos\t_\t_\t_\t_\n",
      "4\tпо\tпо\tADP\t_\t_\t_\t_\t_\t_\n",
      "5\tнезнакомой\tнезнакомый\tADJ\t_\tCase=Dat|Degree=Pos|Gender=Fem|Number=Sing\t_\t_\t_\t_\n",
      "6\tулице\tулица\tNOUN\t_\tAnimacy=Inan|Case=Dat|Gender=Fem|Number=Sing\t_\t_\t_\t_\n",
      "7\t.\t.\tPUNCT\t_\t_\t_\t_\t_\t_\n",
      "\n",
      "1\tДевушка\tдевушка\tNOUN\t_\tAnimacy=Anim|Case=Nom|Gender=Fem|Number=Sing\t_\t_\t_\t_\n",
      "2\tпела\tпеть\tVERB\t_\tAspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t_\t_\t_\t_\n",
      "3\tв\tв\tADP\t_\t_\t_\t_\t_\t_\n",
      "4\tцерковном\tцерковном\tADJ\t_\tCase=Loc|Degree=Pos|Gender=Masc|Number=Sing\t_\t_\t_\t_\n",
      "5\tхоре\tхор\tNOUN\t_\tAnimacy=Inan|Case=Loc|Gender=Masc|Number=Sing\t_\t_\t_\t_\n",
      "6\tо\tо\tADP\t_\t_\t_\t_\t_\t_\n",
      "7\tвсех\tвесь\tDET\t_\tCase=Loc|Number=Plur\t_\t_\t_\t_\n",
      "8\tуставших\tустать\tVERB\t_\tAspect=Perf|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act\t_\t_\t_\t_\n",
      "9\tв\tв\tADP\t_\t_\t_\t_\t_\t_\n",
      "10\tчужом\tчужом\tADJ\t_\tCase=Loc|Degree=Pos|Gender=Masc|Number=Sing\t_\t_\t_\t_\n",
      "11\tкраю\tкрай\tNOUN\t_\tAnimacy=Inan|Case=Loc|Gender=Masc|Number=Sing\t_\t_\t_\t_\n",
      "12\t.\t.\tPUNCT\t_\t_\t_\t_\t_\t_\n"
     ]
    }
   ],
   "source": [
    "sentences = [\"Я шёл домой по незнакомой улице.\", \"Девушка пела в церковном хоре о всех уставших в чужом краю.\"]\n",
    "for parse in model(sentences):\n",
    "    print(parse)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact morpho_ru_syntagrus_bert -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n",
    "\n",
    "# 5. Customize the model\n",
    "\n",
    "To train **morphotagger** on your own data, you should prepare a dataset in **CoNLL-U format**. The description of **CoNLL-U format** can be found [here](https://universaldependencies.org/format.html#conll-u-format).\n",
    "\n",
    "Then you should place files for training, validation and testing into the ``\"data_path\"`` directory of ``morphotagger_dataset_reader``, change file names in ``morphotagger_dataset_reader`` to your filenames and launch the training:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import train_model\n",
    "\n",
    "train_model(\"<your_morphotagging_config_name>\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "or **using CLI**:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov train <your_morphotagging_config_name>"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/multitask_bert.rst
================================================
Multi-task BERT in DeepPavlov
=============================

Multi-task BERT in DeepPavlov is an implementation of BERT training algorithm published in the paper
`Knowledge Transfer Between Tasks and Languages in the Multi-task
Encoder-agnostic Transformer-based Models <https://www.dialog-21.ru/media/5902/karpovdpluskonovalovv002.pdf>`_.

The idea is to share BERT body between several tasks. This is necessary if a model pipe has several
components using BERT and the amount of GPU memory is limited. Each task has its own 'head' part attached to the
output of the BERT encoder. If multi-task BERT has :math:`T` heads, one training iteration consists of

- composing :math:`T` lists of examples, one for each task,

- :math:`T` gradient steps, one gradient step for each task.

By default, on every training steps lists of examples for all but one tasks are empty, as if in the original MT-DNN repository. 

When one of BERT heads is being trained, other heads' parameters do not change. On each training step both BERT head
and body parameters are modified.

Currently multitask bert heads support classification, regression, NER and multiple choice tasks. 

At this page, multi-task BERT usage is explained on a toy configuration file of a model that is trained for the
single-sentence classification, sentence pair classification, regression, multiple choice and NER.
The config for this model is :config:`multitask_example <configs/multitask/multitask_example.json>`.

Other examples of using multitask models can be found in :config:`mt_glue <configs/multitask/mt_glue.json>`.

Train config
------------

When using ``multitask_transformer`` component, you can use the same inference file as the train file.

Data reading and iteration is performed by :class:`~deeppavlov.dataset_readers.multitask_reader.MultiTaskReader`
and :class:`~deeppavlov.dataset_iterators.multitask_iterator.MultiTaskIterator`. These classes are composed
of task readers and iterators and generate batches that contain data from heterogeneous datasets. Example below
demonstrates the usage of multitask dataset reader:

.. code:: json

  "dataset_reader": {
    "class_name": "multitask_reader",
    "task_defaults": {
      "class_name": "huggingface_dataset_reader",
      "path": "glue",
      "train": "train",
      "valid": "validation",
      "test": "test"
    },
    "tasks": {
      "cola": {"name": "cola"},
      "copa": {
        "path": "super_glue",
        "name": "copa"
      },
      "conll": {
        "class_name": "conll2003_reader",
        "use_task_defaults": false,
        "data_path": "{DOWNLOADS_PATH}/conll2003/",
        "dataset_name": "conll2003",
        "provide_pos": false
      }
    }
  }

Nested dataset readers are listed in the ``tasks`` section. By default, default nested readers parameters are taken from
``task_defaults`` section. Values from the ``tasks`` could complement parameters, like ``name`` parameter in the
``dataset_reader.tasks.cola``, and could overwrite default parameter values, like ``path`` parameter from
``dataset_reader.tasks.copa``. In the ``dataset_reader.tasks.conll`` ``use_task_defaults`` is ``False``. This is special
parameter, that forces ``multitask_reader`` to ignore ``task_defaults`` while creating nested reader, which means that
dataset reader for ``conll`` task will use only parameters from ``dataset_reader.tasks.conll``.

The same principle with default values applies to ``multitask_iterator``.

Batches generated by ``multitask_iterator`` are tuples of two elements: inputs of the model and labels. 
Both inputsand labels are lists of tuples. The inputs have following format:
``[(first_task_inputs[0], second_task_inputs[0],...), (first_task_inputs[1], second_task_inputs[1], ...), ...]``
where ``first_task_inputs``, ``second_task_inputs``, and so on are x values of batches from task dataset iterators.
The labels in the second element have the similar format.

If task datasets have different sizes, then for smaller datasets the lists are padded with ``None`` values. For example,
if the first task dataset inputs are ``[0, 1, 2, 3, 4, 5, 6]``, the second task dataset inputs are ``[7, 8, 9]``,
and the batch size is ``2``, then multi-task input mini-batches will be ``[(0, 7), (1, 8)]``, ``[(2, 9), (3, None)]``,
``[(4, None), (5, None)]``, ``[(6, None)]``.

In this tutorial, there are 5 datasets. Considering the batch structure, ``chainer`` inputs in
:config:`multitask_example <configs/multitask/multitask_example.json>` are:

.. code:: json

  "in": ["x_cola", "x_rte", "x_stsb", "x_copa", "x_conll"],
  "in_y": ["y_cola", "y_rte", "y_stsb", "y_copa", "y_conll"]

Sometimes a task dataset iterator returns inputs or labels consisting of more than one element. For example, in the
model input element could consist of two strings. If there is a necessity to split such a variable, ``InputSplitter``
component can be used. Data preparation in the multitask setting can be similar to the preparation in singletask setting
except for the names of the variables.

For streamlining the code, however, ``input_splitter`` and ``tokenizer`` can be unified into the
``multitask_pipeline_preprocessor``. This preprocessor gets as a parameter ``preprocessor`` the one preprocessor class
name for all tasks, or gets the preprocessor name list as a parameter ``preprocessors``. After splitting input by
``possible_keys_to_extract``, every preprocessor (being initialized by the input beforehand) processes the input.
Note, that if ``strict`` parameter(default:False) is set to True, we always try to split data. Here is the definition of
``multitask_pipeline_preprocessor`` from the :config:`multitask_example <configs/multitask/multitask_example.json>`:

.. code:: json

  "class_name": "multitask_pipeline_preprocessor",
  "possible_keys_to_extract": [0, 1],
  "preprocessors": [
    "TorchTransformersPreprocessor",
    "TorchTransformersPreprocessor",
    "TorchTransformersPreprocessor",
    "TorchTransformersMultiplechoicePreprocessor",
    "TorchTransformersNerPreprocessor"
  ],
  "do_lower_case": true,
  "n_task": 5,
  "vocab_file": "{BACKBONE}",
  "max_seq_length": 200,
  "max_subword_length": 15,
  "token_masking_prob": 0.0,
  "return_features": true,
  "in": ["x_cola", "x_rte", "x_stsb", "x_copa", "x_conll"],
  "out": [
    "bert_features_cola",
    "bert_features_rte",
    "bert_features_stsb",
    "bert_features_copa",
    "bert_features_conll"
  ]

The ``multitask_transformer`` component has common and task-specific parameters. Shared parameters are provided inside
the tasks parameter. The tasks is a dictionary that keys are task names and values are task-specific parameters (type,
options). Common parameters, are backbone_model(same parameter as in the tokenizer) and all parameters from torch_bert.
**The order of tasks MATTERS.**

Here is the definition of ``multitask_transformer`` from the :config:`multitask_example <configs/multitask/multitask_example.json>`:

.. code:: json

  "id": "multitask_transformer",
  "class_name": "multitask_transformer",
  "optimizer_parameters": {"lr": 2e-5},
  "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}",
  "learning_rate_drop_patience": 2,
  "learning_rate_drop_div": 2.0,
  "return_probas": true,
  "backbone_model": "{BACKBONE}",
  "save_path": "{MODEL_PATH}",
  "load_path": "{MODEL_PATH}",
  "tasks": {
    "cola": {
      "type": "classification",
      "options": 2
    },
    "rte": {
      "type": "classification",
      "options": 2
    },
    "stsb": {
      "type": "regression",
      "options": 1
    },
    "copa": {
      "type": "multiple_choice",
      "options": 2
    },
    "conll": {
      "type": "sequence_labeling",
      "options": "#vocab_conll.len"
    }
  },
  "in": [
    "bert_features_cola",
    "bert_features_rte",
    "bert_features_stsb",
    "bert_features_copa",
    "bert_features_conll"
  ],
  "in_y": ["y_cola", "y_rte", "y_stsb", "y_copa", "y_ids_conll"],
  "out": [
    "y_cola_pred_probas",
    "y_rte_pred_probas",
    "y_stsb_pred",
    "y_copa_pred_probas",
    "y_conll_pred_ids"
  ]
         
Note that ``proba2labels`` can now take several arguments.

.. code:: json

  {
    "in":["y_cola_pred_probas", "y_rte_pred_probas", "y_copa_pred_probas"],
    "out":["y_cola_pred_ids", "y_rte_pred_ids", "y_copa_pred_ids"],
    "class_name":"proba2labels",
    "max_proba":true
  }

You may need to create your own metric for early stopping. In this example, the target metric is an average of AUC ROC
for insults and sentiment tasks and F1 for NER task:

.. code:: python

    from deeppavlov.metrics.roc_auc_score import roc_auc_score

    def roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_probas2, ner_true3, ner_pred3):
        roc_auc1 = roc_auc_score(true_onehot1, pred_probas1)
        roc_auc2 = roc_auc_score(true_onehot2, pred_probas2)
        ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100
        return (roc_auc1 + roc_auc2 + ner_f1_3) / 3

It he code above will be saved at ``custom_metric.py``, metric could be used in the config as
``custom_metric:roc_auc__roc_auc__ner_f1`` (``module.submodules:function_name`` reference format).

You can make an inference-only config. In this config, there is no need in dataset reader and dataset iterator.
A ``train`` field and components preparing ``in_y`` are removed. In ``multitask_transformer`` component configuration
all training parameters (learning rate, optimizer, etc.) are omitted.

Here are the results of ``deeppavlov/configs/multitask/mt_glue.json`` compared to the analogous single-task configs,
according to the test server.

+-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+
| Task              | Score       | CoLA           | SST-2    | MRPC          | STS-B                 | QQP           | MNLI(m/mm) | QNLI     | RTE      | AX             |
+-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+
| Metric            | from server | Matthew's Corr | Accuracy | F1 / Accuracy | Pearson/Spearman Corr | F1 / Accuracy | Accuracy   | Accuracy | Accuracy | Matthew's Corr |
+===================+=============+================+==========+===============+=======================+===============+============+==========+==========+================+
| Multitask config  | 77.8        | 43.6           | 93.2     | 88.6/84.2     | 84.3/84.0             | 70.1/87.9     | 83.0/82.6  | 90.6     | 75.4     | 35.4           |
+-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+
| Singletask config | 77.6        | 53.6           | 92.7     | 87.7/83.6     | 84.4/83.1             | 70.5/88.9     | 84.4/83.2  | 90.3     | 63.4     | 36.3           |
+-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+


================================================
FILE: docs/features/models/neural_ranking.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Neural Ranking\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/neural_ranking.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1. [Predict using Python](#4.1-Predict-using-Python)\n",
    "    \n",
    "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "This model solves the tasks of ranking and paraphrase identification based on semantic similarity which is trained with siamese neural networks. The trained network can retrieve the response closest semantically to a given context from some database or answer whether two sentences are paraphrases or not. It is possible to build automatic semantic FAQ systems with such neural architectures.\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then make sure that all the required packages for the model are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install ranking_ubuntu_v2_torch_bert_uncased"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`ranking_ubuntu_v2_torch_bert_uncased` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n",
    "\n",
    "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n",
    "The full list of models for neural ranking with their config names can be found in the [table](#3.-Models-list).\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "| Config | Language | Dataset | Transformer model |\n",
    "| :--- | :---: | :--- | :--- |\n",
    "| ranking/ranking_ubuntu_v2_torch_bert_uncased.json | En | [Ubuntu v2](https://github.com/rkadlec/ubuntu-ranking-dataset-creator) | bert-base-uncased |\n",
    "| classifiers/paraphraser_rubert.json | Ru | [paraphraser.ru](https://paraphraser.ru) | DeepPavlov/rubert-base-cased |\n",
    "| classifiers/paraphraser_convers_distilrubert_2L.json | Ru | [paraphraser.ru](https://paraphraser.ru) | DeepPavlov/distilrubert-tiny-cased-conversational |\n",
    "| classifiers/paraphraser_convers_distilrubert_6L.json | Ru | [paraphraser.ru](https://paraphraser.ru) | DeepPavlov/distilrubert-base-cased-conversational |\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "### English"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import configs, build_model\n",
    "\n",
    "\n",
    "ranking = build_model(\"ranking_ubuntu_v2_torch_bert_uncased\", download=True, install=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ranking([[\"Forrest Gump is a 1994 American epic comedy-drama film directed by Robert Zemeckis.\",\n",
    "          \"Robert Zemeckis directed Forrest Gump.\",\n",
    "          \"Robert Lee Zemeckis was born on May 14, 1952, in Chicago.\"]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Input:** List[List[sentence1, sentence2, ...]], where the sentences from the second to the last will be ranked by similarity with the first sentence.\n",
    "\n",
    "**Output:** List[List[scores]] - similarity scores to the first sentence of the sentences from the second to the last.\n",
    "\n",
    "### Russian"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import configs, build_model\n",
    "\n",
    "\n",
    "ranking = build_model(\"paraphraser_rubert\", download=True, install=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ranking([\"Форрест Гамп - комедийная драма, девятый полнометражный фильм режиссёра Роберта Земекиса.\"],\n",
    "        [\"Роберт Земекис был режиссером фильма «Форрест Гамп».\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Input:** Tuple[List[sentences1], List[sentence2]], where each element of the list of sentences1 will be compared with the corresponding element of the sentence2 list.\n",
    "\n",
    "**Output:** List[labels] - each label is 1 or 0, 1 - if the sentence from the first list is a paraphrase to the corresponding sentence from the second list, 0 - otherwise.\n",
    "\n",
    "## 4.2 Predict using CLI\n",
    "\n",
    "### English\n",
    "\n",
    "It is not intended to use the class ``deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel`` in the interact mode, so it is better to launch the config ranking/ranking_ubuntu_v2_torch_bert_uncased.json [using Python](#4.1-Predict-using-Python).\n",
    "\n",
    "### Russian\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact paraphraser_rubert -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Customize the model\n",
    "\n",
    "## English\n",
    "\n",
    "To train the ranking model on your own data, you should make a dataset in the following format:\n",
    "\n",
    "- the dataset should have **train.csv**, **valid.csv** and **test.csv** files.\n",
    "\n",
    "- **train.csv** file should contain the following columns: Context, Utterance, Label. Context and utterance are two texts and label (0 or 1) shows the relevance of the utterance to the context.\n",
    "\n",
    "- **valid.csv** and **test.csv** files should contain the following columns: Context, Ground Truth Utterance, Distractor_0, Distractor_1, ..., Distractor_N. Distractor utterances are negative samples (utterances, irrelevant to the context).\n",
    "\n",
    "Then you should put train.csv, valid.csv and test.csv files into the directory ``\"data_path\"`` in the dataset reader from the config and launch training of the model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "python -m deeppavlov train ranking_ubuntu_v2_torch_bert_uncased"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Russian\n",
    "\n",
    "To train the ranking model on your own data, you should make a dataset with two files: **paraphrases.xml** (for training) and **paraphrases_gold.xml** (for testing).\n",
    "\n",
    "The xml files should have the following format:\n",
    "\n",
    "    <?xml version='1.0' encoding='UTF8'?>\n",
    "    <data>\n",
    "      <head>\n",
    "        <title>Russian Paraphrase Corpus</title>\n",
    "        <description>This file contains a collection of sentence pairs with crowdsourced annotation. Paraphrase classes: -1: non-paraphrases, 0: loose paraphrases, 1: strict paraphrases.</description>\n",
    "        <reference>http://paraphraser.ru</reference>\n",
    "        <version>1.0 beta</version>\n",
    "        <date>2015-11-28</date>\n",
    "      </head>\n",
    "      <corpus>\n",
    "        <paraphrase>\n",
    "          <value name=\"id\">1</value>\n",
    "          <value name=\"id_1\">201</value>\n",
    "          <value name=\"id_2\">8159</value>\n",
    "          <value name=\"text_1\">text 1</value>\n",
    "          <value name=\"text_2\">text 2</value>\n",
    "          <value name=\"jaccard\">0.65</value>\n",
    "          <value name=\"class\">0</value>\n",
    "        </paraphrase>\n",
    "        <paraphrase>\n",
    "          ...\n",
    "        </paraphrase>\n",
    "      </corpus>\n",
    "    </data>\n",
    "\n",
    "Place **paraphrases.xml** and **paraphrases_gold.xml** files into the directory ``\"data_path\"`` in the dataset reader from the config and launch training of the model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "python -m deeppavlov train paraphraser_rubert"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/popularity_ranking.rst
================================================
=================
Popularity Ranker
=================

Popularity Ranker re-ranks results obtained via :doc:`TF-IDF Ranker <tfidf_ranking>` using information about
the number of article views. The number of Wikipedia articles views is an open piece of information which can be
obtained via `Wikimedia REST API <https://wikimedia.org/api/rest_v1/>`_.
We assigned a mean number of views for the period since 2017/11/05 to 2018/11/05 to each article in our
English Wikipedia database `enwiki20180211 <http://files.deeppavlov.ai/datasets/wikipedia/enwiki.tar.gz>`_.

The inner algorithm of Popularity Ranker is a Logistic Regression classifier based on 3 features:

- tfidf score of the article
- popularity of the article
- multiplication of two above features

The classifier is trained on `SQuAD-v1.1`_ train set.

Quick Start
===========

Before using the model make sure that all required packages are installed running the command:

.. code:: bash

    python -m deeppavlov install en_ranker_pop_wiki

Building the model

.. code:: python

    from deeppavlov import build_model

    ranker = build_model('en_ranker_pop_wiki', download=True)

Inference

.. code:: python

    result = ranker(['Who is Ivan Pavlov?'])
    print(result[:5])

Output

::

    >> ['Ivan Pavlov', 'Vladimir Bekhterev', 'Classical conditioning', 'Valentin Pavlov', 'Psychology']

Text for the output titles can be further extracted with :class:`~deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab` class.


Configuration
=============

Default ranker config is
:config:`doc_retrieval/en_ranker_pop_wiki.json <doc_retrieval/en_ranker_pop_wiki.json>`

Running the Ranker
==================

.. note::

    About **17 GB of RAM** required.

Interacting
-----------

When interacting, the ranker returns document titles of the relevant
documents.

Run the following to interact with the ranker:

.. code:: bash

    python -m deeppavlov interact en_ranker_pop_wiki -d


Available Data and Pretrained Models
====================================

Available information about Wikipedia articles popularity is downloaded to ``~/.deeppavlov/downloads/odqa/popularities.json``
and pre-trained logistic regression classifier is downloaded to ``~/.deeppavlov/models/odqa/logreg_3features.joblib`` by default.


References
==========

.. target-notes::

.. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250


================================================
FILE: docs/features/models/relation_extraction.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Relation Extraction\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/relation_extraction.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1 [Predict using Python](#4.1-Predict-using-Python)\n",
    "    \n",
    "    4.2 [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "    \n",
    "    5.1 [Description of config parameters](#5.1-Description-of-config-parameters)\n",
    "    \n",
    "    5.2 [Train Relation Extraction on custom data](#5.2-Train-Relation-Extraction-on-custom-data)\n",
    "\n",
    "6. [Relations list](#6.-Relations-list)\n",
    "\n",
    "    6.1 [Relations used in English model](#6.1-Relations-used-in-English-model)\n",
    "    \n",
    "    6.2 [Relations used in Russian model](#6.2-Relations-used-in-Russian-model)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "Relation extraction is the task of detecting and classifying the relationship between two entities in text.\n",
    "DeepPavlov provides the document-level relation extraction meaning that the relation can be detected between the entities that are not in one sentence.\n",
    "\n",
    "**RE Model Architecture**\n",
    "\n",
    "We based our model on the [Adaptive Thresholding and Localized Context Pooling](https://arxiv.org/pdf/2010.11304.pdf) model and used NER entity tags as additional input. Two core ideas of this model are:\n",
    "\n",
    "- Adaptive Threshold\n",
    "\n",
    "The usual global threshold for converting the RE classifier output probability to relation label is replaced with a learnable one. A new threshold class that learns an entities-dependent threshold value is introduced and learnt as all other classes. During prediction the positive classes (= relations that are hold in the sample indeed) are claimed to be the classes with higher logins that the TH class, while all others are negative ones.\n",
    "\n",
    "- Localised Context Pooling\n",
    "\n",
    "The embedding of each entity pair is enhanced with an additional local context embedding related to both entities. Such representation, which is attended to the relevant context in the document, is useful to decide the relation for exactly this entity pair. For incorporating the context information the attention heads are directly used.\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Before using the model make sure that all required packages are installed running the command:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install re_docred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Models list\n",
    "\n",
    "The table presents a list of all of the relation extraction models available in the DeepPavlov Library.\n",
    "\n",
    "| Config | Language | Dataset |\n",
    "| :--- | :---: | :--- |\n",
    "| relation_extraction/re_docred.json | En | [DocRED](https://www.aclweb.org/anthology/P19-1074/) |\n",
    "| relation_extraction/re_rured.json | Ru | [RuRED](http://www.dialog-21.ru/media/5093/gordeevdiplusetal-031.pdf) |\n",
    "\n",
    "## Some details on DocRED corpus English RE model was trained on\n",
    "\n",
    "The English RE model was trained on DocRED English corpus. It was constructed from Wikipedia and Wikidata and is now the largest human-annotated dataset for document-level RE from plain text.\n",
    "\n",
    "As the original DocRED test dataset containes only unlabeled data, while we want to have labeled one in order to perform evaluation, we decided to:\n",
    "1. merge train and dev data (= labeled data)\n",
    "2. split them into new train, dev and test dataset\n",
    "\n",
    "Currently, there are two types of possible splittings provided:\n",
    "\n",
    "- user can set the relative size of dev and test data (e.g. 1/7)\n",
    "- user can set the absolute size of dev and test data (e.g. 2000 samples)\n",
    "\n",
    "In our experiment, we set the absolute size of dev and test data == 150 initial documents. It resulted in approximately 3500 samples.\n",
    "\n",
    "We additionally generate negative samples if it was necessary to have the following proportions:\n",
    "- for train set: negative samples are twice as many as positive ones\n",
    "- for dev & test set: negative samples are the same amount as positive ones\n",
    "\n",
    "| Train | Dev | Test |\n",
    "| :---: | :---: | :---: |\n",
    "| 130650 | 3406 | 3545 |\n",
    "\n",
    "| Train Positive | Train Negative | Dev Positive   | Dev Negative   | Test Positive  | Test Negative  |\n",
    "| :---: | :---: | :---: | :---: | :---: | :---: |\n",
    "| 44823          | 89214          | 1239           | 1229           | 1043           | 1036           |\n",
    "\n",
    "## Some details on RuRED corpus Russian RE model was trained on\n",
    "\n",
    "In case of RuRED we used the train, dev and test sets from the original RuRED setting. We additionally generate negative samples if it was necessary to have the following proportions:\n",
    "\n",
    "- for train set: negative samples are twice as many as positive ones\n",
    "- for dev & test set: negative samples are the same amount as positive ones\n",
    "\n",
    "| Train         | Dev           | Test           |\n",
    "| :---: | :---: | :---: |\n",
    "| 12855         | 1076          |1072            |\n",
    "\n",
    "| Train Positive | Train Negative | Dev Positive | Dev Negative | Test Positive | Test Negative |\n",
    "| :---: | :---: | :---: | :---: | :---: | :---: |\n",
    "| 4285           | 8570           | 538          | 538          | 536           | 536           |\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "### English"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import configs, build_model\n",
    "\n",
    "re_model = build_model(configs.relation_extraction.re_docred, download=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['P26'], ['spouse']]\n"
     ]
    }
   ],
   "source": [
    "sentence_tokens = [[\"Barack\", \"Obama\", \"is\", \"married\", \"to\", \"Michelle\", \"Obama\", \",\", \"born\", \"Michelle\", \"Robinson\", \".\"]]\n",
    "entity_pos = [[[(0, 2)], [(5, 7), (9, 11)]]]\n",
    "entity_tags = [[\"PER\", \"PER\"]]\n",
    "pred = re_model(sentence_tokens, entity_pos, entity_tags)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Model Input**:\n",
    "\n",
    "- list of tokens of a text document\n",
    "- list of entities positions (i.e. all start and end positions of both entities' mentions)\n",
    "- list of NER tags of both entities.\n",
    "\n",
    "As NER tags, we adapted the used in the DocRED corpus, which are, in turn, inherited from [Tjong Kim Sang and De Meulder(2003)](https://aclanthology.org/W03-0419/)\n",
    "\n",
    "**The whole list of 6 English NER tags**\n",
    "\n",
    "| Tag | Description |\n",
    "| :--- | :--- |\n",
    "|PER | People, including fictional |\n",
    "|ORG    | Companies, universities, institutions, political or religious groups, etc.                     |\n",
    "|LOC    | Geographically defined locations, including mountains, waters, etc. <br> Politically defined locations, including countries, cities, states, streets, etc. <br> Facilities, including buildings, museums, stadiums, hospitals, factories, airports, etc.       |\n",
    "|TIME   | Absolute or relative dates or periods.                                                         |\n",
    "|NUM    | Percents, money, quantities                                                                    |\n",
    "|MISC   | Products, including vehicles, weapons, etc. <br> Events, including elections, battles, sporting MISC events, etc. Laws, cases, languages, etc.   |\n",
    "\n",
    "**Model Output**: one or several of the [97 relations](#6.1-Relations-used-in-English-model) found between the given entities; relation id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P26') and relation name ('spouse').\n",
    "\n",
    "### Russian"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import configs, build_model\n",
    "\n",
    "re_model = build_model(configs.relation_extraction.re_rured)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['P495'], ['страна происхождения']]\n"
     ]
    }
   ],
   "source": [
    "sentence_tokens = [[\"Илон\", \"Маск\", \"живет\", \"в\", \"Сиэттле\", \".\"]]\n",
    "entity_pos = [[[(0, 2)], [(4, 5)]]]\n",
    "entity_tags = [[\"PERSON\", \"CITY\"]]\n",
    "pred = re_model(sentence_tokens, entity_pos, entity_tags)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Model Input**:\n",
    "\n",
    "- list of tokens of a text document\n",
    "- list of entities positions (i.e. all start and end positions of both entities' mentions)\n",
    "- list of NER tags of both entities.\n",
    "\n",
    "**Model Output**: one or several of the [30 relations](#6.2-Relations-used-in-Russian-model) found between the given entities; a Russian relation name (e.g. \"участник\") or an English one, if Russian one is unavailable, and, if applicable, its id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P710').\n",
    "\n",
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact re_docred [-d]\n",
    "! python -m deeppavlov interact re_rured [-d]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`-d` is an optional download key (alternative to `download=True` in Python code). It is used to download the pre-trained model along with embeddings and all other files needed to run the model."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Customize the model\n",
    "\n",
    "## 5.1 Description of config parameters\n",
    "\n",
    "Parameters of ``re_preprocessor`` component:\n",
    "\n",
    "- ``ner_tags: List[str]`` - ner tags of the entities, which are one-hot encoded and concatenated to entity embeddings in the output of the Transformer;\n",
    "- ``special_token: str`` - the token which is added before and after the entities (subject and object in the triplet) mentions;\n",
    "- ``default_tag: str`` - default ner tags, if no tags are provided;\n",
    "- ``do_lower_case: bool`` - set True if lowercasing is needed.\n",
    "\n",
    "Parameters of ``re_classifier`` component:\n",
    "\n",
    "- ``n_classes: int`` - number of relations which the model supports;\n",
    "- ``num_ner_tags: int`` - number of ner tags;\n",
    "- ``return_probas: bool`` - whether to return confidences of predicted relations.\n",
    "\n",
    "Parameters of ``re_postprocessor`` component:\n",
    "    \n",
    "- ``rel2id_path: str`` - the file with mapping of relation IDs in the knowledge base to relation number (for example, \"P19\": 24);\n",
    "- ``rel2label_path: str`` - the file with mapping of relation IDs to relation labels.\n",
    "\n",
    "## 5.2 Train Relation Extraction on custom data\n",
    "\n",
    "There are two kinds of dataset readers for relation extraction in DeepPavlov library:\n",
    "\n",
    "- ``docred_reader``, which takes into account partition of the text into sentences and several mentions in the text for one entity;\n",
    "- ``rured_reader``, a simplified dataset reader.\n",
    "\n",
    "### Train with ``docred_reader``\n",
    "\n",
    "You should prepare **train_annotated.json**, **dev.json**, **test.json** in the following format:\n",
    "\n",
    "    {\n",
    "      \"vertexSet\": [\n",
    "        [\n",
    "          {\n",
    "            \"name\": entity1_mention1,\n",
    "            \"pos\": [mention1 start token index, mention1 end token index],\n",
    "            \"sent_id\": ID of the sentence with the entity1 mention1,\n",
    "            \"type\": ner tag\n",
    "          },\n",
    "          {\n",
    "            \"name\": entity1_mention2,\n",
    "            ...\n",
    "          },\n",
    "          ...\n",
    "        ],\n",
    "        [ ... ]\n",
    "      ],\n",
    "      \"labels\": [\n",
    "        {\n",
    "          \"r\": relation ID,\n",
    "          \"h\": index of head entity of the triplet in the vertexSet list,\n",
    "          \"t\": index of tail entity of the triplet in the vertexSet list,\n",
    "          \"evidence\": [\n",
    "            indices of the sentences with the triplet\n",
    "          ]\n",
    "        },\n",
    "        ...\n",
    "      ],\n",
    "      \"title\": doc title,\n",
    "      \"sentences\": [\n",
    "        list of tokens of sentence 1,\n",
    "        list of tokens of sentence 2,\n",
    "        ...\n",
    "      ],\n",
    "      ...\n",
    "    }\n",
    "\n",
    "For example,\n",
    "\n",
    "    {\n",
    "      \"vertexSet\": [\n",
    "        [\n",
    "          {\n",
    "            \"name\": \"Elon Musk\",\n",
    "            \"pos\": [0, 2],\n",
    "            \"sent_id\": 0,\n",
    "            \"type\": \"PER\"\n",
    "          }\n",
    "        ],\n",
    "        [\n",
    "          {\n",
    "            \"name\": \"Seattle\",\n",
    "            \"pos\": [4, 5],\n",
    "            \"sent_id\": 0,\n",
    "            \"type\": \"CITY\"\n",
    "          }\n",
    "        ]\n",
    "      ],\n",
    "      \"labels\": [\n",
    "        {\n",
    "          \"r\": \"P551\",\n",
    "          \"h\": 0,\n",
    "          \"t\": 1,\n",
    "          \"evidence\": [0]\n",
    "        }\n",
    "      ],\n",
    "      \"title\": \"title1\",\n",
    "      \"sentences\": [\n",
    "        [\"Elon\", \"Musk\", \"lives\", \"in\", \"Seattle\", \".\"]\n",
    "      ]\n",
    "    }\n",
    "\n",
    "### Train with  ``rured_reader``\n",
    "\n",
    "You should prepare **train.json**, **dev.json**, **test.json** in the following format:\n",
    "\n",
    "    {\n",
    "        \"token\": list of text tokens,\n",
    "        \"relation\": relation ID,\n",
    "        \"subj_start\": index of the token of the subject start in the list,\n",
    "        \"subj_end\": index of the token of the subject end in the list,\n",
    "        \"obj_start\": index of the token of the object start in the list,\n",
    "        \"obj_end\": index of the token of the object end in the list,\n",
    "        \"subj_type\": ner tag of the subject entity,\n",
    "        \"obj_type\": ner tag of the object entity,\n",
    "    },\n",
    "\n",
    "for example:\n",
    "\n",
    "    {\n",
    "        \"token\": [\"Илон\", \"Маск\", \"живет\", \"в\", \"Сиэттле\", \".\"],\n",
    "        \"relation\": \"P551\",\n",
    "        \"subj_start\": 0,\n",
    "        \"subj_end\": 2,\n",
    "        \"obj_start\": 4,\n",
    "        \"obj_end\": 5,\n",
    "        \"subj_type\": \"PERSON\",\n",
    "        \"obj_type\": \"CITY\"\n",
    "    }\n",
    "\n",
    "#### Train the model using Python:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import train_model\n",
    "\n",
    "train_model(\"re_docred\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**or using CLI:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov train re_docred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6. Relations list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.1 Relations used in English model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "|Relation id     |  Relation                                           |\n",
    "| :--- | :--- |\n",
    "|P6              |  head of government                                 |\n",
    "|P17             |  country                                            |\n",
    "|P19             |  place of birth                                     |\n",
    "|P20             |  place of death                                     |\n",
    "|P22             |  father                                             |\n",
    "|P25             |  mother                                             |\n",
    "|P26             |  spouse                                             |\n",
    "|P27             |  country of citizenship                             |\n",
    "|P30             |  continent                                          |\n",
    "|P31             |  instance of                                        |\n",
    "|P35             |  head of state                                      |\n",
    "|P36             |  capital                                            |\n",
    "|P37             |  official language                                  |\n",
    "|P39             |  position held                                      |\n",
    "|P40             |  child                                              |\n",
    "|P50             |  author                                             |\n",
    "|P54             |  member of sports team                              |\n",
    "|P57             |  director                                           |\n",
    "|P58             |  screenwriter                                       |\n",
    "|P69             |  educated at                                        |\n",
    "|P86             |  composer                                           |\n",
    "|P102            |  member of political party                          |\n",
    "|P108            |  employer                                           |\n",
    "|P112            |  founded by                                         |\n",
    "|P118            |  league                                             |\n",
    "|P123            |  publisher                                          |\n",
    "|P127            |  owned by                                           |\n",
    "|P131            |  located in the administrative territorial entity   |\n",
    "|P136            |  genre                                              |\n",
    "|P137            |  operator                                           |\n",
    "|P140            |  religion                                           |\n",
    "|P150            |  contains administrative territorial entity         |\n",
    "|P155            |  follows                                            |\n",
    "|P156            |  followed by                                        |\n",
    "|P159            |  headquarters location                              |\n",
    "|P161            |  cast member                                        |\n",
    "|P162            |  producer                                           |\n",
    "|P166            |  award received                                     |\n",
    "|P170            |  creator                                            |\n",
    "|P171            |  parent taxon                                       |\n",
    "|P172            |  ethnic group                                       |\n",
    "|P175            |  performer                                          |\n",
    "|P176            |  manufacturer                                       |\n",
    "|P178            |  developer                                          |\n",
    "|P179            |  series                                             |\n",
    "|P190            |  sister city                                        |\n",
    "|P194            |  legislative body                                   |\n",
    "|P205            |  basin country                                      |\n",
    "|P206            |  located in or next to body of water                |\n",
    "|P241            |  military branch                                    |\n",
    "|P264            |  record label                                       |\n",
    "|P272            |  production company                                 |\n",
    "|P276            |  location                                           |\n",
    "|P279            |  subclass of                                        |\n",
    "|P355            |  subsidiary                                         |\n",
    "|P361            |  part of                                            |\n",
    "|P364            |  original language of work                          |\n",
    "|P400            |  platform                                           |\n",
    "|P403            |  mouth of the watercourse                           |\n",
    "|P449            |  original network                                   |\n",
    "|P463            |  member of                                          |\n",
    "|P488            |  chairperson                                        |\n",
    "|P495            |  country of origin                                  |\n",
    "|P527            |  has part                                           |\n",
    "|P551            |  residence                                          |\n",
    "|P569            |  date of birth                                      |\n",
    "|P570            |  date of death                                      |\n",
    "|P571            |  inception                                          |\n",
    "|P576            |  dissolved, abolished or demolished                 |\n",
    "|P577            |  publication date                                   |\n",
    "|P580            |  start time                                         |\n",
    "|P582            |  end time                                           |\n",
    "|P585            |  point in time                                      |\n",
    "|P607            |  conflict                                           |\n",
    "|P674            |  characters                                         |\n",
    "|P676            |  lyrics by                                          |\n",
    "|P706            |  located on terrain feature                         |\n",
    "|P710            |  participant                                        |\n",
    "|P737            |  influenced by                                      |\n",
    "|P740            |  location of formation                              |\n",
    "|P749            |  parent organization                                |\n",
    "|P800            |  notable work                                       |\n",
    "|P807            |  separated from                                     |\n",
    "|P840            |  narrative location                                 |\n",
    "|P937            |  work location                                      |\n",
    "|P1001           |  applies to jurisdiction                            |\n",
    "|P1056           |  product or material produced                       |\n",
    "|P1198           |  unemployment rate                                  |\n",
    "|P1336           |  territory claimed by                               |\n",
    "|P1344           |  participant of                                     |\n",
    "|P1365           |  replaces                                           |\n",
    "|P1366           |  replaced by                                        |\n",
    "|P1376           |  capital of                                         |\n",
    "|P1412           |  languages spoken, written or signed                |\n",
    "|P1441           |  present in work                                    |\n",
    "|P3373           |  sibling                                            |"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.2 Relations used in Russian model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "| Relation                   | Relation id       | Russian relation                |\n",
    "| :--- | :--- | :--- |\n",
    "| MEMBER                     | P710              | участник                        |\n",
    "| WORKS_AS                   | P106              | род занятий                     |\n",
    "| WORKPLACE                  | --                | --                              |\n",
    "| OWNERSHIP                  | P1830             | владеет                         |\n",
    "| SUBORDINATE_OF             | --                | --                              |\n",
    "| TAKES_PLACE_IN             | P276              | местонахождение                 |\n",
    "| EVENT_TAKES_PART_IN        | P1344             | участвовал в                    |\n",
    "| SELLS_TO                   | --                | --                              |\n",
    "| ALTERNATIVE_NAME           | --                | --                              |\n",
    "| HEADQUARTERED_IN           | P159              | расположение штаб-квартиры      |\n",
    "| PRODUCES                   | P1056             | продукция                       |\n",
    "| ABBREVIATION               | --                | --                              |\n",
    "| DATE_DEFUNCT_IN            | P576              | дата прекращения существования  |\n",
    "| SUBEVENT_OF                | P361              | часть от                        |\n",
    "| DATE_FOUNDED_IN            | P571              | дата основания/создания/возн-я  |\n",
    "| DATE_TAKES_PLACE_ON        | P585              | момент времени                  |\n",
    "| NUMBER_OF_EMPLOYEES_FIRED  | --                | --                              |\n",
    "| ORIGINS_FROM               | P495              | страна происхождения            |\n",
    "| ACQUINTANCE_OF             | --                | --                              |\n",
    "| PARENT_OF                  | P40               | дети                            |\n",
    "| ORGANIZES                  | P664              | организатор                     |\n",
    "| FOUNDED_BY                 | P112              | основатель                      |\n",
    "| PLACE_RESIDES_IN           | P551              | место жительства                |\n",
    "| BORN_IN                    | P19               | место рождения                  |\n",
    "| AGE_IS                     | --                | --                              |\n",
    "| RELATIVE                   | --                | --                              |\n",
    "| NUMBER_OF_EMPLOYEES        | P1128             | число сотрудников               |\n",
    "| SIBLING                    | P3373             | брат/сестра                     |\n",
    "| DATE_OF_BIRTH              | P569              | дата рождения                   |"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/spelling_correction.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Spelling correction\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/spelling_correction.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1. [Predict using Python](#4.1-Predict-using-Python)\n",
    "\n",
    "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "\n",
    "    5.1. [Training configuration](#5.1-Training-configuration)\n",
    "\n",
    "    5.2. [Language model](#5.2-Language-model)\n",
    "\n",
    "6. [Comparison](#6.-Comparison)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "Spelling correction is detection of words in the text with spelling errors and replacement them with correct ones.\n",
    "\n",
    "For example, the sentence\n",
    "\n",
    "```\n",
    "The platypus lives in eastern Astralia, inkluding Tasmania.\n",
    "```\n",
    "\n",
    "with spelling mistakes ('Astralia', 'inkluding') will be corrected as\n",
    "\n",
    "```\n",
    "The platypus lives in eastern Australia, including Tasmania.\n",
    "```\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then make sure that all the required packages for the model are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install brillmoore_wikitypos_en"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`brillmoore_wikitypos_en` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n",
    "\n",
    "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n",
    "The full list of models for spelling correction with their config names can be found in the [table](#3.-Models-list).\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "The table presents a list of all of the models for entity detection, linking and extraction available in the DeepPavlov Library.\n",
    "\n",
    "| Config name | Language | RAM |\n",
    "| :--- | --- | --- |\n",
    "| brillmoore_wikitypos_en | En | 6.7 Gb |\n",
    "| levenshtein_corrector_ru | Ru | 8.7 Gb |\n",
    "\n",
    "We provide two types of pipelines for spelling correction:\n",
    "\n",
    "* [levenshtein_corrector](#4.1.1-Levenshtein-corrector) uses simple Damerau-Levenshtein distance to find correction candidates\n",
    "\n",
    "* [brillmoore](#4.1.2-Brillmoore) uses statistics based error model for it.\n",
    "\n",
    "In both cases correction candidates are chosen based on context with the help of a [kenlm language model](https://docs.deeppavlov.ai/en/master/features/models/spelling_correction.html#language-model).\n",
    "\n",
    "You can find [the comparison](#6.-Comparison) of these and other approaches near the end of this readme.\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "### 4.1.1 Levenshtein corrector\n",
    "\n",
    "[This component](https://docs.deeppavlov.ai/en/master/apiref/models/spelling_correction.html#deeppavlov.models.spelling_correction.levenshtein.LevenshteinSearcherComponent) finds all the candidates in a static dictionary on a set Damerau-Levenshtein distance. It can separate one token into two but it will not work the other way around.\n",
    "\n",
    "**Component config parameters**:\n",
    "\n",
    "-  ``in`` — list with one element: name of this component's input in\n",
    "   chainer's shared memory\n",
    "-  ``out`` — list with one element: name for this component's output in\n",
    "   chainer's shared memory\n",
    "-  ``class_name`` always equals to ``\"spelling_levenshtein\"`` or ``deeppavlov.models.spelling_correction.levenshtein.searcher_component:LevenshteinSearcherComponent``.\n",
    "-  ``words`` — list of all correct words (should be a reference)\n",
    "-  ``max_distance`` — maximum allowed Damerau-Levenshtein distance\n",
    "   between source words and candidates\n",
    "-  ``error_probability`` — assigned probability for every edit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model, configs\n",
    "\n",
    "model = build_model('levenshtein_corrector_ru', download=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['утконос живет в восточной австралии на обширном ареале от холодных плато тасмании и австралийских альп до дождевых лесов прибрежного квинсленда.']\n"
     ]
    }
   ],
   "source": [
    "model(['Утканос живет в Васточной Австралии на обширном ареале от холодных плато Тасмании и Австралийских Альп до дождевых лесов прибрежного Квинсленда.'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1.2 Brillmoore\n",
    "\n",
    "[This component](https://docs.deeppavlov.ai/en/master/apiref/models/spelling_correction.html#deeppavlov.models.spelling_correction.brillmoore.ErrorModel) is based on [An Improved Error Model for Noisy Channel Spelling Correction](http://www.aclweb.org/anthology/P00-1037) by Eric Brill and Robert C. Moore and uses statistics based error model to find best candidates in a static dictionary.\n",
    "\n",
    "**Component config parameters:**\n",
    "\n",
    "-  ``in`` — list with one element: name of this component's input in\n",
    "   chainer's shared memory\n",
    "-  ``out`` — list with one element: name for this component's output in\n",
    "   chainer's shared memory\n",
    "-  ``class_name`` always equals to ``\"spelling_error_model\"`` or ``deeppavlov.models.spelling_correction.brillmoore.error_model:ErrorModel``.\n",
    "-  ``save_path`` — path where the model will be saved at after a\n",
    "   training session\n",
    "-  ``load_path`` — path to the pretrained model\n",
    "-  ``window`` — window size for the error model from ``0`` to ``4``,\n",
    "   defaults to ``1``\n",
    "-  ``candidates_count`` — maximum allowed count of candidates for every\n",
    "   source token\n",
    "-  ``dictionary`` — description of a static dictionary model, instance\n",
    "   of (or inherited from)\n",
    "   ``deeppavlov.vocabs.static_dictionary.StaticDictionary``\n",
    "\n",
    "   -  ``class_name`` — ``\"static_dictionary\"`` for a custom dictionary or one\n",
    "      of two provided:\n",
    "\n",
    "      -  ``\"russian_words_vocab\"`` to automatically download and use a\n",
    "         list of russian words from\n",
    "         `https://github.com/danakt/russian-words/ <https://github.com/danakt/russian-words/>`__\n",
    "      -  ``\"wikitionary_100K_vocab\"`` to automatically download a list\n",
    "         of most common words from Project Gutenberg from\n",
    "         `Wiktionary <https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#Project_Gutenberg>`__\n",
    "\n",
    "   -  ``dictionary_name`` — name of a directory where a dictionary will\n",
    "      be built to and loaded from, defaults to ``\"dictionary\"`` for\n",
    "      static\\_dictionary\n",
    "   -  ``raw_dictionary_path`` — path to a file with a line-separated\n",
    "      list of dictionary words, required for static\\_dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model, configs\n",
    "\n",
    "model = build_model('brillmoore_wikitypos_en', download=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['the platypus lives in australia.']\n"
     ]
    }
   ],
   "source": [
    "model(['The platypus lives in Astralia.'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact brillmoore_wikitypos_en -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Customize the model\n",
    "\n",
    "## 5.1 Training configuration\n",
    "\n",
    "For the training phase config file needs to also include these\n",
    "parameters:\n",
    "\n",
    "-  ``dataset_iterator`` — it should always be set like\n",
    "   ``\"dataset_iterator\": {\"class_name\": \"typos_iterator\"}``\n",
    "\n",
    "   -  ``class_name`` always equals to ``typos_iterator``\n",
    "   -  ``test_ratio`` — ratio of test data to train, from ``0.`` to\n",
    "      ``1.``, defaults to ``0.``\n",
    "\n",
    "-  ``dataset_reader``\n",
    "\n",
    "   -  ``class_name`` — ``typos_custom_reader`` for a custom dataset or one of\n",
    "      two provided:\n",
    "\n",
    "      -  ``typos_kartaslov_reader`` to automatically download and\n",
    "         process misspellings dataset for russian language from\n",
    "         https://github.com/dkulagin/kartaslov/tree/master/dataset/orfo_and_typos\n",
    "      -  ``typos_wikipedia_reader`` to automatically download and\n",
    "         process a list of common misspellings from english\n",
    "         Wikipedia - https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines\n",
    "\n",
    "   -  ``data_path`` — required for typos\\_custom\\_reader as a path to\n",
    "      a dataset file,\n",
    "      where each line contains a misspelling and a correct spelling\n",
    "      of a word separated by a tab symbol\n",
    "\n",
    "Component's configuration for ``spelling_error_model`` also has to\n",
    "have as ``fit_on`` parameter — list of two elements:\n",
    "names of component's input and true output in chainer's shared\n",
    "memory.\n",
    "\n",
    "## 5.2 Language model\n",
    "\n",
    "Provided pipelines use [KenLM](http://kheafield.com/code/kenlm/) to process language models, so if you want to build your own, we suggest you consult its website. We do also provide our own language models for\n",
    "[english](http://files.deeppavlov.ai/lang_models/en_wiki_no_punkt.arpa.binary.gz) (5.5GB) and\n",
    "[russian](http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz) (3.1GB) languages.\n",
    "\n",
    "# 6. Comparison\n",
    "\n",
    "We compared our pipelines with\n",
    "[Yandex.Speller](http://api.yandex.ru/speller/),\n",
    "[JamSpell](https://github.com/bakwc/JamSpell) and\n",
    "[PyHunSpell](https://github.com/blatinier/pyhunspell)\n",
    "on the [test set](http://www.dialog-21.ru/media/3838/test_sample_testset.txt) for the [SpellRuEval\n",
    "competition](http://www.dialog-21.ru/en/evaluation/2016/spelling_correction/)\n",
    "on Automatic Spelling Correction for Russian:\n",
    "\n",
    "| Correction method | Precision | Recall | F-measure | Speed (sentences/s) |\n",
    "| :---------------- | --------- | ------ | --------- | ------------------- |\n",
    "| Yandex.Speller | 83.09 | 59.86 | 69.59 | 5. |\n",
    "| DeepPavlov levenshtein_corrector_ru | 59.38 | 53.44 | 56.25 | 39.3 |\n",
    "| Hunspell + lm | 41.03 | 48.89 | 44.61 | 2.1 |\n",
    "| JamSpell | 44.57 | 35.69 | 39.64 | 136.2 |\n",
    "| Hunspell | 30.30 | 34.02 | 32.06 | 20.3 |"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/superglue.rst
================================================
Russian SuperGLUE Submission
==========================================
The DeepPavlov library provides a way to train your Russian SuperGLUE models and submit the results to the leaderboard in a couple of easy steps.

Task definition
---------------
`Russian SuperGLUE <https://russiansuperglue.com/>`__ is a benchmark that contains a set of tasks in Russian developed for evaluating general language understanding.

There are 9 tasks in the Russian SuperGLUE set:

**DaNetQA (Yes/no Question Answering Dataset for Russian)** is a binary classification task of question answering, in which the model is asked to answer a yes/no question based on a given context fragment.

**PARus (Choice of Plausible Alternatives for Russian language)** is a causal reasoning task. The model is asked to choose the most plausible alternative that has causal relation with the given premise.

**RCB (Russian Commitment Bank)** is a classification task in which the model is asked to define the type of textual entailment (Entailment, Contradiction, Neutral) between two sentences.

In the **MuSeRC (Russian Multi-Sentence Reading Comprehension)** task the model needs to process information from multiple sentences at once and identify the correct answers for the
question from the given list.

In the **RuCoS (Russian reading comprehension with Commonsense reasoning)** task the model has to choose the answer to each query from a list of text spans from a fragment.

**RUSSE (Russian Word-in-Context)** is a reading comprehension task in which the model has to identify whether a given word is used in the same
meaning in two different sentences.

In **RWSD (The Russian Winograd Schema Challenge)** the data is a set of sentences that differ by one or two words
in which syntactic ambiguity is resolved differently. The model is trained to predict whether it is resolved correctly.

**LiDiRus** is a diagnostic task in which the model has to identify whether there is entailment between two sentences.

**TERRa (Textual Entailment Recognition for Russian)** is a binary classification task of identifying whether there is entailment between two sentences.


For more detailed description of each task see `this <https://russiansuperglue.com/tasks/>`__.

Train your model
----------------
Modify the configuration file you need and train your own model for the task (see :doc:`here </intro/quick_start>` 
for more detailed instructions). The full list of models designed for each task can be found in the table below.

Create your submission files
----------------------------
To do that, use the ``submit`` command with the name of the configuration file that defines the path to your model.
Note that the name of the Russian SuperGLUE task should be defined in the ``["metadata"]["variables"]["TASK"]`` variable in the config file.

.. code:: bash

    python -m deeppavlov.utils.benchmarks.superglue <config_name> [-d] [-o <output_file_name.jsonl>]

* ``-d``: downloads model specific data before starting submission generation.
* ``-o <output_file_name.jsonl>``: set output file name. By default for Russian SuperGLUE models output filenames are
  comply with benchmark requirements.

For example, ``russian_superglue_danetqa_rubert`` solves **Yes/no Question Answering Dataset for the Russian** task.
Following command will generate ``DaNetQA.jsonl`` ready for submission:

.. code:: bash

    python -m deeppavlov.utils.benchmarks.superglue russian_superglue_danetqa_rubert -d

The prediction results will be saved in the correct format and the file will be automatically named with the name required by the system and saved to the current directory. All you have to do next 
is to zip the files you want into one archive and `submit them to leaderboard <https://russiansuperglue.com/guide/>`__.

Scores
------
The scores for DeepPavlov's pretrained models on the tasks are presented in the table.
    
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+
| Model                                                                                                  |     Metric     |      Score      |
+========================================================================================================+================+=================+
|  :config:`russian_superglue_danetqa_rubert <russian_super_glue/russian_superglue_danetqa_rubert.json>` |    Accuracy    |      0.647      |
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+
|  :config:`russian_superglue_parus_rubert <russian_super_glue/russian_superglue_parus_rubert.json>`     |    Accuracy    |      0.588      |
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+
|  :config:`russian_superglue_russe_rubert <russian_super_glue/russian_superglue_russe_rubert.json>`     |    Accuracy    |      0.641      |
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+
|  :config:`russian_superglue_lidirus_rubert <russian_super_glue/russian_superglue_lidirus_rubert.json>` | Matthew's Corr |      0.251      |
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+
|  :config:`russian_superglue_rcb_rubert <russian_super_glue/russian_superglue_rcb_rubert.json>`         |     F1/Acc     |  0.336 / 0.486  |
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+
|  :config:`russian_superglue_rwsd_rubert <russian_super_glue/russian_superglue_rwsd_rubert.json>`       |    Accuracy    |      0.669      |
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+
|  :config:`russian_superglue_muserc_rubert <russian_super_glue/russian_superglue_muserc_rubert.json>`   |     F1a/Em     |  0.689 / 0.298  |
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+
|  :config:`russian_superglue_rucos_rubert <russian_super_glue/russian_superglue_rucos_rubert.json>`     |      F1/EM     |   0.77 / 0.768  |
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+
|  :config:`russian_superglue_terra_rubert <russian_super_glue/russian_superglue_terra_rubert.json>`     |    Accuracy    |      0.65       |
+--------------------------------------------------------------------------------------------------------+----------------+-----------------+


================================================
FILE: docs/features/models/syntax_parser.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Syntax Parser\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/syntax_parser.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1. [Predict using Python](#4.1-Predict-using-Python)\n",
    "\n",
    "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "Syntactic parsing is the task of prediction of the syntactic tree given the tokenized (or raw) sentence.\n",
    "\n",
    "To define a tree, for each word one should know its syntactic head and the dependency label for the edge between them.\n",
    "For example, the tree above can be restored from the data\n",
    "\n",
    "```\n",
    "    1\tJohn    2\tnsubj\t\n",
    "    2\tbought  0\troot\t\n",
    "    3\ta       6\tdet\t\n",
    "    4\tvery    5\tadvmod\t\n",
    "    5\ttasty   6\tamod\t\n",
    "    6\tcake    2\tobj\n",
    "    7\t.       2\tpunct\n",
    "```\n",
    "Here the third column contains the positions of syntactic heads and the last one -- the dependency labels.\n",
    "The words are enumerated from 1 since 0 is the index of the artificial root of the tree, whose only\n",
    "dependent is the actual syntactic head of the sentence (usually a verb).\n",
    "\n",
    "Syntactic trees can be used in many information extraction tasks. For example, to detect who is the winner\n",
    "and who is the loser in the sentence *Manchester defeated Liverpool* one relies on the word order. However,\n",
    "many languages, such as Russian, Spanish and German, have relatively free word order, which means we need\n",
    "other cues. Note also that syntactic relations (`nsubj`, `obj` and so one) have clear semantic counterparts,\n",
    "which makes syntactic parsing an appealing preprocessing step for the semantic-oriented tasks.\n",
    "\n",
    "We use BERT as the lowest layer of our model (the embedder). To extract syntactic information we apply\n",
    "the biaffine network of [Dozat, Manning, 2017](https://arxiv.org/pdf/1611.01734.pdf).\n",
    "For each sentence of length `K` this network produces two outputs: the first is an array of shape ``K*(K+1)``,\n",
    "where `i`-th row is the probability distribution of the head of `i`-th word over the sentence elements.\n",
    "The 0-th element of this distribution is the probability of the word to be a root of the sentence.\n",
    "The second output of the network is of shape `K*D`, where `D` is the number of possible dependency labels.\n",
    "\n",
    "The easiest way to obtain a tree is simply to return the head with the highest probability\n",
    "for each word in the sentence. However, the graph obtained in such a way may fail to be a valid tree:\n",
    "it may either contain a cycle or have multiple nodes with head at position 0.\n",
    "Therefore we apply the well-known Chu-Liu-Edmonds algorithm for minimal spanning tree\n",
    "to return the optimal tree, using the open-source modification from [dependency_decoding package](https://pypi.org/project/ufal.chu-liu-edmonds/).\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Before using the model make sure that all required packages are installed running the command:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install syntax_ru_syntagrus_bert"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Models list\n",
    "\n",
    "The table presents a list of all of the syntax parsing models available in the DeepPavlov Library.\n",
    "\n",
    "| Config | Description |\n",
    "| :--- | :--- |\n",
    "| morpho_syntax_parser/syntax_ru_syntagrus_bert.json | Config with the model which defines for each token in the sentence <br> its head and dependency type in the syntactic tree. |\n",
    "| morpho_syntax_parser/ru_syntagrus_joint_parsing | Config which unifies syntax parsing and morphological tagging. |\n",
    "\n",
    "The table presents comparison of syntax_ru_syntagrus_bert config with other models on UD2.3 dataset.\n",
    "\n",
    "| Model | UAS | LAS |\n",
    "| :--- | :---: | :---: |\n",
    "| [UD Pipe 2.3](http://ufal.mff.cuni.cz/udpipe) (Straka et al., 2017)  | 90.3 | 89.0 |\n",
    "| [UD Pipe Future](https://github.com/CoNLL-UD-2018/UDPipe-Future) (Straka, 2018) | 93.0 | 91.5 |\n",
    "| [UDify (multilingual BERT)](https://github.com/hyperparticle/udify) (Kondratyuk, 2018) | 94.8 | 93.1 |\n",
    "| Our BERT model (morpho_syntax_parser/syntax_ru_syntagrus_bert.json) | 94.9 | 93.4 |\n",
    "\n",
    "So our model is the state-of-the-art system for Russian syntactic parsing.\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "### Syntax Parser\n",
    "\n",
    "Our model produces the output in [CONLL-U format](http://universaldependencies.org/format.html)\n",
    "and is trained on Universal Dependency corpora, available on http://universaldependencies.org/format.html .\n",
    "The example usage for inference is"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "\n",
    "model = build_model(\"syntax_ru_syntagrus_bert\", download=True, install=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\tЯ\t_\t_\t_\t_\t2\tnsubj\t_\t_\n",
      "2\tшёл\t_\t_\t_\t_\t0\troot\t_\t_\n",
      "3\tдомой\t_\t_\t_\t_\t2\tadvmod\t_\t_\n",
      "4\tпо\t_\t_\t_\t_\t6\tcase\t_\t_\n",
      "5\tнезнакомой\t_\t_\t_\t_\t6\tamod\t_\t_\n",
      "6\tулице\t_\t_\t_\t_\t2\tobl\t_\t_\n",
      "7\t.\t_\t_\t_\t_\t2\tpunct\t_\t_\n",
      "\n",
      "1\tДевушка\t_\t_\t_\t_\t2\tnsubj\t_\t_\n",
      "2\tпела\t_\t_\t_\t_\t0\troot\t_\t_\n",
      "3\tв\t_\t_\t_\t_\t5\tcase\t_\t_\n",
      "4\tцерковном\t_\t_\t_\t_\t5\tamod\t_\t_\n",
      "5\tхоре\t_\t_\t_\t_\t2\tobl\t_\t_\n",
      "6\t.\t_\t_\t_\t_\t2\tpunct\t_\t_\n"
     ]
    }
   ],
   "source": [
    "sentences = [\"Я шёл домой по незнакомой улице.\", \"Девушка пела в церковном хоре.\"]\n",
    "for parse in model(sentences):\n",
    "    print(parse, end=\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As prescribed by UD standards, our model writes the head information to the 7th column and the dependency\n",
    "information -- to the 8th. Our parser does not return morphological tags and even does not use them in\n",
    "training.\n",
    "\n",
    "### Joint Syntax Parser and Morphological tagger\n",
    "\n",
    "Our model in principle supports joint prediction of morphological tags and syntactic information, however, the quality of the joint model is slightly inferior to the separate ones. Therefore we release a special component that can combine the outputs of tagger and parser: `deeppavlov.models.syntax_parser.joint.JointTaggerParser`. Its sample output for the Russian language with default settings (see the configuration file `morpho_syntax_parser/ru_syntagrus_joint_parsing.json` for exact options) looks like"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model\n",
    "\n",
    "model = build_model(\"ru_syntagrus_joint_parsing\", download=True, install=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\tЯ\tя\tPRON\t_\tCase=Nom|Number=Sing|Person=1\t2\tnsubj\t_\t_\n",
      "2\tшёл\tшёл\tVERB\t_\tAspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t0\troot\t_\t_\n",
      "3\tдомой\tдомой\tADV\t_\tDegree=Pos\t2\tadvmod\t_\t_\n",
      "4\tпо\tпо\tADP\t_\t_\t6\tcase\t_\t_\n",
      "5\tнезнакомой\tнезнакомый\tADJ\t_\tCase=Dat|Degree=Pos|Gender=Fem|Number=Sing\t6\tamod\t_\t_\n",
      "6\tулице\tулица\tNOUN\t_\tAnimacy=Inan|Case=Dat|Gender=Fem|Number=Sing\t2\tobl\t_\t_\n",
      "7\t.\t.\tPUNCT\t_\t_\t2\tpunct\t_\t_\n",
      "1\tДевушка\tдевушка\tNOUN\t_\tAnimacy=Anim|Case=Nom|Gender=Fem|Number=Sing\t2\tnsubj\t_\t_\n",
      "2\tпела\tпеть\tVERB\t_\tAspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t0\troot\t_\t_\n",
      "3\tв\tв\tADP\t_\t_\t5\tcase\t_\t_\n",
      "4\tцерковном\tцерковном\tADJ\t_\tCase=Loc|Degree=Pos|Gender=Masc|Number=Sing\t5\tamod\t_\t_\n",
      "5\tхоре\tхор\tNOUN\t_\tAnimacy=Inan|Case=Loc|Gender=Masc|Number=Sing\t2\tobl\t_\t_\n",
      "6\t.\t.\tPUNCT\t_\t_\t2\tpunct\t_\t_\n"
     ]
    }
   ],
   "source": [
    "sentences = [\"Я шёл домой по незнакомой улице.\", \"Девушка пела в церковном хоре.\"]\n",
    "for parse in model(sentences):\n",
    "    print(parse, end=\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the basic case the model outputs a human-readable string with parse data for each information. If you need\n",
    "to use the output in Python, consult the `deeppavlov.models.syntax_parser.joint.JointTaggerParser` and source code.\n",
    "\n",
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact syntax_ru_syntagrus_bert -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n",
    "\n",
    "# 5. Customize the model\n",
    "\n",
    "To train **syntax parser** on your own data, you should prepare a dataset in **CoNLL-U format**. The description of **CoNLL-U format** can be found [here](https://universaldependencies.org/format.html#conll-u-format).\n",
    "\n",
    "Then you should place files for training, validation and testing into the ``\"data_path\"`` directory of ``morphotagger_dataset_reader``, change file names in ``morphotagger_dataset_reader`` to your filenames and launch the training:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import train_model\n",
    "\n",
    "train_model(\"<your_syntax_parsing_config_name>\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "or **using CLI**:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov train <your_syntax_parser_config_name>"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/models/tfidf_ranking.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Tfidf Ranking\n",
    "\n",
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/tfidf_ranking.ipynb)\n",
    "\n",
    "# Table of contents \n",
    "\n",
    "1. [Introduction to the task](#1.-Introduction-to-the-task)\n",
    "\n",
    "2. [Get started with the model](#2.-Get-started-with-the-model)\n",
    "\n",
    "3. [Models list](#3.-Models-list)\n",
    "\n",
    "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n",
    "\n",
    "    4.1. [Predict using Python](#4.1-Predict-using-Python)\n",
    "    \n",
    "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
    "\n",
    "5. [Customize the model](#5.-Customize-the-model)\n",
    "    \n",
    "    5.1. [Fit on Wikipedia](#5.1-Fit-on-Wikipedia)\n",
    "    \n",
    "    5.2. [Download, parse new Wikipedia dump, build database and index](#5.2-Download,-parse-new-Wikipedia-dump,-build-database-and-index)\n",
    "\n",
    "# 1. Introduction to the task\n",
    "\n",
    "This is an implementation of a passage ranker based on tf-idf vectorization.\n",
    "The ranker implementation is based on [DrQA](https://github.com/facebookresearch/DrQA/) project.\n",
    "The default ranker implementation takes a batch of queries as input and returns 100 passage titles sorted via relevance.\n",
    "\n",
    "# 2. Get started with the model\n",
    "\n",
    "First make sure you have the DeepPavlov Library installed.\n",
    "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q deeppavlov"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then make sure that all the required packages for the model are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m deeppavlov install en_ranker_tfidf_wiki"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`en_ranker_tfidf_wiki` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n",
    "\n",
    "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n",
    "The full list of models for tfidf ranking with their config names can be found in the [table](#3.-Models-list).\n",
    "\n",
    "# 3. Models list\n",
    "\n",
    "| Config | Language | Description | RAM |\n",
    "| :--- | :---: | :--- | :---: |\n",
    "| doc_retrieval/en_ranker_tfidf_wiki.json | En | Config for TF-IDF ranking over Wikipedia | 2.9 Gb |\n",
    "| doc_retrieval/en_ranker_pop_wiki.json | En | Config for TF-IDF ranking, followed by <br> popularity ranking, over Wikipedia | 8.1 Gb |\n",
    "| doc_retrieval/ru_ranker_tfidf_wiki.json | Ru | TF-IDF ranking config over Wikipedia | 8.4 Gb |\n",
    "\n",
    "# 4. Use the model for prediction\n",
    "\n",
    "## 4.1 Predict using Python\n",
    "\n",
    "### English\n",
    "\n",
    "Building (if you don't have your own data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model, configs\n",
    "\n",
    "ranker = build_model(configs.doc_retrieval.en_ranker_tfidf_wiki, download=True, install=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[18155097, 628663, 17123727, 628662, 19097375]\n"
     ]
    }
   ],
   "source": [
    "result = ranker(['Who is Ivan Pavlov?'])\n",
    "print(result[0][:5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Russian"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import build_model, configs\n",
    "\n",
    "ranker = build_model(configs.doc_retrieval.ru_ranker_tfidf_wiki, download=True, install=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[4902620, 1900377, 11129584, 1720563, 1720658]\n"
     ]
    }
   ],
   "source": [
    "result = ranker(['Когда произошла Куликовская битва?'])\n",
    "print(result[0][:5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Text for the output titles can be further extracted with [deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab](https://docs.deeppavlov.ai/en/master/apiref/vocabs.html#deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab) class.\n",
    "\n",
    "## 4.2 Predict using CLI\n",
    "\n",
    "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m deeppavlov interact en_ranker_tfidf_wiki -d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Customize the model\n",
    "\n",
    "## 5.1 Fit on Wikipedia\n",
    "\n",
    "Run the following to fit the ranker on **English** Wikipedia:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "python -m deppavlov train en_ranker_tfidf_wiki"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run the following to fit the ranker on **Russian** Wikipedia:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "python -m deeppavlov train ru_ranker_tfidf_wiki"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As a result of ranker training, a SQLite database and tf-idf matrix are created.\n",
    "\n",
    "## 5.2 Download, parse new Wikipedia dump, build database and index\n",
    "\n",
    "**enwiki.db** SQLite database consists of ~21 M Wikipedia articles and is built by the following steps:\n",
    "\n",
    "- Download a Wikipedia dump file. We took the latest\n",
    "   [enwiki dump](https://dumps.wikimedia.org/enwiki/20230501/)\n",
    "\n",
    "- Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor)\n",
    "   (with ``--json``, ``--no-templates``, ``--filter_disambig_pages``\n",
    "   options)\n",
    "\n",
    "- [Build a database](#5.1-Fit-on-Wikipedia).\n",
    "\n",
    "**enwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size **hash_size x number of documents** which is\n",
    "$2^{24}$ x 21 M. This matrix is built with [deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer](https://docs.deeppavlov.ai/en/master/apiref/models/vectorizers.html#deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer) class.\n",
    "\n",
    "**ruwiki.db** SQLite database consists of ~12 M Wikipedia articles and is built by the following steps:\n",
    "\n",
    "- Download a Wikipedia dump file. We took the latest [ruwiki dump](https://dumps.wikimedia.org/ruwiki/20230501/)\n",
    "\n",
    "- Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor)\n",
    "   (with ``--json``, ``--no-templates``, ``--filter_disambig_pages``\n",
    "   options)\n",
    "\n",
    "- [Build a database](#5.1-Fit-on-Wikipedia).\n",
    "\n",
    "**ruwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size **hash_size x number of documents** which is\n",
    "$2^{24}$ x 12 M. This matrix is built with\n",
    "[deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer](https://docs.deeppavlov.ai/en/master/apiref/models/vectorizers.html#deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer) class."
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/features/overview.rst
================================================
Features
========

.. contents:: :local:

Models
------

NER model :doc:`[docs] </features/models/NER>`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Named Entity Recognition task in DeepPavlov is solved with BERT-based model.
The models predict tags (in BIO format) for tokens in input.

BERT-based model is described in  `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
<https://arxiv.org/abs/1810.04805>`__.

+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
| Dataset                                                 | Lang  | Model                                                                                      |   Test F1   |
+=========================================================+=======+============================================================================================+=============+
| Persons-1000 dataset with additional LOC and ORG markup | Ru    | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                                        |    97.9     |
+                                                         +       +--------------------------------------------------------------------------------------------+-------------+
| (Collection 3)                                          |       | :config:`ner_rus_convers_distilrubert_2L.json  <ner/ner_rus_convers_distilrubert_2L.json>` |  88.4 ± 0.5 |
+                                                         +       +--------------------------------------------------------------------------------------------+-------------+
|                                                         |       | :config:`ner_rus_convers_distilrubert_6L.json  <ner/ner_rus_convers_distilrubert_6L.json>` |  93.3 ± 0.3 |
+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
| Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>`                  |    88.9     |
+                                                         +-------+--------------------------------------------------------------------------------------------+-------------+
|                                                         | En    | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`                            |    89.2     |
+---------------------------------------------------------+       +--------------------------------------------------------------------------------------------+-------------+
| ConLL-2003                                              |       | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`                            |    91.7     |
+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+

Classification model :doc:`[docs] </features/models/classification>`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Model for classification tasks (intents, sentiment, etc) on word-level. Shallow-and-wide CNN, Deep CNN, BiLSTM,
BiLSTM with self-attention and other models are presented. The model also allows multilabel classification of texts.
Several pre-trained models are available and presented in Table below.


+------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
| Task             | Dataset             | Lang | Model                                                                                              | Metric      | Valid            | Test            | Downloads |
+==================+=====================+======+====================================================================================================+=============+==================+=================+===========+
| Insult detection | `Insults`_          | En   | :config:`English BERT<classifiers/insults_kaggle_bert.json>`                                       | ROC-AUC     | 0.9327           | 0.8602          |  1.1 Gb   |
+------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
| Sentiment        | `SST`_              |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`          | Accuracy    | 0.6293           | 0.6626          |  1.1 Gb   |
+------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
| Sentiment        | `Twitter mokoron`_  | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`                  | Accuracy    | 0.9918           | 0.9923          |  5.8 Gb   |
+                  +---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
|                  | `RuSentiment`_      |      | :config:`Multi-language BERT <classifiers/rusentiment_bert.json>`                                  | F1-weighted | 0.6787           | 0.7005          |  1.3 Gb   |
+                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
|                  |                     |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                        |             | 0.739            | 0.7724          |  1.5 Gb   |
+                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L.json>`  |             |  0.703 ± 0.0031  | 0.7348 ± 0.0028 |  690 Mb   |
+                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
|                  |                     |      | :config:`Conversational DistilRuBERT-base <classifiers/rusentiment_convers_distilrubert_6L.json>`  |             |  0.7376 ± 0.0045 | 0.7645 ± 0.035  |  1.0 Gb   |
+------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+

.. _`DSTC 2`: http://camdial.org/~mh521/dstc/
.. _`SNIPS-2017`: https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines
.. _`Insults`: https://www.kaggle.com/c/detecting-insults-in-social-commentary
.. _`AG News`: https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html
.. _`Twitter mokoron`: http://study.mokoron.com/
.. _`RuSentiment`: http://text-machine.cs.uml.edu/projects/rusentiment/
.. _`Yahoo-L31`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l
.. _`Yahoo-L6`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l
.. _`SST`: https://nlp.stanford.edu/sentiment/index.html

As no one had published intent recognition for DSTC-2 data, the
comparison of the presented model is given on **SNIPS** dataset. The
evaluation of model scores was conducted in the same way as in [3]_ to
compare with the results from the report of the authors of the dataset.
The results were achieved with tuning of parameters and embeddings
trained on Reddit dataset.

+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
| Model                  | AddToPlaylist   | BookRestaurant   | GetWheather   | PlayMusic    | RateBook     | SearchCreativeWork   | SearchScreeningEvent   |
+========================+=================+==================+===============+==============+==============+======================+========================+
| api.ai                 | 0.9931          | 0.9949           | 0.9935        | 0.9811       | 0.9992       | 0.9659               | 0.9801                 |
+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
| ibm.watson             | 0.9931          | 0.9950           | 0.9950        | 0.9822       | 0.9996       | 0.9643               | 0.9750                 |
+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
| microsoft.luis         | 0.9943          | 0.9935           | 0.9925        | 0.9815       | 0.9988       | 0.9620               | 0.9749                 |
+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
| wit.ai                 | 0.9877          | 0.9913           | 0.9921        | 0.9766       | 0.9977       | 0.9458               | 0.9673                 |
+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
| snips.ai               | 0.9873          | 0.9921           | 0.9939        | 0.9729       | 0.9985       | 0.9455               | 0.9613                 |
+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
| recast.ai              | 0.9894          | 0.9943           | 0.9910        | 0.9660       | 0.9981       | 0.9424               | 0.9539                 |
+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
| amazon.lex             | 0.9930          | 0.9862           | 0.9825        | 0.9709       | 0.9981       | 0.9427               | 0.9581                 |
+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
| Shallow-and-wide CNN   | **0.9956**      | **0.9973**       | **0.9968**    | **0.9871**   | **0.9998**   | **0.9752**           | **0.9854**             |
+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+

.. [3] https://www.slideshare.net/KonstantinSavenkov/nlu-intent-detection-benchmark-by-intento-august-2017


Automatic spelling correction model :doc:`[docs] </features/models/spelling_correction>`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Pipelines that use candidates search in a static dictionary and an ARPA language model to correct spelling errors.

.. note::

    About 4.4 GB on disc required for the Russian language model and about 7 GB for the English one.

Comparison on the `test set <http://www.dialog-21.ru/media/3838/test_sample_testset.txt>`__ for the `SpellRuEval
competition <http://www.dialog-21.ru/en/evaluation/2016/spelling_correction/>`__
on Automatic Spelling Correction for Russian:

+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| Correction method                                                                       | Precision | Recall | F-measure | Speed (sentences/s) |
+=========================================================================================+===========+========+===========+=====================+
| Yandex.Speller                                                                          | 83.09     | 59.86  | 69.59     | 5.                  |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| :config:`Damerau Levenshtein 1 + lm<spelling_correction/levenshtein_corrector_ru.json>` | 53.26     | 53.74  | 53.50     | 29.3                |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| Hunspell + lm                                                                           | 41.03     | 48.89  | 44.61     | 2.1                 |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| JamSpell                                                                                | 44.57     | 35.69  | 39.64     | 136.2               |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| Hunspell                                                                                | 30.30     | 34.02  | 32.06     | 20.3                |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+


Ranking model :doc:`[docs] </features/models/neural_ranking>`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Available pre-trained models for paraphrase identification:

.. table::
   :widths: auto

   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
   |    Dataset             | Model config                                                                                         | Val (accuracy) | Test (accuracy) | Val (F1)   | Test (F1)  | Val (log_loss) | Test (log_loss) | Downloads |
   +========================+======================================================================================================+================+=================+============+============+================+=================+===========+
   | `paraphraser.ru`_      | :config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                                    |   89.8         |   84.2          |   92.2     |  87.4      |   --           |   --            | 1325M     |
   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L <classifiers/paraphraser_convers_distilrubert_2L.json>` |  76.1 ± 0.2    |  64.5 ± 0.5     | 81.8 ± 0.2 | 73.9 ± 0.8 |   --           |   --            | 618M      |
   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L <classifiers/paraphraser_convers_distilrubert_6L.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3 | 83.2 ± 0.5 |   --           |   --            | 930M      |
   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+

.. _`paraphraser.ru`: https://paraphraser.ru/


References:

* Yu Wu, Wei Wu, Ming Zhou, and Zhoujun Li. 2017. Sequential match network: A new architecture for multi-turn response selection in retrieval-based chatbots. In ACL, pages 372–381. https://www.aclweb.org/anthology/P17-1046

* Xiangyang Zhou, Lu Li, Daxiang Dong, Yi Liu, Ying Chen, Wayne Xin Zhao, Dianhai Yu and Hua Wu. 2018. Multi-Turn Response Selection for Chatbots with Deep Attention Matching Network. Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1118-1127, ACL. http://aclweb.org/anthology/P18-1103

* Chongyang Tao, Wei Wu, Can Xu, Wenpeng Hu, Dongyan Zhao, and Rui Yan. Multi-Representation Fusion Network for Multi-turn Response Selection in Retrieval-based Chatbots. In WSDM'19. https://dl.acm.org/citation.cfm?id=3290985

* Gu, Jia-Chen & Ling, Zhen-Hua & Liu, Quan. (2019). Interactive Matching Network for Multi-Turn Response Selection in Retrieval-Based Chatbots. https://arxiv.org/abs/1901.01824


TF-IDF Ranker model :doc:`[docs] </features/models/tfidf_ranking>`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Based on `Reading Wikipedia to Answer Open-Domain Questions <https://github.com/facebookresearch/DrQA/>`__. The model solves the task of document retrieval for a given query.

+---------------+-------------------------------------------------------------------+----------------------+-----------------+-----------+
| Dataset       | Model                                                             |  Wiki dump           |  Recall@5       | Downloads |
+===============+========================================================+==========+======================+=================+===========+
| `SQuAD-v1.1`_ | :config:`doc_retrieval <doc_retrieval/en_ranker_tfidf_wiki.json>` |  enwiki (2018-02-11) |   75.6          | 33 GB     |
+---------------+-------------------------------------------------+-----------------+----------------------+-----------------+-----------+


Question Answering model :doc:`[docs] </features/models/SQuAD>`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Models in this section solve the task of looking for an answer on a
question in a given context (`SQuAD <https://rajpurkar.github.io/SQuAD-explorer/>`__ task format).
There are two models for this task in DeepPavlov: BERT-based and R-Net. Both models predict answer start and end
position in a given context.

BERT-based model is described in  `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
<https://arxiv.org/abs/1810.04805>`__.

RuBERT-based model is described in  `Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language
<https://arxiv.org/abs/1905.07213>`__.

+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
|    Dataset     | Model config                                                                                                  | lang  |    EM (dev)    |    F-1 (dev)    |    Downloads    |
+================+===============================================================================================================+=======+================+=================+=================+
| `SQuAD-v1.1`_  | :config:`DeepPavlov BERT <squad/squad_bert.json>`                                                             |  en   |     81.49      |     88.86       |     1.2 Gb      |
+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
| `SQuAD-v2.0`_  | :config:`DeepPavlov BERT <squad/qa_squad2_bert.json>`                                                         |  en   |     75.71      |     80.72       |     1.2 Gb      |
+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
| `SDSJ Task B`_ | :config:`DeepPavlov RuBERT <squad/squad_ru_bert.json.json>`                                                   |  ru   |     66.21      |     84.71       |     1.7 Mb      |
+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
| `SDSJ Task B`_ | :config:`DeepPavlov RuBERT, trained with tfidf-retrieved negative samples <squad/qa_sberquad2_bert.json>`     |  ru   |     66.24      |     84.71       |     1.6 Gb      |
+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-tiny <squad/squad_ru_convers_distilrubert_2L.json>`                          |  ru   |  44.2 ± 0.46   |  65.1 ± 0.36    |     867Mb       |
+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-base <squad/squad_ru_convers_distilrubert_6L.json>`                          |  ru   |  61.23 ± 0.42  |  80.36 ± 0.28   |     1.18Gb      |
+----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+

In the case when answer is not necessary present in given context we have :config:`qa_squad2_bert <squad/qa_squad2_bert.json>`
model. This model outputs empty string in case if there is no answer in context.


ODQA :doc:`[docs] </features/models/ODQA>`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

An open domain question answering model. The model accepts free-form questions about the world and outputs an answer
based on its Wikipedia knowledge.


+----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
| Dataset        | Model config                                                       |  Wiki dump            |   F1   | Downloads |
+================+====================================================================+=======================+========+===========+
| `SQuAD-v1.1`_  | :config:`ODQA <odqa/en_odqa_infer_wiki.json>`                      | enwiki (2018-02-11)   |  46.24 | 9.7Gb     |
+----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
| `SDSJ Task B`_ | :config:`ODQA with RuBERT <odqa/ru_odqa_infer_wiki.json>`          | ruwiki (2018-04-01)   |  37.83 | 4.3Gb     |
+----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+


AutoML
--------------------

Hyperparameters optimization :doc:`[docs] </features/hypersearch>`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Hyperparameters optimization by cross-validation for DeepPavlov models
that requires only some small changes in a config file.


Embeddings
----------

Pre-trained embeddings :doc:`[docs] </features/pretrained_vectors>`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Word vectors for the Russian language trained on joint `Russian Wikipedia <https://ru.wikipedia.org/>`__ and `Lenta.ru
<https://lenta.ru/>`__ corpora.


Examples of some models
---------------------------

-  Run insults detection model with console interface:

   .. code-block:: bash

      python -m deeppavlov interact insults_kaggle_bert -d

-  Run insults detection model with REST API:

   .. code-block:: bash

      python -m deeppavlov riseapi insults_kaggle_bert -d

-  Predict whether it is an insult on every line in a file:

   .. code-block:: bash

      python -m deeppavlov predict insults_kaggle_bert -d --batch-size 15 < /data/in.txt > /data/out.txt


.. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250
.. _`SQuAD-v2.0`: https://arxiv.org/abs/1806.03822
.. _`SDSJ Task B`: https://arxiv.org/abs/1912.09723


================================================
FILE: docs/features/pretrained_vectors.rst
================================================
Pre-trained embeddings
======================

BERT
----

We are publishing several pre-trained BERT models:

* RuBERT for Russian language
* Slavic BERT for Bulgarian, Czech, Polish, and Russian
* Conversational BERT for informal English
* Conversational BERT for informal Russian
* Sentence Multilingual BERT for encoding sentences in 101 languages
* Sentence RuBERT for encoding sentences in Russian

Description of these models is available in the :doc:`BERT section </features/models/bert>` of the docs.

License
~~~~~~~

The pre-trained models are distributed under the `License Apache
2.0 <https://www.apache.org/licenses/LICENSE-2.0>`__.

Downloads
~~~~~~~~~

The ``TensorFlow`` models can be run with the original `BERT repo <https://github.com/google-research/bert>`_ code
while the ``PyTorch`` models can be run with the `HuggingFace's Transformers <https://github.com/huggingface/transformers>`__ library.
The download links are:

+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
| Description                | Model parameters                      | Download links                                                                                                       |
+============================+=======================================+======================================================================================================================+
| RuBERT                     | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,           |
|                            | size = 632MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v2.tar.gz>`__            |
+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
| Slavic BERT                | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,      |
|                            | size = 632MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_v1.tar.gz>`__       |
+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
| Conversational BERT        | vocab size = 30K, parameters = 110M,  | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,   |
|                            | size = 385MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_v1.tar.gz>`__    |
+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
| Conversational RuBERT      | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,|
|                            | size = 630MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12.tar.gz>`__    |
+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
| Sentence Multilingual BERT | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,   |
|                            | size = 630MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12.tar.gz>`__       |
+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
| Sentence RuBERT            | vocab size = 120K, parameters = 180M, | `[pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12_pt_v1.tar.gz>`__,      |
|                            | size = 630MB                          | `[tensorflow] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12.tar.gz>`__          |
+----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+


ELMo
----

The ELMo can used via Python code as following:

.. code:: python

   import tensorflow as tf
   import tensorflow_hub as hub
   elmo = hub.Module("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz", trainable=True)
   sess = tf.Session()
   sess.run(tf.global_variables_initializer())
   embeddings = elmo(["это предложение", "word"], signature="default", as_dict=True)["elmo"]
   sess.run(embeddings)


TensorFlow Hub module also supports tokenized sentences in the following format.

.. code:: python

   tokens_input = [["мама", "мыла", "раму"], ["рама", "", ""]]
   tokens_length = [3, 1]
   embeddings = elmo(inputs={"tokens": tokens_input,"sequence_len": tokens_length},signature="tokens",as_dict=True)["elmo"]
   sess.run(embeddings)


Downloads
~~~~~~~~~

The models can be downloaded and run by tensorflow hub module from:


+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Description                                                        | Dataset parameters                          | Perplexity       | Tensorflow hub module                                                                                                                                                                                                                 |
+====================================================================+=============================================+==================+=======================================================================================================================================================================================================================================+
| ELMo on  `Russian Wikipedia <https://ru.wikipedia.org/>`__         | lines = 1M, tokens = 386M, size = 5GB       | 43.692           | `module_spec <http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-wiki_600k_steps.tar.gz>`__                                                                                                                                           |
+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ELMo on  `Russian WMT News <http://www.statmt.org/>`__             | lines = 63M, tokens = 946M, size = 12GB     | 49.876           | `module_spec <http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz>`__                                                                                                                                  |
+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ELMo on  `Russian Twitter <https://twitter.com/>`__                | lines = 104M, tokens = 810M, size = 8.5GB   | 94.145           | `module_spec <http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz>`__                                                                                                                        |
+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


fastText
--------

We are publishing pre-trained word vectors for Russian language.
Several models were trained on joint `Russian
Wikipedia <https://ru.wikipedia.org/>`__
and `Lenta.ru <https://lenta.ru/>`__ corpora.
We also introduce one model for Russian conversational language that
was trained on `Russian Twitter <https://twitter.com/>`__ corpus.

All vectors are 300-dimensional. We used fastText skip-gram (see
`Bojanowski et al. (2016) <https://arxiv.org/abs/1607.04606>`__) for
vectors training as well as various preprocessing options (see below).

You can get vectors either in binary or in text (vec) formats for FastText.

License
~~~~~~~

The pre-trained word vectors are distributed under the `License Apache
2.0 <https://www.apache.org/licenses/LICENSE-2.0>`__.

Downloads
~~~~~~~~~

The pre-trained **fastText skipgram** models can be downloaded from:

+-----------------------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Domain                | Preprocessing                                           | Vectors                                                                                                                                                                                                                                                                                                                            |
+=======================+=========================================================+====================================================================================================================================================================================================================================================================================================================================+
| Wiki+Lenta            | tokenize (nltk word\_tokenize), lemmatize (pymorphy2)   | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lemmatize/ft_native_300_ru_wiki_lenta_lemmatize.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lemmatize/ft_native_300_ru_wiki_lenta_lemmatize.vec>`__                                                                   |
+                       +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                       | tokenize (nltk word\_tokenize), lowercasing             | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lower_case/ft_native_300_ru_wiki_lenta_lower_case.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lower_case/ft_native_300_ru_wiki_lenta_lower_case.vec>`__                                                               |
+                       +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                       | tokenize (nltk wordpunсt\_tokenize)                     | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec>`__           |
+                       +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                       | tokenize (nltk word\_tokenize)                          | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_word_tokenize/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_word_tokenize/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.vec>`__                               |
+                       +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                       | tokenize (nltk word\_tokenize), remove stopwords        | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_remstopwords/ft_native_300_ru_wiki_lenta_remstopwords.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_remstopwords/ft_native_300_ru_wiki_lenta_remstopwords.vec>`__                                                       |
+-----------------------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Twitter               | tokenize (nltk word\_tokenize)                          | `bin <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin>`__, `vec <http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.vec>`__                                                                                                                                   |
+-----------------------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Word vectors training parameters
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

These word vectors were trained with following parameters ([...] is for
default value):

fastText (skipgram)
                   

-  lr [0.1]
-  lrUpdateRate [100]
-  dim 300
-  ws [5]
-  epoch [5]
-  neg [5]
-  loss [softmax]
-  pretrainedVectors []
-  saveOutput [0]


================================================
FILE: docs/index.rst
================================================
Welcome to DeepPavlov's documentation!
======================================

.. toctree::
   :glob:
   :maxdepth: 1

   Installation <intro/installation>
   QuickStart <intro/quick_start>
   General concepts <intro/overview>
   Configuration file <intro/configuration>
   Python pipelines <intro/python.ipynb>
   Models overview <features/overview>


.. toctree::
   :glob:
   :maxdepth: 2
   :caption: Features

   Pre-trained embeddings <features/pretrained_vectors>
   AutoML <features/hypersearch>


.. toctree::
   :glob:
   :maxdepth: 1
   :caption: Models

   Multitask BERT <features/models/multitask_bert>
   Context Question Answering <features/models/SQuAD.ipynb>
   Classification <features/models/classification.ipynb>
   Few-shot Classification <features/models/few_shot_classification>
   Named Entity Recognition <features/models/NER.ipynb>
   Entity Extraction <features/models/entity_extraction.ipynb>
   BERT-based models <features/models/bert>
   Morphological Tagging <features/models/morpho_tagger.ipynb>
   Neural Ranking <features/models/neural_ranking.ipynb>
   Spelling Correction <features/models/spelling_correction.ipynb>
   Syntactic Parsing <features/models/syntax_parser.ipynb>
   TF-IDF Ranking <features/models/tfidf_ranking.ipynb>
   Popularity Ranking <features/models/popularity_ranking.ipynb>
   Knowledge Base Question answering <features/models/KBQA.ipynb>
   Relation Extraction <features/models/relation_extraction.ipynb>
   SuperGLUE Submission <features/models/superglue>
   Open-Domain Question Answering <features/models/ODQA.ipynb>


.. toctree::
   :glob:
   :maxdepth: 3
   :caption: Integrations

   REST API <integrations/rest_api>
   Socket API <integrations/socket_api>
   Amazon AWS deployment <integrations/aws_ec2>
   DeepPavlov settings <integrations/settings>


.. toctree::
   :glob:
   :maxdepth: 3
   :caption: Developer Guides

   Contribution guide <devguides/contribution_guide>
   Register your model <devguides/registry>


.. toctree::
   :glob:
   :maxdepth: 3
   :caption: Internships

   Internships <internships/internships>


.. toctree::
   :glob:
   :maxdepth: 3
   :caption: Package Reference

   apiref/*


Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`


================================================
FILE: docs/integrations/aws_ec2.rst
================================================
Amazon AWS deployment
=====================

Here is a manual for deployment DeepPavlov (with ODQA as example) in Amazon Web Services using EC2 virtual machine.

Deployment process consists of two main stages:

1. AWS EC2 machine launch
2. DeepPavlov ODQA deployment

1. AWS EC2 machine launch
-------------------------

1.  Login to your AWS console and proceed to the EC2 services dashboard.

.. image:: ../_static/aws_ec2/01_login_to_aws.png
   :width: 800

2.  Choose Ubuntu Server 18.04 LTS 64-bit x86 machine.

.. image:: ../_static/aws_ec2/02_choose_ubuntu.png
   :width: 800

3.  You should select appropriate instance type because of high memory consumption by ODQA.
    32 GiB memory is a minimum. Then press *"Next: ..."*

.. image:: ../_static/aws_ec2/03_select_instance_type.png
   :width: 800

4.  Proceed to Step 4. Your instance storage size should be no less than 50 GiB to
    store ODQA models.

.. image:: ../_static/aws_ec2/04_add_storage.png
   :width: 800

5.  Proceed to Step 7. Check your instance parameters and press *"Launch"* button.
    You will be prompted to create and save security key pair for further access to your instance.

.. image:: ../_static/aws_ec2/05_review_instance.png
   :width: 800

6.  Return to your EC2 services dashboard and navigate to your running instances list.

.. image:: ../_static/aws_ec2/06_go_to_running_instances.png
   :width: 800

7.  Wait until instance initializing finishes (instance status become *"running"*).

.. image:: ../_static/aws_ec2/07_wait_init.png
   :width: 800

8.  To make DeepPavlov ODQA model rest API accessible from Internet you should set
    corresponding inbound security rules:

    8.1 Navigate to your instance security group dashboard
    (in this example security group has name *"launch-wizard-2"*).

    .. image:: ../_static/aws_ec2/08_01_set_sec_group.png
       :width: 800

    8.2 Select *"Inbound"* rules tab, click *"Edit"*, then click *"Add Rule"*.
    For your new rule select *"Custom TCP Rule"* type, *"Anywhere"* source and input
    port for your ODQA API. Click *"Save"*.

    .. image:: ../_static/aws_ec2/08_02_set_inbound.png
       :width: 800

9.  Connecting to your instance by SSH:

    9.1 Navigate to your instance dashboard, right-click your instance, select *"Connect"*.

    .. image:: ../_static/aws_ec2/09_01_select_connect.png
       :width: 800

    You will be redirected to connection instructions screen for your dashboard.
    Follow instructions for standalone SSH client. SSH connection bash command example will
    already contain valid user and host name. To connect to your Amazon instance just run
    the example with valid path to your saved key pair (instead of *"dp_key_pair.pem"*
    in this example).

    .. image:: ../_static/aws_ec2/09_02_connection_info.png
       :width: 800

2. DeepPavlov ODQA deployment
-----------------------------

1.  Login to your AWS EC2 instance.

2.  For now DeepPavlov requires Python 3.6 to run. Below are instructions for DeepPavlov ODQA
    deployment under Ubuntu 18.04 (which has pre-installed Python 3.6) and virtualenv.

3.  Install pip3:

    ``sudo apt update``

    ``sudo apt install python3-pip``

4.  Install virtualenv:

    ``sudo pip3 install virtualenv``

5.  Create and activate Python 3.6 virtual enviroment:

    ``virtualenv env -p python3.6``

    ``source env/bin/activate``

6.  Install DeepPavlov:

    ``pip install deeppavlov``

7.  Install ODQA dependencies:

    ``python -m deeppavlov install en_odqa_infer_wiki``

8.  Download ODQA models (it will take quite a time):

    ``python -m deeppavlov download en_odqa_infer_wiki``

9.  Run ODQA REST API service, where <port> is port you defined in TCP
    inbound rules for your AWS instance:

    ``python -m deeppavlov riseapi en_odqa_infer_wiki -p <port>``

3. Accessing your ODQA API
--------------------------

1.  Get your AWS instance public DNS from the instance dashboard.

2.  Get full info about your ODQA API from its Swagger by navigating to
    following URL in your browser:

    ``http://<your_aws_instance_public_dns>:<your_odqa_service_port>``


================================================
FILE: docs/integrations/rest_api.rst
================================================
REST API
========

Each DeepPavlov model can be easily made available for
inference as a REST web service. The general method is:

.. code:: bash

    python -m deeppavlov riseapi <config_path> [-d] [-p <port>] [--https] [--key <SSL key file path>] \
    [--cert <SSL certificate file path>]


* ``-d``: downloads model specific data before starting the service.
* ``-p <port>``: sets the port to ``<port>``. Overrides default
  value from ``deeppavlov/utils/settings/server_config.json``.
* ``--https``: use https instead of http. Overrides default
  value from ``deeppavlov/utils/settings/server_config.json``.
* ``--key <SSL key file path>``: path to SSL key file. Overrides default
  value from ``deeppavlov/utils/settings/server_config.json``.
* ``--cert <SSL certificate file path>``: path to SSL certificate file. Overrides default
  value from ``deeppavlov/utils/settings/server_config.json``.

The command will print the used host and port. Default web service properties
(host, port, POST request arguments) can be modified via changing
``deeppavlov/utils/settings/server_config.json`` file.

.. warning::

    Starting from the 1.0.0rc2 model response format in riseapi mode matches :class:`~deeppavlov.core.common.chainer.Chainer`
    response format. To start model with the old format, give the ``COMPATIBILITY_MODE`` environment variable any
    non-empty value (e.g. ``COMPATIBILITY_MODE=true python -m deeppavlov riseapi ...``).
    ``COMPATIBILITY_MODE`` will be removed in DeepPavlov 1.2.0.

API routes
----------

/model
""""""
Send POST request to ``<host>:<port>/model`` to infer model. See details at
:ref:`rest_api_docs`.

/probe
""""""
Send POST request to ``<host>:<port>/probe`` to check if API is working. The
server will send a response ``["Test passed"]`` if it is working. Requests to
``/probe`` are not logged.

/api
""""
To get model argument and response names send GET request to ``<host>:<port>/api``. Server
will return dict with model input and output names.

.. _rest_api_docs:

/docs
"""""
To interact with the REST API via graphical interface open
``<host>:<port>/docs`` in a browser (Swagger UI).

/metrics
""""""""
Endpoint to monitor a running service using Prometheus. Metrics:

* ``http_requests_count``: Counter, tracks number of processed requests. Labels: ``endpoint``, ``status_code``.
* ``http_requests_latency_seconds``: Histogram, tracks responses latency (only with 200 status code). Labels:
  ``endpoint``.
* ``http_requests_in_progress``: Gauge, tracks inprogress requests. Labels: ``endpoint``.

Advanced configuration
----------------------

By modifying ``deeppavlov/utils/settings/server_config.json`` you can change
host, port, POST request arguments and other properties of the API service.

Properties from ``common_defaults`` section are used by default unless
they are overridden by model-specific properties, provided in ``model_defaults``
section of the ``server_config.json``. Model-specific properties are bound
to the model by ``server_utils`` label in ``metadata`` section of the model
config. Value of ``server_utils`` label from model config should match with
properties key from ``model_defaults`` section of ``server_config.json``.

For example, adding ``metadata/server_utils`` key to ``kbqa/kbqa_cq.json``
with value *KBQA* will initiate the search of *KBQA* tag
at ``model_defaults`` section of ``server_config.json``. Therefore, if this
section is present, all parameters with non empty (i.e. not ``""``,
not ``[]`` etc.) values stored by this tag will overwrite the parameter values
in ``common_defaults``.

If ``model_args_names`` parameter of ``server_config.json`` is empty string,
then model argument names are provided as list from ``chainer/in`` section of
the model config file, where arguments order corresponds to model API.
When inferencing model via REST api, JSON payload keys should match
model arguments names from ``chainer/in`` section.
If ``model_args_names`` parameter of ``server_config.json`` is list, its values
are used as model argument names instead of the list from model config's
``chainer/in`` section.
Here are POST request payload examples for some of the library models:

+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Model                                   | POST request JSON payload example                                                                                                                   |
+=========================================+=====================================================================================================================================================+
| **One argument models**                                                                                                                                                                       |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| NER model                               | {"x":["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]}                                                                            |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Intent classification model             | {"x":["I would like to go to a restaurant with Asian cuisine this evening"]}                                                                        |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Automatic spelling correction model     | {"x":["errror"]}                                                                                                                                    |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Ranking model                           | {"x":["What is the average cost of life insurance services?"]}                                                                                      |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Goal-oriented bot                       | {"x":["Hello, can you help me to find and book a restaurant this evening?"]}                                                                        |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| **Multiple arguments models**                                                                                                                                                                 |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Question Answering model                | | {"context_raw":["After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies."], |
|                                         | |  "question_raw":["What strained the relationship between Great Britain and its colonies?"]}                                                       |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+

REST API Usage Example
======================

To start server with ``squad_bert`` model run:

.. code:: bash

    python -m deeppavlov riseapi squad_bert -id

To get response from this model on another terminal run:

.. code:: bash

    curl -X POST http://0.0.0.0:5000/model -H 'Content-Type: application/json' -d '{
        "context_raw": [
            "All work and no play makes Jack a dull boy.",
            "I used to be an adventurer like you, then I took an arrow in the knee."
        ],
        "question_raw": [
            "What makes Jack a dull boy?",
            "Who I used to be?"
        ]
    }'


================================================
FILE: docs/integrations/settings.rst
================================================
DeepPavlov settings
===================

DeepPavlov provides some tools to facilitate its usage (e.g. dialog logging, settings management). This document is aimed to guide you through them.

1. Settings files access and management
---------------------------------------

Most of DeepPavlov settings are located in settings files, which in turn are located in a settings folder. Default settings folder location is ``deeppavlov/utils/settings`` .

You can override a settings directory path by setting the ``DP_SETTINGS_PATH`` environment variable. Missing files will be added automatically when running any deeppavlov script.

You can get current full path to settings directory with ``python -m deeppavlov.settings``.
To reset settings in the current settings directory one can use ``python -m deeppavlov.settings -d``.

2. Dialog logging
-----------------

DeepPavlov supports logging of infered utterances and DeepPavlov model responses. You can manage dialog logging by
editing ``dialog_logger_config.json`` file in a settings directory.

Following dialog logging settings are available:

1. **enabled** (default: ``false``): turns on/off dialog logging for DeepPavlov instance;
2. **log_path** (default: ``~/.deeppavlov/dialog_logs``): sets directory where dialog logs are stored;
3. **logger_name** (default: ``default``): sets subdirectory name for storing dialog logs;
4. **logfile_max_size_kb** (default: ``10240``): sets logfile maximum size in kilobytes. If exceeded, new log file is created;
5. **ensure_ascii** (default: ``false``): If ``true``, converts all non-ASCII symbols in logged content to Unicode code points.

3. Environment variables
------------------------

- **DP_SETTINGS_PATH** — custom path to a directory that contains settings files. It's automatically populated with missing files when running any deeppavlov scripts.
- **DP_SKIP_NLTK_DOWNLOAD** set to ``TRUE`` to prevent automatic downloading of **nltk** packages (``punkt``, ``stopwords``, ``perluniprops``, ``nonbreaking_prefixes``)


================================================
FILE: docs/integrations/socket_api.rst
================================================
Socket API
==========

Each DeepPavlov model can be made available as a socket server. The general
method is:

.. code:: bash

    python -m deeppavlov risesocket <config_path> [-d] [--socket-type <address_family>] [-p <port>] \
    [--socket-file <unix_socket_file>]


* ``-d``: downloads model specific data before starting the service.
* ``--socket-type <address_family>``: sets socket address family to ``AF_INET``
  if ``<address_family>`` is ``TCP`` or to ``AF_UNIX`` if ``<address_family>``
  is ``UNIX``. Overrides default value from
  ``deeppavlov/utils/settings/server_config.json``.
* ``-p <port>``: sets the port to ``<port>`` if socket address family is
  ``AF_INET``. Overrides default value from
  ``deeppavlov/utils/settings/server_config.json``.
* ``--socket-file <unix_socket_file>``: sets the file for socket binding to
  ``<unix_socket_file>`` if socket address family is ``AF_UNIX``. Overrides
  default value from ``deeppavlov/utils/settings/server_config.json``.

The command will print the binding address: host and port for ``AF_INET``
socket family and path to the UNIX socket file for ``AF_UNIX`` socket family.
Default service properties (socket address family, host, port, path to the UNIX
socket file, socket buffer size, binding message) can be modified via changing
``deeppavlov/utils/settings/server_config.json`` file.

Advanced configuration
~~~~~~~~~~~~~~~~~~~~~~

By modifying ``deeppavlov/utils/settings/server_config.json`` you can change
socket address family, host, port, path to the UNIX socket file and other
properties of the API service.

Properties from ``common_defaults`` section are used by default unless
they are overridden by model-specific properties, provided in ``model_defaults``
section of the ``server_config.json``. Model-specific properties are bound
to the model by ``server_utils`` label in ``metadata`` section of the model
config. Value of ``server_utils`` label from model config should match with
properties key from ``model_defaults`` section of ``server_config.json``.

For example, adding ``metadata/server_utils`` key to ``kbqa/kbqa_cq.json``
with value *KBQA* will initiate the search of *KBQA* tag
at ``model_defaults`` section of ``server_config.json``. Therefore, if this
section is present, all parameters with non empty (i.e. not ``""``,
not ``[]`` etc.) values stored by this tag will overwrite the parameter values
in ``common_defaults``.

If ``model_args_names`` parameter of ``server_config.json`` is empty string,
then model argument names are provided as list from ``chainer/in`` section of
the model config file, where arguments order corresponds to model API.
When inferencing model via socket API, serialized JSON payload keys should match
model arguments names from ``chainer/in`` section.
If ``model_args_names`` parameter of ``server_config.json`` is list, its values
are used as model argument names instead of the list from model config's
``chainer/in`` section.

+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Model                                   | POST request JSON payload example                                                                                                                   |
+=========================================+=====================================================================================================================================================+
| **One argument models**                                                                                                                                                                       |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| NER model                               | {"x":["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]}                                                                            |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Intent classification model             | {"x":["I would like to go to a restaurant with Asian cuisine this evening"]}                                                                        |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Automatic spelling correction model     | {"x":["errror"]}                                                                                                                                    |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Ranking model                           | {"x":["What is the average cost of life insurance services?"]}                                                                                      |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Goal-oriented bot                       | {"x":["Hello, can you help me to find and book a restaurant this evening?"]}                                                                        |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| **Multiple arguments models**                                                                                                                                                                 |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
| Question Answering model                | | {"context_raw":["After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies."], |
|                                         | |  "question_raw":["What strained the relationship between Great Britain and its colonies?"]}                                                       |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+

Socket client example (Python)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Socket client for :doc:`SQuAD </features/models/SQuAD>` model with a batch of
two elements:

.. code-block:: python

    # squad-client.py

    import json
    import socket
    from struct import unpack

    from deeppavlov.utils.socket import encode

    socket_payload = {
        "context_raw": [
            "All work and no play makes Jack a dull boy",
            "I used to be an adventurer like you, then I took an arrow in the knee"
        ],
        "question_raw": [
            "What makes Jack a dull boy?",
            "Who I used to be?"
        ]
    }
    serialized_socket_payload = encode(socket_payload)

    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.connect(('0.0.0.0', 5000))
        s.sendall(serialized_socket_payload)
        header = s.recv(4)
        body_len = unpack('<I', header)[0]
        serialized_response = s.recv(body_len)
        json_payload = json.loads(serialized_response)

    print(json_payload)

To start socket server with ``squad_bert`` model run:

.. code:: bash

    python -m deeppavlov risesocket -d squad_bert --socket-type TCP -p 5000


To start socket client on another terminal run:

.. code:: bash

    python squad-client.py


================================================
FILE: docs/internships/internships.rst
================================================

Internships
===========

Do you have ideas on how to improve dialog systems for everyone? Are you ready to make an impact across the world?
Great, then join us!

Let’s shape the future of Conversational AI together. An internship is for aspiring graduate and undergraduate students
who are passionate about Conversational AI technology and offer diverse perspectives.

As an intern, you will work on some of the most ambitious technical problems, develop new ML solutions that will impact
future DeepPavlov products and make the lives of DeepPavlov users easier.

All interns are paired with a mentor and will participate directly in DeepPavlov's groundbreaking work.
There are no restrictions on publications based on internships. International candidates are welcome to apply.

Each of our research teams has specific test assignments for interested candidates, so please familiarize yourself
with our `projects <https://deeppavlov.ai/research>`_ that best match your skills and interests.

`Apply now at our website <https://deeppavlov.ai/internships#application>`_.


================================================
FILE: docs/intro/configuration.rst
================================================
Configuration file
==================

An NLP pipeline config is a JSON file that contains one required element ``chainer``:

.. code:: python

    {
      "chainer": {
        "in": ["x"],
        "in_y": ["y"],
        "pipe": [
          ...
        ],
        "out": ["y_predicted"]
      }
    }

:class:`~deeppavlov.core.common.chainer.Chainer` is a core concept of DeepPavlov library: chainer builds a pipeline from
heterogeneous components (Rule-Based/ML/DL) and allows to train or infer from pipeline as a whole. Each component in the
pipeline specifies its inputs and outputs as arrays of names, for example: ``"in": ["tokens", "features"]`` and
``"out": ["token_embeddings", "features_embeddings"]`` and you can chain outputs of one components with inputs of other
components:

.. code:: python

    {
      "class_name": "deeppavlov.models.preprocessors.str_lower:str_lower",
      "in": ["x"],
      "out": ["x_lower"]
    },
    {
      "class_name": "nltk_tokenizer",
      "in": ["x_lower"],
      "out": ["x_tokens"]
    },

Pipeline elements could be child classes of :class:`~deeppavlov.core.models.component.Component` or functions.

Each :class:`~deeppavlov.core.models.component.Component` in the pipeline must implement method :meth:`__call__` and has
``class_name`` parameter, which is its registered codename, or full name of any python class in the form of
``"module_name:ClassName"``. It can also have any other parameters which repeat its :meth:`__init__` method arguments.
Default values of :meth:`__init__` arguments will be overridden with the config values during the initialization of a
class instance.

You can reuse components in the pipeline to process different parts of data with the help of ``id`` and ``ref``
parameters:

.. code:: python

    {
      "class_name": "nltk_tokenizer",
      "id": "tokenizer",
      "in": ["x_lower"],
      "out": ["x_tokens"]
    },
    {
      "ref": "tokenizer",
      "in": ["y"],
      "out": ["y_tokens"]
    },


Nested configuration files
--------------------------

Any configuration file could be used inside another configuration file as an element of the
:class:`~deeppavlov.core.common.chainer.Chainer` or as a field of another component using ``config_path`` key.
Any field of the nested configuration file could be overwritten using ``overwrite`` field:

.. code::

    "chainer": {
      "pipe": {
        ...
        {
          "class_name": "ner_chunk_model",
          "ner": {
            "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert.json",
            "overwrite": {
              "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"]
            }
          },
          ...
        }
      }
    }

In this example ``ner_ontonotes_bert.json`` is used as ``ner`` argument value in ``ner_chunk_model`` component.
``chainer.out`` value is overwritten with new list. Overwritten fields names are defined using dot notation. In this
notation numeric fields are treated as indexes of lists. For example, to change ``class_name`` value of the second
element of the pipe to ``ner_chunker`` (1 is the index of the second element), use
``"chainer.pipe.1.class_name": "ner_chunker"`` key-value pair.


Variables
---------

As of *version 0.1.0* every string value in a configuration file is interpreted
as a `format string <https://docs.python.org/3.6/library/string.html#formatstrings>`__ where fields are evaluated
from ``metadata.variables`` element:

.. code:: python

    {
      "chainer": {
        "in": ["x"],
        "pipe": [
          {
            "class_name": "my_component",
            "in": ["x"],
            "out": ["x"],
            "load_path": "{MY_PATH}/file.obj"
          },
          {
            "in": ["x"],
            "out": ["y_predicted"],
            "config_path": "{CONFIGS_PATH}/classifiers/insults_kaggle_bert.json"
          }
        ],
        "out": ["y_predicted"]
      },
      "metadata": {
        "variables": {
          "MY_PATH": "/some/path",
          "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
        }
      }
    }

Variable ``DEEPPAVLOV_PATH`` is always preset to be a path to the ``deeppavlov`` python module.

One can override configuration variables using environment variables with prefix ``DP_``. So environment variable
``DP_VARIABLE_NAME`` will override ``VARIABLE_NAME`` inside a configuration file.

For example, adding ``DP_ROOT_PATH=/my_path/to/large_hard_drive`` will make most configs use this path for downloading and reading  embeddings/models/datasets.

Training
--------

There are two abstract classes for trainable components: :class:`~deeppavlov.core.models.estimator.Estimator`
and :class:`~deeppavlov.core.models.nn_model.NNModel`.

:class:`~deeppavlov.core.models.estimator.Estimator` are fit once on any data with no batching or early stopping,
so it can be safely done at the time of pipeline initialization. :meth:`fit` method has to be implemented for each
:class:`~deeppavlov.core.models.estimator.Estimator`. One example is :class:`~deeppavlov.core.data.vocab.Vocab`.

:class:`~deeppavlov.core.models.nn_model.NNModel` requires more complex training. It can only be trained in a supervised
mode (as opposed to :class:`~deeppavlov.core.models.estimator.Estimator` which can be trained in both supervised and
unsupervised settings). This process takes multiple epochs with periodic validation and logging.
:meth:`~deeppavlov.core.models.nn_model.NNModel.train_on_batch` method has to be implemented for each
:class:`~deeppavlov.core.models.nn_model.NNModel`.

Training is triggered by :func:`~deeppavlov.train_model` function.


Train config
~~~~~~~~~~~~

:class:`~deeppavlov.core.models.estimator.Estimator` s that are trained should also have ``fit_on`` parameter which
contains a list of input parameter names. An :class:`~deeppavlov.core.models.nn_model.NNModel` should have the ``in_y``
parameter which contains a list of ground truth answer names. For example:

.. code:: python

    [
      {
        "id": "classes_vocab",
        "class_name": "default_vocab",
        "fit_on": ["y"],
        "level": "token",
        "save_path": "vocabs/classes.dict",
        "load_path": "vocabs/classes.dict"
      },
      {
        "in": ["x"],
        "in_y": ["y"],
        "out": ["y_predicted"],
        "class_name": "intent_model",
        "save_path": "classifiers/intent_cnn",
        "load_path": "classifiers/intent_cnn",
        "classes_vocab": {
          "ref": "classes_vocab"
        }
      }
    ]

The config for training the pipeline should have three additional elements: ``dataset_reader``, ``dataset_iterator``
and ``train``:

.. code:: python

    {
      "dataset_reader": {
        "class_name": ...,
        ...
      },
      "dataset_iterator": {
        "class_name": ...,
        ...
      },
      "chainer": {
        ...
      },
      "train": {
        ...
      }
    }


Simplified version of training pipeline contains two elements: ``dataset`` and ``train``. The ``dataset`` element
currently can be used for train from classification data in ``csv`` and ``json`` formats.


Train Parameters
~~~~~~~~~~~~~~~~

``train`` element can contain a ``class_name`` parameter that references a trainer class (default value is
:class:`torch_trainer <deeppavlov.core.trainers.torch_trainer.TorchTrainer>`).
All other parameters will be passed as keyword arguments to the trainer class's constructor.


Metrics
_______

.. code:: python

    "train": {
      "class_name": "torch_trainer",
      "metrics": [
        "f1",
        {
          "name": "accuracy",
          "inputs": ["y", "y_labels"]
        },
        {
          "name": "sklearn.metrics:accuracy_score",
          "alias": "unnormalized_accuracy",
          "inputs": ["y", "y_labels"],
          "normalize": false
        }
      ],
      ...
    }

The first metric in the list is used for early stopping.

Each metric can be described as a JSON object with ``name``, ``alias`` and ``inputs`` properties, where:

  - ``name`` is either a registered name of a metric function or ``module.submodules:function_name``.
  - ``alias`` is a metric name. Default value is ``name`` value.
  - ``inputs`` is a list of parameter names from chainer's inner memory that will be passed to the metric function.
    Default value is a concatenation of chainer's ``in_y`` and ``out`` parameters.

All other arguments are interpreted as kwargs when the metric is called.
If a metric is given as a string, this string is interpreted as a metric name, i.e. ``"f1"`` in the example
above is equivalent to ``{"name": "f1"}``.


DatasetReader
~~~~~~~~~~~~~

:class:`~deeppavlov.core.dara.dataset_reader.DatasetReader` class reads data and returns it in a specified format.
A concrete :class:`DatasetReader` class should be inherited from this base class and registered with a codename:


.. code:: python

    from deeppavlov.core.common.registry import register
    from deeppavlov.core.data.dataset_reader import DatasetReader

    @register('conll2003_reader')
    class Conll2003DatasetReader(DatasetReader):


DataLearningIterator and DataFittingIterator
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` forms the sets of data ('train', 'valid',
'test') needed for training/inference and divides them into batches. A concrete :class:`DataLearningIterator` class
should be registered and can be inherited from :class:`deeppavlov.data.data_learning_iterator.DataLearningIterator`
class. This is a base class and can be used as a :class:`DataLearningIterator` as well.

:class:`~deeppavlov.core.data.data_fitting_iterator.DataFittingIterator` iterates over provided dataset without
train/valid/test splitting and is useful for :class:`~deeppavlov.core.models.estimator.Estimator` s that do not require
training.


Inference
---------

All components inherited from :class:`~deeppavlov.core.models.component.Component` abstract class can be used for
inference. The :meth:`__call__` method should return standard output of a component. For example, a `tokenizer`
should return `tokens`, a `NER recognizer` should return `recognized entities`, a `bot` should return an `utterance`.
A particular format of returned data should be defined in :meth:`__call__`.

Inference is triggered by :func:`~deeppavlov.core.commands.infer.interact_model` function. There is no need in a
separate JSON for inference.

Model Configuration
-------------------

Each DeepPavlov model is determined by its configuration file. You can use
existing config files or create yours. You can also choose a config file and 
modify preprocessors/tokenizers/embedders/vectorizers there. The components
below have the same interface and are responsible for the same functions,
therefore they can be used in the same parts of a config pipeline.

Here is a list of useful
:class:`~deeppavlov.core.models.component.Component`\ s aimed to preprocess,
postprocess and vectorize your data.

Preprocessors
~~~~~~~~~~~~~

Preprocessor is a component that processes batch of samples.

* Already implemented universal preprocessors of **tokenized texts** (each
  sample is a list of tokens):

    - :class:`~deeppavlov.models.preprocessors.mask.Mask` (registered as
      ``mask``) returns binary mask of corresponding length (padding up to the
      maximum length per batch.

    - :class:`~deeppavlov.models.preprocessors.sanitizer.Sanitizer`
      (registered as ``sanitizer``) removes all combining characters like
      diacritical marks from tokens.

* Already implemented universal preprocessors of **non-tokenized texts**
  (each sample is a string):

    - :class:`~deeppavlov.models.preprocessors.dirty_comments_preprocessor.DirtyCommentsPreprocessor`
      (registered as ``dirty_comments_preprocessor``) preprocesses samples
      converting samples to lowercase, paraphrasing English combinations with
      apostrophe ``'``,  transforming more than three the same symbols to two
      symbols.

    - :meth:`~deeppavlov.models.preprocessors.str_lower.str_lower` converts samples to lowercase.

* Already implemented universal preprocessors of another type of features:

    - :class:`~deeppavlov.models.preprocessors.one_hotter.OneHotter`
      (registered as ``one_hotter``) performs one-hotting operation for the
      batch of samples where each sample is an integer label or a list of
      integer labels (can be combined in one batch). If ``multi_label``
      parameter is set to ``True``, returns one one-dimensional vector per
      sample with several elements equal to ``1``.


Tokenizers
~~~~~~~~~~

Tokenizer is a component that processes batch of samples (each sample is a text
string).

    - :class:`~deeppavlov.models.tokenizers.nltk_tokenizer.NLTKTokenizer`
      (registered as ``nltk_tokenizer``) tokenizes using tokenizers from
      ``nltk.tokenize``, e.g. ``nltk.tokenize.wordpunct_tokenize``.

    - :class:`~deeppavlov.models.tokenizers.nltk_moses_tokenizer.NLTKMosesTokenizer`
      (registered as ``nltk_moses_tokenizer``) tokenizes and detokenizes using
      ``nltk.tokenize.moses.MosesDetokenizer``,
      ``nltk.tokenize.moses.MosesTokenizer``.

    - :class:`~deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer`
      (registered as ``stream_spacy_tokenizer``) tokenizes or lemmatizes texts
      with spacy ``en_core_web_sm`` models by default.

    - :class:`~deeppavlov.models.tokenizers.split_tokenizer.SplitTokenizer`
      (registered as ``split_tokenizer``) tokenizes using string method
      ``split``.


Embedders
~~~~~~~~~

Embedder is a component that converts every token in a tokenized batch to a
vector of a particular dimension (optionally, returns a single vector per
sample).

    - :class:`~deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder`
      (registered as ``fasttext``) reads embedding file in fastText format.
      If ``mean`` returns one vector per sample - mean of embedding vectors
      of tokens.

    - :class:`~deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder`
      (registered as ``tfidf_weighted``) accepts embedder, tokenizer (for
      detokenization, by default, detokenize with joining with space), TFIDF
      vectorizer or counter vocabulary, optionally accepts tags vocabulary (to
      assign additional multiplcative weights to particular tags). If ``mean``
      returns one vector per sample - mean of embedding vectors of tokens.

Vectorizers
~~~~~~~~~~~

Vectorizer is a component that converts batch of text samples to batch of
vectors.

    - :class:`~deeppavlov.models.sklearn.sklearn_component.SklearnComponent`
      (registered as ``sklearn_component``) is a DeepPavlov wrapper for most
      of sklearn estimators, vectorizers etc. For example, to get
      TFIDF-vectorizer one should assign in config ``model_class`` to
      ``sklearn.feature_extraction.text:TfidfVectorizer``, ``infer_method``
      to ``transform``, pass ``load_path``, ``save_path`` and other sklearn
      model parameters.

    - :class:`~deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer`
      (registered as ``hashing_tfidf_vectorizer``) implements hashing version
      of usual TFIDF-vecotrizer. It creates a TFIDF matrix from collection of
      documents of size ``[n_documents X n_features(hash_size)]``.


================================================
FILE: docs/intro/installation.rst
================================================
Installation
============

DeepPavlov supports **Linux**, **Windows 10+** (through WSL/WSL2), **MacOS** (Big Sur+) platforms, **Python 3.6-3.11**.
Depending on the model used, you may need from 4 to 16 GB RAM.

Install with pip
~~~~~~~~~~~~~~~~

You should install DeepPavlov in a `virtual environment <https://docs.python.org/3/library/venv.html>`_. If you’re
unfamiliar with Python virtual environments, take a look at this
`guide <https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/>`_. A virtual
environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.

#. Create a virtual environment:

    .. code:: bash

        python -m venv env

#. Activate the virtual environment on Linux (`source` could be replaced with `.`):

    .. code:: bash

        source env/bin/activate

#. Install DeepPavlov inside this virtual environment:

    .. code:: bash

        pip install deeppavlov

Install from source
~~~~~~~~~~~~~~~~~~~

Install DeepPavlov **dev** branch from source with the following command:

    .. code:: bash

        pip install git+http://github.com/deeppavlov/DeepPavlov@dev

This command installs the bleeding edge dev version rather than the latest release version. The dev version is useful
for staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last release but
a new release hasn’t been rolled out yet. However, this means the dev version may not always be stable.

Editable install
~~~~~~~~~~~~~~~~

You will need an editable install if you want to make changes in the DeepPavlov source code that immediately take place
without requiring a new installation.

Clone the repository and install DeepPavlov with the following commands:

    .. code:: bash

        git clone http://github.com/deeppavlov/DeepPavlov.git
        pip install -e DeepPavlov

Docker Images
~~~~~~~~~~~~~

We have built several DeepPavlov based Docker images, which include:

    * DeepPavlov based Jupyter notebook Docker image;
    * Docker images which serve some of our models and allow to access them
      via REST API (:doc:`riseapi </integrations/rest_api>` mode).

Here is our `DockerHub repository <https://hub.docker.com/u/deeppavlov/>`_ with
images and deployment instructions.


================================================
FILE: docs/intro/overview.rst
================================================
Conceptual overview
===================

Our goal is to enable AI-application developers and researchers with:

-  A set of pre-trained NLP models, pre-defined dialog system components
   (ML/DL/Rule-based), and pipeline templates;
-  A framework for implementing and testing their own dialog models;
-  Tools for application integration with adjacent infrastructure
   (messengers, helpdesk software, etc.);
-  Benchmarking environments for conversational models and uniform access
   to relevant datasets.

.. image:: ../_static/dp_agnt_diag.png


Key Concepts
------------

-  A ``Model`` is any NLP model that doesn't necessarily communicates
   with the user in natural language.
-  A ``Component`` is a reusable functional part of a ``Model``.
-  ``Rule-based Models`` cannot be trained.
-  ``Machine Learning Models`` can be trained only stand alone.
-  ``Deep Learning Models`` can be trained independently and in an
   end-to-end mode being joined in a chain.
-  A ``Chainer`` builds a model pipeline from heterogeneous
   components (Rule-based/ML/DL). It allows one to train and infer models in
   a pipeline as a whole.

The smallest building block of the library is a ``Component``.
A ``Component`` stands for any kind of function in an NLP pipeline. It can
be implemented as a neural network, a non-neural ML model, or a
rule-based system.

``Component``\ s can be joined into a ``Model``. A ``Model``
solves a larger NLP task than a ``Component``. However, in terms of
implementation, ``Model``\ s are not different from ``Component``\ s.

Most of DeepPavlov models are built on top of `PyTorch <https://www.pytorch.org/>`__.
Other external libraries can be used to build basic components.


================================================
FILE: docs/intro/python.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6d5cd16b",
   "metadata": {},
   "source": [
    "#### Python pipelines"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da10fd80",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/intro/python.ipynb)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d55ebe35",
   "metadata": {},
   "source": [
    "Python models could be used without .json configuration files.\n",
    "\n",
    "The code below is an alternative to building [insults_kaggle_bert](https://github.com/deepmipt/DeepPavlov/blob/master/deeppavlov/configs/classifiers/insults_kaggle_bert.json) model and using it with\n",
    "\n",
    "```python\n",
    "from deeppavlov import build_model\n",
    "\n",
    "model = build_model('insults_kaggle_bert', download=True)\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa1db63b",
   "metadata": {},
   "source": [
    "At first, define variables for model components and download model data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d6671e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov.core.commands.utils import expand_path\n",
    "from deeppavlov.download import download_resource\n",
    "\n",
    "\n",
    "classifiers_path = expand_path('~/.deeppavlov/models/classifiers')\n",
    "model_path = classifiers_path / 'insults_kaggle_torch_bert'\n",
    "transformer_name = 'bert-base-uncased'\n",
    "\n",
    "download_resource(\n",
    "    'http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_torch_bert_v5.tar.gz',\n",
    "    {classifiers_path}\n",
    ")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "332d644e",
   "metadata": {},
   "source": [
    "Then, initialize model components."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "809c31ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov.core.data.simple_vocab import SimpleVocabulary\n",
    "from deeppavlov.models.classifiers.proba2labels import Proba2Labels\n",
    "from deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersPreprocessor\n",
    "from deeppavlov.models.torch_bert.torch_transformers_classifier import TorchTransformersClassifierModel\n",
    "\n",
    "\n",
    "preprocessor = TorchTransformersPreprocessor(\n",
    "    vocab_file=transformer_name,\n",
    "    max_seq_length=64\n",
    ")\n",
    "\n",
    "classes_vocab = SimpleVocabulary(\n",
    "    load_path=model_path/'classes.dict',\n",
    "    save_path=model_path/'classes.dict'\n",
    ")\n",
    "\n",
    "classifier =  TorchTransformersClassifierModel(\n",
    "    n_classes=classes_vocab.len,\n",
    "    return_probas=True,\n",
    "    pretrained_bert=transformer_name,\n",
    "    save_path=model_path/'model',\n",
    "    optimizer_parameters={'lr': 1e-05}\n",
    ")\n",
    "\n",
    "proba2labels = Proba2Labels(max_proba=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87e8ec20",
   "metadata": {},
   "source": [
    "Finally, create model from components. ``Element`` is a wrapper for a component. ``Element`` receives the component and the names of the incoming and outgoing arguments. ``Model`` combines ``Element``s into pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acfe29de",
   "metadata": {},
   "outputs": [],
   "source": [
    "from deeppavlov import Element, Model\n",
    "\n",
    "model = Model(\n",
    "    x=['x'],\n",
    "    out=['y_pred_labels'],\n",
    "    pipe=[\n",
    "        Element(component=preprocessor, x=['x'], out=['bert_features']),\n",
    "        Element(component=classifier, x=['bert_features'], out=['y_pred_probas']),\n",
    "        Element(component=proba2labels, x=['y_pred_probas'], out=['y_pred_ids']),\n",
    "        Element(component=classes_vocab, x=['y_pred_ids'], out=['y_pred_labels'])\n",
    "    ]\n",
    ")\n",
    "\n",
    "model(['you are stupid', 'you are smart'])"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: docs/intro/quick_start.rst
================================================
QuickStart
------------

First, follow instructions on :doc:`Installation page </intro/installation>`
to install ``deeppavlov`` package for Python 3.6-3.11.

DeepPavlov contains a bunch of great pre-trained NLP models. Each model is
determined by its config file. List of models is available on
:doc:`the doc page </features/overview>` or in
the ``deeppavlov.configs``:

    .. code:: python
        
        from deeppavlov import configs

When you've decided on the model (+ config file), there are two ways to train,
evaluate and infer it:

* via `Command line interface (CLI)`_ and
* via `Python`_.

Before making choice of an interface, install model's package requirements
(CLI):

    .. code:: bash
        
        python -m deeppavlov install <config_path>

    * where ``<config_path>`` is model name without ``.json`` extension (e.g. ``insults_kaggle_bert``) or path to the
      chosen model's config file (e.g. ``deeppavlov/configs/classifiers/insults_kaggle_bert.json``)


Command line interface (CLI)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

To get predictions from a model interactively through CLI, run

    .. code:: bash
        
        python -m deeppavlov interact <config_path> [-d] [-i]

    * ``-d`` downloads required data -- pretrained model files and embeddings (optional).
    * ``-i`` installs model requirements (optional).

You can train it in the same simple way:

    .. code:: bash
        
        python -m deeppavlov train <config_path> [-d] [-i]

    Dataset will be downloaded regardless of whether there was ``-d`` flag or not.

    To train on your own data, you need to modify dataset reader path in the
    `train section doc <configuration.html#Train-config>`__. The data format is
    specified in the corresponding model doc page. 

There are even more actions you can perform with configs:

    .. code:: bash
        
        python -m deeppavlov <action> <config_path> [-d] [-i]

    * ``<action>`` can be
        * ``install`` to install model requirements (same as ``-i``),
        * ``download`` to download model's data (same as ``-d``),
        * ``train`` to train the model on the data specified in the config file,
        * ``evaluate`` to calculate metrics on the same dataset,
        * ``interact`` to interact via CLI,
        * ``riseapi`` to run a REST API server (see :doc:`docs
          </integrations/rest_api>`),
        * ``risesocket`` to run a socket API server (see :doc:`docs
          </integrations/socket_api>`),
        * ``predict`` to get prediction for samples from ``stdin`` or from
          ``<file_path>`` if ``-f <file_path>`` is specified.
    * ``<config_path>`` specifies path (or name) of model's config file
    * ``-d`` downloads required data
    * ``-i`` installs model requirements


Python
~~~~~~

To get predictions from a model interactively through Python, run

    .. code:: python
        
        from deeppavlov import build_model

        model = build_model(<config_path>, install=True, download=True)

        # get predictions for 'input_text1', 'input_text2'
        model(['input_text1', 'input_text2'])

where

    * ``install=True`` installs model requirements (optional),
    * ``download=True`` downloads required data from web -- pretrained model files and embeddings (optional),
    * ``<config_path>`` is path to the chosen model's config file (e.g.
      ``"deeppavlov/configs/ner/ner_ontonotes_bert_mult.json"``) or
      ``deeppavlov.configs`` attribute (e.g.
      ``deeppavlov.configs.ner.ner_ontonotes_bert_mult`` without quotation marks).

You can train it in the same simple way:

    .. code:: python
        
        from deeppavlov import train_model 

        model = train_model(<config_path>, install=True, download=True)

    * ``download=True`` downloads pretrained model, therefore the pretrained
      model will be, first, loaded and then trained (optional).

    Dataset will be downloaded regardless of whether there was ``-d`` flag or not.

    To train on your own data, you need to modify dataset reader path in the
    `train section doc <configuration.html#Train-config>`__. The data format is
    specified in the corresponding model doc page. 

You can also calculate metrics on the dataset specified in your config file:

    .. code:: python
        
        from deeppavlov import evaluate_model 

        model = evaluate_model(<config_path>, install=True, download=True)


Using GPU
~~~~~~~~~

To run or train **PyTorch**-based DeepPavlov models on GPU you should have `CUDA <https://developer.nvidia.com/cuda-toolkit>`__
installed on your host machine, and install model's package requirements. CUDA version should be compatible with
DeepPavlov :dp_file:`required PyTorch version <deeppavlov/requirements/pytorch.txt>`.
GPU with Pascal or newer architecture and 4+ GB VRAM is recommended.

.. warning::
    If you use latest NVIDIA architecture, PyTorch installed from PyPI using DeepPavlov could not support your device
    CUDA capability. You will receive incompatible device warning after model initialization. You can install compatible
    package from `download.pytorch.org <https://download.pytorch.org/whl/torch_stable.html>`_. For example:

    .. code:: bash

        pip3 install torch==1.8.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html

If you want to run the code on GPU, just make the device visible for the script.
If you want to use a particular device, you may set it in command line:

    .. code:: bash

        export CUDA_VISIBLE_DEVICES=3; python -m deeppavlov train <config_path>

or in Python script:

    .. code:: python

        import os

        os.environ["CUDA_VISIBLE_DEVICES"]="3"

In case you want to keep GPU visible but disable GPU acceleration for specific component, use ``device`` paramenter
(available for :class:`~deeppavlov.core.models.torch_model.TorchModel` child classes): ``"device": "cpu"``.


Pretrained models
~~~~~~~~~~~~~~~~~

DeepPavlov provides a wide range of pretrained models.
See :doc:`features overview </features/overview>` for more info. Please
note that most of our models are trained on specific datasets for
specific tasks and may require further training on your data.
You can find a list of our out-of-the-box models `below <#out-of-the-box-pretrained-models>`_.


Docker images
~~~~~~~~~~~~~

You can run DeepPavlov models in :doc:`riseapi </integrations/rest_api>` mode or start Jupyter server
via Docker without installing DeepPavlov. Both your CPU and GPU (we support NVIDIA graphic
processors) can be utilised, please refer our `Docker <https://hub.docker.com/r/deeppavlov/deeppavlov>`_
images run instructions.


Out-of-the-box pretrained models
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

While the best way to solve most of the NLP tasks lies through collecting datasets
and training models according to the domain and an actual task itself, DeepPavlov
offers several pretrained models, which can be strong baselines for a wide range of tasks.

You can run these models `via Docker <#docker-images>`_ or in ``riseapi``/``risesocket`` mode to use in
solutions. See :doc:`riseapi </integrations/rest_api>` and :doc:`risesocket </integrations/socket_api>`
modes documentation for API details.


Text Question Answering
=======================

Text Question Answering component answers a question based on a given context (e.g,
a paragraph of text), where the answer to the question is a segment of the context.

.. code:: python

    from deeppavlov import build_model

    model = build_model('squad_bert', download=True, install=True)
    contexts = ['DeepPavlov is a library for NLP and dialog systems.', 'All work and no play makes Jack a dull boy']
    questions = ['What is DeepPavlov?', 'What makes Jack a dull boy?']
    answer, answers_start_idx, score = model(contexts, questions)
    print(answer)

.. code:: bash

    ['a library for NLP and dialog systems', 'All work and no play']

To get list of available models for Text Question Answering see :doc:`documentation </features/models/SQuAD>`.

Open-Domain Question Answering
==============================

Open Domain Question Answering (ODQA) answers any question based on the document collection covering a wide range of
topics. The ODQA task combines two challenges of document retrieval (finding the relevant articles) with that of machine
comprehension of text (identifying the answer span from those articles). This component can be used to answer questions
based on the company knowledge base.

.. code:: python

    from deeppavlov import build_model

    model = build_model('en_odqa_infer_wiki', download=True, install=True)
    questions = ["What is the name of Darth Vader's son?", 'Who was the first president of France?']
    answer, answer_score, answer_place = model(questions)
    print(answer)

.. code:: bash

    ['Luke Skywalker', 'Louis-Napoleon Bonaparte']

To get list of available models for Open-Domain Question Answering see :doc:`documentation </features/models/ODQA>`.

Knowledge Base Question Answering
=================================

Knowledge Base Question Answering (KBQA) answers any question based on Knowledge Base (Knowledge Graph) -
a comprehensive repository of information about a given domain or a number of domains that reflects the ways we model
knowledge about a given subject or subjects, in terms of concepts, entities, properties, and relationships. KBQA models
validate questions against a preconfigured list of question templates, disambiguate entities using Entity Linking,
and answer questions asked in natural language.

.. code:: python

    from deeppavlov import build_model

    model = build_model('kbqa_cq_en', download=True, install=True)
    questions = ['What is the currency of Sweden?', 'When did the Korean War end?']
    answers, answer_ids, query = model(questions)
    print(answers)

.. code:: bash

    ['Swedish krona', '27 July 1953']

To get list of available models for Knowledge Base Question Answering see :doc:`documentation </features/models/KBQA>`.

Classification (insult and paraphrase detection, sentiment analysis, topic classification)
==========================================================================================

Insult detection predicts whether a text (e.g, post or speech in some public discussion) is considered insulting to one
of the persons it is related to.

Sentiment analysis is a task of classifying the polarity of the the given sequence.

The models trained for the paraphrase detection task identify whether two sentences expressed with different words
convey the same meaning.

Topic classification refers to the task of classifying an utterance by the topic which belongs to the conversational
domain.

.. code:: python

    from deeppavlov import build_model

    model = build_model('insults_kaggle_bert', download=True, install=True)
    phrases = ['You are kind of stupid', 'You are a wonderful person!']
    labels = model(phrases)
    print(labels)

.. code:: bash

    ['Insult', 'Not Insult']

To get list of available models for Classification see :doc:`documentation </features/models/classification>`.

Name Entity Recognition
=======================

Named Entity Recognition (NER) classifies tokens in text into predefined categories
(tags), such as person names, quantity expressions, percentage expressions, names
of locations, organizations, as well as expression of time, currency and others.

.. code:: python

    from deeppavlov import build_model

    model = build_model('ner_ontonotes_bert', download=True, install=True)
    phrases = ['Bob Ross lived in Florida', 'Elon Musk founded Tesla']
    tokens, tags = model(phrases)
    print(tokens, tags, sep='\n')

.. code:: bash

    [['Bob', 'Ross', 'lived', 'in', 'Florida'], ['Elon', 'Musk', 'founded', 'Tesla']]
    [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE'], ['B-PERSON', 'I-PERSON', 'O', 'B-ORG']]

To get list of available models for Name Entity Recognition see :doc:`documentation </features/models/NER>`.

Entity Extraction
=================

Entity Detection is the task of identifying entity mentions in text with corresponding entity types.
Entity Linking is the task of finding knowledge base entity ids for entity mentions in text.
Entity Extraction configs perform subsequent Entity Detection and Entity Linking of extracted entity mentions.

.. code:: python

    from deeppavlov import build_model

    model = build_model('entity_extraction_en', download=True, install=True)
    phrases = ['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.']
    entity_substr, tags, entity_offsets, entity_ids, entity_conf, entity_pages, entity_labels = model(phrases)
    print(entity_substr, tags, entity_ids, entity_labels, sep='\n')

.. code:: bash

    [['forrest gump', 'robert zemeckis', 'eric roth']]
    [['WORK_OF_ART', 'PERSON', 'PERSON']]
    [[['Q134773', 'Q552213', 'Q12016774'], ['Q187364', 'Q36951156'], ['Q942932', 'Q89320386', 'Q89909683']]]
    [[['Forrest Gump', 'Forrest Gump', 'Forrest Gump'], ['Robert Zemeckis', 'Welcome to Marwen'], ['Eric Roth', 'Eric Roth', 'Eric W Roth']]]

To get list of available models for Entity Extraction see :doc:`documentation </features/models/entity_extraction>`.

Spelling Correction
===================

Spelling Correction models detect and correct spelling errors in texts.

.. code:: python

    from deeppavlov import build_model

    model = build_model('brillmoore_wikitypos_en', download=True, install=True)
    phrases_w_typos = ['I think this is the begining of a beautifull frendship.', "I'll be bak"]
    correct_phrases = model(phrases_w_typos)
    print(correct_phrases)

.. code:: bash

    ['i think this is the beginning of a beautiful friendship.', "i'll be back"]

To get list of available models for Spelling Correction see :doc:`documentation </features/models/spelling_correction>`.


================================================
FILE: requirements.txt
================================================
fastapi>=0.47.0,<=0.89.1
filelock>=3.0.0,<3.10.0
nltk>=3.2.4,<3.10.0
numpy<1.24
pandas>=1.0.0,<1.6.0
prometheus-client>=0.13.0,<=1.16.0
pydantic<2
pybind11==2.10.3
requests>=2.19.0,<3.0.0
scikit-learn>=0.24,<1.1.0;python_version<="3.10"
scikit-learn==1.4.0;python_version=="3.11.*"
tqdm>=4.42.0,<4.65.0
uvicorn>=0.13.0,<0.19.0
wheel
scipy<1.10.0;python_version<"3.8"
scipy==1.10.0;python_version>="3.8"


================================================
FILE: setup.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re

from setuptools import setup, find_packages

__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

meta_path = os.path.join(__location__, 'deeppavlov', '_meta.py')
with open(meta_path) as meta:
    exec(meta.read())


def read_requirements():
    """parses requirements from requirements.txt"""
    reqs_path = os.path.join(__location__, 'requirements.txt')
    with open(reqs_path, encoding='utf8') as f:
        reqs = [line.strip() for line in f if not line.strip().startswith('#')]

    names = []
    links = []
    for req in reqs:
        if '://' in req:
            links.append(req)
        else:
            names.append(req)
    return {'install_requires': names, 'dependency_links': links}


def readme():
    with open(os.path.join(__location__, 'README.md'), encoding='utf8') as f:
        text = f.read()
    text = re.sub(r']\((?!https?://)', r'](https://github.com/deeppavlov/DeepPavlov/blob/master/', text)
    text = re.sub(r'\ssrc="(?!https?://)', r' src="https://raw.githubusercontent.com/deeppavlov/DeepPavlov/master/', text)
    return text


if __name__ == '__main__':
    setup(
        name='deeppavlov',
        packages=find_packages(exclude=('tests', 'docs', 'utils')),
        version=__version__,
        description=__description__,
        long_description=readme(),
        long_description_content_type='text/markdown',
        author=__author__,
        author_email=__email__,
        license=__license__,
        url='https://github.com/deeppavlov/DeepPavlov',
        download_url=f'https://github.com/deeppavlov/DeepPavlov/archive/{__version__}.tar.gz',
        keywords=__keywords__,
        include_package_data=True,
        extras_require={
            'tests': [
                'flake8',
                'pytest',
                'pytest-instafail',
                'pexpect'
            ],
            'docs': [
                'sphinx==3.5.4;python_version<="3.7"',
                'sphinx==5.0.0;python_version=="3.8"',
                'sphinx==5.0.0;python_version=="3.9"',
                'sphinx==5.0.0;python_version=="3.10"',
                'sphinx==7.2.*;python_version=="3.11.*"',
                'sphinx_rtd_theme==0.5.2;python_version<="3.10"',
                'sphinx_rtd_theme==2.0.0;python_version=="3.11.*"',
                'docutils<0.17,>=0.12;python_version<="3.10"',
                'docutils==0.20.1;python_version=="3.11.*"',
                'nbsphinx==0.8.4;python_version<="3.10"',
                'nbsphinx==0.9.3;python_version=="3.11.*"',
                'ipykernel==5.5.4',
                'jinja2<=3.0.3',
                'sphinx-copybutton==0.5.0',
                'pandoc==2.3',
                'ipython_genutils==0.2.0'
            ],
            's3': [
                'boto3'
            ]
        },
        **read_requirements()
    )


================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/test_configs/doc_retrieval/en_ranker_pop_wiki_test.json
================================================
{
  "dataset_reader": {
    "class_name": "odqa_reader",
    "data_path": "{DOWNLOADS_PATH}/odqa/enwiki_test",
    "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db",
    "dataset_format": "txt"
  },
  "dataset_iterator": {
    "class_name": "sqlite_iterator",
    "shuffle": false,
    "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db"
  },
  "chainer": {
    "in": [
      "docs"
    ],
    "in_y": [
      "doc_ids",
      "doc_nums"
    ],
    "out": [
      "pop_doc_ids"
    ],
    "pipe": [
      {
        "class_name": "hashing_tfidf_vectorizer",
        "id": "vectorizer",
        "fit_on": [
          "docs",
          "doc_ids",
          "doc_nums"
        ],
        "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz",
        "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz",
        "tokenizer": {
          "class_name": "stream_spacy_tokenizer",
          "lemmas": true,
          "ngram_range": [
            1,
            2
          ]
        }
      },
      {
        "class_name": "tfidf_ranker",
        "top_n": 20,
        "in": [
          "docs"
        ],
        "out": [
          "tfidf_doc_ids",
          "tfidf_doc_scores"
        ],
        "vectorizer": "#vectorizer"
      },
      {
        "class_name": "pop_ranker",
        "pop_dict_path": "{DOWNLOADS_PATH}/odqa/enwiki_popularities.json",
        "load_path": "{MODELS_PATH}/odqa/logreg_3features_v2.joblib",
        "top_n": 10,
        "in": [
          "tfidf_doc_ids",
          "tfidf_doc_scores"
        ],
        "out": [
          "pop_doc_ids",
          "pop_doc_scores"
        ]
      }
    ]
  },
  "train": {
    "batch_size": 10000,
    "evaluation_targets": [],
    "class_name": "fit_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki_test.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/odqa"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_popularities.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/odqa"
      },
      {
        "url": "http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib",
        "subdir": "{MODELS_PATH}/odqa"
      }
    ]
  }
}


================================================
FILE: tests/test_configs/doc_retrieval/en_ranker_tfidf_wiki_test.json
================================================
{
  "dataset_reader": {
    "class_name": "odqa_reader",
    "data_path": "{DOWNLOADS_PATH}/odqa/enwiki_test",
    "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db",
    "dataset_format": "txt"
  },
  "dataset_iterator": {
    "class_name": "sqlite_iterator",
    "shuffle": false,
    "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db"
  },
  "chainer": {
    "in": [
      "docs"
    ],
    "in_y": [
      "doc_ids",
      "doc_nums"
    ],
    "out": [
      "tfidf_doc_ids"
    ],
    "pipe": [
      {
        "class_name": "hashing_tfidf_vectorizer",
        "id": "vectorizer",
        "fit_on": [
          "docs",
          "doc_ids",
          "doc_nums"
        ],
        "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz",
        "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz",
        "tokenizer": {
          "class_name": "stream_spacy_tokenizer",
          "lemmas": true,
          "ngram_range": [
            1,
            2
          ]
        }
      },
      {
        "class_name": "tfidf_ranker",
        "top_n": 20,
        "in": [
          "docs"
        ],
        "out": [
          "tfidf_doc_ids",
          "tfidf_doc_scores"
        ],
        "vectorizer": "#vectorizer"
      }
    ]
  },
  "train": {
    "batch_size": 2,
    "evaluation_targets": [],
    "class_name": "fit_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki_test.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/odqa"
      }
    ]
  }
}

================================================
FILE: tests/test_configs/doc_retrieval/ru_ranker_tfidf_wiki_test.json
================================================
{
  "dataset_reader": {
    "class_name": "odqa_reader",
    "data_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test",
    "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test.db",
    "dataset_format": "txt"
  },
  "dataset_iterator": {
    "class_name": "sqlite_iterator",
    "shuffle": false,
    "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test.db"
  },
  "chainer": {
    "in": [
      "docs"
    ],
    "in_y": [
      "doc_ids",
      "doc_nums"
    ],
    "out": [
      "tfidf_doc_ids"
    ],
    "pipe": [
      {
        "class_name": "hashing_tfidf_vectorizer",
        "id": "vectorizer",
        "fit_on": [
          "docs",
          "doc_ids",
          "doc_nums"
        ],
        "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz",
        "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz",
        "tokenizer": {
          "class_name": "stream_spacy_tokenizer",
          "spacy_model": "ru_core_news_sm",
          "lemmas": true,
          "lowercase": true,
          "filter_stopwords": true,
          "ngram_range": [
            1,
            2
          ]
        }
      },
      {
        "class_name": "tfidf_ranker",
        "top_n": 20,
        "in": [
          "docs"
        ],
        "out": [
          "tfidf_doc_ids",
          "tfidf_doc_scores"
        ],
        "vectorizer": "#vectorizer"
      }
    ]
  },
  "train": {
    "batch_size": 2,
    "evaluation_targets": [],
    "class_name": "fit_trainer"
  },
  "metadata": {
    "variables": {
      "ROOT_PATH": "~/.deeppavlov",
      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
      "MODELS_PATH": "{ROOT_PATH}/models"
    },
    "download": [
      {
        "url": "http://files.deeppavlov.ai/datasets/wikipedia/ruwiki_test.tar.gz",
        "subdir": "{DOWNLOADS_PATH}/odqa"
      }
    ]
  }
}

================================================
FILE: tests/test_quick_start.py
================================================
import io
import json
import logging
import os
import shutil
import signal
import socket
import sys
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from struct import unpack
from time import sleep
from typing import Optional, Union
from urllib.parse import urljoin

import pexpect
import pexpect.popen_spawn
import pytest
import requests

import deeppavlov
from deeppavlov import build_model
from deeppavlov.core.commands.utils import parse_config, parse_value_with_config
from deeppavlov.core.common.aliases import ALIASES
from deeppavlov.core.data.utils import get_all_elems_from_json
from deeppavlov.download import deep_download
from deeppavlov.utils.server import get_server_params
from deeppavlov.utils.socket import encode

tests_dir = Path(__file__).parent
test_configs_path = tests_dir / "deeppavlov" / "configs"
src_dir = Path(deeppavlov.__path__[0]) / "configs"
test_src_dir = tests_dir / "test_configs"
download_path = tests_dir / "download"

cache_dir: Optional[Path] = None
if not os.getenv('DP_PYTEST_NO_CACHE'):
    cache_dir = tests_dir / 'download_cache'

SKIP_TF = os.getenv('SKIP_TF', False)

api_port = os.getenv('DP_PYTEST_API_PORT')
if api_port is not None:
    api_port = int(api_port)

TEST_MODES = ['IP',  # test_inferring_pretrained_model
              'TI',  # test_consecutive_training_and_inferring
              ]

ALL_MODES = ('IP', 'TI')

ONE_ARGUMENT_INFER_CHECK = ('Dummy text', None)
TWO_ARGUMENTS_INFER_CHECK = ('Dummy text', 'Dummy text', None)
FOUR_ARGUMENTS_INFER_CHECK = ('Dummy text', 'Dummy text', 'Dummy text', 'Dummy_text', None)

LIST_ARGUMENTS_INFER_CHECK = (['Dummy text', 'Dummy text'], ['Dummy text', 'Dummy text'], None)

RECORD_ARGUMENTS_INFER_CHECK = ("Index", "Dummy query text", "Dummy passage text", "Dummy entity", 1, None)

# Mapping from model name to config-model_dir-ispretrained and corresponding queries-response list.
PARAMS = {
    "relation_extraction": {
        ("relation_extraction/re_docred.json", "relation_extraction", ('IP',)):
            [
                (
                    [["Barack", "Obama", "is", "married", "to", "Michelle", "Obama", ",", "born", "Michelle",
                      "Robinson", "."]],
                    [[[(0, 2)], [(5, 7), (9, 11)]]],
                    [["PER", "PER"]],
                    (
                        'P26',
                        'spouse'
                    )
                )
            ],
        ("relation_extraction/re_rured.json", "relation_extraction", ('IP',)):
            [
                (
                    [["Илон", "Маск", "живет", "в", "Сиэттле", "."]],
                    [[[(0, 2)], [(4, 6)]]],
                    [["PERSON", "CITY"]],
                    (
                        'P495',
                        'страна происхождения'
                    )
                ),
            ]
    },
    "faq": {
        ("faq/fasttext_logreg.json", "fasttext_logreg", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK],  # TODO: add ru test
    },
    "spelling_correction": {
        ("spelling_correction/brillmoore_wikitypos_en.json", "error_model", ALL_MODES):
            [
                ("helllo", ("hello",)),
                ("datha", ("data",))
            ],
        ("spelling_correction/levenshtein_corrector_ru.json", "error_model", ('IP',)):
            [
                ("преветствую", ("приветствую",)),
                ("Я джва года хочу такую игру", ("я два года хочу такую игру",))
            ]
    },
    "classifiers": {
        ("classifiers/paraphraser_rubert.json", "classifiers", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/insults_kaggle_bert.json", "classifiers", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
        ("classifiers/rusentiment_bert.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],
        ("classifiers/sentiment_twitter.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK],
        ("classifiers/sentiment_sst_conv_bert.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],
        ("classifiers/glue/glue_mrpc_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/glue/glue_stsb_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/glue/glue_mnli_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/glue/glue_rte_roberta_mnli.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/glue/glue_cola_roberta.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],
        ("classifiers/glue/glue_qnli_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/glue/glue_qqp_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/glue/glue_sst2_roberta.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],
        ("classifiers/glue/glue_wnli_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/superglue/superglue_copa_roberta.json", "classifiers", ('TI',)): [LIST_ARGUMENTS_INFER_CHECK],
        ("classifiers/superglue/superglue_boolq_roberta_mnli.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/superglue/superglue_record_roberta.json", "classifiers", ('TI',)): [RECORD_ARGUMENTS_INFER_CHECK],
        ("classifiers/superglue/superglue_wic_bert.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/topics_distilbert_base_uncased.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],
        ("classifiers/few_shot_roberta.json", "classifiers", ('IP',)): [
            ('Dummy text', ['Dummy text Dummy text', 'Dummy class'], ('Dummy class',))
        ]
    },
    "distil": {
        ("classifiers/paraphraser_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/paraphraser_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
        ("classifiers/rusentiment_convers_distilrubert_2L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
        ("classifiers/rusentiment_convers_distilrubert_6L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_rus_convers_distilrubert_2L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_rus_convers_distilrubert_6L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_case_agnostic_mdistilbert.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
        ("squad/squad_ru_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
        ("squad/squad_ru_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK]
    },
    "russian_super_glue": {
        ("russian_super_glue/russian_superglue_lidirus_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("russian_super_glue/russian_superglue_danetqa_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("russian_super_glue/russian_superglue_terra_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("russian_super_glue/russian_superglue_rcb_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("russian_super_glue/russian_superglue_russe_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("russian_super_glue/russian_superglue_rwsd_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("russian_super_glue/russian_superglue_muserc_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK],
        ("russian_super_glue/russian_superglue_parus_rubert.json", "russian_super_glue", ('IP',)): [LIST_ARGUMENTS_INFER_CHECK],
        ("russian_super_glue/russian_superglue_rucos_rubert.json", "russian_super_glue", ('IP',)): [RECORD_ARGUMENTS_INFER_CHECK]
    },
    "multitask":{
        ("multitask/multitask_example.json", "multitask", ALL_MODES): [
            ('Dummy text',) + (('Dummy text', 'Dummy text'),) * 3 + ('Dummy text',) + (None,)],
        ("multitask/mt_glue.json", "multitask", ALL_MODES): [
            ('Dummy text',) * 2 + (('Dummy text', 'Dummy text'),) * 6 + (None,)]
    },
    "entity_extraction": {
        ("entity_extraction/entity_detection_en.json", "entity_extraction", ('IP',)):
            [
                ("Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.",
                 (['forrest gump', 'robert zemeckis', 'eric roth'],
                  [(0, 12), (48, 63), (79, 88)],
                  [[0, 1], [10, 11], [15, 16]],
                  ['WORK_OF_ART', 'PERSON', 'PERSON'],
                  [(0, 89)],
                  ['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'],
                  [0.8798, 0.9986, 0.9985]))
            ],
        ("entity_extraction/entity_detection_ru.json", "entity_extraction", ('IP',)):
            [
                ("Москва — столица России, центр Центрального федерального округа и центр Московской области.",
                 (['москва', 'россии', 'центрального федерального округа', 'московской области'],
                  [(0, 6), (17, 23), (31, 63), (72, 90)],
                  [[0], [3], [6, 7, 8], [11, 12]],
                  ['CITY', 'COUNTRY', 'LOC', 'LOC'],
                  [(0, 91)],
                  ['Москва — столица России, центр Центрального федерального округа и центр Московской области.'],
                  [0.8359, 0.938, 0.9917, 0.9803]))
            ],
        ("entity_extraction/entity_extraction_en.json", "entity_extraction", ('IP',)):
            [
                ("Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.",
                 (['forrest gump', 'robert zemeckis', 'eric roth'],
                  ['WORK_OF_ART', 'PERSON', 'PERSON'],
                  [(0, 12), (48, 63), (79, 88)],
                  [['Q134773', 'Q552213', 'Q12016774'], ['Q187364', 'Q36951156'],
                   ['Q942932', 'Q89320386', 'Q89909683']],
                  [[[1.1, 110, 1.0], [1.1, 13, 0.73], [1.1, 8, 0.04]], [[1.1, 73, 1.0], [0.5, 52, 0.29]],
                   [[1.1, 37, 0.95], [1.1, 2, 0.35], [0.67, 2, 0.35]]],
                  [['Forrest Gump', 'Forrest Gump (novel)', ''], ['Robert Zemeckis', 'Welcome to Marwen'],
                   ['Eric Roth', '', '']],
                  [['Forrest Gump', 'Forrest Gump', 'Forrest Gump'], ['Robert Zemeckis', 'Welcome to Marwen'],
                   ['Eric Roth', 'Eric Roth', 'Eric W Roth']]))
            ],
        ("entity_extraction/entity_extraction_ru.json", "entity_extraction", ('IP',)):
            [
                ("Москва — столица России, центр Центрального федерального округа и центр Московской области.",
                 (['москва', 'россии', 'центрального федерального округа', 'московской области'],
                  ['CITY', 'COUNTRY', 'LOC', 'LOC'],
                  [(0, 6), (17, 23), (31, 63), (72, 90)],
                  [['Q649', 'Q1023006', 'Q2380475'], ['Q159', 'Q2184', 'Q139319'], ['Q190778', 'Q4504288', 'Q27557290'],
                   ['Q1697', 'Q4303932', 'Q24565285']],
                  [[[1.1, 200, 1.0], [1.0, 20, 0.0], [1.0, 18, 0.0]],
                   [[1.1, 200, 1.0], [1.0, 58, 1.0], [1.0, 29, 0.85]],
                   [[1.1, 200, 1.0], [0.67, 3, 0.92], [0.67, 3, 0.89]],
                   [[0.9, 200, 1.0], [0.9, 6, 0.83], [0.61, 8, 0.03]]],
                  [['Москва', 'Москоу (Канзас)', 'Москоу (Теннесси)'],
                   ['Россия', 'Российская Советская Федеративная Социалистическая Республика',
                    'Российская республика'],
                   ['Центральный федеральный округ', 'Центральный округ (Краснодар)', ''],
                   ['Московская область', 'Московская область (1917—1918)',
                    'Мостовский (Волгоградская область)']],
                  [['Москва', 'Москоу', 'Москоу'],
                   ['Россия', 'Российская Советская Федеративная Социалистическая Республика',
                    'Российская республика'],
                   ['Центральный федеральный округ', 'Центральный округ (Краснодар)', 'Центральный округ (Братск)'],
                   ['Московская область', 'Московская область', 'Мостовский']]))
            ]
    },
    "ner": {
        ("ner/ner_bert_base.json", "ner_bert_base", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_conll2003_bert.json", "ner_conll2003_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_ontonotes_bert.json", "ner_ontonotes_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_ontonotes_bert_mult.json", "ner_ontonotes_bert_mult", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_rus_bert.json", "ner_rus_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_collection3_bert.json", "ner_collection3_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_conll2003_deberta_crf.json", "ner_conll2003_deberta_crf", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
        ("ner/ner_ontonotes_deberta_crf.json", "ner_ontonotes_deberta_crf", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
    },
    "sentence_segmentation": {
        ("sentence_segmentation/sentseg_dailydialog_bert.json", "sentseg_dailydialog_bert", ('IP', 'TI')): [
            (["hey", "alexa", "how", "are", "you"], None)]
    },
    "kbqa": {
        ("kbqa/kbqa_cq_en.json", "kbqa", ('IP',)):
            [
                ("What is the currency of Sweden?",
                 ("Swedish krona", ["Q122922"], ["SELECT ?answer WHERE { wd:Q34 wdt:P38 ?answer. }"])),
                ("Where was Napoleon Bonaparte born?",
                 ("Ajaccio", ["Q40104"], ["SELECT ?answer WHERE { wd:Q517 wdt:P19 ?answer. }"])),
                ("When did the Korean War end?",
                 ("27 July 1953", ["+1953-07-27^^T"], ["SELECT ?answer WHERE { wd:Q8663 wdt:P582 ?answer. }"])),
                ("   ", ("Not Found", [], []))
            ],            
        ("kbqa/kbqa_cq_ru.json", "kbqa", ('IP',)):
            [
                ("Кто такой Оксимирон?",
                 ("российский рэп-исполнитель", ['российский рэп-исполнитель"@ru'],
                  ["SELECT ?answer WHERE { wd:Q4046107 wdt:P0 ?answer. }"])),
                ("Кто написал «Евгений Онегин»?",
                 ("Александр Сергеевич Пушкин", ["Q7200"], ["SELECT ?answer WHERE { wd:Q50948 wdt:P50 ?answer. }"])),
                ("абв", ("Not Found", [], []))
            ]
    },
    "ranking": {
        ("ranking/ranking_ubuntu_v2_torch_bert_uncased.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK]
    },
    "doc_retrieval": {
        ("doc_retrieval/en_ranker_tfidf_wiki_test.json", "doc_retrieval", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],
        ("doc_retrieval/ru_ranker_tfidf_wiki_test.json", "doc_retrieval", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],
        ("doc_retrieval/en_ranker_pop_wiki_test.json", "doc_retrieval", ('TI',)): [ONE_ARGUMENT_INFER_CHECK]
    },
    "squad": {
        ("squad/squad_ru_bert.json", "squad_ru_bert", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK],
        ("squad/squad_bert.json", "squad_bert", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK]
    },
    "odqa": {
        ("odqa/en_odqa_infer_wiki.json", "odqa", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],
        ("odqa/ru_odqa_infer_wiki.json", "odqa", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],
        ("odqa/en_odqa_pop_infer_wiki.json", "odqa", ('IP',)): [ONE_ARGUMENT_INFER_CHECK]
    },
    "morpho_tagger": {
        ("morpho_syntax_parser/morpho_ru_syntagrus_bert.json", "morpho_tagger_bert", ('IP', 'TI')):
            [ONE_ARGUMENT_INFER_CHECK]
    },
    "syntax_tagger": {
        ("morpho_syntax_parser/syntax_ru_syntagrus_bert.json", "syntax_ru_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
        ("morpho_syntax_parser/ru_syntagrus_joint_parsing.json", "syntax_ru_bert", ('IP',)): [ONE_ARGUMENT_INFER_CHECK]
    },
}

MARKS = {"gpu_only": ["squad"], "slow": ["error_model", "squad"]}  # marks defined in pytest.ini

TEST_GRID = []
for model in PARAMS.keys():
    for conf_file, model_dir, mode in PARAMS[model].keys():
        marks = []
        for mark in MARKS.keys():
            if model in MARKS[mark]:
                marks.append(eval("pytest.mark." + mark))
        grid_unit = pytest.param(model, conf_file, model_dir, mode, marks=marks)
        TEST_GRID.append(grid_unit)


def _override_with_test_values(item: Union[dict, list]) -> None:
    if isinstance(item, dict):
        keys = [k for k in item.keys() if k.startswith('pytest_')]
        for k in keys:
            item[k[len('pytest_'):]] = item.pop(k)
        item = item.values()

    for child in item:
        if isinstance(child, (dict, list)):
            _override_with_test_values(child)


def download_config(config_path):
    src_file = src_dir / config_path
    if not src_file.is_file():
        src_file = test_src_dir / config_path

    if not src_file.is_file():
        raise RuntimeError('No config file {}'.format(config_path))

    with src_file.open(encoding='utf8') as fin:
        config: dict = json.load(fin)

    # Download referenced config files
    config_references = get_all_elems_from_json(parse_config(config), 'config_path')
    for config_ref in config_references:
        splitted = config_ref.split("/")
        first_subdir_index = splitted.index("configs") + 1
        m_name = config_ref.split('/')[first_subdir_index]
        config_ref = '/'.join(config_ref.split('/')[first_subdir_index:])

        test_configs_path.joinpath(m_name).mkdir(exist_ok=True)
        if not test_configs_path.joinpath(config_ref).exists():
            download_config(config_ref)

    # Update config for testing
    config.setdefault('train', {}).setdefault('pytest_epochs', 1)
    config['train'].setdefault('pytest_max_batches', 2)
    config['train'].setdefault('pytest_max_test_batches', 2)
    _override_with_test_values(config)

    config_path = test_configs_path / config_path
    config_path.parent.mkdir(exist_ok=True, parents=True)
    with config_path.open("w", encoding='utf8') as fout:
        json.dump(config, fout)


def install_config(config_path):
    logfile = io.BytesIO(b'')
    p = pexpect.popen_spawn.PopenSpawn(sys.executable + " -m deeppavlov install " + str(config_path), timeout=None,
                                       logfile=logfile)
    p.readlines()
    if p.wait() != 0:
        raise RuntimeError('Installing process of {} returned non-zero exit code: \n{}'
                           .format(config_path, logfile.getvalue().decode()))


def setup_module():
    shutil.rmtree(str(test_configs_path), ignore_errors=True)
    shutil.rmtree(str(download_path), ignore_errors=True)
    test_configs_path.mkdir(parents=True)

    for m_name, conf_dict in PARAMS.items():
        test_configs_path.joinpath(m_name).mkdir(exist_ok=True, parents=True)
        for (config_path, _, _), _ in conf_dict.items():
            download_config(config_path)

    os.environ['DP_ROOT_PATH'] = str(download_path)
    os.environ['DP_CONFIGS_PATH'] = str(test_configs_path)

    if cache_dir:
        cache_dir.mkdir(parents=True, exist_ok=True)
        os.environ['DP_CACHE_DIR'] = str(cache_dir.resolve())


def teardown_module():
    shutil.rmtree(str(test_configs_path.parent), ignore_errors=True)
    shutil.rmtree(str(download_path), ignore_errors=True)

    if cache_dir:
        shutil.rmtree(str(cache_dir), ignore_errors=True)


def _infer(config, inputs, download=False):
    chainer = build_model(config, download=download)
    if inputs:
        prediction = chainer(*inputs)
        if len(chainer.out_params) == 1:
            prediction = [prediction]
    else:
        prediction = []
    return prediction


@pytest.mark.parametrize("model,conf_file,model_dir,mode", TEST_GRID, scope='class')
class TestQuickStart(object):
    @staticmethod
    def infer(config_path, qr_list=None, check_outputs=True):

        *inputs, expected_outputs = zip(*qr_list) if qr_list else ([],)
        with ProcessPoolExecutor(max_workers=1) as executor:
            f = executor.submit(_infer, config_path, inputs)
        outputs = list(zip(*f.result()))

        if check_outputs:
            errors = ';'.join([f'expected `{expected}` got `{output}`'
                               for output, expected in zip(outputs, expected_outputs)
                               if expected is not None and expected != output])
            if errors:
                raise RuntimeError(f'Unexpected results for {config_path}: {errors}')

    @staticmethod
    def infer_api(config_path, qr_list):
        *inputs, expected_outputs = zip(*qr_list)
        server_params = get_server_params(config_path)

        url_base = 'http://{}:{}'.format(server_params['host'], api_port or server_params['port'])
        url = urljoin(url_base.replace('http://0.0.0.0:', 'http://127.0.0.1:'), server_params['model_endpoint'])

        post_headers = {'Accept': 'application/json'}

        logfile = io.BytesIO(b'')
        args = [sys.executable, "-m", "deeppavlov", "riseapi", str(config_path)]
        if api_port:
            args += ['-p', str(api_port)]
        p = pexpect.popen_spawn.PopenSpawn(' '.join(args),
                                           timeout=None, logfile=logfile)
        try:
            p.expect(url_base)

            get_url = urljoin(url_base.replace('http://0.0.0.0:', 'http://127.0.0.1:'), '/api')
            get_response = requests.get(get_url)
            response_code = get_response.status_code
            assert response_code == 200, f"GET /api request returned error code {response_code} with {config_path}"

            model_args_names = get_response.json()['in']
            post_payload = dict(zip(model_args_names, inputs))
            # TODO: remove this if from here and socket
            if 'docred' in str(config_path) or 'rured' in str(config_path):
                post_payload = {k: v[0] for k, v in post_payload.items()}
            post_response = requests.post(url, json=post_payload, headers=post_headers)
            response_code = post_response.status_code
            assert response_code == 200, f"POST request returned error code {response_code} with {config_path}"

        except pexpect.exceptions.EOF:
            raise RuntimeError('Got unexpected EOF: \n{}'.format(logfile.getvalue().decode()))

        finally:
            p.kill(signal.SIGTERM)
            p.wait()
            # if p.wait() != 0:
            #     raise RuntimeError('Error in shutting down API server: \n{}'.format(logfile.getvalue().decode()))

    @staticmethod
    def infer_socket(config_path, socket_type):
        socket_params = get_server_params(config_path)
        model_args_names = socket_params['model_args_names']

        host = socket_params['host']
        host = host.replace('0.0.0.0', '127.0.0.1')
        port = api_port or socket_params['port']

        socket_payload = {}
        for arg_name in model_args_names:
            arg_value = ' '.join(['qwerty'] * 10)
            socket_payload[arg_name] = [arg_value]

        if 'parus' in str(config_path):
            socket_payload = {k: [v] for k, v in socket_payload.items()}

        logfile = io.BytesIO(b'')
        args = [sys.executable, "-m", "deeppavlov", "risesocket", str(config_path), '--socket-type', socket_type]
        if socket_type == 'TCP':
            args += ['-p', str(port)]
            address_family = socket.AF_INET
            connect_arg = (host, port)
        else:
            address_family = socket.AF_UNIX
            connect_arg = socket_params['unix_socket_file']
        p = pexpect.popen_spawn.PopenSpawn(' '.join(args),
                                           timeout=None, logfile=logfile)
        try:
            p.expect(socket_params['socket_launch_message'])
            with socket.socket(address_family, socket.SOCK_STREAM) as s:
                try:
                    s.connect(connect_arg)
                except ConnectionRefusedError:
                    sleep(1)
                    s.connect(connect_arg)
                s.sendall(encode(socket_payload))
                s.settimeout(120)
                header = s.recv(4)
                body_len = unpack('<I', header)[0]
                data = bytearray()
                while len(data) < body_len:
                    chunk = s.recv(body_len - len(data))
                    if not chunk:
                        raise ValueError(f'header does not match body\nheader: {body_len}\nbody length: {len(data)}'
                                         f'data: {data}')
                    data.extend(chunk)
            try:
                resp = json.loads(data)
            except json.decoder.JSONDecodeError:
                raise ValueError(f"Can't decode model response {data}")
            assert resp['status'] == 'OK', f"{socket_type} socket request returned status: {resp['status']}" \
                                           f" with {config_path}\n{logfile.getvalue().decode()}"

        except pexpect.exceptions.EOF:
            raise RuntimeError(f'Got unexpected EOF: \n{logfile.getvalue().decode()}')

        except json.JSONDecodeError:
            raise ValueError(f'Got JSON not serializable response from model: "{data}"\n{logfile.getvalue().decode()}')

        finally:
            p.kill(signal.SIGTERM)
            p.wait()

    def test_inferring_pretrained_model(self, model, conf_file, model_dir, mode):
        if 'IP' in mode:
            config_file_path = str(test_configs_path.joinpath(conf_file))
            install_config(config_file_path)
            deep_download(config_file_path)

            self.infer(test_configs_path / conf_file, PARAMS[model][(conf_file, model_dir, mode)])
        else:
            pytest.skip("Unsupported mode: {}".format(mode))

    def test_inferring_pretrained_model_api(self, model, conf_file, model_dir, mode):
        if 'IP' in mode:
            self.infer_api(test_configs_path / conf_file, PARAMS[model][(conf_file, model_dir, mode)])
        else:
            pytest.skip("Unsupported mode: {}".format(mode))

    def test_inferring_pretrained_model_socket(self, model, conf_file, model_dir, mode):
        pytest.skip(f"Disabled")
        if 'IP' in mode:
            self.infer_socket(test_configs_path / conf_file, 'TCP')

            if 'TI' not in mode:
                shutil.rmtree(str(download_path), ignore_errors=True)
        else:
            pytest.skip(f"Unsupported mode: {mode}")


    def test_consecutive_training_and_inferring(self, model, conf_file, model_dir, mode):
        if 'TI' in mode:
            c = test_configs_path / conf_file
            model_path = download_path / model_dir

            if 'IP' not in mode:
                config_path = str(test_configs_path.joinpath(conf_file))
                install_config(config_path)
                deep_download(config_path)
            shutil.rmtree(str(model_path), ignore_errors=True)

            logfile = io.BytesIO(b'')
            p = pexpect.popen_spawn.PopenSpawn(sys.executable + " -m deeppavlov train " + str(c), timeout=None,
                                               logfile=logfile)
            p.readlines()
            if p.wait() != 0:
                raise RuntimeError('Training process of {} returned non-zero exit code: \n{}'
                                   .format(model_dir, logfile.getvalue().decode()))
            self.infer(c, PARAMS[model][(conf_file, model_dir, mode)], check_outputs=False)

            shutil.rmtree(str(download_path), ignore_errors=True)
        else:
            pytest.skip("Unsupported mode: {}".format(mode))


def test_crossvalidation():
    model_dir = 'faq'
    conf_file = 'faq/fasttext_logreg.json'

    download_config(conf_file)

    c = test_configs_path / conf_file
    model_path = download_path / model_dir

    install_config(c)
    deep_download(c)
    shutil.rmtree(str(model_path), ignore_errors=True)

    logfile = io.BytesIO(b'')
    p = pexpect.popen_spawn.PopenSpawn(sys.executable + f" -m deeppavlov crossval {c} --folds 2",
                                       timeout=None, logfile=logfile)
    p.readlines()
    if p.wait() != 0:
        raise RuntimeError('Training process of {} returned non-zero exit code: \n{}'
                           .format(model_dir, logfile.getvalue().decode()))

    shutil.rmtree(str(download_path), ignore_errors=True)


def test_hashes_existence():
    all_configs = list(src_dir.glob('**/*.json')) + list(test_src_dir.glob('**/*.json'))
    url_root = 'http://files.deeppavlov.ai/'
    downloads_urls = set()
    for config in all_configs:
        config = json.loads(config.read_text(encoding='utf-8'))
        # TODO: replace with get downloads from config
        # TODO: download only headers
        # TODO: make requests in async mode
        config_urls = {d if isinstance(d, str) else d['url'] for d in config.get('metadata', {}).get('download', [])}
        downloads_urls |= {parse_value_with_config(url, config) for url in config_urls}
    downloads_urls = [url + '.md5' for url in downloads_urls if url.startswith(url_root)]
    messages = []

    logging.getLogger("urllib3").setLevel(logging.WARNING)

    for url in downloads_urls:
        status = requests.get(url).status_code
        if status != 200:
            messages.append(f'got status_code {status} for {url}')
    if messages:
        raise RuntimeError('\n'.join(messages))


def test_aliases():
    configs = list(src_dir.glob('**/*.json'))
    config_names = [c.stem for c in configs]

    assert len(config_names) == len(set(config_names)), 'Some model names are duplicated'

    aliases_in_configs = set(ALIASES.keys()) & set(config_names)
    assert aliases_in_configs == set(), f'Following model(s) marked as deprecated but still present in configs list: ' \
                                        f'{", ".join(aliases_in_configs)}.'

    alias_targets_not_in_configs = set(ALIASES.values()) - set(config_names)
    assert alias_targets_not_in_configs == set(), f'Following model(s) marked as alias targets but there is no such ' \
                                                  f'config in the library: {", ".join(alias_targets_not_in_configs)}'


================================================
FILE: utils/Docker/Dockerfile
================================================
ARG BASE_IMAGE

FROM $BASE_IMAGE

SHELL ["/bin/bash", "-c"]

ENV DP_PYTEST_API_PORT=5000
ENV DP_PYTEST_NO_CACHE=True
ENV LANG='en_US.UTF-8'

ARG DEBIAN_FRONTEND=noninteractive
ARG PYTHON_VERSION

RUN rm -f /etc/apt/sources.list.d/cuda*.list && \
    apt update && \
    apt install -y --no-install-recommends \
        build-essential \
        dpkg-dev \
        gcc \
        git	\
        libbz2-dev \
        libc6-dev \
        libexpat1-dev \
        libffi-dev \
        libgdbm-dev \
        liblzma-dev \
        libncursesw5-dev \
        libreadline-dev \
        libsqlite3-dev \
        libssl-dev \
        libxslt-dev \
        locales \
        make \
        pandoc \
        tk-dev \
        wget \
        xz-utils \
        zlib1g-dev && \
    locale-gen en_US.UTF-8 && \
    wget --no-check-certificate -O python.tar.xz https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tar.xz && \
    mkdir -p /usr/src/python && \
	tar -xC /usr/src/python --strip-components=1 -f python.tar.xz && \
    rm python.tar.xz && \
    cd /usr/src/python && \
	./configure && \
	make -j "$(nproc)" altinstall && \
	ln -s /usr/local/bin/python${PYTHON_VERSION%.*} /usr/local/bin/python && \
    ln -s /usr/local/bin/pip${PYTHON_VERSION%.*} /usr/local/bin/pip && \
    pip install --upgrade pip && \
    pip install pybind11==2.2.4 && \
    rm -rf /usr/src/python /var/lib/apt/lists/*

WORKDIR /app

# two commands to prevent caching of the next layers
ARG EPOCH
ENV EPOCH=$EPOCH

COPY . .

CMD utils/Docker/cmd.sh


================================================
FILE: utils/Docker/README.md
================================================


================================================
FILE: utils/Docker/cmd.sh
================================================
#!/bin/bash

set -e

pip install .[tests,docs]

rm -rf `find . -mindepth 1 -maxdepth 1 ! -name tests ! -name Jenkinsfile ! -name docs`

cd docs
make clean
make html
cd ..

flake8 `python -c 'import deeppavlov; print(deeppavlov.__path__[0])'` --count --select=E9,F63,F7,F82 --show-source --statistics

pytest -v --disable-warnings --instafail $PYTEST_ARGS


================================================
FILE: utils/Docker/docker-compose.yml
================================================
version: '3.7'
services:
  py36:
    build:
      context: ../../
      dockerfile: utils/Docker/Dockerfile
      args:
        - EPOCH=$EPOCH
        - PYTHON_VERSION=3.6.15
        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
    user: '${UID}:${GID}'
    environment:
      - CUDA_VISIBLE_DEVICES=$TEST_GPU_0
      - PYTEST_ARGS=$PYTEST_ARGS
      - DP_PYTEST_NO_CACHE=True
  py37:
    build:
      context: ../../
      dockerfile: utils/Docker/Dockerfile
      args:
        - EPOCH=$EPOCH
        - PYTHON_VERSION=3.7.16
        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
    user: '${UID}:${GID}'
    environment:
      - CUDA_VISIBLE_DEVICES=$TEST_GPU_1
      - PYTEST_ARGS=$PYTEST_ARGS
      - DP_PYTEST_NO_CACHE=True
  py38:
    build:
      context: ../../
      dockerfile: utils/Docker/Dockerfile
      args:
        - EPOCH=$EPOCH
        - PYTHON_VERSION=3.8.16
        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
    user: '${UID}:${GID}'
    environment:
      - CUDA_VISIBLE_DEVICES=$TEST_GPU_0
      - PYTEST_ARGS=$PYTEST_ARGS
      - DP_PYTEST_NO_CACHE=True
  py39:
    build:
      context: ../../
      dockerfile: utils/Docker/Dockerfile
      args:
        - EPOCH=$EPOCH
        - PYTHON_VERSION=3.9.16
        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
    user: '${UID}:${GID}'
    environment:
      - CUDA_VISIBLE_DEVICES=$TEST_GPU_1
      - PYTEST_ARGS=$PYTEST_ARGS
      - DP_PYTEST_NO_CACHE=True
  py310:
    build:
      context: ../../
      dockerfile: utils/Docker/Dockerfile
      args:
        - EPOCH=$EPOCH
        - PYTHON_VERSION=3.10.9
        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
    user: '${UID}:${GID}'
    environment:
      - CUDA_VISIBLE_DEVICES=$TEST_GPU_0
      - PYTEST_ARGS=$PYTEST_ARGS
      - DP_PYTEST_NO_CACHE=True
  py311:
    build:
      context: ../../
      dockerfile: utils/Docker/Dockerfile
      args:
        - EPOCH=$EPOCH
        - PYTHON_VERSION=3.11.6
        - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
    user: '${UID}:${GID}'
    environment:
      - CUDA_VISIBLE_DEVICES=$TEST_GPU_1
      - PYTEST_ARGS=$PYTEST_ARGS
      - DP_PYTEST_NO_CACHE=True


================================================
FILE: utils/__init__.py
================================================


================================================
FILE: utils/prepare/__init__.py
================================================


================================================
FILE: utils/prepare/hashes.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import gzip
import sys
import tarfile
from hashlib import md5
from pathlib import Path
from typing import Dict, Optional, Union
from zipfile import ZipFile

from deeppavlov.core.data.utils import file_md5


def tar_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> Dict[str, str]:
    tar = tarfile.open(fpath)
    res = {}
    while True:
        item: tarfile.TarInfo = tar.next()
        if item is None:
            break
        if not item.isfile():
            continue
        file_hash = md5()
        with tar.extractfile(item) as f:
            for chunk in iter(lambda: f.read(chunk_size), b""):
                file_hash.update(chunk)
        res[item.name] = file_hash.hexdigest()
    return res


def gzip_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> str:
    file_hash = md5()
    with gzip.open(fpath, 'rb') as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            file_hash.update(chunk)
    return file_hash.hexdigest()


def zip_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> Dict[str, str]:
    res = {}
    with ZipFile(fpath) as zip_f:
        for item in zip_f.infolist():
            if item.is_dir():
                continue
            file_hash = md5()
            with zip_f.open(item) as f:
                for chunk in iter(lambda: f.read(chunk_size), b""):
                    file_hash.update(chunk)
            res[item.filename] = file_hash.hexdigest()
    return res


def compute_hashes(fpath: Union[str, Path]) -> Dict[str, str]:
    p = Path(fpath).expanduser()
    if not p.is_file():
        raise RuntimeError(f'{p} is not a file')

    if '.tar' in {s.lower() for s in p.suffixes}:
        hashes = tar_md5(p)
    elif p.suffix.lower() == '.gz':
        hashes = {p.with_suffix('').name: gzip_md5(p)}
    elif p.suffix.lower() == '.zip':
        hashes = zip_md5(p)
    else:
        hashes = {p.name: file_md5(p)}
    return hashes


def main(fname: str, outfile: Optional[str] = None) -> None:
    p = Path(fname).expanduser()
    hashes = compute_hashes(p)

    if outfile is None:
        outfile = p.with_suffix(p.suffix + '.md5').open('w', encoding='utf-8')
    elif outfile == '-':
        outfile = sys.stdout
    else:
        outfile = Path(outfile).expanduser().open('w', encoding='utf-8')

    for fname, fhash in hashes.items():
        print(f'{fhash} *{fname}', file=outfile, flush=True)

    if outfile is not sys.stdout:
        outfile.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("fname", help="path to a file to compute hash for", type=str)
    parser.add_argument('-o', '--outfile', help='where to write the hashes', default=None, type=str)

    args = parser.parse_args()
    main(args.fname, args.outfile)


================================================
FILE: utils/prepare/optimize_ipynb.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
from pathlib import Path

try:
    import nbformat as nbf
except ModuleNotFoundError:
    raise ModuleNotFoundError(f"Please, run `pip install nbformat==5.8.0` before using this script.")

logging.basicConfig(level=logging.INFO, format="%(message)s")


def merge_markdown(nb: nbf.notebooknode.NotebookNode) -> None:
    """Merges consequent markdown cells into one."""
    start_idx = None
    slices = []
    for i, cell in enumerate(nb["cells"]):
        if cell["cell_type"] == "markdown":
            if start_idx is None:
                start_idx = i
        else:
            if start_idx is not None:
                if i - start_idx > 1:
                    slices.append(slice(start_idx, i))
                start_idx = None
    for sl in slices[::-1]:
        nb["cells"][sl.start]["source"] = "\n\n".join([c["source"].rstrip() for c in nb["cells"][sl]])
        del nb["cells"][sl.start + 1: sl.stop]  # nb["cells"][sl] does not work properly


def drop_metadata(nb: nbf.notebooknode.NotebookNode) -> None:
    """Replaces notebook and cells metadata with empty dicts."""
    nb["metadata"] = dict()
    for i in range(len(nb["cells"])):
        nb["cells"][i]["metadata"] = dict()


def update_file(path: Path, update_ckpts: bool) -> None:
    """Optimizes ipynb files in order to reduce further git diffs.
    Args:
        path: File to update, if this is file. If this is dir - recursively searches and updates .ipynb files in it.
        update_ckpts: If False and path is dir, will skip all found ipynb files from .ipynb_checkpoints.
    """
    if path.is_dir():
        logging.info(f"Updating .ipynb files in {path} dir"
                     f"{', excluding files from .ipynb_checkpoints subdirs' if update_ckpts is False else ''}.")
        for f in path.rglob('*.ipynb'):
            if update_ckpts is False and '.ipynb_checkpoints' in f.parts:
                continue
            update_file(f, update_ckpts)
    else:
        logging.info(f"Updating {path}.")
        nb = nbf.read(path, nbf.NO_CONVERT)
        merge_markdown(nb)
        drop_metadata(nb)
        with open(path, "w") as fout:
            nbf.write(nb, fout)


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("fname", help="path to an ipynb file to optimize", type=Path)
    parser.add_argument("--update-ckpts", help="update checkpoints in .ipynb_checkpoints subdirs", action="store_true")
    args = parser.parse_args()
    update_file(args.fname.resolve(), args.update_ckpts)


if __name__ == "__main__":
    main()


================================================
FILE: utils/prepare/registry.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import pkgutil
from importlib import import_module, reload

import deeppavlov
from deeppavlov.core.common.metrics_registry import _registry_path as m_registry_path, _REGISTRY as M_REGISTRY
from deeppavlov.core.common.registry import _registry_path as c_registry_path, _REGISTRY as C_REGISTRY

if __name__ == '__main__':
    C_REGISTRY.clear()
    M_REGISTRY.clear()

    for _, pkg_name, _ in pkgutil.walk_packages(deeppavlov.__path__, deeppavlov.__name__ + '.'):
        if pkg_name not in ('deeppavlov.core.common.registry', 'deeppavlov.core.common.metrics_registry'):
            reload(import_module(pkg_name))

    with c_registry_path.open('w', encoding='utf-8') as f:
        json.dump(dict(sorted(C_REGISTRY.items())), f, indent=2)

    with m_registry_path.open('w', encoding='utf-8') as f:
        json.dump(dict(sorted(M_REGISTRY.items())), f, indent=2)


================================================
FILE: utils/prepare/upload.py
================================================
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import pathlib
import tarfile
from pathlib import Path

from deeppavlov.core.commands.utils import parse_config
from deeppavlov.core.common.file import find_config
from hashes import main


def upload(config_in_file: str, tar_name: str, tar_output_dir: Path):
    if not tar_output_dir.exists():
        raise RuntimeError(f'A folder {tar_output_dir} does not exist')

    print(f'Config: {config_in_file}')
    if not Path(config_in_file).exists():
        raise RuntimeError(f'A config {config_in_file} does not exist')

    config_in = parse_config(config_in_file)
    config_in_file = find_config(config_in_file)

    model_path = Path(config_in['metadata']['variables']['MODEL_PATH']).expanduser()
    model_name, class_name = config_in_file.stem, config_in_file.parent.name

    if tar_name is None:
        tar_name = f'{model_name}'
        print(f'tar_name set to {tar_name}')

    full_tar_name = tar_output_dir / f'{tar_name}.tar.gz'
    if Path(full_tar_name).exists():
        raise RuntimeError(f'An archive {Path(full_tar_name)} already exists')

    print(f'model_path: {model_path}')
    print(f'class_name: {class_name}')
    print(f'model_name: {model_name}')
    print(f'Start tarring to {full_tar_name}')
    with tarfile.open(str(full_tar_name), "w|gz") as archive:
        archive.add(model_path, arcname=pathlib.os.sep)

    print("Stop tarring")
    print(f'Tar archive: {Path(full_tar_name)} has been created')

    print("Calculating hash")
    main(full_tar_name)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config_in', help='path to a config', type=str)
    parser.add_argument('-n', '--tar_name', help='name of the tar archive (without tar.gz extension)',
                        default=None, required=False, type=str)
    parser.add_argument('-o', '--tar_output_dir', help='dir to save a tar archive', default='./',
                        required=False, type=Path)
    args = parser.parse_args()
    upload(args.config_in, args.tar_name, args.tar_output_dir)